From 00cd0d736331e6cd03fc9a775976b62c19731471 Mon Sep 17 00:00:00 2001 From: Phlya Date: Tue, 10 Jan 2023 19:11:57 +0100 Subject: [PATCH 1/9] optional split files, small changes --- hmm_bigwigs/__main__.py | 43 ++++++++++++++++++++++++--------------- hmm_bigwigs/bigwig_hmm.py | 26 +++++++++++++++-------- 2 files changed, 44 insertions(+), 25 deletions(-) diff --git a/hmm_bigwigs/__main__.py b/hmm_bigwigs/__main__.py index 90ca0e9..d574f9f 100755 --- a/hmm_bigwigs/__main__.py +++ b/hmm_bigwigs/__main__.py @@ -3,9 +3,11 @@ from hmm_bigwigs import * + def parse_args(): parser = argparse.ArgumentParser( - description="Create bedfile of HMM states from bigwig") + description="Create bedfile of HMM states from bigwig" + ) parser.add_argument( "-i", "--inputfile", @@ -34,7 +36,7 @@ def parse_args(): help="Colormap to map states to colors", action="store", required=False, - default='coolwarm', + default="coolwarm", ) parser.add_argument( "-o", @@ -44,30 +46,39 @@ def parse_args(): action="store", required=False, ) + parser.add_argument( + "-save-split-files", + dest="savesplit", + help="Whether to save separate bed files split by state in addition to the output file", + action="store", + required=False, + ) args = parser.parse_args() if args.outputfile is None: - args.outputfile=args.inputfile + args.outputfile = args.inputfile return args + def main(): args = parse_args() - print("Starting HMM on " + args.inputfile) + # print("Starting HMM on " + args.inputfile) chroms = get_chroms(args.genome) df = create_df(inputfile=args.inputfile, chroms=chroms) df = hmm(df, args.num_states) - print("Finished hmm!") + # print("Finished hmm!") df_sparse = sparse(df) write_to_file(df_sparse, args.outputfile, args.num_states, cmap=args.cmap) # df_final=merge_different_hmmstates(df_sparse, cLAD=cLAD, open=open_state) # df_final.to_csv(args.outputfile+'_combined_state.bed', sep='\t', header=False, index=False) - print("write first file") - df_sparse[df_sparse["state"] == 0].to_csv( - args.outputfile + "_0_state.bed", sep="\t", header=False, index=False - ) - df_sparse[df_sparse["state"] == 1].to_csv( - args.outputfile + "_1_state.bed", sep="\t", header=False, index=False - ) - df_sparse[df_sparse["state"] == 2].to_csv( - args.outputfile + "_2_state.bed", sep="\t", header=False, index=False - ) - print("Finished writing to file") \ No newline at end of file + # print("write first file") + if args.savesplit: + df_sparse[df_sparse["state"] == 0].to_csv( + args.outputfile + "_0_state.bed", sep="\t", header=False, index=False + ) + df_sparse[df_sparse["state"] == 1].to_csv( + args.outputfile + "_1_state.bed", sep="\t", header=False, index=False + ) + df_sparse[df_sparse["state"] == 2].to_csv( + args.outputfile + "_2_state.bed", sep="\t", header=False, index=False + ) + # print("Finished writing to file") diff --git a/hmm_bigwigs/bigwig_hmm.py b/hmm_bigwigs/bigwig_hmm.py index b9e7464..9d770cf 100644 --- a/hmm_bigwigs/bigwig_hmm.py +++ b/hmm_bigwigs/bigwig_hmm.py @@ -44,10 +44,10 @@ def hmm(df, num_states): states = model.predict(vals) # Rename states to increase with mean signal - order = np.argsort(df['value'].groupby(states).mean()) + order = np.argsort(df["value"].groupby(states).mean()) states = [order[s] for s in states] df["state"] = states - df['state'][np.isnan(df['value'])] = np.nan + df["state"][np.isnan(df["value"])] = np.nan return df @@ -83,7 +83,7 @@ def sparse(df): def merge_different_hmmstates(df, cLAD, open): - "merge strong and weak HMM states into 2 " + "merge strong and weak HMM states into 2" import pandas as pd chr_list = [] @@ -193,14 +193,15 @@ def strong_region(df, open_state): return mode(open_list) -def write_to_file(df, outputfile, num_states, cmap='coolwarm'): - states = list(range(num_states)) +def write_to_file(df, outputfile=None, cmap="coolwarm"): + states = list(sorted(df["state"].unique())) cmap = plt.get_cmap(cmap) - colors = {s:cmap(s/states[-1]) for s in states} + colors = {s: cmap(s / states[-1]) for s in states} df["score"] = "0" df["strand"] = "." - filename = outputfile + "_" + str(num_states) + "_state_HMM_colored.bed" - df['RGB'] = df["state"].apply(lambda x: ','.join([str(int(round(c*255))) for c in colors[x][:-1]])) + df["RGB"] = df["state"].apply( + lambda x: ",".join([str(int(round(c * 255))) for c in colors[x][:-1]]) + ) cols_to_keep = [ "chrom", "start", @@ -212,4 +213,11 @@ def write_to_file(df, outputfile, num_states, cmap='coolwarm'): "start", "RGB", ] - df.to_csv(filename, sep="\t", header=False, columns=cols_to_keep, index=False) + if outputfile: + df.to_csv(outputfile, sep="\t", header=False, columns=cols_to_keep, index=False) + else: + print( + df.to_csv( + outputfile, sep="\t", header=False, columns=cols_to_keep, index=False + ) + ) From b89a77290a394d7504b56b32ca4a886f7ee3a118 Mon Sep 17 00:00:00 2001 From: Phlya Date: Wed, 11 Jan 2023 13:32:25 +0100 Subject: [PATCH 2/9] better splitting flag --- hmm_bigwigs/__main__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hmm_bigwigs/__main__.py b/hmm_bigwigs/__main__.py index d574f9f..486096e 100755 --- a/hmm_bigwigs/__main__.py +++ b/hmm_bigwigs/__main__.py @@ -47,11 +47,12 @@ def parse_args(): required=False, ) parser.add_argument( - "-save-split-files", + "--save-split-files", dest="savesplit", help="Whether to save separate bed files split by state in addition to the output file", - action="store", + action="store_true", required=False, + default=False, ) args = parser.parse_args() if args.outputfile is None: From e41308d16c971f6434548ac7dc36fe9b546a6768 Mon Sep 17 00:00:00 2001 From: Phlya Date: Wed, 11 Jan 2023 13:35:35 +0100 Subject: [PATCH 3/9] fix --- hmm_bigwigs/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hmm_bigwigs/__main__.py b/hmm_bigwigs/__main__.py index 486096e..2678188 100755 --- a/hmm_bigwigs/__main__.py +++ b/hmm_bigwigs/__main__.py @@ -68,7 +68,7 @@ def main(): df = hmm(df, args.num_states) # print("Finished hmm!") df_sparse = sparse(df) - write_to_file(df_sparse, args.outputfile, args.num_states, cmap=args.cmap) + write_to_file(df_sparse, args.outputfile, cmap=args.cmap) # df_final=merge_different_hmmstates(df_sparse, cLAD=cLAD, open=open_state) # df_final.to_csv(args.outputfile+'_combined_state.bed', sep='\t', header=False, index=False) # print("write first file") From c96be9f6f54d883699b00816faa70327c79e04d9 Mon Sep 17 00:00:00 2001 From: Phlya Date: Wed, 11 Jan 2023 13:42:52 +0100 Subject: [PATCH 4/9] better split saving --- hmm_bigwigs/__main__.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/hmm_bigwigs/__main__.py b/hmm_bigwigs/__main__.py index 2678188..678a3ce 100755 --- a/hmm_bigwigs/__main__.py +++ b/hmm_bigwigs/__main__.py @@ -73,13 +73,11 @@ def main(): # df_final.to_csv(args.outputfile+'_combined_state.bed', sep='\t', header=False, index=False) # print("write first file") if args.savesplit: - df_sparse[df_sparse["state"] == 0].to_csv( - args.outputfile + "_0_state.bed", sep="\t", header=False, index=False - ) - df_sparse[df_sparse["state"] == 1].to_csv( - args.outputfile + "_1_state.bed", sep="\t", header=False, index=False - ) - df_sparse[df_sparse["state"] == 2].to_csv( - args.outputfile + "_2_state.bed", sep="\t", header=False, index=False - ) + for state in range(args.num_states): + df_sparse[df_sparse["state"] == state].to_csv( + f"{args.outputfile}_{state}_state.bed", + sep="\t", + header=False, + index=False, + ) # print("Finished writing to file") From c70a71520d5dc33dfac2500621237d2a777c6c28 Mon Sep 17 00:00:00 2001 From: Phlya Date: Wed, 11 Jan 2023 14:23:58 +0100 Subject: [PATCH 5/9] Make states int --- hmm_bigwigs/bigwig_hmm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hmm_bigwigs/bigwig_hmm.py b/hmm_bigwigs/bigwig_hmm.py index 9d770cf..0c17945 100644 --- a/hmm_bigwigs/bigwig_hmm.py +++ b/hmm_bigwigs/bigwig_hmm.py @@ -41,7 +41,7 @@ def hmm(df, num_states): model = HiddenMarkovModel.from_samples( NormalDistribution, X=[vals], n_components=num_states ) - states = model.predict(vals) + states = model.predict(vals).astype(int) # Rename states to increase with mean signal order = np.argsort(df["value"].groupby(states).mean()) From 677f9d0b17c5f34a9e6f278c075181d424ca8557 Mon Sep 17 00:00:00 2001 From: Phlya Date: Wed, 11 Jan 2023 14:29:48 +0100 Subject: [PATCH 6/9] Undo --- hmm_bigwigs/bigwig_hmm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hmm_bigwigs/bigwig_hmm.py b/hmm_bigwigs/bigwig_hmm.py index 0c17945..9d770cf 100644 --- a/hmm_bigwigs/bigwig_hmm.py +++ b/hmm_bigwigs/bigwig_hmm.py @@ -41,7 +41,7 @@ def hmm(df, num_states): model = HiddenMarkovModel.from_samples( NormalDistribution, X=[vals], n_components=num_states ) - states = model.predict(vals).astype(int) + states = model.predict(vals) # Rename states to increase with mean signal order = np.argsort(df["value"].groupby(states).mean()) From e144d34d9d6e0cae3c365679bdadf351e25c0a16 Mon Sep 17 00:00:00 2001 From: Phlya Date: Fri, 3 Feb 2023 12:06:56 +0100 Subject: [PATCH 7/9] Add option to use view --- hmm_bigwigs/__init__.py | 2 +- hmm_bigwigs/__main__.py | 31 ++++++++++++++++++++++--------- hmm_bigwigs/bigwig_hmm.py | 12 ++++++------ 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/hmm_bigwigs/__init__.py b/hmm_bigwigs/__init__.py index 4455c5b..da967d9 100755 --- a/hmm_bigwigs/__init__.py +++ b/hmm_bigwigs/__init__.py @@ -3,7 +3,7 @@ """ Created on Fri Feb 28 15:30:21 2020 -@author: s1529682 +@author: Ilya Flyamer """ from .bigwig_hmm import * \ No newline at end of file diff --git a/hmm_bigwigs/__main__.py b/hmm_bigwigs/__main__.py index 678a3ce..6ea4661 100755 --- a/hmm_bigwigs/__main__.py +++ b/hmm_bigwigs/__main__.py @@ -1,6 +1,9 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +from natsort import natsort_keygen +import bbi +import bioframe as bf from hmm_bigwigs import * @@ -20,6 +23,9 @@ def parse_args(): parser.add_argument( "-g", "--genome", dest="genome", help="genome (i.e. hg38)", action="store" ) + parser.add_argument( + "--view", dest="view_file", help="file with genomic view", action="store" + ) parser.add_argument( "-n", "--num_states", @@ -62,16 +68,24 @@ def parse_args(): def main(): args = parse_args() - # print("Starting HMM on " + args.inputfile) - chroms = get_chroms(args.genome) - df = create_df(inputfile=args.inputfile, chroms=chroms) - df = hmm(df, args.num_states) - # print("Finished hmm!") + if args.view_file is not None: + view = bf.make_viewframe( + bf.read_table(args.view_file, schema="bed4", index_col=False) + ) + elif args.genome is not None: + view = bf.make_viewframe(bf.fetch_chromsizes(args.genome, as_bed=True)) + else: + with bbi.open(args.inputfile) as f: + view = ( + bf.make_viewframe(dict(f.chromsizes)) + .sort_values("chrom", key=natsort_keygen()) + .reset_index(drop=True) + ) + df = create_df(inputfile=args.inputfile, view=view) + df = bf.assign_view(df, view) + df = df.groupby("view_region").apply(hmm, num_states=args.num_states) df_sparse = sparse(df) write_to_file(df_sparse, args.outputfile, cmap=args.cmap) - # df_final=merge_different_hmmstates(df_sparse, cLAD=cLAD, open=open_state) - # df_final.to_csv(args.outputfile+'_combined_state.bed', sep='\t', header=False, index=False) - # print("write first file") if args.savesplit: for state in range(args.num_states): df_sparse[df_sparse["state"] == state].to_csv( @@ -80,4 +94,3 @@ def main(): header=False, index=False, ) - # print("Finished writing to file") diff --git a/hmm_bigwigs/bigwig_hmm.py b/hmm_bigwigs/bigwig_hmm.py index 9d770cf..dc0adb5 100644 --- a/hmm_bigwigs/bigwig_hmm.py +++ b/hmm_bigwigs/bigwig_hmm.py @@ -23,14 +23,14 @@ def get_chroms(genome, ignoreXYMT=True): return chr_list -def create_df(inputfile, chroms): +def create_df(inputfile, view): "Create dataframe from bigwig" - df = pd.DataFrame(columns=["chrom", "start", "end", "value"]) - for item in chroms: - ivals = list(bbi.fetch_intervals(inputfile, item, 0, -1)) + df_list = [] + for i, row in view.iterrows(): + ivals = bbi.fetch_intervals(inputfile, row.chrom, row.start, row.end) df_new = pd.DataFrame(ivals, columns=["chrom", "start", "end", "value"]) - df = df.append(df_new, ignore_index=True) - return df + df_list.append(df_new) + return pd.concat(df_list).reset_index(drop=True) def hmm(df, num_states): From 7f7e7a6c8c3dfee17e5ac476b6b6a295ba510f6f Mon Sep 17 00:00:00 2001 From: Ilya Flyamer Date: Fri, 3 Feb 2023 13:04:26 +0100 Subject: [PATCH 8/9] Update setup.py --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index d14e9bf..eaa3e6a 100755 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: long_description = f.read() -INSTALL_REQUIRES = ['numpy', 'pandas', 'bioframe', 'pomegranate', 'pybbi'] +INSTALL_REQUIRES = ['numpy', 'pandas', 'bioframe', 'pomegranate', 'pybbi', 'natsort'] setup( name='hmm_bigwigs', @@ -26,4 +26,4 @@ "Programming Language :: Python", ], zip_safe=False -) \ No newline at end of file +) From bcf278f932c135d9e0ef15f33a1d119ce66813ef Mon Sep 17 00:00:00 2001 From: Ilya Flyamer Date: Wed, 1 Mar 2023 13:50:12 +0100 Subject: [PATCH 9/9] Fix for missing data (#3) * Fix na (#2) fix for empty or tiny view regions * sort sparse df --- hmm_bigwigs/__main__.py | 8 +++++++- hmm_bigwigs/bigwig_hmm.py | 40 +++++++++++---------------------------- 2 files changed, 18 insertions(+), 30 deletions(-) diff --git a/hmm_bigwigs/__main__.py b/hmm_bigwigs/__main__.py index 6ea4661..c86a051 100755 --- a/hmm_bigwigs/__main__.py +++ b/hmm_bigwigs/__main__.py @@ -83,7 +83,13 @@ def main(): ) df = create_df(inputfile=args.inputfile, view=view) df = bf.assign_view(df, view) - df = df.groupby("view_region").apply(hmm, num_states=args.num_states) + df = ( + df.dropna(subset=["value"]) + .groupby("view_region") + .filter(lambda x: len(x) > 1) + .groupby("view_region") + .apply(hmm, num_states=args.num_states) + ) df_sparse = sparse(df) write_to_file(df_sparse, args.outputfile, cmap=args.cmap) if args.savesplit: diff --git a/hmm_bigwigs/bigwig_hmm.py b/hmm_bigwigs/bigwig_hmm.py index cebd93c..d5e7afb 100644 --- a/hmm_bigwigs/bigwig_hmm.py +++ b/hmm_bigwigs/bigwig_hmm.py @@ -35,8 +35,10 @@ def create_df(inputfile, view): def hmm(df, num_states): "HMM program" - # df['value']=df['value'].replace(0,np.nan) #this removes unmappable areas of chr - # df_dropna=df.dropna(subset=['value']) #this removes unmappable areas of chr (NaN is otherwise considered 0) + # df["value"] = df["value"].replace(0, np.nan) # this removes unmappable areas of chr + # df = df.dropna( + # subset=["value"] + # ) # this removes unmappable areas of chr (NaN is otherwise considered 0) vals = df["value"].values model = HiddenMarkovModel.from_samples( NormalDistribution, X=[vals], n_components=num_states @@ -53,33 +55,13 @@ def hmm(df, num_states): def sparse(df): "Merge neighboring bins with same state" - chr_list = [] - start_list = [] - state_list = [] - end_list = [] - - for item in df["chrom"].unique(): - chrom_df = df[df["chrom"] == item].reset_index() - - chr_list.append((chrom_df["chrom"].iloc[0])) - start_list.append((chrom_df["start"].iloc[0])) - state_list.append((chrom_df["state"].iloc[0])) - for index, row in chrom_df[1:].iterrows(): - if chrom_df["state"].iloc[index] == chrom_df["state"].iloc[(index - 1)]: - continue - else: - end_list.append(chrom_df["end"].iloc[(index - 1)]) - chr_list.append(chrom_df["chrom"].iloc[index]) - start_list.append(chrom_df["start"].iloc[index]) - state_list.append(chrom_df["state"].iloc[index]) - if len(start_list) != len(end_list): - end_list.append(chrom_df["end"].iloc[(index)]) - - keys = ["chrom", "start", "end", "state"] - values = [chr_list, start_list, end_list, state_list] - dictionary = dict(zip(keys, values)) - df_sparse = pd.DataFrame.from_dict(dictionary) - return df_sparse.dropna() + df_sparse = ( + bioframe.merge(df, on=["state"]) + .dropna() + .sort_values(["chrom", "start"]) + .reset_index(drop=True) + ) + return df_sparse def merge_different_hmmstates(df, cLAD, open):