diff --git a/hmm_bigwigs/__init__.py b/hmm_bigwigs/__init__.py index 4455c5b..da967d9 100755 --- a/hmm_bigwigs/__init__.py +++ b/hmm_bigwigs/__init__.py @@ -3,7 +3,7 @@ """ Created on Fri Feb 28 15:30:21 2020 -@author: s1529682 +@author: Ilya Flyamer """ from .bigwig_hmm import * \ No newline at end of file diff --git a/hmm_bigwigs/__main__.py b/hmm_bigwigs/__main__.py index 90ca0e9..c86a051 100755 --- a/hmm_bigwigs/__main__.py +++ b/hmm_bigwigs/__main__.py @@ -1,11 +1,16 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- +from natsort import natsort_keygen +import bbi +import bioframe as bf from hmm_bigwigs import * + def parse_args(): parser = argparse.ArgumentParser( - description="Create bedfile of HMM states from bigwig") + description="Create bedfile of HMM states from bigwig" + ) parser.add_argument( "-i", "--inputfile", @@ -18,6 +23,9 @@ def parse_args(): parser.add_argument( "-g", "--genome", dest="genome", help="genome (i.e. hg38)", action="store" ) + parser.add_argument( + "--view", dest="view_file", help="file with genomic view", action="store" + ) parser.add_argument( "-n", "--num_states", @@ -34,7 +42,7 @@ def parse_args(): help="Colormap to map states to colors", action="store", required=False, - default='coolwarm', + default="coolwarm", ) parser.add_argument( "-o", @@ -44,30 +52,51 @@ def parse_args(): action="store", required=False, ) + parser.add_argument( + "--save-split-files", + dest="savesplit", + help="Whether to save separate bed files split by state in addition to the output file", + action="store_true", + required=False, + default=False, + ) args = parser.parse_args() if args.outputfile is None: - args.outputfile=args.inputfile + args.outputfile = args.inputfile return args + def main(): args = parse_args() - print("Starting HMM on " + args.inputfile) - chroms = get_chroms(args.genome) - df = create_df(inputfile=args.inputfile, chroms=chroms) - df = hmm(df, args.num_states) - print("Finished hmm!") - df_sparse = sparse(df) - write_to_file(df_sparse, args.outputfile, args.num_states, cmap=args.cmap) - # df_final=merge_different_hmmstates(df_sparse, cLAD=cLAD, open=open_state) - # df_final.to_csv(args.outputfile+'_combined_state.bed', sep='\t', header=False, index=False) - print("write first file") - df_sparse[df_sparse["state"] == 0].to_csv( - args.outputfile + "_0_state.bed", sep="\t", header=False, index=False + if args.view_file is not None: + view = bf.make_viewframe( + bf.read_table(args.view_file, schema="bed4", index_col=False) + ) + elif args.genome is not None: + view = bf.make_viewframe(bf.fetch_chromsizes(args.genome, as_bed=True)) + else: + with bbi.open(args.inputfile) as f: + view = ( + bf.make_viewframe(dict(f.chromsizes)) + .sort_values("chrom", key=natsort_keygen()) + .reset_index(drop=True) + ) + df = create_df(inputfile=args.inputfile, view=view) + df = bf.assign_view(df, view) + df = ( + df.dropna(subset=["value"]) + .groupby("view_region") + .filter(lambda x: len(x) > 1) + .groupby("view_region") + .apply(hmm, num_states=args.num_states) ) - df_sparse[df_sparse["state"] == 1].to_csv( - args.outputfile + "_1_state.bed", sep="\t", header=False, index=False - ) - df_sparse[df_sparse["state"] == 2].to_csv( - args.outputfile + "_2_state.bed", sep="\t", header=False, index=False - ) - print("Finished writing to file") \ No newline at end of file + df_sparse = sparse(df) + write_to_file(df_sparse, args.outputfile, cmap=args.cmap) + if args.savesplit: + for state in range(args.num_states): + df_sparse[df_sparse["state"] == state].to_csv( + f"{args.outputfile}_{state}_state.bed", + sep="\t", + header=False, + index=False, + ) diff --git a/hmm_bigwigs/bigwig_hmm.py b/hmm_bigwigs/bigwig_hmm.py index 4629d27..d5e7afb 100644 --- a/hmm_bigwigs/bigwig_hmm.py +++ b/hmm_bigwigs/bigwig_hmm.py @@ -23,20 +23,22 @@ def get_chroms(genome, ignoreXYMT=True): return chr_list -def create_df(inputfile, chroms): +def create_df(inputfile, view): "Create dataframe from bigwig" - df = pd.DataFrame(columns=["chrom", "start", "end", "value"]) - for item in chroms: - ivals = list(bbi.fetch_intervals(inputfile, item, 0, -1)) + df_list = [] + for i, row in view.iterrows(): + ivals = bbi.fetch_intervals(inputfile, row.chrom, row.start, row.end) df_new = pd.DataFrame(ivals, columns=["chrom", "start", "end", "value"]) - df = df.append(df_new, ignore_index=True) - return df + df_list.append(df_new) + return pd.concat(df_list).reset_index(drop=True) def hmm(df, num_states): "HMM program" - # df['value']=df['value'].replace(0,np.nan) #this removes unmappable areas of chr - # df_dropna=df.dropna(subset=['value']) #this removes unmappable areas of chr (NaN is otherwise considered 0) + # df["value"] = df["value"].replace(0, np.nan) # this removes unmappable areas of chr + # df = df.dropna( + # subset=["value"] + # ) # this removes unmappable areas of chr (NaN is otherwise considered 0) vals = df["value"].values model = HiddenMarkovModel.from_samples( NormalDistribution, X=[vals], n_components=num_states @@ -44,46 +46,26 @@ def hmm(df, num_states): states = model.predict(vals) # Rename states to increase with mean signal - order = np.argsort(df['value'].groupby(states).mean()) + order = np.argsort(df["value"].groupby(states).mean()) states = [order[s] for s in states] df["state"] = states - df['state'][np.isnan(df['value'])] = np.nan + df["state"][np.isnan(df["value"])] = np.nan return df def sparse(df): "Merge neighboring bins with same state" - chr_list = [] - start_list = [] - state_list = [] - end_list = [] - - for item in df["chrom"].unique(): - chrom_df = df[df["chrom"] == item].reset_index() - - chr_list.append((chrom_df["chrom"].iloc[0])) - start_list.append((chrom_df["start"].iloc[0])) - state_list.append((chrom_df["state"].iloc[0])) - for index, row in chrom_df[1:].iterrows(): - if chrom_df["state"].iloc[index] == chrom_df["state"].iloc[(index - 1)]: - continue - else: - end_list.append(chrom_df["end"].iloc[(index - 1)]) - chr_list.append(chrom_df["chrom"].iloc[index]) - start_list.append(chrom_df["start"].iloc[index]) - state_list.append(chrom_df["state"].iloc[index]) - if len(start_list) != len(end_list): - end_list.append(chrom_df["end"].iloc[(index)]) - - keys = ["chrom", "start", "end", "state"] - values = [chr_list, start_list, end_list, state_list] - dictionary = dict(zip(keys, values)) - df_sparse = pd.DataFrame.from_dict(dictionary) - return df_sparse.dropna() + df_sparse = ( + bioframe.merge(df, on=["state"]) + .dropna() + .sort_values(["chrom", "start"]) + .reset_index(drop=True) + ) + return df_sparse def merge_different_hmmstates(df, cLAD, open): - "merge strong and weak HMM states into 2 " + "merge strong and weak HMM states into 2" import pandas as pd chr_list = [] @@ -121,14 +103,16 @@ def merge_different_hmmstates(df, cLAD, open): df_merge = pd.DataFrame.from_dict(dictionary) return df_merge -def write_to_file(df, outputfile, num_states, cmap='coolwarm'): - states = list(range(num_states)) + +def write_to_file(df, outputfile=None, cmap="coolwarm"): + states = list(sorted(df["state"].unique())) cmap = plt.get_cmap(cmap) - colors = {s:cmap(s/states[-1]) for s in states} + colors = {s: cmap(s / states[-1]) for s in states} df["score"] = "0" df["strand"] = "." - filename = outputfile + "_" + str(num_states) + "_state_HMM_colored.bed" - df['RGB'] = df["state"].apply(lambda x: ','.join([str(int(round(c*255))) for c in colors[x][:-1]])) + df["RGB"] = df["state"].apply( + lambda x: ",".join([str(int(round(c * 255))) for c in colors[x][:-1]]) + ) cols_to_keep = [ "chrom", "start", @@ -140,4 +124,11 @@ def write_to_file(df, outputfile, num_states, cmap='coolwarm'): "start", "RGB", ] - df.to_csv(filename, sep="\t", header=False, columns=cols_to_keep, index=False) + if outputfile: + df.to_csv(outputfile, sep="\t", header=False, columns=cols_to_keep, index=False) + else: + print( + df.to_csv( + outputfile, sep="\t", header=False, columns=cols_to_keep, index=False + ) + ) diff --git a/setup.py b/setup.py index d14e9bf..eaa3e6a 100755 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: long_description = f.read() -INSTALL_REQUIRES = ['numpy', 'pandas', 'bioframe', 'pomegranate', 'pybbi'] +INSTALL_REQUIRES = ['numpy', 'pandas', 'bioframe', 'pomegranate', 'pybbi', 'natsort'] setup( name='hmm_bigwigs', @@ -26,4 +26,4 @@ "Programming Language :: Python", ], zip_safe=False -) \ No newline at end of file +)