gspracklin · Phlya · Jan 10, 2023 · Jan 11, 2023 · Jan 11, 2023 · Jan 11, 2023
diff --git a/hmm_bigwigs/__init__.py b/hmm_bigwigs/__init__.py
@@ -3,7 +3,7 @@
 """
 Created on Fri Feb 28 15:30:21 2020
 
-@author: s1529682
+@author: Ilya Flyamer
 """
 
 from .bigwig_hmm import *
diff --git a/hmm_bigwigs/__main__.py b/hmm_bigwigs/__main__.py
@@ -1,11 +1,16 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
+from natsort import natsort_keygen
+import bbi
+import bioframe as bf
 from hmm_bigwigs import *
 
+
 def parse_args():
     parser = argparse.ArgumentParser(
-    description="Create bedfile of HMM states from bigwig")
+        description="Create bedfile of HMM states from bigwig"
+    )
     parser.add_argument(
         "-i",
         "--inputfile",
@@ -18,6 +23,9 @@ def parse_args():
     parser.add_argument(
         "-g", "--genome", dest="genome", help="genome (i.e. hg38)", action="store"
     )
+    parser.add_argument(
+        "--view", dest="view_file", help="file with genomic view", action="store"
+    )
     parser.add_argument(
         "-n",
         "--num_states",
@@ -34,7 +42,7 @@ def parse_args():
         help="Colormap to map states to colors",
         action="store",
         required=False,
-        default='coolwarm',
+        default="coolwarm",
     )
     parser.add_argument(
         "-o",
@@ -44,30 +52,51 @@ def parse_args():
         action="store",
         required=False,
     )
+    parser.add_argument(
+        "--save-split-files",
+        dest="savesplit",
+        help="Whether to save separate bed files split by state in addition to the output file",
+        action="store_true",
+        required=False,
+        default=False,
+    )
     args = parser.parse_args()
     if args.outputfile is None:
-        args.outputfile=args.inputfile
+        args.outputfile = args.inputfile
     return args
 
+
 def main():
     args = parse_args()
-    print("Starting HMM on " + args.inputfile)
-    chroms = get_chroms(args.genome)
-    df = create_df(inputfile=args.inputfile, chroms=chroms)
-    df = hmm(df, args.num_states)
-    print("Finished hmm!")
-    df_sparse = sparse(df)
-    write_to_file(df_sparse, args.outputfile, args.num_states, cmap=args.cmap)
-    # df_final=merge_different_hmmstates(df_sparse, cLAD=cLAD, open=open_state)
-    # df_final.to_csv(args.outputfile+'_combined_state.bed', sep='\t', header=False, index=False)
-    print("write first file")
-    df_sparse[df_sparse["state"] == 0].to_csv(
-        args.outputfile + "_0_state.bed", sep="\t", header=False, index=False
+    if args.view_file is not None:
+        view = bf.make_viewframe(
+            bf.read_table(args.view_file, schema="bed4", index_col=False)
+        )
+    elif args.genome is not None:
+        view = bf.make_viewframe(bf.fetch_chromsizes(args.genome, as_bed=True))
+    else:
+        with bbi.open(args.inputfile) as f:
+            view = (
+                bf.make_viewframe(dict(f.chromsizes))
+                .sort_values("chrom", key=natsort_keygen())
+                .reset_index(drop=True)
+            )
+    df = create_df(inputfile=args.inputfile, view=view)
+    df = bf.assign_view(df, view)
+    df = (
+        df.dropna(subset=["value"])
+        .groupby("view_region")
+        .filter(lambda x: len(x) > 1)
+        .groupby("view_region")
+        .apply(hmm, num_states=args.num_states)
     )
-    df_sparse[df_sparse["state"] == 1].to_csv(
-        args.outputfile + "_1_state.bed", sep="\t", header=False, index=False
-    )
-    df_sparse[df_sparse["state"] == 2].to_csv(
-        args.outputfile + "_2_state.bed", sep="\t", header=False, index=False
-    )
-    print("Finished writing to file")
+    df_sparse = sparse(df)
+    write_to_file(df_sparse, args.outputfile, cmap=args.cmap)
+    if args.savesplit:
+        for state in range(args.num_states):
+            df_sparse[df_sparse["state"] == state].to_csv(
+                f"{args.outputfile}_{state}_state.bed",
+                sep="\t",
+                header=False,
+                index=False,
+            )
diff --git a/hmm_bigwigs/bigwig_hmm.py b/hmm_bigwigs/bigwig_hmm.py
@@ -23,67 +23,49 @@ def get_chroms(genome, ignoreXYMT=True):
     return chr_list
 
 
-def create_df(inputfile, chroms):
+def create_df(inputfile, view):
     "Create dataframe from bigwig"
-    df = pd.DataFrame(columns=["chrom", "start", "end", "value"])
-    for item in chroms:
-        ivals = list(bbi.fetch_intervals(inputfile, item, 0, -1))
+    df_list = []
+    for i, row in view.iterrows():
+        ivals = bbi.fetch_intervals(inputfile, row.chrom, row.start, row.end)
         df_new = pd.DataFrame(ivals, columns=["chrom", "start", "end", "value"])
-        df = df.append(df_new, ignore_index=True)
-    return df
+        df_list.append(df_new)
+    return pd.concat(df_list).reset_index(drop=True)
 
 
 def hmm(df, num_states):
     "HMM program"
-    # df['value']=df['value'].replace(0,np.nan) #this removes unmappable areas of chr
-    # df_dropna=df.dropna(subset=['value']) #this removes unmappable areas of chr (NaN is otherwise considered 0)
+    # df["value"] = df["value"].replace(0, np.nan)  # this removes unmappable areas of chr
+    # df = df.dropna(
+    #     subset=["value"]
+    # )  # this removes unmappable areas of chr (NaN is otherwise considered 0)
     vals = df["value"].values
     model = HiddenMarkovModel.from_samples(
         NormalDistribution, X=[vals], n_components=num_states
     )
     states = model.predict(vals)
 
     # Rename states to increase with mean signal
-    order = np.argsort(df['value'].groupby(states).mean())
+    order = np.argsort(df["value"].groupby(states).mean())
     states = [order[s] for s in states]
     df["state"] = states
-    df['state'][np.isnan(df['value'])] = np.nan
+    df["state"][np.isnan(df["value"])] = np.nan
     return df
 
 
 def sparse(df):
     "Merge neighboring bins with same state"
-    chr_list = []
-    start_list = []
-    state_list = []
-    end_list = []
-
-    for item in df["chrom"].unique():
-        chrom_df = df[df["chrom"] == item].reset_index()
-
-        chr_list.append((chrom_df["chrom"].iloc[0]))
-        start_list.append((chrom_df["start"].iloc[0]))
-        state_list.append((chrom_df["state"].iloc[0]))
-        for index, row in chrom_df[1:].iterrows():
-            if chrom_df["state"].iloc[index] == chrom_df["state"].iloc[(index - 1)]:
-                continue
-            else:
-                end_list.append(chrom_df["end"].iloc[(index - 1)])
-                chr_list.append(chrom_df["chrom"].iloc[index])
-                start_list.append(chrom_df["start"].iloc[index])
-                state_list.append(chrom_df["state"].iloc[index])
-        if len(start_list) != len(end_list):
-            end_list.append(chrom_df["end"].iloc[(index)])
-
-    keys = ["chrom", "start", "end", "state"]
-    values = [chr_list, start_list, end_list, state_list]
-    dictionary = dict(zip(keys, values))
-    df_sparse = pd.DataFrame.from_dict(dictionary)
-    return df_sparse.dropna()
+    df_sparse = (
+        bioframe.merge(df, on=["state"])
+        .dropna()
+        .sort_values(["chrom", "start"])
+        .reset_index(drop=True)
+    )
+    return df_sparse
 
 
 def merge_different_hmmstates(df, cLAD, open):
-    "merge strong and weak HMM states into 2 "
+    "merge strong and weak HMM states into 2"
     import pandas as pd
 
     chr_list = []
@@ -121,14 +103,16 @@ def merge_different_hmmstates(df, cLAD, open):
     df_merge = pd.DataFrame.from_dict(dictionary)
     return df_merge
 
-def write_to_file(df, outputfile, num_states, cmap='coolwarm'):
-    states = list(range(num_states))
+
+def write_to_file(df, outputfile=None, cmap="coolwarm"):
+    states = list(sorted(df["state"].unique()))
     cmap = plt.get_cmap(cmap)
-    colors = {s:cmap(s/states[-1]) for s in states}
+    colors = {s: cmap(s / states[-1]) for s in states}
     df["score"] = "0"
     df["strand"] = "."
-    filename = outputfile + "_" + str(num_states) + "_state_HMM_colored.bed"
-    df['RGB'] = df["state"].apply(lambda x: ','.join([str(int(round(c*255))) for c in colors[x][:-1]]))
+    df["RGB"] = df["state"].apply(
+        lambda x: ",".join([str(int(round(c * 255))) for c in colors[x][:-1]])
+    )
     cols_to_keep = [
         "chrom",
         "start",
@@ -140,4 +124,11 @@ def write_to_file(df, outputfile, num_states, cmap='coolwarm'):
         "start",
         "RGB",
     ]
-    df.to_csv(filename, sep="\t", header=False, columns=cols_to_keep, index=False)
+    if outputfile:
+        df.to_csv(outputfile, sep="\t", header=False, columns=cols_to_keep, index=False)
+    else:
+        print(
+            df.to_csv(
+                outputfile, sep="\t", header=False, columns=cols_to_keep, index=False
+            )
+        )
diff --git a/setup.py b/setup.py
@@ -7,7 +7,7 @@
 with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
     long_description = f.read()
 
-INSTALL_REQUIRES = ['numpy', 'pandas', 'bioframe', 'pomegranate', 'pybbi']
+INSTALL_REQUIRES = ['numpy', 'pandas', 'bioframe', 'pomegranate', 'pybbi', 'natsort']
 
 setup(
       name='hmm_bigwigs',
@@ -26,4 +26,4 @@
         "Programming Language :: Python",
     ],
     zip_safe=False
-)
+)