From 00cd0d736331e6cd03fc9a775976b62c19731471 Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Tue, 10 Jan 2023 19:11:57 +0100
Subject: [PATCH 1/9] optional split files, small changes

---
 hmm_bigwigs/__main__.py   | 43 ++++++++++++++++++++++++---------------
 hmm_bigwigs/bigwig_hmm.py | 26 +++++++++++++++--------
 2 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/hmm_bigwigs/__main__.py b/hmm_bigwigs/__main__.py
index 90ca0e9..d574f9f 100755
--- a/hmm_bigwigs/__main__.py
+++ b/hmm_bigwigs/__main__.py
@@ -3,9 +3,11 @@
 
 from hmm_bigwigs import *
 
+
 def parse_args():
     parser = argparse.ArgumentParser(
-    description="Create bedfile of HMM states from bigwig")
+        description="Create bedfile of HMM states from bigwig"
+    )
     parser.add_argument(
         "-i",
         "--inputfile",
@@ -34,7 +36,7 @@ def parse_args():
         help="Colormap to map states to colors",
         action="store",
         required=False,
-        default='coolwarm',
+        default="coolwarm",
     )
     parser.add_argument(
         "-o",
@@ -44,30 +46,39 @@ def parse_args():
         action="store",
         required=False,
     )
+    parser.add_argument(
+        "-save-split-files",
+        dest="savesplit",
+        help="Whether to save separate bed files split by state in addition to the output file",
+        action="store",
+        required=False,
+    )
     args = parser.parse_args()
     if args.outputfile is None:
-        args.outputfile=args.inputfile
+        args.outputfile = args.inputfile
     return args
 
+
 def main():
     args = parse_args()
-    print("Starting HMM on " + args.inputfile)
+    # print("Starting HMM on " + args.inputfile)
     chroms = get_chroms(args.genome)
     df = create_df(inputfile=args.inputfile, chroms=chroms)
     df = hmm(df, args.num_states)
-    print("Finished hmm!")
+    # print("Finished hmm!")
     df_sparse = sparse(df)
     write_to_file(df_sparse, args.outputfile, args.num_states, cmap=args.cmap)
     # df_final=merge_different_hmmstates(df_sparse, cLAD=cLAD, open=open_state)
     # df_final.to_csv(args.outputfile+'_combined_state.bed', sep='\t', header=False, index=False)
-    print("write first file")
-    df_sparse[df_sparse["state"] == 0].to_csv(
-        args.outputfile + "_0_state.bed", sep="\t", header=False, index=False
-    )
-    df_sparse[df_sparse["state"] == 1].to_csv(
-        args.outputfile + "_1_state.bed", sep="\t", header=False, index=False
-    )
-    df_sparse[df_sparse["state"] == 2].to_csv(
-        args.outputfile + "_2_state.bed", sep="\t", header=False, index=False
-    )
-    print("Finished writing to file")
\ No newline at end of file
+    # print("write first file")
+    if args.savesplit:
+        df_sparse[df_sparse["state"] == 0].to_csv(
+            args.outputfile + "_0_state.bed", sep="\t", header=False, index=False
+        )
+        df_sparse[df_sparse["state"] == 1].to_csv(
+            args.outputfile + "_1_state.bed", sep="\t", header=False, index=False
+        )
+        df_sparse[df_sparse["state"] == 2].to_csv(
+            args.outputfile + "_2_state.bed", sep="\t", header=False, index=False
+        )
+    # print("Finished writing to file")
diff --git a/hmm_bigwigs/bigwig_hmm.py b/hmm_bigwigs/bigwig_hmm.py
index b9e7464..9d770cf 100644
--- a/hmm_bigwigs/bigwig_hmm.py
+++ b/hmm_bigwigs/bigwig_hmm.py
@@ -44,10 +44,10 @@ def hmm(df, num_states):
     states = model.predict(vals)
 
     # Rename states to increase with mean signal
-    order = np.argsort(df['value'].groupby(states).mean())
+    order = np.argsort(df["value"].groupby(states).mean())
     states = [order[s] for s in states]
     df["state"] = states
-    df['state'][np.isnan(df['value'])] = np.nan
+    df["state"][np.isnan(df["value"])] = np.nan
     return df
 
 
@@ -83,7 +83,7 @@ def sparse(df):
 
 
 def merge_different_hmmstates(df, cLAD, open):
-    "merge strong and weak HMM states into 2 "
+    "merge strong and weak HMM states into 2"
     import pandas as pd
 
     chr_list = []
@@ -193,14 +193,15 @@ def strong_region(df, open_state):
     return mode(open_list)
 
 
-def write_to_file(df, outputfile, num_states, cmap='coolwarm'):
-    states = list(range(num_states))
+def write_to_file(df, outputfile=None, cmap="coolwarm"):
+    states = list(sorted(df["state"].unique()))
     cmap = plt.get_cmap(cmap)
-    colors = {s:cmap(s/states[-1]) for s in states}
+    colors = {s: cmap(s / states[-1]) for s in states}
     df["score"] = "0"
     df["strand"] = "."
-    filename = outputfile + "_" + str(num_states) + "_state_HMM_colored.bed"
-    df['RGB'] = df["state"].apply(lambda x: ','.join([str(int(round(c*255))) for c in colors[x][:-1]]))
+    df["RGB"] = df["state"].apply(
+        lambda x: ",".join([str(int(round(c * 255))) for c in colors[x][:-1]])
+    )
     cols_to_keep = [
         "chrom",
         "start",
@@ -212,4 +213,11 @@ def write_to_file(df, outputfile, num_states, cmap='coolwarm'):
         "start",
         "RGB",
     ]
-    df.to_csv(filename, sep="\t", header=False, columns=cols_to_keep, index=False)
+    if outputfile:
+        df.to_csv(outputfile, sep="\t", header=False, columns=cols_to_keep, index=False)
+    else:
+        print(
+            df.to_csv(
+                outputfile, sep="\t", header=False, columns=cols_to_keep, index=False
+            )
+        )

From b89a77290a394d7504b56b32ca4a886f7ee3a118 Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Wed, 11 Jan 2023 13:32:25 +0100
Subject: [PATCH 2/9] better splitting flag

---
 hmm_bigwigs/__main__.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/hmm_bigwigs/__main__.py b/hmm_bigwigs/__main__.py
index d574f9f..486096e 100755
--- a/hmm_bigwigs/__main__.py
+++ b/hmm_bigwigs/__main__.py
@@ -47,11 +47,12 @@ def parse_args():
         required=False,
     )
     parser.add_argument(
-        "-save-split-files",
+        "--save-split-files",
         dest="savesplit",
         help="Whether to save separate bed files split by state in addition to the output file",
-        action="store",
+        action="store_true",
         required=False,
+        default=False,
     )
     args = parser.parse_args()
     if args.outputfile is None:

From e41308d16c971f6434548ac7dc36fe9b546a6768 Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Wed, 11 Jan 2023 13:35:35 +0100
Subject: [PATCH 3/9] fix

---
 hmm_bigwigs/__main__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hmm_bigwigs/__main__.py b/hmm_bigwigs/__main__.py
index 486096e..2678188 100755
--- a/hmm_bigwigs/__main__.py
+++ b/hmm_bigwigs/__main__.py
@@ -68,7 +68,7 @@ def main():
     df = hmm(df, args.num_states)
     # print("Finished hmm!")
     df_sparse = sparse(df)
-    write_to_file(df_sparse, args.outputfile, args.num_states, cmap=args.cmap)
+    write_to_file(df_sparse, args.outputfile, cmap=args.cmap)
     # df_final=merge_different_hmmstates(df_sparse, cLAD=cLAD, open=open_state)
     # df_final.to_csv(args.outputfile+'_combined_state.bed', sep='\t', header=False, index=False)
     # print("write first file")

From c96be9f6f54d883699b00816faa70327c79e04d9 Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Wed, 11 Jan 2023 13:42:52 +0100
Subject: [PATCH 4/9] better split saving

---
 hmm_bigwigs/__main__.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/hmm_bigwigs/__main__.py b/hmm_bigwigs/__main__.py
index 2678188..678a3ce 100755
--- a/hmm_bigwigs/__main__.py
+++ b/hmm_bigwigs/__main__.py
@@ -73,13 +73,11 @@ def main():
     # df_final.to_csv(args.outputfile+'_combined_state.bed', sep='\t', header=False, index=False)
     # print("write first file")
     if args.savesplit:
-        df_sparse[df_sparse["state"] == 0].to_csv(
-            args.outputfile + "_0_state.bed", sep="\t", header=False, index=False
-        )
-        df_sparse[df_sparse["state"] == 1].to_csv(
-            args.outputfile + "_1_state.bed", sep="\t", header=False, index=False
-        )
-        df_sparse[df_sparse["state"] == 2].to_csv(
-            args.outputfile + "_2_state.bed", sep="\t", header=False, index=False
-        )
+        for state in range(args.num_states):
+            df_sparse[df_sparse["state"] == state].to_csv(
+                f"{args.outputfile}_{state}_state.bed",
+                sep="\t",
+                header=False,
+                index=False,
+            )
     # print("Finished writing to file")

From c70a71520d5dc33dfac2500621237d2a777c6c28 Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Wed, 11 Jan 2023 14:23:58 +0100
Subject: [PATCH 5/9] Make states int

---
 hmm_bigwigs/bigwig_hmm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hmm_bigwigs/bigwig_hmm.py b/hmm_bigwigs/bigwig_hmm.py
index 9d770cf..0c17945 100644
--- a/hmm_bigwigs/bigwig_hmm.py
+++ b/hmm_bigwigs/bigwig_hmm.py
@@ -41,7 +41,7 @@ def hmm(df, num_states):
     model = HiddenMarkovModel.from_samples(
         NormalDistribution, X=[vals], n_components=num_states
     )
-    states = model.predict(vals)
+    states = model.predict(vals).astype(int)
 
     # Rename states to increase with mean signal
     order = np.argsort(df["value"].groupby(states).mean())

From 677f9d0b17c5f34a9e6f278c075181d424ca8557 Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Wed, 11 Jan 2023 14:29:48 +0100
Subject: [PATCH 6/9] Undo

---
 hmm_bigwigs/bigwig_hmm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hmm_bigwigs/bigwig_hmm.py b/hmm_bigwigs/bigwig_hmm.py
index 0c17945..9d770cf 100644
--- a/hmm_bigwigs/bigwig_hmm.py
+++ b/hmm_bigwigs/bigwig_hmm.py
@@ -41,7 +41,7 @@ def hmm(df, num_states):
     model = HiddenMarkovModel.from_samples(
         NormalDistribution, X=[vals], n_components=num_states
     )
-    states = model.predict(vals).astype(int)
+    states = model.predict(vals)
 
     # Rename states to increase with mean signal
     order = np.argsort(df["value"].groupby(states).mean())

From e144d34d9d6e0cae3c365679bdadf351e25c0a16 Mon Sep 17 00:00:00 2001
From: Phlya <flyamer@gmail.com>
Date: Fri, 3 Feb 2023 12:06:56 +0100
Subject: [PATCH 7/9] Add option to use view

---
 hmm_bigwigs/__init__.py   |  2 +-
 hmm_bigwigs/__main__.py   | 31 ++++++++++++++++++++++---------
 hmm_bigwigs/bigwig_hmm.py | 12 ++++++------
 3 files changed, 29 insertions(+), 16 deletions(-)

diff --git a/hmm_bigwigs/__init__.py b/hmm_bigwigs/__init__.py
index 4455c5b..da967d9 100755
--- a/hmm_bigwigs/__init__.py
+++ b/hmm_bigwigs/__init__.py
@@ -3,7 +3,7 @@
 """
 Created on Fri Feb 28 15:30:21 2020
 
-@author: s1529682
+@author: Ilya Flyamer
 """
 
 from .bigwig_hmm import *
\ No newline at end of file
diff --git a/hmm_bigwigs/__main__.py b/hmm_bigwigs/__main__.py
index 678a3ce..6ea4661 100755
--- a/hmm_bigwigs/__main__.py
+++ b/hmm_bigwigs/__main__.py
@@ -1,6 +1,9 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
 
+from natsort import natsort_keygen
+import bbi
+import bioframe as bf
 from hmm_bigwigs import *
 
 
@@ -20,6 +23,9 @@ def parse_args():
     parser.add_argument(
         "-g", "--genome", dest="genome", help="genome (i.e. hg38)", action="store"
     )
+    parser.add_argument(
+        "--view", dest="view_file", help="file with genomic view", action="store"
+    )
     parser.add_argument(
         "-n",
         "--num_states",
@@ -62,16 +68,24 @@ def parse_args():
 
 def main():
     args = parse_args()
-    # print("Starting HMM on " + args.inputfile)
-    chroms = get_chroms(args.genome)
-    df = create_df(inputfile=args.inputfile, chroms=chroms)
-    df = hmm(df, args.num_states)
-    # print("Finished hmm!")
+    if args.view_file is not None:
+        view = bf.make_viewframe(
+            bf.read_table(args.view_file, schema="bed4", index_col=False)
+        )
+    elif args.genome is not None:
+        view = bf.make_viewframe(bf.fetch_chromsizes(args.genome, as_bed=True))
+    else:
+        with bbi.open(args.inputfile) as f:
+            view = (
+                bf.make_viewframe(dict(f.chromsizes))
+                .sort_values("chrom", key=natsort_keygen())
+                .reset_index(drop=True)
+            )
+    df = create_df(inputfile=args.inputfile, view=view)
+    df = bf.assign_view(df, view)
+    df = df.groupby("view_region").apply(hmm, num_states=args.num_states)
     df_sparse = sparse(df)
     write_to_file(df_sparse, args.outputfile, cmap=args.cmap)
-    # df_final=merge_different_hmmstates(df_sparse, cLAD=cLAD, open=open_state)
-    # df_final.to_csv(args.outputfile+'_combined_state.bed', sep='\t', header=False, index=False)
-    # print("write first file")
     if args.savesplit:
         for state in range(args.num_states):
             df_sparse[df_sparse["state"] == state].to_csv(
@@ -80,4 +94,3 @@ def main():
                 header=False,
                 index=False,
             )
-    # print("Finished writing to file")
diff --git a/hmm_bigwigs/bigwig_hmm.py b/hmm_bigwigs/bigwig_hmm.py
index 9d770cf..dc0adb5 100644
--- a/hmm_bigwigs/bigwig_hmm.py
+++ b/hmm_bigwigs/bigwig_hmm.py
@@ -23,14 +23,14 @@ def get_chroms(genome, ignoreXYMT=True):
     return chr_list
 
 
-def create_df(inputfile, chroms):
+def create_df(inputfile, view):
     "Create dataframe from bigwig"
-    df = pd.DataFrame(columns=["chrom", "start", "end", "value"])
-    for item in chroms:
-        ivals = list(bbi.fetch_intervals(inputfile, item, 0, -1))
+    df_list = []
+    for i, row in view.iterrows():
+        ivals = bbi.fetch_intervals(inputfile, row.chrom, row.start, row.end)
         df_new = pd.DataFrame(ivals, columns=["chrom", "start", "end", "value"])
-        df = df.append(df_new, ignore_index=True)
-    return df
+        df_list.append(df_new)
+    return pd.concat(df_list).reset_index(drop=True)
 
 
 def hmm(df, num_states):

From 7f7e7a6c8c3dfee17e5ac476b6b6a295ba510f6f Mon Sep 17 00:00:00 2001
From: Ilya Flyamer <flyamer@gmail.com>
Date: Fri, 3 Feb 2023 13:04:26 +0100
Subject: [PATCH 8/9] Update setup.py

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index d14e9bf..eaa3e6a 100755
--- a/setup.py
+++ b/setup.py
@@ -7,7 +7,7 @@
 with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
     long_description = f.read()
 
-INSTALL_REQUIRES = ['numpy', 'pandas', 'bioframe', 'pomegranate', 'pybbi']
+INSTALL_REQUIRES = ['numpy', 'pandas', 'bioframe', 'pomegranate', 'pybbi', 'natsort']
 
 setup(
       name='hmm_bigwigs',
@@ -26,4 +26,4 @@
         "Programming Language :: Python",
     ],
     zip_safe=False
-)
\ No newline at end of file
+)

From bcf278f932c135d9e0ef15f33a1d119ce66813ef Mon Sep 17 00:00:00 2001
From: Ilya Flyamer <flyamer@gmail.com>
Date: Wed, 1 Mar 2023 13:50:12 +0100
Subject: [PATCH 9/9] Fix for missing data (#3)

* Fix na (#2)

fix for empty or tiny view regions

* sort sparse df
---
 hmm_bigwigs/__main__.py   |  8 +++++++-
 hmm_bigwigs/bigwig_hmm.py | 40 +++++++++++----------------------------
 2 files changed, 18 insertions(+), 30 deletions(-)

diff --git a/hmm_bigwigs/__main__.py b/hmm_bigwigs/__main__.py
index 6ea4661..c86a051 100755
--- a/hmm_bigwigs/__main__.py
+++ b/hmm_bigwigs/__main__.py
@@ -83,7 +83,13 @@ def main():
             )
     df = create_df(inputfile=args.inputfile, view=view)
     df = bf.assign_view(df, view)
-    df = df.groupby("view_region").apply(hmm, num_states=args.num_states)
+    df = (
+        df.dropna(subset=["value"])
+        .groupby("view_region")
+        .filter(lambda x: len(x) > 1)
+        .groupby("view_region")
+        .apply(hmm, num_states=args.num_states)
+    )
     df_sparse = sparse(df)
     write_to_file(df_sparse, args.outputfile, cmap=args.cmap)
     if args.savesplit:
diff --git a/hmm_bigwigs/bigwig_hmm.py b/hmm_bigwigs/bigwig_hmm.py
index cebd93c..d5e7afb 100644
--- a/hmm_bigwigs/bigwig_hmm.py
+++ b/hmm_bigwigs/bigwig_hmm.py
@@ -35,8 +35,10 @@ def create_df(inputfile, view):
 
 def hmm(df, num_states):
     "HMM program"
-    # df['value']=df['value'].replace(0,np.nan) #this removes unmappable areas of chr
-    # df_dropna=df.dropna(subset=['value']) #this removes unmappable areas of chr (NaN is otherwise considered 0)
+    # df["value"] = df["value"].replace(0, np.nan)  # this removes unmappable areas of chr
+    # df = df.dropna(
+    #     subset=["value"]
+    # )  # this removes unmappable areas of chr (NaN is otherwise considered 0)
     vals = df["value"].values
     model = HiddenMarkovModel.from_samples(
         NormalDistribution, X=[vals], n_components=num_states
@@ -53,33 +55,13 @@ def hmm(df, num_states):
 
 def sparse(df):
     "Merge neighboring bins with same state"
-    chr_list = []
-    start_list = []
-    state_list = []
-    end_list = []
-
-    for item in df["chrom"].unique():
-        chrom_df = df[df["chrom"] == item].reset_index()
-
-        chr_list.append((chrom_df["chrom"].iloc[0]))
-        start_list.append((chrom_df["start"].iloc[0]))
-        state_list.append((chrom_df["state"].iloc[0]))
-        for index, row in chrom_df[1:].iterrows():
-            if chrom_df["state"].iloc[index] == chrom_df["state"].iloc[(index - 1)]:
-                continue
-            else:
-                end_list.append(chrom_df["end"].iloc[(index - 1)])
-                chr_list.append(chrom_df["chrom"].iloc[index])
-                start_list.append(chrom_df["start"].iloc[index])
-                state_list.append(chrom_df["state"].iloc[index])
-        if len(start_list) != len(end_list):
-            end_list.append(chrom_df["end"].iloc[(index)])
-
-    keys = ["chrom", "start", "end", "state"]
-    values = [chr_list, start_list, end_list, state_list]
-    dictionary = dict(zip(keys, values))
-    df_sparse = pd.DataFrame.from_dict(dictionary)
-    return df_sparse.dropna()
+    df_sparse = (
+        bioframe.merge(df, on=["state"])
+        .dropna()
+        .sort_values(["chrom", "start"])
+        .reset_index(drop=True)
+    )
+    return df_sparse
 
 
 def merge_different_hmmstates(df, cLAD, open):