Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Streamlined code for saving, and allow to use genomic views #12

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion hmm_bigwigs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"""
Created on Fri Feb 28 15:30:21 2020

@author: s1529682
@author: Ilya Flyamer
"""

from .bigwig_hmm import *
73 changes: 51 additions & 22 deletions hmm_bigwigs/__main__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

from natsort import natsort_keygen
import bbi
import bioframe as bf
from hmm_bigwigs import *


def parse_args():
parser = argparse.ArgumentParser(
description="Create bedfile of HMM states from bigwig")
description="Create bedfile of HMM states from bigwig"
)
parser.add_argument(
"-i",
"--inputfile",
Expand All @@ -18,6 +23,9 @@ def parse_args():
parser.add_argument(
"-g", "--genome", dest="genome", help="genome (i.e. hg38)", action="store"
)
parser.add_argument(
"--view", dest="view_file", help="file with genomic view", action="store"
)
parser.add_argument(
"-n",
"--num_states",
Expand All @@ -34,7 +42,7 @@ def parse_args():
help="Colormap to map states to colors",
action="store",
required=False,
default='coolwarm',
default="coolwarm",
)
parser.add_argument(
"-o",
Expand All @@ -44,30 +52,51 @@ def parse_args():
action="store",
required=False,
)
parser.add_argument(
"--save-split-files",
dest="savesplit",
help="Whether to save separate bed files split by state in addition to the output file",
action="store_true",
required=False,
default=False,
)
args = parser.parse_args()
if args.outputfile is None:
args.outputfile=args.inputfile
args.outputfile = args.inputfile
return args


def main():
args = parse_args()
print("Starting HMM on " + args.inputfile)
chroms = get_chroms(args.genome)
df = create_df(inputfile=args.inputfile, chroms=chroms)
df = hmm(df, args.num_states)
print("Finished hmm!")
df_sparse = sparse(df)
write_to_file(df_sparse, args.outputfile, args.num_states, cmap=args.cmap)
# df_final=merge_different_hmmstates(df_sparse, cLAD=cLAD, open=open_state)
# df_final.to_csv(args.outputfile+'_combined_state.bed', sep='\t', header=False, index=False)
print("write first file")
df_sparse[df_sparse["state"] == 0].to_csv(
args.outputfile + "_0_state.bed", sep="\t", header=False, index=False
if args.view_file is not None:
view = bf.make_viewframe(
bf.read_table(args.view_file, schema="bed4", index_col=False)
)
elif args.genome is not None:
view = bf.make_viewframe(bf.fetch_chromsizes(args.genome, as_bed=True))
else:
with bbi.open(args.inputfile) as f:
view = (
bf.make_viewframe(dict(f.chromsizes))
.sort_values("chrom", key=natsort_keygen())
.reset_index(drop=True)
)
df = create_df(inputfile=args.inputfile, view=view)
df = bf.assign_view(df, view)
df = (
df.dropna(subset=["value"])
.groupby("view_region")
.filter(lambda x: len(x) > 1)
.groupby("view_region")
.apply(hmm, num_states=args.num_states)
)
df_sparse[df_sparse["state"] == 1].to_csv(
args.outputfile + "_1_state.bed", sep="\t", header=False, index=False
)
df_sparse[df_sparse["state"] == 2].to_csv(
args.outputfile + "_2_state.bed", sep="\t", header=False, index=False
)
print("Finished writing to file")
df_sparse = sparse(df)
write_to_file(df_sparse, args.outputfile, cmap=args.cmap)
if args.savesplit:
for state in range(args.num_states):
df_sparse[df_sparse["state"] == state].to_csv(
f"{args.outputfile}_{state}_state.bed",
sep="\t",
header=False,
index=False,
)
79 changes: 35 additions & 44 deletions hmm_bigwigs/bigwig_hmm.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,67 +23,49 @@ def get_chroms(genome, ignoreXYMT=True):
return chr_list


def create_df(inputfile, chroms):
def create_df(inputfile, view):
"Create dataframe from bigwig"
df = pd.DataFrame(columns=["chrom", "start", "end", "value"])
for item in chroms:
ivals = list(bbi.fetch_intervals(inputfile, item, 0, -1))
df_list = []
for i, row in view.iterrows():
ivals = bbi.fetch_intervals(inputfile, row.chrom, row.start, row.end)
df_new = pd.DataFrame(ivals, columns=["chrom", "start", "end", "value"])
df = df.append(df_new, ignore_index=True)
return df
df_list.append(df_new)
return pd.concat(df_list).reset_index(drop=True)


def hmm(df, num_states):
"HMM program"
# df['value']=df['value'].replace(0,np.nan) #this removes unmappable areas of chr
# df_dropna=df.dropna(subset=['value']) #this removes unmappable areas of chr (NaN is otherwise considered 0)
# df["value"] = df["value"].replace(0, np.nan) # this removes unmappable areas of chr
# df = df.dropna(
# subset=["value"]
# ) # this removes unmappable areas of chr (NaN is otherwise considered 0)
vals = df["value"].values
model = HiddenMarkovModel.from_samples(
NormalDistribution, X=[vals], n_components=num_states
)
states = model.predict(vals)

# Rename states to increase with mean signal
order = np.argsort(df['value'].groupby(states).mean())
order = np.argsort(df["value"].groupby(states).mean())
states = [order[s] for s in states]
df["state"] = states
df['state'][np.isnan(df['value'])] = np.nan
df["state"][np.isnan(df["value"])] = np.nan
return df


def sparse(df):
"Merge neighboring bins with same state"
chr_list = []
start_list = []
state_list = []
end_list = []

for item in df["chrom"].unique():
chrom_df = df[df["chrom"] == item].reset_index()

chr_list.append((chrom_df["chrom"].iloc[0]))
start_list.append((chrom_df["start"].iloc[0]))
state_list.append((chrom_df["state"].iloc[0]))
for index, row in chrom_df[1:].iterrows():
if chrom_df["state"].iloc[index] == chrom_df["state"].iloc[(index - 1)]:
continue
else:
end_list.append(chrom_df["end"].iloc[(index - 1)])
chr_list.append(chrom_df["chrom"].iloc[index])
start_list.append(chrom_df["start"].iloc[index])
state_list.append(chrom_df["state"].iloc[index])
if len(start_list) != len(end_list):
end_list.append(chrom_df["end"].iloc[(index)])

keys = ["chrom", "start", "end", "state"]
values = [chr_list, start_list, end_list, state_list]
dictionary = dict(zip(keys, values))
df_sparse = pd.DataFrame.from_dict(dictionary)
return df_sparse.dropna()
df_sparse = (
bioframe.merge(df, on=["state"])
.dropna()
.sort_values(["chrom", "start"])
.reset_index(drop=True)
)
return df_sparse


def merge_different_hmmstates(df, cLAD, open):
"merge strong and weak HMM states into 2 "
"merge strong and weak HMM states into 2"
import pandas as pd

chr_list = []
Expand Down Expand Up @@ -121,14 +103,16 @@ def merge_different_hmmstates(df, cLAD, open):
df_merge = pd.DataFrame.from_dict(dictionary)
return df_merge

def write_to_file(df, outputfile, num_states, cmap='coolwarm'):
states = list(range(num_states))

def write_to_file(df, outputfile=None, cmap="coolwarm"):
states = list(sorted(df["state"].unique()))
cmap = plt.get_cmap(cmap)
colors = {s:cmap(s/states[-1]) for s in states}
colors = {s: cmap(s / states[-1]) for s in states}
df["score"] = "0"
df["strand"] = "."
filename = outputfile + "_" + str(num_states) + "_state_HMM_colored.bed"
df['RGB'] = df["state"].apply(lambda x: ','.join([str(int(round(c*255))) for c in colors[x][:-1]]))
df["RGB"] = df["state"].apply(
lambda x: ",".join([str(int(round(c * 255))) for c in colors[x][:-1]])
)
cols_to_keep = [
"chrom",
"start",
Expand All @@ -140,4 +124,11 @@ def write_to_file(df, outputfile, num_states, cmap='coolwarm'):
"start",
"RGB",
]
df.to_csv(filename, sep="\t", header=False, columns=cols_to_keep, index=False)
if outputfile:
df.to_csv(outputfile, sep="\t", header=False, columns=cols_to_keep, index=False)
else:
print(
df.to_csv(
outputfile, sep="\t", header=False, columns=cols_to_keep, index=False
)
)
4 changes: 2 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f:
long_description = f.read()

INSTALL_REQUIRES = ['numpy', 'pandas', 'bioframe', 'pomegranate', 'pybbi']
INSTALL_REQUIRES = ['numpy', 'pandas', 'bioframe', 'pomegranate', 'pybbi', 'natsort']

setup(
name='hmm_bigwigs',
Expand All @@ -26,4 +26,4 @@
"Programming Language :: Python",
],
zip_safe=False
)
)