Skip to content

Commit

Permalink
model update (#1)
Browse files Browse the repository at this point in the history
* model update

* test fix config fix

* curl added

* onnx version

* onnx version

* onnx version

* onnx version

* debug

* debug

* cores default

* cores default

* annotation update

* model update

* test case update

* bug fix for promoter

---------

Co-authored-by: mh <mh@mhs-Mac-mini.local>
  • Loading branch information
MuhammedHasan and mh authored Mar 6, 2024
1 parent 99c783d commit dc8fdac
Show file tree
Hide file tree
Showing 26 changed files with 3,066 additions and 119 deletions.
10 changes: 6 additions & 4 deletions .github/workflows/python-app.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,17 @@ jobs:

steps:
- uses: actions/checkout@v2
- name: Set up Python 3.7
- name: Set up Python 3.8
uses: actions/setup-python@v2
with:
python-version: 3.7
python-version: 3.8
- name: Install cURL Headers
run: sudo apt-get install libcurl4-openssl-dev
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pytest cython pytest-cov pytest-xdist
python -m pip install -e .
pip install pytest cython pytest-cov pytest-xdist
python -m pip install -e ".[all]"
- name: Test with pytest
run: |
python -m pytest -n auto tests/ --junitxml=junit/test-results.xml --cov=epiout --cov-report=xml
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,12 @@ conda install -c bioconda hic-straw
or
```
conda install -c conda-forge curl
pip install hic-straw
pip install epiout[hic]
```

and another optional dependency is `onnxruntime` to predict aberrant gene expression from aberrant chromatin accessibility:
```
pip install onnxruntime
pip install epiout[onnx]
```

## Usage
Expand Down
16 changes: 8 additions & 8 deletions docs/environment.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
name: lapa
name: epiout
channels:
- bioconda
- conda-forge
Expand All @@ -7,10 +7,10 @@ dependencies:
- pip=21.2
- python=3.7
- pip:
- cython==0.29.28
- sphinx==4.4.0
- sphinx-autoapi==1.8.4
- sphinx-click==3.1.0
- sphinx-rtd-theme==1.0.0
- m2r2==0.3.2
- nbsphinx==0.8.8
- cython==0.29.28
- sphinx==4.4.0
- sphinx-autoapi==1.8.4
- sphinx-click==3.1.0
- sphinx-rtd-theme==1.0.0
- m2r2==0.3.2
- nbsphinx==0.8.8
78 changes: 42 additions & 36 deletions epiout/annotate.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,13 +214,6 @@ def annotate_hic(self, gr):
for chrom, df_chrom in df.groupby("Chromosome"):
df_nn = self._neighbors(pr.PyRanges(df_chrom))

try:
print(f"\t Annotating contacts {chrom}:")
hic_scores = self.hic.contact_scores(chrom)
except ValueError:
warnings.warn(f"{chrom} not found in hic file. Skipping...")
continue

df_nn["peak_other"] = (
df_nn["Chromosome"].astype(str)
+ ":"
Expand All @@ -241,36 +234,50 @@ def annotate_hic(self, gr):

center = (df_nn.Start + df_nn.End) // 2
center_other = (df_nn.Start_b + df_nn.End_b) // 2
df_nn["distance"] = abs(center_other - center)

bins = center // self.hic_binsize
bins_other = center_other // self.hic_binsize
bins_index = hic_scores.shape[1] // 2 + (bins_other - bins)

in_vicinity = bins_index < hic_scores.shape[1]

bins = bins[in_vicinity]
bins_index = bins_index[in_vicinity]
df_nn = df_nn.loc[in_vicinity]

df_nn["hic_score"] = np.array(
[
hic_scores[
np.minimum(np.maximum(0, bins + i),
hic_scores.shape[0] - 1),
np.minimum(
np.maximum(0, bins_index +
j), hic_scores.shape[1] - 1
),
if self.hic is not None:
try:
print(f"\t Annotating contacts {chrom}:")
hic_scores = self.hic.contact_scores(chrom)
except ValueError:
warnings.warn(
f"{chrom} not found in hic file. Skipping...")
continue

bins = center // self.hic_binsize
bins_other = center_other // self.hic_binsize
bins_index = hic_scores.shape[1] // 2 + (bins_other - bins)

in_vicinity = bins_index < hic_scores.shape[1]

bins = bins[in_vicinity]
bins_index = bins_index[in_vicinity]
df_nn = df_nn.loc[in_vicinity]

df_nn["hic_score"] = np.array(
[
hic_scores[
np.minimum(np.maximum(0, bins + i),
hic_scores.shape[0] - 1),
np.minimum(
np.maximum(0, bins_index +
j), hic_scores.shape[1] - 1
),
]
for i in range(-1, 2)
for j in range(-1, 2)
]
for i in range(-1, 2)
for j in range(-1, 2)
]
).max(axis=0)

df_nn["distance"] = abs(center_other - center)
).max(axis=0)
else:
df_nn['hic_score'] = (
np.exp(
13.8552
- np.log(np.maximum(df_nn['distance'], 10_000))
+ df_nn['co_outlier'] * 0.0981)
).tolist()

df_nn["count"] = df_nn["peak_other"].map(self.counts).fillna(0)

df_nn = df_nn.set_index("peak")[
[
"peak_other",
Expand Down Expand Up @@ -308,9 +315,8 @@ def save_annotate(self, gr, output_prefix):
df_gtf.to_csv(output_prefix + ".gtf.csv")

print("Annotating hic contacts...")
if self.hic is not None:
df_batch_writer(self.annotate_hic(
gr), output_prefix + ".interaction.csv")
df_batch_writer(self.annotate_hic(
gr), output_prefix + ".interaction.csv")


class GTFAnnot:
Expand Down
5 changes: 3 additions & 2 deletions epiout/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ def __init__(self, bed, alignments, njobs=1, slack=200, subset_chrom=False):
self.alignments = self.read_alignments(alignments)
self.njobs = njobs

def read_bed(self, bed, slack=200, subset_chrom=False):
@staticmethod
def read_bed(bed, slack=200, subset_chrom=False):
'''
Read bed file and overlapping merge peaks with slack of
by default 200bp, subset chromosomes of
Expand All @@ -39,7 +40,7 @@ def read_bed(self, bed, slack=200, subset_chrom=False):
if subset_chrom:
bed = bed[bed.Chromosome.isin(chroms)]

self.valid_bed(bed)
EpiOutDataset._valid_bed(bed)
return bed

def read_alignments(self, alignments):
Expand Down
110 changes: 77 additions & 33 deletions epiout/gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,35 +21,49 @@ def import_onnxruntime():

class GeneLink:
'''
Predict aberrant genes based on the features from promoters, proximal
and distal enhancers.
Predict aberrant genes based on the features from tss, proximal
and distal region.
'''

def __init__(self, annotation, annotation_gtf, interaction):
self.df_annotation = pd.read_csv(annotation)
def __init__(self, annotation_gtf, interaction, annotation=None):
self.df_gtf = pd.read_csv(annotation_gtf)

self.df_gtf = self.df_gtf[self.df_gtf["gene_type"] == "protein_coding"]
self.df_interaction = pd.read_csv(interaction)

self.df_peak_gene = self._peak_gene()

if annotation:
self.df_annotation = pd.read_csv(annotation)
self.df_peak_annot_gene = self._peak_annot_gene()

def _peak_gene(self):
df = (
self.df_gtf[["peak", "Feature", "gene_id"]]
.drop_duplicates()
.set_index("peak")
)
return df[~df["gene_id"].isna()]

def _peak_annot_gene(self):
df = (
self.df_annotation.set_index("peak")[["annotation"]]
.join(self.df_gtf.set_index("peak")[["Feature", "gene_id"]])
.reset_index()
.drop_duplicates()
.set_index("peak")
)

df = df[~df["gene_id"].isna()]
return df
return df[~df["gene_id"].isna()]

def _promoter_gene(self):
return self.df_peak_annot_gene[
(self.df_peak_annot_gene["annotation"] == "promoter") &
self.df_peak_annot_gene["Feature"].isin({"tss", "five_prime_utr"})
]

def _tss_gene(self):
return self.df_peak_gene[
(self.df_peak_gene["annotation"] == "promoter")
& (self.df_peak_gene["Feature"].isin({"tss", "five_prime_utr"}))
self.df_peak_gene["Feature"].isin({"tss", "five_prime_utr"})
]

def promoter_gene(self, df_result):
Expand All @@ -58,12 +72,21 @@ def promoter_gene(self, df_result):
'''
return df_result.set_index("peak").join(self._promoter_gene(), how="inner")

def tss_gene(self, df_result):
'''
Calculate tss outlier scores for each gene.
'''
return df_result.set_index("peak").join(self._tss_gene(), how="inner")

def _promoters(self):
return self._promoter_gene().index

def _tss(self):
return self._tss_gene().index

def _melt(self, df, value_name):
return (
df[self._promoters()]
df[self._tss()]
.reset_index()
.melt(id_vars="index", value_name=value_name)
.rename(columns={"index": "sample"})
Expand All @@ -72,7 +95,7 @@ def _melt(self, df, value_name):

def promoter_features(self, result, eps=1e-16):
'''
Calculate promoter outlier scores for each gene.
Calculate tss outlier scores for each gene.
'''
df = pd.concat(
[
Expand All @@ -95,34 +118,55 @@ def promoter_features(self, result, eps=1e-16):
})
)

def tss_features(self, result, eps=1e-16):
'''
Calculate tss outlier scores for each gene.
'''
df = pd.concat(
[
self._melt(result.outlier, "tss_outlier").astype("int"),
self._melt(result.l2fc.abs(), "l2fc"),
self._melt(-np.log10(np.maximum(result.pval, eps)),
"-log(pval)"),
],
axis=1,
)
return (
df.reset_index(level=0)
.join(self._tss_gene(), how="inner")
.reset_index()
.groupby(["gene_id", "sample"])
.agg({
"tss_outlier": "max",
"l2fc": "max",
"-log(pval)": "max",
})
)

def _proximal_gene(self):
return self.df_peak_gene[
~(
(self.df_peak_gene["annotation"] == "promoter")
& (self.df_peak_gene["Feature"].isin({"tss", "five_prime_utr"}))
)
~self.df_peak_gene["Feature"].isin({"tss", "five_prime_utr"})
]

def proximal_gene(self, df_result):
'''
Calculate proximal enhancer scores for each gene.
Calculate proximal region scores for each gene.
'''
return df_result.set_index("peak").join(self._proximal_gene(), how="inner")

def proximal_features(self, df_result):
df_proximal = self.proximal_gene(df_result)
df_proximal["Proximal-Enhancer"] = df_proximal["l2fc"].abs()
return df_proximal.groupby(["gene_id", "sample"])[["Proximal-Enhancer"]].max()
df_proximal["proximal"] = df_proximal["l2fc"].abs()
return df_proximal.groupby(["gene_id", "sample"])[["proximal"]].max()

def _distal_gene(self):
promoters = set(self._promoter_gene().index)
tss = set(self._tss_gene().index)
df = (
self.df_interaction[self.df_interaction["peak"].isin(promoters)]
self.df_interaction[self.df_interaction["peak"].isin(tss)]
.set_index("peak")
.join(self.df_peak_gene)
.rename(columns={"peak_other": "peak"})
)
del df["annotation"]
return (
df[["abc_score", "distance", "gene_id", "peak"]]
.drop_duplicates()
Expand All @@ -131,46 +175,46 @@ def _distal_gene(self):

def distal_gene(self, df_result):
'''
Calculate distal enhancer scores for each gene.
Calculate distal region scores for each gene.
'''
df_promoter = self._promoter_gene().set_index("gene_id", append=True)
df_tss = self._tss_gene().set_index("gene_id", append=True)
df_proximal = self._proximal_gene().set_index("gene_id", append=True)

promoter_proximal = set(df_promoter.index).union(df_proximal.index)
tss_proximal = set(df_tss.index).union(df_proximal.index)

df_distal = self._distal_gene().set_index("gene_id", append=True)
df_distal = df_distal[~df_distal.index.isin(promoter_proximal)]
df_distal = df_distal[~df_distal.index.isin(tss_proximal)]

return df_result.set_index("peak").join(
df_distal.reset_index(level=1), how="inner"
)

def distal_features(self, df_result):
'''
Calculate distal enhancer scores for each gene.
Calculate distal region scores for each gene.
'''
df_distal = self.distal_gene(df_result)
df_distal["Distal-Enhancer"] = df_distal["abc_score"] * \
df_distal["distal"] = df_distal["abc_score"] * \
df_distal["l2fc"].abs()
return df_distal.groupby(["gene_id", "sample"])[["Distal-Enhancer"]].sum()
return df_distal.groupby(["gene_id", "sample"])[["distal"]].sum()

def features(self, result):
'''
Prepare features for prediction from promoters, proximal and
distal enhancers.
Prepare features for prediction from tss, proximal and
distal region.
'''
df_result = result.results()
return (
self.promoter_features(result)
self.tss_features(result)
.join(self.proximal_features(df_result), how="outer")
.join(self.distal_features(df_result), how="outer")
.fillna(0)
)

def predict(self, result):
'''
Predict aberrant genes based on the features from promoters,
proximal and distal enhancers.
Predict aberrant genes based on the features from tss,
proximal and distal regions.
'''
features = self.features(result)

Expand Down
Loading

0 comments on commit dc8fdac

Please sign in to comment.