Skip to content

Commit

Permalink
Merge pull request #179 from monarch-initiative/Feature_counts
Browse files Browse the repository at this point in the history
Create Protein table so users can see list of protein features and coordinates
  • Loading branch information
lnrekerle authored Jul 11, 2024
2 parents d0edd1f + 8aaf00f commit 951d8b7
Show file tree
Hide file tree
Showing 13 changed files with 297 additions and 3,838 deletions.
3,880 changes: 67 additions & 3,813 deletions notebooks/ANKRD11/KBG.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ dependencies = [
"statsmodels>=0.13.0",
"numpy>=1.23",
"matplotlib>=3.2.0,<4.0",
"tqdm>=4.60"
"tqdm>=4.60",
]
dynamic = ["version"]

Expand Down
6 changes: 5 additions & 1 deletion src/genophenocorr/model/_protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,11 @@ def protein_features(self) -> typing.Sequence[ProteinFeature]:
return self._features

@property
def protein_length(self):
def protein_length(self) -> int:
"""
Returns:
int: length of protein
"""
return self._protein_length

def domains(self) -> typing.Iterable[ProteinFeature]:
Expand Down
43 changes: 29 additions & 14 deletions src/genophenocorr/model/_variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,6 +356,35 @@ class FunctionalAnnotationAware(metaclass=abc.ABCMeta):
@abc.abstractmethod
def tx_annotations(self) -> typing.Sequence[TranscriptAnnotation]:
pass

def get_tx_anno_by_tx_id(self, transcript_id:str) -> typing.Optional[TranscriptAnnotation]:
"""Given a transcript ID, this will return the `TranscriptAnnotation` associated with that
variant and transcript.
Args:
transcript_id (str): A transcript ID - i.e. 'NM_170707.4'
Returns:
typing.Optional[TranscriptAnnotation]: The Transcript Annotation if available.
"""
for tx_ann in self.tx_annotations:
if tx_ann.transcript_id == transcript_id:
return tx_ann
return None

def get_hgvs_cdna_by_tx_id(self, transcript_id:str) -> typing.Optional[str]:
"""Given a transcript ID, will return the hgvs cdna string associated with that variant and transcript.
Args:
transcript_id (str): A transcript ID - i.e. 'NM_170707.4'
Returns:
str or None: The hgvs cdna if available - i.e. 'NM_170707.4:c.1824C>T'
"""
for tx_ann in self.tx_annotations:
if tx_ann.transcript_id == transcript_id:
return tx_ann.hgvs_cdna
return None


class Variant(VariantCoordinateAware, FunctionalAnnotationAware, Genotyped):
Expand Down Expand Up @@ -423,20 +452,6 @@ def tx_annotations(self) -> typing.Sequence[TranscriptAnnotation]:
@property
def genotypes(self) -> Genotypes:
return self._gts

def get_hgvs_cdna_by_tx(self, transcript_id:str) -> typing.Optional[str]:
"""Given a transcript ID, will return the hgvs cdna string associated with that variant and transcript.
Args:
transcript_id (str): A transcript ID - i.e. 'NM_170707.4'
Returns:
str or None: The hgvs cdna if available - i.e. 'NM_170707.4:c.1824C>T'
"""
for tx in self.tx_annotations:
if tx.transcript_id == transcript_id:
return tx.hgvs_cdna
return None

def __eq__(self, other) -> bool:
return isinstance(other, Variant) \
Expand Down
3 changes: 2 additions & 1 deletion src/genophenocorr/preprocessing/_uniprot.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,9 @@ def parse_uniprot_json(
for feature in protein['features']:
feature_start = int(feature['location']['start']['value'])
feature_end = int(feature['location']['end']['value'])
feature_name = feature['description']
feature_info = FeatureInfo(
feature['description'],
feature_name,
Region(start=feature_start, end=feature_end),
)
feature_type = FeatureType[feature['type'].upper()]
Expand Down
3 changes: 2 additions & 1 deletion src/genophenocorr/view/__init__.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from ._cohort import CohortViewable
from ._disease import DiseaseViewable
from ._protein_viewer import ProteinViewable
from ._protein_visualizable import ProteinVisualizable
from ._stats import StatsViewer
from ._txp import VariantTranscriptVisualizer
from ._protein_visualizer import ProteinVisualizer

__all__ = [
'CohortViewable',
'ProteinVisualizer', 'ProteinVisualizable',
'ProteinVisualizer', 'ProteinVisualizable', 'ProteinViewable',
'DiseaseViewable',
'StatsViewer',
'VariantTranscriptVisualizer'
Expand Down
8 changes: 5 additions & 3 deletions src/genophenocorr/view/_cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,13 @@ def _prepare_context(

var_effects_list = list()
if transcript_id is not None:
has_transcript = False
has_transcript = True
data_by_tx = cohort.variant_effect_count_by_tx(tx_id=transcript_id)
# e.g., data structure -- {'effect}': 'FRAMESHIFT_VARIANT', 'count': 175}, {'effect}': 'STOP_GAINED', 'count': 67},
for k, v in data_by_tx.items():
var_effects_list.append({"effect": k, "count": v})
for tx_id, counter in data_by_tx.items():
if tx_id == transcript_id:
for effect, count in counter.items():
var_effects_list.append({"effect": effect, "count": count})
else:
has_transcript = False
# The following dictionary is used by the Jinja2 HTML template
Expand Down
79 changes: 79 additions & 0 deletions src/genophenocorr/view/_protein_viewer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import typing

from dataclasses import dataclass

from jinja2 import Environment, PackageLoader
from collections import namedtuple

from genophenocorr.model import Cohort
from genophenocorr.model.genome import Region
from ._protein_visualizable import ProteinVisualizable

@dataclass(frozen=False)
class Feature:
"""
A private dataclass for representing a table row.
Any edits to the dataclass must also be followed by an update of the Jinja template.
"""
name: str
type: str
region: Region
variant_count: int


class ProteinViewable:
"""
Class to create a pretty HTML table to display the protein information in the Jupyter notebook.
"""
def __init__(self) -> None:
environment = Environment(loader=(PackageLoader('genophenocorr.view', 'templates')))
self._cohort_template = environment.get_template("protein.html")

def process(self, cohort: Cohort, pvis: ProteinVisualizable) -> str:
"""
Summarize the data regarding the protein into a HTML table.
Args:
cohort (Cohort): the cohort of patients being analyzed
pvis (ProteinVisualizable): The class that collects data from the UniProt API for a given protein ID
Returns:
str: an HTML document for showing in Jupyter notebook
"""
context = self._prepare_context(cohort, pvis)
return self._cohort_template.render(context)

def _prepare_context(self, cohort: Cohort, pvis: ProteinVisualizable) -> typing.Mapping[str, typing.Any]:
protein_id = pvis.protein_id

protein_features = []

for i in range(len(pvis.protein_feature_names)):
feature = Feature(
name=pvis.protein_feature_names[i],
type=pvis.protein_feature_types[i],
region=Region(pvis.protein_feature_starts[i], pvis.protein_feature_ends[i]),
variant_count=0,
)
protein_features.append(feature)

for feature in protein_features:
count = 0
for var in cohort.all_variants():
tx_anno = var.get_tx_anno_by_tx_id(pvis.transcript_id)
if tx_anno is not None:
location = tx_anno.protein_effect_location
if location is not None and location.overlaps_with(feature.region):
count += 1

feature.variant_count = count

final_protein_features = sorted(protein_features, key=lambda f: f.region.start)

return {
'protein_id': protein_id,
'protein_label': pvis.protein_metadata.label,
'protein_features': final_protein_features
}

8 changes: 7 additions & 1 deletion src/genophenocorr/view/_protein_visualizable.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,12 @@ def __init__(
self._variant_effect.append(variant_effects[0])

self._protein_feature_names = list()
self._protein_feature_types = list()
self._protein_feature_starts = list()
self._protein_feature_ends = list()
for feature in protein_meta.protein_features:
self._protein_feature_names.append(feature.info.name)
self._protein_feature_types.append(feature.feature_type.name.lower())
self._protein_feature_starts.append(feature.info.start)
self._protein_feature_ends.append(feature.info.end)

Expand Down Expand Up @@ -101,6 +103,10 @@ def protein_feature_starts(self) -> typing.Sequence[int]:
@property
def protein_feature_ends(self) -> typing.Sequence[int]:
return self._protein_feature_ends

@property
def protein_feature_types(self) -> typing.Sequence[str]:
return self._protein_feature_types

@property
def protein_length(self) -> int:
Expand All @@ -123,7 +129,7 @@ def protein_length(self) -> int:
@property
def protein_feature_names(self) -> typing.Sequence[str]:
return self._protein_feature_names

@property
def variant_effects(self) -> typing.Sequence[VariantEffect]:
return self._variant_effect
Expand Down
2 changes: 1 addition & 1 deletion src/genophenocorr/view/templates/cohort.html
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
}

caption {
caption-side: bottom;
caption-side: top;
text-align: left;
padding-bottom: 10px;
font-weight: bold;
Expand Down
86 changes: 86 additions & 0 deletions src/genophenocorr/view/templates/protein.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<title>Cohort</title>
<style>table {
border-collapse: collapse;
margin: 25px 0;
font-size: 0.9em;
font-family: sans-serif;
min-width: 400px;
box-shadow: 0 0 20px rgba(0, 0, 0, 0.15);
}


.table .column-1 {
text-align: left;
}
th {
background-color: LightSkyBlue;
border: 1px solid #dddddd;
text-align: left;
padding: 5px;
font-weight: bold;
font-size: 120%;
}


tr {
border: 1px solid #dddddd;
}

td {
padding: 5px;
font-weight: bold;
}

tr:nth-child(even) {
background-color: #f2f2f2;
}

.table td,tr {
text-align: left;
}

.table td:first-child, tr:first-child {
text-align: left;
}​

caption {
caption-side: top;
text-align: left;
padding-bottom: 10px;
font-weight: bold;
}

</style>
</head>

<body>
<h1>genophenocorr protein analysis</h1>
<p>The UniProt API successfully returned protein information for ID: {{ protein_id }}</p>
<p>Protein Name: {{ protein_label }}</p>
<table>
<caption style="color:black;">
<h3>Protein Features</h3>
</caption>
<tbody>
<tr class="strng">
<th>Feature Name</th>
<th>Feature Type</th>
<th>Feature Coordinates</th>
<th>Variants in Feature</th>
</tr>
{% for feat in protein_features %}
<tr>
<td>{{ feat.name }}</td>
<td>{{ feat.type }}</td>
<td>{{ feat.region.start }} - {{ feat.region.end }}</td>
<td>{{ feat.variant_count }}</td>
</tr>
{% endfor %}
</tbody>
</table>
</body>
</html>
2 changes: 1 addition & 1 deletion tests/model/test_variant.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,6 @@ def test_get_hgvs_cdna_by_tx(
tx_id: str,
expected: typing.Optional[str],
):
hgvs = some_variant.get_hgvs_cdna_by_tx(transcript_id=tx_id)
hgvs = some_variant.get_hgvs_cdna_by_tx_id(transcript_id=tx_id)

assert hgvs == expected
13 changes: 12 additions & 1 deletion tests/view/test_protein_visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pytest

from genophenocorr.model import TranscriptCoordinates, ProteinMetadata, Cohort
from genophenocorr.view import ProteinVisualizer, ProteinVisualizable
from genophenocorr.view import ProteinVisualizer, ProteinVisualizable, ProteinViewable


class TestProteinVisualizer:
Expand Down Expand Up @@ -37,3 +37,14 @@ def test_protein_visualizer(
)

fig.savefig('protein.png')

@pytest.mark.skip('Run manually on demand')
def test_protein_viewable(
self,
suox_cohort: Cohort,
visualizable: ProteinVisualizable,
):
protein_viewable = ProteinViewable()
view = protein_viewable.process(suox_cohort, visualizable)
with open('protein_viewable.html', 'w') as fh:
fh.write(view)

0 comments on commit 951d8b7

Please sign in to comment.