Skip to content

Commit

Permalink
Merge pull request #273 from monarch-initiative/protein_df
Browse files Browse the repository at this point in the history
DataFrame for protein features
  • Loading branch information
ielis authored Sep 19, 2024
2 parents a6caa22 + 47e3136 commit 2ef09cf
Show file tree
Hide file tree
Showing 3 changed files with 281 additions and 52 deletions.
247 changes: 195 additions & 52 deletions src/gpsea/model/_protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import typing

import hpotk
import pandas as pd

from .genome import Region

Expand All @@ -11,11 +12,11 @@ class FeatureInfo:
"""
`FeatureInfo` represents a protein feature
(e.g. a repeated sequence given the name "ANK 1" in protein "Ankyrin repeat domain-containing protein 11")
"""
"""

def __init__(self, name: str, region: Region):
self._name = hpotk.util.validate_instance(name, str, 'name')
self._region = hpotk.util.validate_instance(region, Region, 'region')
self._name = hpotk.util.validate_instance(name, str, "name")
self._region = hpotk.util.validate_instance(region, Region, "region")

@property
def name(self) -> str:
Expand Down Expand Up @@ -53,9 +54,11 @@ def __len__(self):
return len(self._region)

def __eq__(self, other) -> bool:
return isinstance(other, FeatureInfo) \
and self.name == other.name \
return (
isinstance(other, FeatureInfo)
and self.name == other.name
and self.region == other.region
)

def __hash__(self):
return hash((self._name, self._region))
Expand All @@ -71,12 +74,12 @@ class FeatureType(enum.Enum):
"""
An enum representing the protein feature types supported in GPSEA.
"""

REPEAT = enum.auto()
"""
A repeated sequence motif or repeated domain within the protein.
"""

MOTIF = enum.auto()
"""
A short (usually not more than 20 amino acids) conserved sequence motif of biological significance.
Expand All @@ -92,11 +95,28 @@ class FeatureType(enum.Enum):
A region of interest that cannot be described in other subsections.
"""

@staticmethod
def from_string(category: str) -> "FeatureType":
cat_lover = category.lower()
if cat_lover == "repeat":
return FeatureType.REGION
elif cat_lover == "motif":
return FeatureType.MOTIF
elif cat_lover == "domain":
return FeatureType.DOMAIN
elif cat_lover == "region":
return FeatureType.REGION
else:
raise ValueError(f'Unrecognized protein feature type: "{category}"')


class ProteinFeature(metaclass=abc.ABCMeta):

@staticmethod
def create(info: FeatureInfo, feature_type: FeatureType):
def create(
info: FeatureInfo,
feature_type: FeatureType,
) -> "ProteinFeature":
return SimpleProteinFeature(info, feature_type)

@property
Expand All @@ -117,6 +137,7 @@ class SimpleProteinFeature(ProteinFeature):
"""
An implementation of a `ProteinFeature`.
"""

# Not part of the public API.

def __init__(self, info: FeatureInfo, feature_type: FeatureType):
Expand All @@ -130,7 +151,9 @@ def __init__(self, info: FeatureInfo, feature_type: FeatureType):
raise ValueError(f"info must be type FeatureInfo but was type {type(info)}")
self._info = info
if not isinstance(feature_type, FeatureType):
raise ValueError(f"feature_type must be type FeatureType but was type {type(feature_type)}")
raise ValueError(
f"feature_type must be type FeatureType but was type {type(feature_type)}"
)
self._type = feature_type

@property
Expand All @@ -150,16 +173,17 @@ def feature_type(self) -> FeatureType:
return self._type

def __str__(self) -> str:
return f"SimpleProteinFeature(type={self._type}, " \
f"info={self._info})"
return f"SimpleProteinFeature(type={self._type}, " f"info={self._info})"

def __repr__(self) -> str:
return str(self)

def __eq__(self, other) -> bool:
return isinstance(other, SimpleProteinFeature) \
and self._type == other._type \
return (
isinstance(other, SimpleProteinFeature)
and self._type == other._type
and self._info == other._info
)

def __hash__(self) -> int:
return hash((self._type, self._info))
Expand All @@ -169,88 +193,204 @@ class ProteinMetadata:
"""
An info regarding a protein sequence, including an ID, a label,
and location of protein features, such as motifs, domains, or other regions.
The information is usually retrieved from a resource such as :class:`~gpsea.preprocessing.UniprotMetadataService`,
but it can also be created manually using :meth:`~gpsea.model.ProteinMetadata.from_feature_frame` function.
Example
-------
Let's create a protein info with a domain and a region. We must provide protein accession ID,
a label, a data frame with protein features, and the number of aminoacids of the protein sequence:
>>> protein_id = 'NP_000129.3'
>>> label = 'fibrillin-1 isoform a preproprotein'
>>> protein_length = 1000
Now let's prepare a data frame with the protein features. We will prepare a domain and a region:
>>> import pandas as pd
>>> features = [
... {
... "region": "Suppresor domain",
... "category": "domain",
... "start": 1,
... "end": 223,
... },
... {
... "region": "IP3 binding",
... "category": "region",
... "start": 224,
... "end": 578,
... },
... ]
>>> df = pd.DataFrame(features)
last, we can put the protein info together:
>>> from gpsea.model import ProteinMetadata
>>> protein_meta = ProteinMetadata.from_feature_frame(
... protein_id=protein_id,
... label=label,
... features=df,
... protein_length=protein_length,
... )
and get the expected protein info:
>>> protein_meta.protein_id
'NP_000129.3'
>>> protein_meta.label
'fibrillin-1 isoform a preproprotein'
>>> len(protein_meta.protein_features)
2
"""

@staticmethod
def from_feature_frame(
protein_id: str,
label: str,
features: pd.DataFrame,
protein_length: int,
) -> "ProteinMetadata":
"""
Create `ProteinMetadata` from a user-supplied pandas DataFrame.
We expect to obtain the gene symbol, protein identifier, and regions
The DataFrame should include the following columns:
+------------------+----------+----------------+
| region | category | start | end |
+------------------+----------+--------+-------+
| Suppresor domain | domain | 1 | 223 |
+------------------+----------+--------+-------+
| IP3 binding | region | 224 | 578 |
+------------------+----------+--------+-------+
The `region` column includes the protein feature name.
The category must be one of `'repeat'`, `'motif'`, `'domain'`, or `'region'`.
Use `region` if no other option fits. Last, `start` and `end` denote 1-based start and end coordinates
of the aminoacid sequence region described by the feature.
For instance, `[1, 10]` for the first ten aminoacids of the protein.
:param protein_id: the accession id of the protein, e.g. `'NP_000129.3'`.
:param label: human-readable label, e.g. `'fibrillin-1 isoform a preproprotein'`.
:param features: a dataframe with of the protein features.
:param protein_length: a positive `int` representing the number of aminoacids included in the protein sequence.
:raises ValueError: if case of issues during parsing the provided data.
"""
expected_headers = ("region", "start", "end", "category")
if any(col_name not in features.columns for col_name in expected_headers):
missing_cols = ", ".join(set(expected_headers).difference(features.columns))
raise ValueError(
f"The column(s) {{{missing_cols}}} are missing from the `features` DataFrame: "
f"{tuple(features.columns)}"
)
region_list = list()
for _, row in features.iterrows():
region_name = row["region"]
region_start = row["start"] - 1 # convert to 0-based coordinates
region_end = row["end"]
region_category = row["category"]
feature_type = FeatureType.from_string(region_category)
finfo = FeatureInfo(
name=region_name, region=Region(start=region_start, end=region_end)
)
pfeature = ProteinFeature.create(info=finfo, feature_type=feature_type)
region_list.append(pfeature)

return ProteinMetadata(
protein_id=protein_id,
label=label,
protein_features=region_list,
protein_length=protein_length,
)

def __init__(
self,
protein_id: str,
label: str,
protein_features: typing.Sequence[ProteinFeature],
protein_length: int = 0,
protein_features: typing.Iterable[ProteinFeature],
protein_length: int,
):
if not isinstance(protein_id, str):
raise ValueError(f"Protein ID must be type string but is type {type(protein_id)}")
assert isinstance(protein_id, str)
self._id = protein_id
if not isinstance(label, str):
raise ValueError(f"Protein label must be type string but is type {type(label)}")
assert isinstance(label, str)
self._label = label
if not all(isinstance(x, ProteinFeature) for x in protein_features):
raise ValueError(
f"Protein Features must be a list of type ProteinFeature but is type {type(protein_features)}")

assert all(isinstance(x, ProteinFeature) for x in protein_features)
self._features = tuple(protein_features)
assert isinstance(protein_length, int) and protein_length > 0
self._protein_length = protein_length

@property
def protein_id(self) -> str:
"""
Returns:
string: A string unique to this protein
Get the protein's accession ID, e.g. `NP_000129.3`.
"""
return self._id

@property
def label(self) -> str:
"""
Returns:
string: The full name of the protein
Get the protein label, e.g. `fibrillin-1 isoform a preproprotein`.
"""
return self._label

@property
def protein_features(self) -> typing.Sequence[ProteinFeature]:
"""
Returns:
Sequence[ProteinFeature]: A sequence of ProteinFeatures objects
Get a sequence of protein features.
"""
return self._features

@property
def protein_length(self) -> int:
"""
Returns:
int: length of protein
Get the number of aminoacids of the protein sequence.
"""
return self._protein_length

def domains(self) -> typing.Iterable[ProteinFeature]:
"""
Returns:
Iterable[ProteinFeature]: A subgroup of protein_features, where the ProteinFeature object has a FeatureType equal to "DOMAIN"
Iterable[ProteinFeature]: A subgroup of the protein features that correspond to protein domains.
"""
return filter(lambda f: f.feature_type == FeatureType.DOMAIN, self.protein_features)
return filter(
lambda f: f.feature_type == FeatureType.DOMAIN, self.protein_features
)

def repeats(self) -> typing.Iterable[ProteinFeature]:
"""
Returns:
Iterable[ProteinFeature]: A subgroup of protein_features, where the ProteinFeature object has a FeatureType equal to "REPEAT"
Iterable[ProteinFeature]: A subgroup of the protein features that correspond to repeat regions.
"""
return filter(lambda f: f.feature_type == FeatureType.REPEAT, self.protein_features)
return filter(
lambda f: f.feature_type == FeatureType.REPEAT, self.protein_features
)

def regions(self) -> typing.Iterable[ProteinFeature]:
"""
Returns:
Iterable[ProteinFeature]: A subgroup of protein_features, where the ProteinFeature object has a FeatureType equal to "REGIONS"
Iterable[ProteinFeature]: A subgroup of the protein features that correspond to generic regions.
"""
return filter(lambda f: f.feature_type == FeatureType.REGION, self.protein_features)
return filter(
lambda f: f.feature_type == FeatureType.REGION, self.protein_features
)

def motifs(self) -> typing.Iterable[ProteinFeature]:
"""
Returns:
Iterable[ProteinFeature]: A subgroup of protein_features, where the ProteinFeature object has a FeatureType equal to "MOTIF"
Iterable[ProteinFeature]: A subgroup of the protein features that correspond to motifs.
"""
return filter(lambda f: f.feature_type == FeatureType.MOTIF, self.protein_features)
return filter(
lambda f: f.feature_type == FeatureType.MOTIF, self.protein_features
)

def get_features_variant_overlaps(self, region: Region) -> typing.Collection[ProteinFeature]:
def get_features_variant_overlaps(
self,
region: Region,
) -> typing.Collection[ProteinFeature]:
"""
Get a collection of protein features that overlap with the `region`.
Args:
Expand All @@ -259,23 +399,26 @@ def get_features_variant_overlaps(self, region: Region) -> typing.Collection[Pro
Returns:
Collection[ProteinFeature]: a collection of overlapping protein features.
"""
affected_features = set()
for feat in self.protein_features:
if feat.info.region.overlaps_with(region):
affected_features.add(feat)

return affected_features
return tuple(
feature
for feature in self._features
if feature.info.region.overlaps_with(region)
)

def __str__(self) -> str:
return f"ProteinMetadata(id={self.protein_id}, " \
f"label={self.label}, " \
f"features={str(self.protein_features)})"
return (
f"ProteinMetadata(id={self.protein_id}, "
f"label={self.label}, "
f"features={str(self.protein_features)})"
)

def __eq__(self, other) -> bool:
return isinstance(other, ProteinMetadata) \
and self.label == other.label \
and self.protein_features == other.protein_features \
return (
isinstance(other, ProteinMetadata)
and self.label == other.label
and self.protein_features == other.protein_features
and self.protein_id == other.protein_id
)

def __hash__(self) -> int:
return hash((self.protein_id, self.label, self._features))
Expand Down
Loading

0 comments on commit 2ef09cf

Please sign in to comment.