Skip to content

Commit

Permalink
Show how to create a custom protein meta.
Browse files Browse the repository at this point in the history
  • Loading branch information
ielis committed Sep 19, 2024
1 parent 2c508f3 commit 47e3136
Show file tree
Hide file tree
Showing 2 changed files with 117 additions and 47 deletions.
160 changes: 115 additions & 45 deletions src/gpsea/model/_protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ class FeatureInfo:
"""
`FeatureInfo` represents a protein feature
(e.g. a repeated sequence given the name "ANK 1" in protein "Ankyrin repeat domain-containing protein 11")
"""
"""

def __init__(self, name: str, region: Region):
self._name = hpotk.util.validate_instance(name, str, 'name')
self._region = hpotk.util.validate_instance(region, Region, 'region')
self._name = hpotk.util.validate_instance(name, str, "name")
self._region = hpotk.util.validate_instance(region, Region, "region")

@property
def name(self) -> str:
Expand Down Expand Up @@ -54,9 +54,11 @@ def __len__(self):
return len(self._region)

def __eq__(self, other) -> bool:
return isinstance(other, FeatureInfo) \
and self.name == other.name \
return (
isinstance(other, FeatureInfo)
and self.name == other.name
and self.region == other.region
)

def __hash__(self):
return hash((self._name, self._region))
Expand All @@ -72,12 +74,12 @@ class FeatureType(enum.Enum):
"""
An enum representing the protein feature types supported in GPSEA.
"""

REPEAT = enum.auto()
"""
A repeated sequence motif or repeated domain within the protein.
"""

MOTIF = enum.auto()
"""
A short (usually not more than 20 amino acids) conserved sequence motif of biological significance.
Expand Down Expand Up @@ -105,7 +107,7 @@ def from_string(category: str) -> "FeatureType":
elif cat_lover == "region":
return FeatureType.REGION
else:
raise ValueError(f"Unrecognized protein feature type: \"{category}\"")
raise ValueError(f'Unrecognized protein feature type: "{category}"')


class ProteinFeature(metaclass=abc.ABCMeta):
Expand Down Expand Up @@ -135,6 +137,7 @@ class SimpleProteinFeature(ProteinFeature):
"""
An implementation of a `ProteinFeature`.
"""

# Not part of the public API.

def __init__(self, info: FeatureInfo, feature_type: FeatureType):
Expand All @@ -148,7 +151,9 @@ def __init__(self, info: FeatureInfo, feature_type: FeatureType):
raise ValueError(f"info must be type FeatureInfo but was type {type(info)}")
self._info = info
if not isinstance(feature_type, FeatureType):
raise ValueError(f"feature_type must be type FeatureType but was type {type(feature_type)}")
raise ValueError(
f"feature_type must be type FeatureType but was type {type(feature_type)}"
)
self._type = feature_type

@property
Expand All @@ -168,16 +173,17 @@ def feature_type(self) -> FeatureType:
return self._type

def __str__(self) -> str:
return f"SimpleProteinFeature(type={self._type}, " \
f"info={self._info})"
return f"SimpleProteinFeature(type={self._type}, " f"info={self._info})"

def __repr__(self) -> str:
return str(self)

def __eq__(self, other) -> bool:
return isinstance(other, SimpleProteinFeature) \
and self._type == other._type \
return (
isinstance(other, SimpleProteinFeature)
and self._type == other._type
and self._info == other._info
)

def __hash__(self) -> int:
return hash((self._type, self._info))
Expand All @@ -190,19 +196,67 @@ class ProteinMetadata:
The information is usually retrieved from a resource such as :class:`~gpsea.preprocessing.UniprotMetadataService`,
but it can also be created manually using :meth:`~gpsea.model.ProteinMetadata.from_feature_frame` function.
Example
-------
Let's create a protein info with a domain and a region. We must provide protein accession ID,
a label, a data frame with protein features, and the number of aminoacids of the protein sequence:
>>> protein_id = 'NP_000129.3'
>>> label = 'fibrillin-1 isoform a preproprotein'
>>> protein_length = 1000
Now let's prepare a data frame with the protein features. We will prepare a domain and a region:
>>> import pandas as pd
>>> features = [
... {
... "region": "Suppresor domain",
... "category": "domain",
... "start": 1,
... "end": 223,
... },
... {
... "region": "IP3 binding",
... "category": "region",
... "start": 224,
... "end": 578,
... },
... ]
>>> df = pd.DataFrame(features)
last, we can put the protein info together:
>>> from gpsea.model import ProteinMetadata
>>> protein_meta = ProteinMetadata.from_feature_frame(
... protein_id=protein_id,
... label=label,
... features=df,
... protein_length=protein_length,
... )
and get the expected protein info:
>>> protein_meta.protein_id
'NP_000129.3'
>>> protein_meta.label
'fibrillin-1 isoform a preproprotein'
>>> len(protein_meta.protein_features)
2
"""

@staticmethod
def from_feature_frame(
protein_id: str,
label: str,
features: pd.DataFrame,
length: int,
protein_length: int,
) -> "ProteinMetadata":
"""
Create `ProteinMetadata` from a user-supplied pandas DataFrame.
We expect to obtain the gene symbol, protein identifier, and regions
The DataFrame should include the following columns:
+------------------+----------+----------------+
Expand All @@ -218,16 +272,16 @@ def from_feature_frame(
Use `region` if no other option fits. Last, `start` and `end` denote 1-based start and end coordinates
of the aminoacid sequence region described by the feature.
For instance, `[1, 10]` for the first ten aminoacids of the protein.
:param protein_id: the accession id of the protein, e.g. `NP_000129.3`.
:param label: human-readable label, e.g. `fibrillin-1 isoform a preproprotein`.
:param protein_id: the accession id of the protein, e.g. `'NP_000129.3'`.
:param label: human-readable label, e.g. `'fibrillin-1 isoform a preproprotein'`.
:param features: a dataframe with of the protein features.
:param length: a positive `int` representing the number of aminoacids included in the protein sequence.
:param protein_length: a positive `int` representing the number of aminoacids included in the protein sequence.
:raises ValueError: if case of issues during parsing the provided data.
"""
expected_headers = ("region", "start", "end", "category")
if any(col_name not in features.columns for col_name in expected_headers):
missing_cols = ', '.join(set(expected_headers).difference(features.columns))
missing_cols = ", ".join(set(expected_headers).difference(features.columns))
raise ValueError(
f"The column(s) {{{missing_cols}}} are missing from the `features` DataFrame: "
f"{tuple(features.columns)}"
Expand All @@ -239,11 +293,18 @@ def from_feature_frame(
region_end = row["end"]
region_category = row["category"]
feature_type = FeatureType.from_string(region_category)
finfo = FeatureInfo(name=region_name, region=Region(start=region_start, end=region_end))
finfo = FeatureInfo(
name=region_name, region=Region(start=region_start, end=region_end)
)
pfeature = ProteinFeature.create(info=finfo, feature_type=feature_type)
region_list.append(pfeature)

return ProteinMetadata(protein_id=protein_id, label=label, protein_features=region_list, protein_length=length)

return ProteinMetadata(
protein_id=protein_id,
label=label,
protein_features=region_list,
protein_length=protein_length,
)

def __init__(
self,
Expand All @@ -265,32 +326,28 @@ def __init__(
@property
def protein_id(self) -> str:
"""
Returns:
string: A string unique to this protein
Get the protein's accession ID, e.g. `NP_000129.3`.
"""
return self._id

@property
def label(self) -> str:
"""
Returns:
string: The full name of the protein
Get the protein label, e.g. `fibrillin-1 isoform a preproprotein`.
"""
return self._label

@property
def protein_features(self) -> typing.Sequence[ProteinFeature]:
"""
Returns:
Sequence[ProteinFeature]: A sequence of ProteinFeatures objects
Get a sequence of protein features.
"""
return self._features

@property
def protein_length(self) -> int:
"""
Returns:
int: length of protein
Get the number of aminoacids of the protein sequence.
"""
return self._protein_length

Expand All @@ -299,28 +356,36 @@ def domains(self) -> typing.Iterable[ProteinFeature]:
Returns:
Iterable[ProteinFeature]: A subgroup of the protein features that correspond to protein domains.
"""
return filter(lambda f: f.feature_type == FeatureType.DOMAIN, self.protein_features)
return filter(
lambda f: f.feature_type == FeatureType.DOMAIN, self.protein_features
)

def repeats(self) -> typing.Iterable[ProteinFeature]:
"""
Returns:
Iterable[ProteinFeature]: A subgroup of the protein features that correspond to repeat regions.
"""
return filter(lambda f: f.feature_type == FeatureType.REPEAT, self.protein_features)
return filter(
lambda f: f.feature_type == FeatureType.REPEAT, self.protein_features
)

def regions(self) -> typing.Iterable[ProteinFeature]:
"""
Returns:
Iterable[ProteinFeature]: A subgroup of the protein features that correspond to generic regions.
"""
return filter(lambda f: f.feature_type == FeatureType.REGION, self.protein_features)
return filter(
lambda f: f.feature_type == FeatureType.REGION, self.protein_features
)

def motifs(self) -> typing.Iterable[ProteinFeature]:
"""
Returns:
Iterable[ProteinFeature]: A subgroup of the protein features that correspond to motifs.
"""
return filter(lambda f: f.feature_type == FeatureType.MOTIF, self.protein_features)
return filter(
lambda f: f.feature_type == FeatureType.MOTIF, self.protein_features
)

def get_features_variant_overlaps(
self,
Expand All @@ -334,21 +399,26 @@ def get_features_variant_overlaps(
Returns:
Collection[ProteinFeature]: a collection of overlapping protein features.
"""
return [
feature for feature in self._features
return tuple(
feature
for feature in self._features
if feature.info.region.overlaps_with(region)
]
)

def __str__(self) -> str:
return f"ProteinMetadata(id={self.protein_id}, " \
f"label={self.label}, " \
f"features={str(self.protein_features)})"
return (
f"ProteinMetadata(id={self.protein_id}, "
f"label={self.label}, "
f"features={str(self.protein_features)})"
)

def __eq__(self, other) -> bool:
return isinstance(other, ProteinMetadata) \
and self.label == other.label \
and self.protein_features == other.protein_features \
return (
isinstance(other, ProteinMetadata)
and self.label == other.label
and self.protein_features == other.protein_features
and self.protein_id == other.protein_id
)

def __hash__(self) -> int:
return hash((self.protein_id, self.label, self._features))
Expand Down
4 changes: 2 additions & 2 deletions tests/preprocessing/test_protein_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def itpr1_protein_metadata(self) -> ProteinMetadata:
protein_id=ITPR1_protein_id,
label=ITPR1_protein_id,
features=df,
length=ITPR1_protein_len,
protein_length=ITPR1_protein_len,
)

def test_general_info(self, itpr1_protein_metadata: ProteinMetadata):
Expand Down Expand Up @@ -78,7 +78,7 @@ def test_malformed_protein_metadata(self):
protein_id=ITPR1_protein_id,
label=ITPR1_protein_id,
features=df,
length=ITPR1_protein_len,
protein_length=ITPR1_protein_len,
)

assert e.value.args[0] == "The column(s) {category} are missing from the `features` DataFrame: ('region', 'start', 'end')"
Expand Down

0 comments on commit 47e3136

Please sign in to comment.