Show how to create a custom protein meta.

monarch-initiative · Sep 19, 2024 · 47e3136 · 47e3136
1 parent 2c508f3
commit 47e3136
Show file tree

Hide file tree

Showing 2 changed files with 117 additions and 47 deletions.
diff --git a/src/gpsea/model/_protein.py b/src/gpsea/model/_protein.py
@@ -12,11 +12,11 @@ class FeatureInfo:
     """
     `FeatureInfo` represents a protein feature
     (e.g. a repeated sequence given the name "ANK 1" in protein "Ankyrin repeat domain-containing protein 11")
-"""
+    """
 
     def __init__(self, name: str, region: Region):
-        self._name = hpotk.util.validate_instance(name, str, 'name')
-        self._region = hpotk.util.validate_instance(region, Region, 'region')
+        self._name = hpotk.util.validate_instance(name, str, "name")
+        self._region = hpotk.util.validate_instance(region, Region, "region")
 
     @property
     def name(self) -> str:
@@ -54,9 +54,11 @@ def __len__(self):
         return len(self._region)
 
     def __eq__(self, other) -> bool:
-        return isinstance(other, FeatureInfo) \
-            and self.name == other.name \
+        return (
+            isinstance(other, FeatureInfo)
+            and self.name == other.name
             and self.region == other.region
+        )
 
     def __hash__(self):
         return hash((self._name, self._region))
@@ -72,12 +74,12 @@ class FeatureType(enum.Enum):
     """
     An enum representing the protein feature types supported in GPSEA.
     """
-    
+
     REPEAT = enum.auto()
     """
     A repeated sequence motif or repeated domain within the protein.
     """
-    
+
     MOTIF = enum.auto()
     """
     A short (usually not more than 20 amino acids) conserved sequence motif of biological significance.
@@ -105,7 +107,7 @@ def from_string(category: str) -> "FeatureType":
         elif cat_lover == "region":
             return FeatureType.REGION
         else:
-            raise ValueError(f"Unrecognized protein feature type: \"{category}\"")
+            raise ValueError(f'Unrecognized protein feature type: "{category}"')
 
 
 class ProteinFeature(metaclass=abc.ABCMeta):
@@ -135,6 +137,7 @@ class SimpleProteinFeature(ProteinFeature):
     """
     An implementation of a `ProteinFeature`.
     """
+
     # Not part of the public API.
 
     def __init__(self, info: FeatureInfo, feature_type: FeatureType):
@@ -148,7 +151,9 @@ def __init__(self, info: FeatureInfo, feature_type: FeatureType):
             raise ValueError(f"info must be type FeatureInfo but was type {type(info)}")
         self._info = info
         if not isinstance(feature_type, FeatureType):
-            raise ValueError(f"feature_type must be type FeatureType but was type {type(feature_type)}")
+            raise ValueError(
+                f"feature_type must be type FeatureType but was type {type(feature_type)}"
+            )
         self._type = feature_type
 
     @property
@@ -168,16 +173,17 @@ def feature_type(self) -> FeatureType:
         return self._type
 
     def __str__(self) -> str:
-        return f"SimpleProteinFeature(type={self._type}, " \
-               f"info={self._info})"
+        return f"SimpleProteinFeature(type={self._type}, " f"info={self._info})"
 
     def __repr__(self) -> str:
         return str(self)
 
     def __eq__(self, other) -> bool:
-        return isinstance(other, SimpleProteinFeature) \
-            and self._type == other._type \
+        return (
+            isinstance(other, SimpleProteinFeature)
+            and self._type == other._type
             and self._info == other._info
+        )
 
     def __hash__(self) -> int:
         return hash((self._type, self._info))
@@ -190,19 +196,67 @@ class ProteinMetadata:
 
     The information is usually retrieved from a resource such as :class:`~gpsea.preprocessing.UniprotMetadataService`,
     but it can also be created manually using :meth:`~gpsea.model.ProteinMetadata.from_feature_frame` function.
+
+    Example
+    -------
+
+    Let's create a protein info with a domain and a region. We must provide protein accession ID,
+    a label, a data frame with protein features, and the number of aminoacids of the protein sequence:
+
+    >>> protein_id = 'NP_000129.3'
+    >>> label = 'fibrillin-1 isoform a preproprotein'
+    >>> protein_length = 1000
+
+    Now let's prepare a data frame with the protein features. We will prepare a domain and a region:
+
+    >>> import pandas as pd
+    >>> features = [
+    ...     {
+    ...         "region": "Suppresor domain",
+    ...         "category": "domain",
+    ...         "start": 1,
+    ...         "end": 223,
+    ...     },
+    ...     {
+    ...         "region": "IP3 binding",
+    ...         "category": "region",
+    ...         "start": 224,
+    ...         "end": 578,
+    ...     },
+    ... ]
+    >>> df = pd.DataFrame(features)
+
+    last, we can put the protein info together:
+
+    >>> from gpsea.model import ProteinMetadata
+    >>> protein_meta = ProteinMetadata.from_feature_frame(
+    ...     protein_id=protein_id,
+    ...     label=label,
+    ...     features=df,
+    ...     protein_length=protein_length,
+    ... )
+
+    and get the expected protein info:
+
+    >>> protein_meta.protein_id
+    'NP_000129.3'
+    >>> protein_meta.label
+    'fibrillin-1 isoform a preproprotein'
+    >>> len(protein_meta.protein_features)
+    2
     """
 
     @staticmethod
     def from_feature_frame(
         protein_id: str,
         label: str,
         features: pd.DataFrame,
-        length: int,
+        protein_length: int,
     ) -> "ProteinMetadata":
         """
         Create `ProteinMetadata` from a user-supplied pandas DataFrame.
         We expect to obtain the gene symbol, protein identifier, and regions
-        
+
         The DataFrame should include the following columns:
 
         +------------------+----------+----------------+
@@ -218,16 +272,16 @@ def from_feature_frame(
         Use `region` if no other option fits. Last, `start` and `end` denote 1-based start and end coordinates
         of the aminoacid sequence region described by the feature.
         For instance, `[1, 10]` for the first ten aminoacids of the protein.
-        
-        :param protein_id: the accession id of the protein, e.g. `NP_000129.3`.
-        :param label: human-readable label, e.g. `fibrillin-1 isoform a preproprotein`.
+
+        :param protein_id: the accession id of the protein, e.g. `'NP_000129.3'`.
+        :param label: human-readable label, e.g. `'fibrillin-1 isoform a preproprotein'`.
         :param features: a dataframe with of the protein features.
-        :param length: a positive `int` representing the number of aminoacids included in the protein sequence.
+        :param protein_length: a positive `int` representing the number of aminoacids included in the protein sequence.
         :raises ValueError: if case of issues during parsing the provided data.
         """
         expected_headers = ("region", "start", "end", "category")
         if any(col_name not in features.columns for col_name in expected_headers):
-            missing_cols = ', '.join(set(expected_headers).difference(features.columns))
+            missing_cols = ", ".join(set(expected_headers).difference(features.columns))
             raise ValueError(
                 f"The column(s) {{{missing_cols}}} are missing from the `features` DataFrame: "
                 f"{tuple(features.columns)}"
@@ -239,11 +293,18 @@ def from_feature_frame(
             region_end = row["end"]
             region_category = row["category"]
             feature_type = FeatureType.from_string(region_category)
-            finfo = FeatureInfo(name=region_name, region=Region(start=region_start, end=region_end))
+            finfo = FeatureInfo(
+                name=region_name, region=Region(start=region_start, end=region_end)
+            )
             pfeature = ProteinFeature.create(info=finfo, feature_type=feature_type)
             region_list.append(pfeature)
-
-        return ProteinMetadata(protein_id=protein_id, label=label, protein_features=region_list, protein_length=length)
+
+        return ProteinMetadata(
+            protein_id=protein_id,
+            label=label,
+            protein_features=region_list,
+            protein_length=protein_length,
+        )
 
     def __init__(
         self,
@@ -265,32 +326,28 @@ def __init__(
     @property
     def protein_id(self) -> str:
         """
-        Returns:
-            string: A string unique to this protein
+        Get the protein's accession ID, e.g. `NP_000129.3`.
         """
         return self._id
 
     @property
     def label(self) -> str:
         """
-        Returns:
-            string: The full name of the protein
+        Get the protein label, e.g. `fibrillin-1 isoform a preproprotein`.
         """
         return self._label
 
     @property
     def protein_features(self) -> typing.Sequence[ProteinFeature]:
         """
-        Returns:
-            Sequence[ProteinFeature]: A sequence of ProteinFeatures objects
+        Get a sequence of protein features.
         """
         return self._features
-    
+
     @property
     def protein_length(self) -> int:
         """
-        Returns:
-            int: length of protein
+        Get the number of aminoacids of the protein sequence.
         """
         return self._protein_length
 
@@ -299,28 +356,36 @@ def domains(self) -> typing.Iterable[ProteinFeature]:
         Returns:
             Iterable[ProteinFeature]: A subgroup of the protein features that correspond to protein domains.
         """
-        return filter(lambda f: f.feature_type == FeatureType.DOMAIN, self.protein_features)
+        return filter(
+            lambda f: f.feature_type == FeatureType.DOMAIN, self.protein_features
+        )
 
     def repeats(self) -> typing.Iterable[ProteinFeature]:
         """
         Returns:
             Iterable[ProteinFeature]: A subgroup of the protein features that correspond to repeat regions.
         """
-        return filter(lambda f: f.feature_type == FeatureType.REPEAT, self.protein_features)
+        return filter(
+            lambda f: f.feature_type == FeatureType.REPEAT, self.protein_features
+        )
 
     def regions(self) -> typing.Iterable[ProteinFeature]:
         """
         Returns:
             Iterable[ProteinFeature]: A subgroup of the protein features that correspond to generic regions.
         """
-        return filter(lambda f: f.feature_type == FeatureType.REGION, self.protein_features)
+        return filter(
+            lambda f: f.feature_type == FeatureType.REGION, self.protein_features
+        )
 
     def motifs(self) -> typing.Iterable[ProteinFeature]:
         """
         Returns:
             Iterable[ProteinFeature]: A subgroup of the protein features that correspond to motifs.
         """
-        return filter(lambda f: f.feature_type == FeatureType.MOTIF, self.protein_features)
+        return filter(
+            lambda f: f.feature_type == FeatureType.MOTIF, self.protein_features
+        )
 
     def get_features_variant_overlaps(
         self,
@@ -334,21 +399,26 @@ def get_features_variant_overlaps(
         Returns:
             Collection[ProteinFeature]: a collection of overlapping protein features.
         """
-        return [
-            feature for feature in self._features 
+        return tuple(
+            feature
+            for feature in self._features
             if feature.info.region.overlaps_with(region)
-        ]        
+        )
 
     def __str__(self) -> str:
-        return f"ProteinMetadata(id={self.protein_id}, " \
-               f"label={self.label}, " \
-               f"features={str(self.protein_features)})"
+        return (
+            f"ProteinMetadata(id={self.protein_id}, "
+            f"label={self.label}, "
+            f"features={str(self.protein_features)})"
+        )
 
     def __eq__(self, other) -> bool:
-        return isinstance(other, ProteinMetadata) \
-            and self.label == other.label \
-            and self.protein_features == other.protein_features \
+        return (
+            isinstance(other, ProteinMetadata)
+            and self.label == other.label
+            and self.protein_features == other.protein_features
             and self.protein_id == other.protein_id
+        )
 
     def __hash__(self) -> int:
         return hash((self.protein_id, self.label, self._features))

diff --git a/tests/preprocessing/test_protein_metadata.py b/tests/preprocessing/test_protein_metadata.py
@@ -32,7 +32,7 @@ def itpr1_protein_metadata(self) -> ProteinMetadata:
             protein_id=ITPR1_protein_id,
             label=ITPR1_protein_id,
             features=df,
-            length=ITPR1_protein_len,
+            protein_length=ITPR1_protein_len,
         )
 
     def test_general_info(self, itpr1_protein_metadata: ProteinMetadata):
@@ -78,7 +78,7 @@ def test_malformed_protein_metadata(self):
                 protein_id=ITPR1_protein_id,
                 label=ITPR1_protein_id,
                 features=df,
-                length=ITPR1_protein_len,
+                protein_length=ITPR1_protein_len,
             )
 
         assert e.value.args[0] == "The column(s) {category} are missing from the `features` DataFrame: ('region', 'start', 'end')"