Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding zinc finger to ProteinFeatures #346

Merged
merged 6 commits into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 24 additions & 20 deletions src/gpsea/model/_protein.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,21 +118,28 @@ class FeatureType(enum.Enum):
A region of interest that cannot be described in other subsections.
"""

ZINC_FINGER = enum.auto()
"""
A zinc finger is a small, functional, independently folded domain that coordinates one or more zinc ions to stabilize its structure through cysteine and/or histidine residues.
"""

@staticmethod
def from_string(category: str) -> "FeatureType":
cat_lover = category.lower()
if cat_lover == "repeat":
cat_lower = category.lower()
if cat_lower == "repeat":
return FeatureType.REGION
elif cat_lover == "motif":
elif cat_lower == "motif":
return FeatureType.MOTIF
elif cat_lover == "domain":
elif cat_lower == "domain":
return FeatureType.DOMAIN
elif cat_lover == "region":
elif cat_lower == "region":
return FeatureType.REGION
elif cat_lover == "coiled coil":
elif cat_lower == "coiled coil":
return FeatureType.REGION
elif cat_lover == "compositional bias":
elif cat_lower == "compositional bias":
return FeatureType.COMPOSITIONAL_BIAS
elif cat_lower == "zinc finger":
return FeatureType.ZINC_FINGER
else:
raise ValueError(f'Unrecognized protein feature type: "{category}"')

Expand Down Expand Up @@ -361,19 +368,16 @@ def from_uniprot_json(

regions = list()
for feature in data["features"]:
try:
region_name = feature["description"]
locus = feature["location"]
region_start = int(locus["start"]["value"]) - 1 # convert to 0-based coordinates
region_end = int(locus["end"]["value"])
feature_type = FeatureType.from_string(feature["type"])
finfo = FeatureInfo(
name=region_name, region=Region(start=region_start, end=region_end)
)
pfeature = ProteinFeature.create(info=finfo, feature_type=feature_type)
regions.append(pfeature)
except Exception as feature_exception:
print(f"Could not parse feature: {str(feature_exception)} (skipping)")
region_name = feature["description"]
locus = feature["location"]
region_start = int(locus["start"]["value"]) - 1 # convert to 0-based coordinates
region_end = int(locus["end"]["value"])
feature_type = FeatureType.from_string(feature["type"])
finfo = FeatureInfo(
name=region_name, region=Region(start=region_start, end=region_end)
)
pfeature = ProteinFeature.create(info=finfo, feature_type=feature_type)
regions.append(pfeature)

return ProteinMetadata(
protein_id=protein_id,
Expand Down
4 changes: 3 additions & 1 deletion src/gpsea/preprocessing/_uniprot.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,9 @@ def annotate(self, protein_id: str) -> ProteinMetadata:
Args:
protein_id (str): A protein ID
Returns:
Sequence[ProteinMetadata]: A sequence of ProteinMetadata objects, or an empty sequence if no data was found.
ProteinMetadata: A :class:`~gpsea.model.ProteinMetadata` corresponding to the input `protein_id`.
Raises:
ValueError: in case of issues with `protein_id`, I/O issues, or parsing the REST response.
"""
if not isinstance(protein_id, str):
raise ValueError(f'Protein ID must be a str but it was {type(protein_id)}')
Expand Down
282 changes: 282 additions & 0 deletions tests/preprocessing/data/uniprot_response/P17010_manual_download.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,282 @@
{
"entryType": "UniProtKB reviewed (Swiss-Prot)",
"primaryAccession": "P17010",
"features": [
{
"type": "Zinc finger",
"location": {
"start": {
"value": 425,
"modifier": "EXACT"
},
"end": {
"value": 447,
"modifier": "EXACT"
}
},
"description": "C2H2-type 1",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 456,
"modifier": "EXACT"
},
"end": {
"value": 478,
"modifier": "EXACT"
}
},
"description": "C2H2-type 2",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 488,
"modifier": "EXACT"
},
"end": {
"value": 510,
"modifier": "EXACT"
}
},
"description": "C2H2-type 3",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 519,
"modifier": "EXACT"
},
"end": {
"value": 542,
"modifier": "EXACT"
}
},
"description": "C2H2-type 4",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 548,
"modifier": "EXACT"
},
"end": {
"value": 570,
"modifier": "EXACT"
}
},
"description": "C2H2-type 5",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 576,
"modifier": "EXACT"
},
"end": {
"value": 599,
"modifier": "EXACT"
}
},
"description": "C2H2-type 6",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 605,
"modifier": "EXACT"
},
"end": {
"value": 627,
"modifier": "EXACT"
}
},
"description": "C2H2-type 7",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 633,
"modifier": "EXACT"
},
"end": {
"value": 656,
"modifier": "EXACT"
}
},
"description": "C2H2-type 8",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 662,
"modifier": "EXACT"
},
"end": {
"value": 684,
"modifier": "EXACT"
}
},
"description": "C2H2-type 9",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 690,
"modifier": "EXACT"
},
"end": {
"value": 713,
"modifier": "EXACT"
}
},
"description": "C2H2-type 10",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 719,
"modifier": "EXACT"
},
"end": {
"value": 741,
"modifier": "EXACT"
}
},
"description": "C2H2-type 11",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 747,
"modifier": "EXACT"
},
"end": {
"value": 770,
"modifier": "EXACT"
}
},
"description": "C2H2-type 12",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
},
{
"type": "Zinc finger",
"location": {
"start": {
"value": 776,
"modifier": "EXACT"
},
"end": {
"value": 798,
"modifier": "EXACT"
}
},
"description": "C2H2-type 13",
"evidences": [
{
"evidenceCode": "ECO:0000255",
"source": "PROSITE-ProRule",
"id": "PRU00042"
}
]
}
],
"extraAttributes": {
"uniParcId": "UPI000013C504"
}
}
Loading
Loading