From 9beeb0fff3fd7afbea128194e96903f46187291f Mon Sep 17 00:00:00 2001 From: Sam Cox Date: Sat, 22 Jun 2024 20:22:17 -0700 Subject: [PATCH] Pdb tools (#141) --- mdagent/agent/agent.py | 1 - mdagent/tools/base_tools/__init__.py | 40 +- .../base_tools/preprocess_tools/pdb_get.py | 3 +- .../base_tools/preprocess_tools/uniprot.py | 1273 +++++++++++++++++ mdagent/tools/maketools.py | 36 + tests/test_preprocess/test_uniprot.py | 564 ++++++++ 6 files changed, 1914 insertions(+), 3 deletions(-) create mode 100644 mdagent/tools/base_tools/preprocess_tools/uniprot.py create mode 100644 tests/test_preprocess/test_uniprot.py diff --git a/mdagent/agent/agent.py b/mdagent/agent/agent.py index a5ce7092..760e8a8c 100644 --- a/mdagent/agent/agent.py +++ b/mdagent/agent/agent.py @@ -40,7 +40,6 @@ def __init__( model="gpt-4-1106-preview", # current name for gpt-4 turbo tools_model="gpt-4-1106-preview", temp=0.1, - max_iterations=40, verbose=True, ckpt_dir="ckpt", top_k_tools=20, # set "all" if you want to use all tools diff --git a/mdagent/tools/base_tools/__init__.py b/mdagent/tools/base_tools/__init__.py index 05ede489..cdedc3dd 100644 --- a/mdagent/tools/base_tools/__init__.py +++ b/mdagent/tools/base_tools/__init__.py @@ -15,6 +15,26 @@ from .preprocess_tools.clean_tools import CleaningToolFunction from .preprocess_tools.packing import PackMolTool from .preprocess_tools.pdb_get import ProteinName2PDBTool, SmallMolPDB, get_pdb +from .preprocess_tools.uniprot import ( + GetActiveSites, + GetAllKnownSites, + GetAllSequences, + GetBindingSites, + GetGeneNames, + GetInteractions, + GetKineticProperties, + GetPDB3DInfo, + GetPDBProcessingInfo, + GetProteinAssociatedKeywords, + GetProteinFunction, + GetRelevantSites, + GetSequenceInfo, + GetSubunitStructure, + GetTurnsBetaSheetsHelices, + GetUniprotID, + MapProteinRepresentation, + UniprotID2Name, +) from .simulation_tools.create_simulation import ModifyBaseSimulationScriptTool from .simulation_tools.setup_and_run import ( SetUpandRunFunction, @@ -27,6 +47,22 @@ __all__ = [ "CleaningToolFunction", + "GetActiveSites", + "GetAllKnownSites", + "GetAllSequences", + "GetBindingSites", + "GetGeneNames", + "GetInteractions", + "GetKineticProperties", + "GetPDB3DInfo", + "GetPDBProcessingInfo", + "GetProteinAssociatedKeywords", + "GetProteinFunction", + "GetRelevantSites", + "GetSequenceInfo", + "GetSubunitStructure", + "GetTurnsBetaSheetsHelices", + "GetUniprotID", "ComputeLPRMSD", "ComputeRMSD", "ComputeRMSF", @@ -34,6 +70,7 @@ "DistanceMatrixTool", "ListRegistryPaths", "MapPath2Name", + "MapProteinRepresentation", "ModifyBaseSimulationScriptTool", "MomentOfInertia", "PackMolTool", @@ -47,11 +84,12 @@ "RMSDCalculator", "Scholar2ResultLLM", "SerpGitTool", - "SetUpandRunFunction", "SetUpAndRunTool", + "SetUpandRunFunction", "SimulationFunctions", "SimulationOutputFigures", "SmallMolPDB", + "UniprotID2Name", "SolventAccessibleSurfaceArea", "VisFunctions", "VisualizeProtein", diff --git a/mdagent/tools/base_tools/preprocess_tools/pdb_get.py b/mdagent/tools/base_tools/preprocess_tools/pdb_get.py index 031e6885..eac925ec 100644 --- a/mdagent/tools/base_tools/preprocess_tools/pdb_get.py +++ b/mdagent/tools/base_tools/preprocess_tools/pdb_get.py @@ -33,7 +33,8 @@ def get_pdb(query_string: str, path_registry: PathRegistry): else: filetype = "pdb" if "result_set" in r.json() and len(r.json()["result_set"]) > 0: - pdbid = r.json()["result_set"][0]["identifier"] + results = r.json()["result_set"] + pdbid = max(results, key=lambda x: x["score"])["identifier"] print(f"PDB file found with this ID: {pdbid}") st.markdown(f"PDB file found with this ID: {pdbid}", unsafe_allow_html=True) url = f"https://files.rcsb.org/download/{pdbid}.{filetype}" diff --git a/mdagent/tools/base_tools/preprocess_tools/uniprot.py b/mdagent/tools/base_tools/preprocess_tools/uniprot.py new file mode 100644 index 00000000..3d6c03e0 --- /dev/null +++ b/mdagent/tools/base_tools/preprocess_tools/uniprot.py @@ -0,0 +1,1273 @@ +import time +from enum import Enum + +import requests +from langchain.tools import BaseTool +from requests.adapters import HTTPAdapter +from urllib3.util import Retry + + +class SiteType(Enum): + ACTIVE = ("ft_act_site", "active site") + BINDING = ("ft_binding", "binding site") + SITES = ("ft_site", "site") + + +class PTMType(Enum): + CHAIN = ("ft_chain", "Chain") + CROSSLINK = ("ft_crosslnk", "Cross-link") + DISULFIDE_BOND = ("ft_disulfid", "Disulfide bond") + GLYCOSYLATION = ("ft_carbohyd", "Glycosylation") + INITIATOR_METHIONINE = ("ft_init_met", "Initiator methionine") + LIPIDATION = ("ft_lipid", "Lipidation") + MODIFIED_RESIDUE = ("ft_mod_res", "Modified residue") + PEPTIDE = ("ft_peptide", "Peptide") + PROPEPTIDE = ("ft_propep", "Propeptide") + SIGNAL_PEPTIDE = ("ft_signal", "Signal peptide") + TRANSIT_PEPTIDE = ("ft_transit", "Transit peptide") + + +class StructureMap(Enum): + BETA = ("ft_strand", "Beta strand") + HELIX = ("ft_helix", "Helix") + TURN = ("ft_turn", "Turn") + + +class QueryUniprot: + API_URL = "https://rest.uniprot.org" + + def get_sequence_mapping( + self, + query: str, + from_db: str = "UniProtKB_AC-ID", + to_db: str = "PDB", + polling_interval: int = 3, + ) -> list: + """ + Fetch specific ID mapping from UniProt and extract the 'to' field f + rom results. + + Args: + query: The UniProt ID to map (e.g. 'P05067') + from_db: The source database to map from. + Defaults to 'UniProtKB_AC-ID'. + to_db: The target database to map to. Defaults to 'PDB'. + polling_interval: The interval to poll the API for results. + Defaults to 3 seconds. + + Returns: + A list of mapped database entries from the 'to' field if + successful, otherwise an empty list. + """ + with requests.Session() as session: + session.mount( + "https://", + HTTPAdapter( + max_retries=Retry( + total=5, + backoff_factor=0.25, + status_forcelist=[500, 502, 503, 504], + ) + ), + ) + try: + response = session.post( + f"{self.API_URL}/idmapping/run", + data={"from": from_db, "to": to_db, "ids": query}, + ) + response.raise_for_status() + job_id = response.json()["jobId"] + + while True: + response = session.get(f"{self.API_URL}/idmapping/status/{job_id}") + response.raise_for_status() + status_data = response.json() + if status_data.get("jobStatus") == "RUNNING": + print(f"Job is running. Retrying in {polling_interval}s.") + time.sleep(polling_interval) + else: + break + + response = session.get(f"{self.API_URL}/idmapping/details/{job_id}") + response.raise_for_status() + results_link = response.json().get("redirectURL") + + response = session.get(results_link) + response.raise_for_status() + if response.headers["Content-Type"] != "application/json": + raise ValueError( + "Expected JSON response but got a different format." + ) + + results_json = response.json() + results = results_json.get("results", []) + return [r["to"] for r in results] + except requests.HTTPError as http_err: + print(f"HTTP error occurred: {http_err}") + return [] + except Exception as err: + print(f"An error occurred: {err}") + return [] + + def get_data( + self, query: str, desired_field: str, format_type: str = "json" + ) -> list | None: + """ + Helper function to get data from the Uniprot API. + + Args: + query: The query string to search (e.g. 'hemoglobin') + desired_field: The desired field to retrieve from the API + format_type: The format of the data to retrieve. Defaults to 'json'. + + Returns: + The data retrieved from the API or None if no data is found. + """ + with requests.Session() as session: + session.mount( + "https://", + HTTPAdapter( + max_retries=Retry( + total=5, + backoff_factor=0.25, + status_forcelist=[500, 502, 503, 504], + ) + ), + ) + url = f"https://rest.uniprot.org/uniprotkb/search?fields={desired_field}&format={format_type}&query={query}" + try: + response = session.get(url) + response.raise_for_status() + data = response.json() + except requests.HTTPError: + print( + "Requested query not found, " + "please try again with a valid protein identifier." + ) + return None + if "results" not in data or not data["results"]: + raise ValueError( + "Requested query not found, " + "please try again with a valid protein identifier." + ) + return data["results"] + + def _match_primary_accession(self, data: list, primary_accession: str = "") -> list: + """ + Helper function to match the primary accession number with the data. + + Args: + data: The data to search through + primary_accession: The primary accession number to match + + Returns: + The relevant data entry for the primary accession number or + the first entry if no match is found. + """ + if primary_accession: + matched_data = next( + ( + entry + for entry in data + if entry["primaryAccession"] == primary_accession + ), + None, + ) + if matched_data: + return [matched_data] + print( + "The primary accession number provided does not " + "match any entry in the data, using the first entry instead." + ) + return [data][0] + return [data][0] + + def get_protein_name( + self, + query: str, + primary_accession: str | None = None, + short_names: bool = True, + alternative_names: bool = True, + ) -> list: + """ + Get the protein name for a specific protein, with the option to + filter by primary accession number and to include alternative + and shortened names. + + Args: + query: The query string to search + primary_accession: The primary accession number of the protein. + Defaults to None. + short_names: Whether to include short names in the results. Defaults + to True. + alternative_names: Whether to include alternative names in the + results. Defaults to True. + + Returns: + The protein name for the protein if found, otherwise an empty list. + If primary_accession is provided, returns the protein name + associated with that primary accession number, otherwise returns + all the protein names associated with the protein. + """ + data = self.get_data(query, desired_field="protein_name") + if not data: + return [] + if primary_accession: + data = [ + entry + for entry in data + if entry["primaryAccession"] == primary_accession + ] + + def _parse_names(recommended_names: dict, short_names: bool = True): + full_name = recommended_names["fullName"]["value"].split(",") + if not short_names: + return full_name + all_shortnames = recommended_names.get("shortNames", []) + short = [name["value"] for name in all_shortnames] if all_shortnames else [] + return full_name + short + + names = [] + for d in data: + protein_description = d["proteinDescription"] + recommended_names = protein_description["recommendedName"] + names.extend(_parse_names(recommended_names, short_names=short_names)) + if alternative_names: + alt_names_data = protein_description.get("alternativeNames", []) + names.extend( + _parse_names(alt_names_data[0], short_names=short_names) + if alt_names_data + else [] + ) + return names + + def _site_key(self, site_type: str) -> tuple[str, str]: + """ + Helper function to get the desired field and associated key for + sites (active, binding, or sites). + + Args: + site_type: The type of site to retrieve + + Returns: + The desired field and associated key for the type + + Raises: + ValueError: If an invalid type is provided + """ + try: + site_type_map = SiteType[site_type.upper()] + except KeyError as e: + valid_types = ", ".join(f"'{s_type.name}'" for s_type in SiteType) + raise ValueError( + f"Invalid site type '{site_type}'. Valid types are: {valid_types}." + ) from e + + return site_type_map.value + + def get_relevant_sites( + self, + query: str, + primary_accession: str, + site_type: str, + ) -> list[dict]: + """ + Get the relevant sites, active sites, or binding sites for a + specific protein, given the primary accession number. + + Args: + query: The query string to search (e.g. 'hemoglobin') + primary_accession: The primary accession number of the protein + site_type: The type of site to retrieve + + Returns: + The relevant sites for the protein with the given primary accession number + The list contains a dict for each site with the following keys: + - 'start': The start position of the site + - 'start_modifier': The start position modifier of the site + - 'end': The end position of the site + - 'end_modifier': The end position modifier of the site + - 'description': The description of the site + - 'evidences': The evidences for the site + """ + desired_field, associated_key = self._site_key(site_type) + if not desired_field: + return [] + data = self.get_data(query, desired_field=desired_field) + if not data: + return [] + data = self._match_primary_accession(data, primary_accession) + all_sites = {} + features = [ + feature + for feature in data[0]["features"] + if feature["type"].lower() == associated_key + ] + all_sites[primary_accession] = features + if not all_sites: + return [] + relevant_sites = all_sites.get(primary_accession) + if not relevant_sites: + return [] + + sites = [] + for site in relevant_sites: + start = site["location"]["start"]["value"] + start_modifier = site["location"]["start"].get("modifier", "") + end = site["location"]["end"]["value"] + end_modifier = site["location"]["end"].get("modifier", "") + description = site["description"] + evidences = site.get("evidences", []) + sites.append( + { + "start": start, + "start_modifier": start_modifier, + "end": end, + "end_modifier": end_modifier, + "description": description, + "evidences": evidences, + } + ) + return sites + + def get_protein_function( + self, query: str, primary_accession: str | None = None + ) -> list: + """ + Get the protein function for a specific protein, with the option to + filter by primary accession number. + + Args: + query: The query string to search (e.g. 'hemoglobin') + primary_accession: The primary accession number of the protein. + Defaults to None. + + Returns: + The protein function for the protein. + If primary_accession is provided, returns the protein function + associated with that primary accession number, otherwise returns + all the protein functions associated with the protein. + """ + data = self.get_data(query, desired_field="cc_function") + if not data: + return [] + if primary_accession: + data = self._match_primary_accession(data, primary_accession) + return [ + entry["comments"] + for entry in data + if "commentType" not in entry["comments"] + ] + + def get_keywords(self, query: str, primary_accession: str | None = None) -> list: + """ + Get the keywords for a specific protein, with the option to filter by + primary accession number. + + Args: + query: The query string to search (e.g. 'hemoglobin') + primary_accession: The primary accession number of the protein. + Defaults to None. + + Returns: + The keywords for the protein. + If primary_accession is provided, returns the keywords + associated with that primary accession number. Otherwise, + returns all the keywords associated with the protein + """ + keywords = self.get_data(query, desired_field="keyword") + if not keywords: + return [] + if primary_accession: + keywords = self._match_primary_accession(keywords, primary_accession) + return [ + f"{entry['category']}: {entry['name']}" + for entry in keywords[0]["keywords"] + ] + return [ + f"{entry['category']}: {entry['name']}" + for kw_row in keywords + for entry in kw_row["keywords"] + ] + + def get_all_sequences(self, query: str) -> list: + """ + Get all the sequences for a specific protein. + + Args: + query: The query string to search (e.g. 'hemoglobin') + + Returns: + The sequences for the protein + """ + data = self.get_data(query, desired_field="sequence") + return [entry["sequence"]["value"] for entry in data] if data else [] + + def get_interactions(self, query: str, primary_accession: str) -> list: + """ + Get the interactions for a specific protein, given the primary accession + number. + + Args: + query: The query string to search (e.g. 'hemoglobin') + primary_accession: The primary accession number of the protein + (required) + + Returns: + The interactions for the protein with the given primary accession + number + """ + data = self.get_data(query, desired_field="cc_interaction") + if not data: + return [] + data = self._match_primary_accession(data, primary_accession) + return next( + comment["interactions"] + for interaction in data + for comment in interaction["comments"] + ) + + def get_subunit_structure(self, query: str, primary_accession: str) -> list: + """ + Get the subunit structure information for a specific protein, given the + primary accession number. + + Args: + query: The query string to search (e.g. 'hemoglobin') + primary_accession: The primary accession number of the protein + + Returns: + The subunit structure information for the protein with the given + primary accession number, along with the evidence + """ + data = self.get_data(query, desired_field="cc_subunit") + if not data: + return [] + data = self._match_primary_accession(data, primary_accession) + texts = [comment["texts"] for comment in data[0]["comments"]] + if not texts: + print("No subunit structure information found.") + return [] + return [ + { + "subunit structure": text["value"], + "evidence": text.get("evidences", "No evidence provided"), + } + for text_list in texts + for text in text_list + ] + + def get_sequence_info(self, query: str, primary_accession: str) -> dict: + """ + Get the sequence information for a specific protein, given the primary + accession number. + + Args: + query: The query string to search (e.g. 'hemoglobin') + primary_accession: The primary accession number of the protein + + Returns: + The sequence information for the protein with the given accession + The dictionary contains the following keys: + - 'sequence': The sequence of the protein + - 'length': The length of the protein sequence + - 'molWeight': The molecular weight of the protein + - 'crc64': The CRC64 hash of the protein sequence (probably not useful) + - 'md5': The MD5 hash of the protein sequence (probably not useful) + """ + seq_info = self.data = self.get_data(query, desired_field="sequence") + if not seq_info: + return {} + seq_info_specific = self._match_primary_accession(seq_info, primary_accession)[ + 0 + ]["sequence"] + seq_info_specific["sequence"] = seq_info_specific.pop("value") + return seq_info_specific + + def _ptm_key(self, ptm_key: str) -> tuple[str, str]: + """ + Helper function to get the desired field and associated key for PTM/ + Processing (e.g., chain, crosslink, disulfide-bond, etc.). + + Args: + ptm_key: The PTM/Processing key to retrieve. + + Returns: + The desired field and associated key for the PTM/Processing key. + + Raises: + ValueError: If an invalid PTM/Processing key is provided. + """ + normalized_key = ptm_key.replace(" ", "_").replace("-", "_").lower() + try: + ptm_type = PTMType[normalized_key.upper()] + except KeyError as e: + valid_keys = ", ".join( + f"'{key.name.replace('_', ' ').lower()}'" for key in PTMType + ) + raise ValueError( + "Invalid PTM/Processing key, " + f"please use one of the following: {valid_keys}." + ) from e + return ptm_type.value + + def get_ptm_processing_info( + self, + query: str, + primary_accession: str, + ptm_key: str, + ) -> list[dict]: + """ + Get the ptm/processing information for a specific protein, given the + primary accession number. + + Args: + query: The query string to search (e.g. 'hemoglobin') + primary_accession: The primary accession number of the protein + ptm_key: The PTM/Processing key to retrieve + + Returns: + The relevant information for the protein with the given primary + accession number + The list contains a dictionary for each object with the + following keys: + - 'start': The start position + - 'start_modifier': The start position modifier + - 'end': The end position + - 'end_modifier': The end position modifier + - 'description': The description + - 'featureId': The feature ID + """ + desired_field, associated_key = self._ptm_key(ptm_key) + if not desired_field: + return [] + data = self.get_data(query, desired_field=desired_field) + if not data: + return [] + data = self._match_primary_accession(data, primary_accession) + + structure_info = [] + relevant_fields = [ + feature + for feature in data[0]["features"] + if feature["type"] == associated_key + ] + for field in relevant_fields: + start_ = field["location"]["start"]["value"] + start_modifier = field["location"]["start"].get("modifier", "") + end_ = field["location"]["end"]["value"] + end_modifier = field["location"]["end"].get("modifier", "") + description = field.get("description", "") + featureid = field.get("featureId", "") + structure_info.append( + { + "start": start_, + "start_modifier": start_modifier, + "end": end_, + "end_modifier": end_modifier, + "description": description, + "featureId": featureid, + } + ) + return structure_info + + def _structure_key(self, structure_key: str) -> tuple[str, str]: + """ + Helper function to get the desired field and associated key for + structure beta, helix, turn). + + Args: + structure_key: The structure key to retrieve + + Returns: + The desired field and associated key for the structure key + + Raises: + ValueError: If an invalid structure key is provided + """ + try: + structure_key_map = StructureMap[structure_key.upper()] + except KeyError as e: + valid_keys = ", ".join(f"'{key.name}'" for key in StructureMap) + raise ValueError( + f"Invalid structure key '{structure_key}'. " + f"Valid keys are: {valid_keys}." + ) from e + return structure_key_map.value + + def get_3d_info(self, query: str, primary_accession: str) -> list: + """ + Get the 3D structure information for a specific protein, given the + primary accession number. + + Args: + query: The query string to search (e.g. 'hemoglobin') + primary_accession: The primary accession number of the protein + + Returns: + The 3D structure information for the protein with the given primary + accession number + """ + data = self.get_data(query, desired_field="structure_3d") + if not data: + return [] + data = self._match_primary_accession(data, primary_accession) + return data[0]["uniProtKBCrossReferences"] + + def get_structure_info( + self, + query: str, + primary_accession: str, + structure_key: str, + ) -> list[dict]: + """ + Get the structure information for a specific protein, given the primary + accession number, including either beta sheets, helices, or turns. + + Args: + query: The query string to search (e.g. 'hemoglobin') + primary_accession: The primary accession number of the protein + structure_key: The structure key to retrieve + + Returns: + The structure information for the protein with the given primary + accession number + The list contains a dictionary for each structure with the + following keys: + - 'start': The start position + - 'start_modifier': The start position modifier + - 'end': The end position + - 'end_modifier': The end position modifier + - 'evidences': The evidences for the structure + """ + desired_field, associated_key = self._structure_key(structure_key) + if not desired_field: + return [] + data = self.get_data(query, desired_field=desired_field) + if not data: + return [] + data = self._match_primary_accession(data, primary_accession) + + structure_info = [] + relevant_fields = [ + feature + for feature in data[0]["features"] + if feature["type"] == associated_key + ] + + for field in relevant_fields: + start_ = field["location"]["start"]["value"] + start_modifier = field["location"]["start"].get("modifier", "") + end_ = field["location"]["end"]["value"] + end_modifier = field["location"]["end"].get("modifier", "") + evidences = field.get("evidences", []) + structure_info.append( + { + "start": start_, + "start_modifier": start_modifier, + "end": end_, + "end_modifier": end_modifier, + "evidences": evidences, + } + ) + return structure_info + + def get_ids( + self, query: str, single_id: bool = False, include_uniprotkbids=False + ) -> list: + """ + Get the IDs for a specific protein. + + Args: + query: The query string to search (e.g. 'hemoglobin') + single_id: Whether to return a single ID or all IDs. Defaults to + False. + include_uniprotkbids: Whether to include UniProtKB IDs in the + results. Defaults to False. + + Returns: + The IDs for the protein + """ + ids_ = self.get_data(query, desired_field="id") + all_ids = [entry["primaryAccession"] for entry in ids_] if ids_ else [] + if include_uniprotkbids: + all_ids + [entry["uniProtkbId"] for entry in ids_] if ids_ else [] + accession = self.get_data(query, desired_field="accession") + all_ids + [ + entry["primaryAccession"] for entry in accession + ] if accession else [] + if single_id: + return all_ids.pop() + return list(set(all_ids)) + + def get_gene_names(self, query: str, primary_accession: str | None = None) -> list: + """ + Get the gene names for a specific protein, with the option to filter by + primary accession number. + + Args: + query: The query string to search (e.g. 'hemoglobin') + primary_accession: The primary accession number of the protein. + Defaults to None. + + Returns: + The gene names for the protein if gene names are found, otherwise an + empty list. + If primary_accession is provided, returns the gene names + associated with that primary accession number, otherwise returns + all the gene names associated with the protein. + """ + data = self.get_data(query, desired_field="gene_names") + if not data: + return [] + if primary_accession: + data = [ + entry + for entry in data + if entry["primaryAccession"] == primary_accession + ] + all_genes = [] + for i in range(len(data)): + if "genes" not in data[i]: + continue + gene_info = data[i]["genes"] + gene_name = [gene_name["geneName"]["value"] for gene_name in gene_info] + synonyms = [ + value["value"] + for synonym in gene_info + if "synonyms" in synonym + for value in synonym["synonyms"] + ] + orfNames = [ + value["value"] + for orf in gene_info + if "orfNames" in orf + for value in orf["orfNames"] + ] + orderedlocus = [ + value["value"] + for ordered in gene_info + if "orderedLocusNames" in ordered + for value in ordered["orderedLocusNames"] + ] + all_genes.extend(gene_name + synonyms + orfNames + orderedlocus) + return all_genes + + def get_kinetics(self, query: str, primary_accession: str | None = None) -> list: + """ + Get the kinetics information for a specific protein, given the primary + accession number. + + Args: + query: The query string to search (e.g. 'hemoglobin') + primary_accession: The primary accession number of the protein + + Returns: + The kinetics information for the protein with the given primary + accession number + """ + data = self.get_data(query, desired_field="kinetics") + if not data: + return [] + + if primary_accession: + data = self._match_primary_accession(data, primary_accession) + + return [entry["comments"] for entry in data if entry["comments"]] + + +class MapProteinRepresentation(BaseTool): + name = "MapProteinRepresentation" + description = ( + "Fetch specific ID mapping from UniProt. " + "You must specify the database to map from and to, " + "as well as the representation of the protein. " + "The defaults are 'UniProtKB_AC-ID' and 'PDB', respectively." + ) + uniprot = QueryUniprot() + + def _run( + self, query: str, src_db: str = "UniProtKB_AC-ID", dst_db: str = "PDB" + ) -> str: + """use the tool.""" + try: + mapped_ids = self.uniprot.get_sequence_mapping( + query, from_db=src_db, to_db=dst_db + ) + return str(mapped_ids) + except Exception as e: + return str(e) + + async def _arun(self, query: str, src_db: str | None, dst_db: str | None) -> str: + """use the tool asynchronously.""" + raise NotImplementedError("This tool does not support asynchronous execution.") + + +class UniprotID2Name(BaseTool): + name = "UniprotID2Name" + description = ( + "Get the protein name for a specific protein, " + "with the option to filter by primary accession" + "number. If you have the primary accession " + "number, you can use it to filter the results. " + "Otherwise, all names associated with the " + "protein will be returned. Input the uniprot ID" + "of the protein." + ) + uniprot = QueryUniprot() + + def __init__(self, all_names: bool = True): + super().__init__() + self.all_names = all_names + + def _run(self, query: str, primary_accession: str = "") -> str: + """use the tool.""" + try: + names = self.uniprot.get_protein_name( + query, + primary_accession=primary_accession, + short_names=self.all_names, + alternative_names=self.all_names, + ) + return ", ".join(names) + except Exception as e: + return str(e) + + async def _arun(self, query: str, primary_accession: str) -> str: + """use the tool asynchronously.""" + raise NotImplementedError("This tool does not support asynchronous execution.") + + +class GetBindingSites(BaseTool): + name = "GetBindingSites" + description = ( + "Get the binding sites known for a specific " + "protein, given the primary accession number. " + "Both the query string and primary accession " + "number are required. " + ) + uniprot = QueryUniprot() + + def _run(self, query: str, primary_accession: str = "") -> str: + """use the tool.""" + try: + sites = self.uniprot.get_relevant_sites(query, primary_accession, "binding") + return str(sites) + except Exception as e: + return str(e) + + async def _arun(self, query: str, primary_accession: str) -> str: + """use the tool asynchronously.""" + raise NotImplementedError("This tool does not support asynchronous execution.") + + +class GetActiveSites(BaseTool): + name = "GetActiveSites" + description = ( + "Get the active sites known for a specific " + "protein, given the primary accession number. " + "Both the query string and primary accession " + "number are required. " + ) + uniprot = QueryUniprot() + + def _run(self, query: str, primary_accession: str = "") -> str: + """use the tool.""" + try: + sites = self.uniprot.get_relevant_sites(query, primary_accession, "active") + return str(sites) + except Exception as e: + return str(e) + + async def _arun(self, query: str, primary_accession: str) -> str: + """use the tool asynchronously.""" + raise NotImplementedError("This tool does not support asynchronous execution.") + + +class GetRelevantSites(BaseTool): + name = "GetRelevantSites" + description = ( + "Get the relevant sites for a specific protein, " + "given the primary accession number. You must " + "provide the query string and primary accession " + "number. The relevant sites are sites that are " + "known to be important for the protein's function, " + "but are not necessarily active or binding sites." + ) + uniprot = QueryUniprot() + + def _run(self, query: str, primary_accession: str = "") -> str: + """use the tool.""" + try: + sites = self.uniprot.get_relevant_sites(query, primary_accession, "sites") + return str(sites) + except Exception as e: + return str(e) + + async def _arun(self, query: str, primary_accession: str) -> str: + """use the tool asynchronously.""" + raise NotImplementedError("This tool does not support asynchronous execution.") + + +class GetAllKnownSites(BaseTool): + name = "GetAllKnownSites" + description = ( + "Get all known sites for a specific protein, " + "given the primary accession number. You must " + "provide the query string and primary accession " + "number. This tool is a one-stop shop to get all known sites " + "for the protein, including active sites, binding " + "sites, and other relevant sites." + ) + uniprot = QueryUniprot() + + def _run(self, query: str, primary_accession: str = "") -> str: + """use the tool.""" + try: + active_sites = self.uniprot.get_relevant_sites( + query, primary_accession, "active" + ) + active_sites_msg = ( + f"Active Sites: {active_sites}" + if active_sites + else "No known active sites." + ) + + binding_sites = self.uniprot.get_relevant_sites( + query, primary_accession, "binding" + ) + binding_sites_msg = ( + f"Binding Sites: {binding_sites}" + if binding_sites + else "No known binding sites." + ) + + sites = self.uniprot.get_relevant_sites(query, primary_accession, "sites") + sites_msg = ( + f"Other Relevant Sites: {sites}" + if sites + else "No other relevant sites." + ) + + return active_sites_msg + "\n" + binding_sites_msg + "\n" + sites_msg + except Exception as e: + return str(e) + + async def _arun(self, query: str, primary_accession: str) -> str: + """use the tool asynchronously.""" + raise NotImplementedError("This tool does not support asynchronous execution.") + + +class GetProteinFunction(BaseTool): + name = "GetProteinFunction" + description = ( + "Get the protein function for a specific protein, " + "with the option to filter by primary accession number. " + "If you have the primary accession number, you can use " + "it to filter the results. Otherwise, all functions " + "associated with the protein will be returned. " + "Input the uniprot ID of the protein." + ) + uniprot = QueryUniprot() + + def _run(self, query: str, primary_accession: str = "") -> str: + """use the tool.""" + try: + functions = self.uniprot.get_protein_function( + query, primary_accession=primary_accession + ) + return ", ".join(functions) + except Exception as e: + return str(e) + + async def _arun(self, query: str, primary_accession: str) -> str: + """use the tool asynchronously.""" + raise NotImplementedError("This tool does not support asynchronous execution.") + + +class GetProteinAssociatedKeywords(BaseTool): + name = "GetProteinAssociatedKeywords" + description = ( + "Get the keywords associated with a specific protein, with " + "the option to filter by primary accession number. If you " + "have the primary accession number, you can use it to " + "filter the results. Otherwise, all keywords associated " + "with the protein will be returned. Input the uniprot ID " + "of the protein." + ) + uniprot = QueryUniprot() + + def _run(self, query: str, primary_accession: str = "") -> str: + """use the tool.""" + try: + keywords = self.uniprot.get_keywords( + query, primary_accession=primary_accession + ) + return ", ".join(keywords) + except Exception as e: + return str(e) + + async def _arun(self, query: str, primary_accession: str) -> str: + """use the tool asynchronously.""" + raise NotImplementedError("This tool does not support asynchronous execution.") + + +class GetAllSequences(BaseTool): + name = "GetAllSequences" + description = ( + "Get all the sequences for a specific protein. " + "Input the uniprot ID of the protein." + "This tool will return all sequences associated with the protein." + ) + uniprot = QueryUniprot() + + def _run(self, query: str) -> str: + """use the tool.""" + try: + sequences = self.uniprot.get_all_sequences(query) + return ", ".join(sequences) + except Exception as e: + return str(e) + + async def _arun(self, query: str) -> str: + """use the tool asynchronously.""" + raise NotImplementedError("This tool does not support asynchronous execution.") + + +class GetInteractions(BaseTool): + name = "GetInteractions" + description = ( + "Get the interactions for a specific protein, given the " + "primary accession number. Both the query string and primary " + "accession number are required. This tool will return the " + "interactions for the protein." + ) + uniprot = QueryUniprot() + + def _run(self, query: str, primary_accession: str = "") -> str: + """use the tool.""" + try: + interactions = self.uniprot.get_interactions(query, primary_accession) + return str(interactions) + except Exception as e: + return str(e) + + async def _arun(self, query: str, primary_accession: str) -> str: + """use the tool asynchronously.""" + raise NotImplementedError("This tool does not support asynchronous execution.") + + +class GetSubunitStructure(BaseTool): + name = "GetSubunitStructure" + description = ( + "Get the subunit structure information for a specific protein, " + "given the primary accession number. Both the query string and " + "primary accession number are required. This tool will return " + "the subunit structure information for the protein." + ) + uniprot = QueryUniprot() + + def _run(self, query: str, primary_accession: str = "") -> str: + """use the tool.""" + try: + structure_info = self.uniprot.get_subunit_structure( + query, primary_accession + ) + return str(structure_info) + except Exception as e: + return str(e) + + async def _arun(self, query: str, primary_accession: str) -> str: + """use the tool asynchronously.""" + raise NotImplementedError("This tool does not support asynchronous execution.") + + +class GetSequenceInfo(BaseTool): + name = "GetSequenceInfo" + description = ( + "Get the sequence information for a specific protein, " + "given the primary accession number. Both the query string " + "and primary accession number are required. This tool will " + "return the sequence, length, and molecular weight. " + ) + uniprot = QueryUniprot() + + def _run(self, query: str, primary_accession: str = "") -> str: + """use the tool.""" + try: + sequence_info = self.uniprot.get_sequence_info(query, primary_accession) + # remove crc64 and md5 keys, as they are not useful to the agent + sequence_info.pop("crc64", None) + sequence_info.pop("md5", None) + return str(sequence_info) + except Exception as e: + return str(e) + + async def _arun(self, query: str, primary_accession: str) -> str: + """use the tool asynchronously.""" + raise NotImplementedError("This tool does not support asynchronous execution.") + + +class GetPDBProcessingInfo(BaseTool): + name = "GetPDBProcessingInfo" + description = ( + "Get the processing information for a specific protein, " + "given the primary accession number. Both the query string " + "and primary accession number are required. Input the query, accession " + "number, and the type of processing information to retrieve (e.g., " + "chain, crosslink, disulfide-bond, etc.). Here is a list of the " + "processing types you can retrieve: chain, crosslink, disulfide-bond, " + "glycosylation, initiator-methionine, lipidation, modified-residue, " + "peptide, propeptide, signal-peptide, transit-peptide" + ) + uniprot = QueryUniprot() + + def _run( + self, query: str, processing_type: str, primary_accession: str = "" + ) -> str: + """use the tool.""" + try: + processing_info = self.uniprot.get_ptm_processing_info( + query, primary_accession, processing_type + ) + return str(processing_info) + except Exception as e: + return str(e) + + async def _arun(self, query: str, primary_accession: str) -> str: + """use the tool asynchronously.""" + raise NotImplementedError("This tool does not support asynchronous execution.") + + +class GetPDB3DInfo(BaseTool): + name = "GetPDB3DInfo" + description = ( + "Get the 3D structure information for a specific protein, " + "given the primary accession number. Both the query string " + "and primary accession number are required. This tool will " + "return information from the PDB database for the protein, " + "including the PDB ID, chain, and resolution." + ) + uniprot = QueryUniprot() + + def _run(self, query: str, primary_accession: str = "") -> str: + """use the tool.""" + try: + structure_info = self.uniprot.get_3d_info(query, primary_accession) + return str(structure_info) + except Exception as e: + return str(e) + + async def _arun(self, query: str, primary_accession: str) -> str: + """use the tool asynchronously.""" + raise NotImplementedError("This tool does not support asynchronous execution.") + + +class GetTurnsBetaSheetsHelices(BaseTool): + name = "GetTurnsBetaSheetsHelices" + description = ( + "Get the number and location of turns, beta sheets, and helices " + "for a specific protein, given the primary accession number. Both " + "the query string and primary accession number are required. This " + "tool will return the number and location of turns, beta sheets, and " + "helices for the protein. " + ) + uniprot = QueryUniprot() + + def _run(self, query: str, primary_accession: str = "") -> str: + """use the tool.""" + try: + turns = self.uniprot.get_structure_info(query, primary_accession, "turn") + beta_sheets = self.uniprot.get_structure_info( + query, primary_accession, "beta" + ) + helices = self.uniprot.get_structure_info(query, primary_accession, "helix") + return f"Turns: {turns}\nBeta sheets: {beta_sheets}\nHelices: {helices}" + except Exception as e: + return str(e) + + async def _arun(self, query: str, primary_accession: str) -> str: + """use the tool asynchronously.""" + raise NotImplementedError("This tool does not support asynchronous execution.") + + +class GetUniprotID(BaseTool): + name = "GetUniprotID" + description = ( + "Get the UniProt ID for a specific protein. " + "Input the query string of the protein. " + "This tool will return the UniProt ID of the protein. " + "You can optionally specify whether you want to return " + "all IDs or just one ID. By default, a single ID will be " + "returned." + ) + uniprot = QueryUniprot() + + def __init__(self, include_uniprotkbids: bool = False): + super().__init__() + self.include_uniprotkbids = include_uniprotkbids + + def _run(self, query: str, all_ids: bool = False) -> str: + """use the tool.""" + try: + ids = self.uniprot.get_ids( + query, + single_id=not all_ids, + include_uniprotkbids=self.include_uniprotkbids, + ) + return ", ".join(ids) + except Exception as e: + return str(e) + + async def _arun(self, query: str, all_ids: bool) -> str: + """use the tool asynchronously.""" + raise NotImplementedError("This tool does not support asynchronous execution.") + + +class GetGeneNames(BaseTool): + name = "GetGeneNames" + description = ( + "Get the gene names associated with a specific protein, " + "with the option to filter by primary accession number. " + "If you have the primary accession number, you can use it " + "to filter the results. Otherwise, all gene names associated " + "with the protein will be returned. Input the uniprot ID of " + "the protein." + ) + uniprot = QueryUniprot() + + def _run(self, query: str, primary_accession: str = "") -> str: + """use the tool.""" + try: + gene_names = self.uniprot.get_gene_names( + query, primary_accession=primary_accession + ) + return ", ".join(gene_names) + except Exception as e: + return str(e) + + async def _arun(self, query: str, primary_accession: str) -> str: + """use the tool asynchronously.""" + raise NotImplementedError("This tool does not support asynchronous execution.") + + +class GetKineticProperties(BaseTool): + name = "GetKineticProperties" + description = ( + "Get the kinetics information for a specific protein, " + "given the primary accession number. " + "Both the query string and primary accession number are required. " + ) + uniprot = QueryUniprot() + + def _run(self, query: str, primary_accession: str = "") -> str: + """use the tool.""" + try: + kinetics = self.uniprot.get_kinetics(query, primary_accession) + return str(kinetics) + except Exception as e: + return str(e) + + async def _arun(self, query: str, dependency: str, primary_accession: str) -> str: + """use the tool asynchronously.""" + raise NotImplementedError("This tool does not support asynchronous execution.") diff --git a/mdagent/tools/maketools.py b/mdagent/tools/maketools.py index 1682cd46..c328bd32 100644 --- a/mdagent/tools/maketools.py +++ b/mdagent/tools/maketools.py @@ -14,7 +14,24 @@ ComputeRMSF, ContactsTool, DistanceMatrixTool, + GetActiveSites, + GetAllKnownSites, + GetAllSequences, + GetBindingSites, + GetGeneNames, + GetInteractions, + GetKineticProperties, + GetPDB3DInfo, + GetPDBProcessingInfo, + GetProteinAssociatedKeywords, + GetProteinFunction, + GetRelevantSites, + GetSequenceInfo, + GetSubunitStructure, + GetTurnsBetaSheetsHelices, + GetUniprotID, ListRegistryPaths, + MapProteinRepresentation, ModifyBaseSimulationScriptTool, MomentOfInertia, PackMolTool, @@ -30,6 +47,7 @@ SimulationOutputFigures, SmallMolPDB, SolventAccessibleSurfaceArea, + UniprotID2Name, VisualizeProtein, ) @@ -74,6 +92,24 @@ def make_all_tools( SmallMolPDB(path_registry=path_instance), SolventAccessibleSurfaceArea(path_registry=path_instance), VisualizeProtein(path_registry=path_instance), + MapProteinRepresentation(), + UniprotID2Name(), + GetBindingSites(), + GetActiveSites(), + GetRelevantSites(), + GetAllKnownSites(), + GetProteinFunction(), + GetProteinAssociatedKeywords(), + GetAllSequences(), + GetInteractions(), + GetSubunitStructure(), + GetSequenceInfo(), + GetPDBProcessingInfo(), + GetPDB3DInfo(), + GetTurnsBetaSheetsHelices(), + GetUniprotID(), + GetGeneNames(), + GetKineticProperties(), ] all_tools += base_tools diff --git a/tests/test_preprocess/test_uniprot.py b/tests/test_preprocess/test_uniprot.py new file mode 100644 index 00000000..e4b118bc --- /dev/null +++ b/tests/test_preprocess/test_uniprot.py @@ -0,0 +1,564 @@ +import pytest + +from mdagent.tools.base_tools.preprocess_tools.uniprot import ( + GetAllKnownSites, + QueryUniprot, +) + + +@pytest.fixture() +def query_uniprot(): + return QueryUniprot() + + +def test_match_primary_accession(query_uniprot): + mock_data = [ + {"entryType": "UniProtKB reviewed (Swiss-Prot)", "primaryAccession": "P68871"}, + {"entryType": "UniProtKB reviewed (Swiss-Prot)", "primaryAccession": "P69905"}, + ] + assert query_uniprot._match_primary_accession(mock_data, "P69905") == [ + { + "entryType": "UniProtKB reviewed (Swiss-Prot)", + "primaryAccession": "P69905", + } + ] + + +def test_get_protein_name_accession(query_uniprot): + full_names = ["Glutathione reductase", " mitochondrial"] + short_names_included = ["Glutathione reductase", " mitochondrial", "GR", "GRase"] + assert full_names == query_uniprot.get_protein_name( + "gsr", "P00390", short_names=False, alternative_names=False + ) + assert short_names_included == query_uniprot.get_protein_name( + "gsr", "P00390", short_names=True, alternative_names=False + ) + assert short_names_included == query_uniprot.get_protein_name( + "gsr", "P00390", short_names=True, alternative_names=True + ) + + +def test_get_protein_name_no_accession(query_uniprot): + full_names = ["Glutathione reductase", " mitochondrial"] + short_names_included = ["Glutathione reductase", " mitochondrial", "GR", "GRase"] + full_names_result = query_uniprot.get_protein_name( + "gsr", short_names=False, alternative_names=False + ) + length_full_name = 29 + length_with_short = 46 + length_with_all = 58 + + assert all(name in full_names_result for name in full_names) + assert len(full_names_result) >= length_full_name + + short_names_included_result = query_uniprot.get_protein_name( + "gsr", short_names=True, alternative_names=False + ) + assert all(name in short_names_included_result for name in short_names_included) + assert len(short_names_included_result) >= length_with_short + + all_names_included_result = query_uniprot.get_protein_name( + "gsr", short_names=True, alternative_names=True + ) + assert all(name in all_names_included_result for name in short_names_included) + assert len(all_names_included_result) >= length_with_all + + +def test_get_relevant_sites(query_uniprot): + binding_sites = query_uniprot.get_relevant_sites("gsr", "P70619", "active") + active_sites = query_uniprot.get_relevant_sites("gsr", "P70619", "binding") + sites = query_uniprot.get_relevant_sites("gsr", "P70619", "sites") + + true_binding_sites = { + "start": 413, + "start_modifier": "EXACT", + "end": 413, + "end_modifier": "EXACT", + "description": "Proton acceptor", + "evidences": [{"evidenceCode": "ECO:0000250"}], + } + assert true_binding_sites in binding_sites + assert not active_sites + assert not sites + + +def test_get_all_known_sites(): + all_known_sites = GetAllKnownSites() + site_msg = all_known_sites._run("hemoglobin", "P69905") + assert "No known active sites." in site_msg + + assert ( + "Binding Sites: [{'start': 59, " + "'start_modifier': 'EXACT', 'end': 59, " + "'end_modifier': 'EXACT', 'description': " + "'', 'evidences': [{'evidenceCode': " + "'ECO:0000255', 'source': 'PROSITE-ProRule', " + "'id': 'PRU00238'}]}," + ) in site_msg + + assert ( + "Other Relevant Sites: [{'start': 9, " + "'start_modifier': 'EXACT', 'end': 10, " + "'end_modifier': 'EXACT', 'description': " + "'(Microbial infection) Cleavage; by " + "N.americanus apr-2', 'evidences': " + "[{'evidenceCode': 'ECO:0000269', 'source': " + "'PubMed', 'id': '12552433'}]}" + ) in site_msg + + +def test_get_protein_function(query_uniprot): + fxn_data_specific = query_uniprot.get_protein_function("hemoglobin", "P69905") + texts = [ + "Involved in oxygen transport from the lung to the various peripheral tissues", + ( + "Hemopressin acts as an antagonist " + "peptide of the cannabinoid receptor " + "CNR1 (PubMed:18077343). " + "Hemopressin-binding efficiently blocks " + "cannabinoid receptor CNR1 and " + "subsequent signaling (PubMed:18077343)" + ), + ] + data_texts = [comment["texts"][0]["value"] for comment in fxn_data_specific[0]] + assert all(text in data_texts for text in texts) + + +def test_get_keywords(query_uniprot): + kw = query_uniprot.get_keywords("gsr", primary_accession="P70619") + kw_true = [ + "PTM: Disulfide bond", + "Ligand: FAD", + "Ligand: Flavoprotein", + "Ligand: NADP", + "Molecular function: Oxidoreductase", + "Domain: Redox-active center", + "Technical term: Reference proteome", + ] + assert all(k in kw for k in kw_true) + + kw_long = query_uniprot.get_keywords("gsr") + assert len(kw_long) >= len(kw) + assert all(k in kw_long for k in kw) + + +def test_get_all_sequences(query_uniprot): + one_gfp_seq = "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK" # noqa: E501 + all_seq = query_uniprot.get_all_sequences("gfp") + length_all_seq = 25 + assert one_gfp_seq in all_seq + assert len(all_seq) >= length_all_seq + + +def test_get_interactions(query_uniprot): + interactions = query_uniprot.get_interactions("hemoglobin", "P69905") + length_interactions = 13 + assert len(interactions) >= length_interactions + i1 = [ + interactions[i]["interactantOne"]["uniProtKBAccession"] + for i in range(len(interactions)) + ] + assert set(i1) == {"P69905"} + i2 = [ + interactions[i]["interactantTwo"]["uniProtKBAccession"] + for i in range(len(interactions)) + ] + assert all( + i in i2 + for i in ( + "Q9NZD4", + "Q2TAC2", + "Q15323", + "O76011", + "P02042", + "P00387", + "P02100", + "P29474", + "Q6A162", + "P0DPK4", + "P09105", + "P69892", + "P68871", + ) + ) + + +def test_get_subunit_structure(query_uniprot): + sus = query_uniprot.get_subunit_structure("hemoglobin", "P69905") + sus_sus = [s["subunit structure"] for s in sus] + assert all( + texts in sus_sus + for texts in [ + "Heterotetramer of two alpha chains and two beta chains in adult hemoglobin A (HbA); two alpha chains and two delta chains in adult hemoglobin A2 (HbA2); two alpha chains and two epsilon chains in early embryonic hemoglobin Gower-2; two alpha chains and two gamma chains in fetal hemoglobin F (HbF)", # noqa: E501 + "(Microbial infection) Interacts with Staphylococcus aureus protein isdB", + ] + ) + + +def test_get_sequence_info(query_uniprot): + seq_info = query_uniprot.get_sequence_info("gsr", "P70619") + crc64, md5 = ( + "0714FF531F90BEBA", # pragma: allowlist secret + "B3EF8C2F41BE8D44040346F274687F49", # pragma: allowlist secret + ) + sequence = "VNVGCVPKKVMWNTAVHSEFIHDHVDYGFQNCKSKFNWHVIKEKRDAYVSRLNNIYQNNLTKSHIEVIHGYATFRDGPQPTAEVNGKKFTAPHILIATGGVPTVPHENQIPGASLGITSDGFFQLEDLPSRSVIVGAGYIAVEIAGILSALGSKTSLMIRHDKVLRSFDSLISSNCTEELENAGGVEVLTVKKFSQVKEVKKTSSGLELHVVTALPGRKPTVTTIPDVDCLLWAIGRDPNSKGLNLNKLGIQTDDKGHILVDEFQNTNVKGVYAVGDVCGKALLTPVAIAAGRKLAHRLFEGKEDSRLDYDNIPTVVFSHPPIGTVGLTEDEAVHKYGKDNVKIYSTAFTPMYHAVTTRKTKCVMKMVCANKEEKVVGIHMQGIGCDEMLQGFAVAVKMGATKADFDNRVAIHPTSSEELVTLR" # pragma: allowlist secret # noqa: E501 + length, molWeight = 424, 46301 + assert seq_info["length"] == length + assert seq_info["molWeight"] == molWeight + assert seq_info["crc64"] == crc64 + assert seq_info["md5"] == md5 + assert seq_info["sequence"] == sequence + + +def test_get_ptm_processing_info(query_uniprot): + chains = query_uniprot.get_ptm_processing_info("hemoglobin", "P69905", "chain") + assert { + "start": 2, + "start_modifier": "EXACT", + "end": 142, + "end_modifier": "EXACT", + "description": "Hemoglobin subunit alpha", + "featureId": "PRO_0000052653", + } in chains + + assert not query_uniprot.get_ptm_processing_info( + "hemoglobin", "P69905", "crosslink" + ) + + assert not query_uniprot.get_ptm_processing_info( + "hemoglobin", "P69905", "disulfide-bond" + ) + + glyco = query_uniprot.get_ptm_processing_info( + "hemoglobin", "P69905", "glycosylation" + ) + assert all( + g in glyco + for g in [ + { + "start": 8, + "start_modifier": "EXACT", + "end": 8, + "end_modifier": "EXACT", + "description": "N-linked (Glc) (glycation) lysine; alternate", + "featureId": "", + }, + { + "start": 17, + "start_modifier": "EXACT", + "end": 17, + "end_modifier": "EXACT", + "description": "N-linked (Glc) (glycation) lysine; alternate", + "featureId": "", + }, + { + "start": 41, + "start_modifier": "EXACT", + "end": 41, + "end_modifier": "EXACT", + "description": "N-linked (Glc) (glycation) lysine; alternate", + "featureId": "", + }, + { + "start": 62, + "start_modifier": "EXACT", + "end": 62, + "end_modifier": "EXACT", + "description": "N-linked (Glc) (glycation) lysine", + "featureId": "", + }, + ] + ) + + i_m = query_uniprot.get_ptm_processing_info( + "hemoglobin", "P69905", "initiator-methionine" + ) + assert { + "start": 1, + "start_modifier": "EXACT", + "end": 1, + "end_modifier": "EXACT", + "description": "Removed", + "featureId": "", + } in i_m + + assert not query_uniprot.get_ptm_processing_info( + "hemoglobin", "P69905", "lipidation" + ) + + mr = query_uniprot.get_ptm_processing_info( + "hemoglobin", "P69905", "modified-residue" + ) + assert all( + m in mr + for m in [ + { + "start": 4, + "start_modifier": "EXACT", + "end": 4, + "end_modifier": "EXACT", + "description": "Phosphoserine", + "featureId": "", + }, + { + "start": 8, + "start_modifier": "EXACT", + "end": 8, + "end_modifier": "EXACT", + "description": "N6-succinyllysine; alternate", + "featureId": "", + }, + { + "start": 9, + "start_modifier": "EXACT", + "end": 9, + "end_modifier": "EXACT", + "description": "Phosphothreonine", + "featureId": "", + }, + { + "start": 12, + "start_modifier": "EXACT", + "end": 12, + "end_modifier": "EXACT", + "description": "N6-succinyllysine", + "featureId": "", + }, + { + "start": 17, + "start_modifier": "EXACT", + "end": 17, + "end_modifier": "EXACT", + "description": "N6-acetyllysine; alternate", + "featureId": "", + }, + { + "start": 17, + "start_modifier": "EXACT", + "end": 17, + "end_modifier": "EXACT", + "description": "N6-succinyllysine; alternate", + "featureId": "", + }, + { + "start": 25, + "start_modifier": "EXACT", + "end": 25, + "end_modifier": "EXACT", + "description": "Phosphotyrosine", + "featureId": "", + }, + ] + ) + + pep = query_uniprot.get_ptm_processing_info("hemoglobin", "P69905", "peptide") + assert { + "start": 96, + "start_modifier": "EXACT", + "end": 104, + "end_modifier": "EXACT", + "description": "Hemopressin", + "featureId": "PRO_0000455882", + } in pep + + assert not query_uniprot.get_ptm_processing_info( + "hemoglobin", "P69905", "propeptide" + ) + + assert not query_uniprot.get_ptm_processing_info( + "hemoglobin", "P69905", "signal-peptide" + ) + + assert not query_uniprot.get_ptm_processing_info( + "hemoglobin", "P69905", "transit-peptide" + ) + + +def test_get_3d_info(query_uniprot): + gsr_3d = query_uniprot.get_3d_info("gsr", "P00390") + assert all( + i in gsr_3d + for i in [ + { + "database": "PDB", + "id": "1ALG", + "properties": [ + {"key": "Method", "value": "NMR"}, + {"key": "Resolution", "value": "-"}, + {"key": "Chains", "value": "A=480-503"}, + ], + }, + { + "database": "PDB", + "id": "1BWC", + "properties": [ + {"key": "Method", "value": "X-ray"}, + {"key": "Resolution", "value": "2.10 A"}, + {"key": "Chains", "value": "A=45-522"}, + ], + }, + ] + ) + + +def test_get_structure_info(query_uniprot): + beta = query_uniprot.get_structure_info("hemoglobin", "P69905", "beta") + assert all( + b in beta + for b in [ + { + "start": 45, + "start_modifier": "EXACT", + "end": 47, + "end_modifier": "EXACT", + "evidences": [ + {"evidenceCode": "ECO:0007829", "source": "PDB", "id": "1M9P"} + ], + }, + { + "start": 50, + "start_modifier": "EXACT", + "end": 52, + "end_modifier": "EXACT", + "evidences": [ + {"evidenceCode": "ECO:0007829", "source": "PDB", "id": "6XDT"} + ], + }, + ] + ) + + helix = query_uniprot.get_structure_info("hemoglobin", "P69905", "helix") + assert all( + h in helix + for h in [ + { + "start": 5, + "start_modifier": "EXACT", + "end": 18, + "end_modifier": "EXACT", + "evidences": [ + {"evidenceCode": "ECO:0007829", "source": "PDB", "id": "2W72"} + ], + }, + { + "start": 19, + "start_modifier": "EXACT", + "end": 21, + "end_modifier": "EXACT", + "evidences": [ + {"evidenceCode": "ECO:0007829", "source": "PDB", "id": "2W72"} + ], + }, + ] + ) + + turns = query_uniprot.get_structure_info("hemoglobin", "P69905", "turn") + assert all( + t in turns + for t in [ + { + "start": 73, + "start_modifier": "EXACT", + "end": 75, + "end_modifier": "EXACT", + "evidences": [ + {"evidenceCode": "ECO:0007829", "source": "PDB", "id": "2W72"} + ], + }, + { + "start": 91, + "start_modifier": "EXACT", + "end": 93, + "end_modifier": "EXACT", + "evidences": [ + {"evidenceCode": "ECO:0007829", "source": "PDB", "id": "2M6Z"} + ], + }, + ] + ) + + +def get_ids(query_uniprot): + hg_ids = [ + "P84792", + "P02042", + "P69891", + "P69892", + "P68871", + "P02089", + "P02070", + "O13163", + "Q10733", + "P02008", + "B3EWR7", + "Q90487", + "P04244", + "P02094", + "P83479", + "P01966", + "O93349", + "P68872", + "P02110", + "P69905", + "P02088", + "P02100", + "P09105", + "P11517", + "P02091", + ] + all_ids = query_uniprot.get_ids("hemoglobin") + single_id = query_uniprot.get_ids("hemoglobin", single_id=True) + assert single_id in hg_ids + assert all(i in all_ids for i in hg_ids) + + +def test_get_gene_names(query_uniprot): + specific_gene = query_uniprot.get_gene_names("gsr", "P00390") + assert all(gene in specific_gene for gene in ["GSR", "GLUR", "GRD1"]) + all_genes = query_uniprot.get_gene_names("gsr") + assert len(all_genes) >= len(specific_gene) + assert all(gene in all_genes for gene in specific_gene) + + +def test_get_sequence_mapping(query_uniprot): + identifiers = [ + "1A00", + "1A01", + "1A0U", + "1A0Z", + "1A3N", + "1A3O", + "1A9W", + "1ABW", + "1ABY", + "1AJ9", + "1B86", + "1BAB", + "1BBB", + "1BIJ", + "1BUW", + "1BZ0", + "1BZ1", + "1BZZ", + "1C7B", + "1C7C", + "1C7D", + "1CLS", + "1CMY", + "1COH", + "1DKE", + ] + mapping = query_uniprot.get_sequence_mapping("P69905") + assert all(i in mapping for i in identifiers) + + +def test_get_kinetics(query_uniprot): + with_a_t = query_uniprot.get_kinetics("rubisco", primary_accession="O85040") + no_a_t = query_uniprot.get_kinetics("rubisco") + + assert len(with_a_t) == 1 + assert len(no_a_t) > len(with_a_t) + + assert ( + with_a_t[0][0]["kineticParameters"]["maximumVelocities"][0]["velocity"] == 2.9 + )