defget_bgc_genome_mapping(self)->dict[str,str]:
-"""Get the mapping from BGC to genome.
-
- Note that the directory name of the gbk file is treated as genome id.
-
- Returns:
- The key is BGC name (gbk file name) and value is genome id (the directory name of the
- gbk file).
- """
-return{
-bid:os.path.basename(os.path.dirname(bpath))forbid,bpathinself._file_dict.items()
-}
+53
+54
+55
+56
defget_bgc_genome_mapping(self)->dict[str,str]:
+"""Get the mapping from BGC to genome.
+
+ Note that the directory name of the gbk file is treated as genome id.
+
+ Returns:
+ The key is BGC name (gbk file name) and value is genome id (the directory name of the
+ gbk file).
+ """
+return{
+bid:os.path.basename(os.path.dirname(bpath))forbid,bpathinself._file_dict.items()
+}
@@ -1812,19 +1898,19 @@
Source code in src/nplinker/genomics/antismash/antismash_loader.py
-
defget_files(self)->dict[str,str]:
-"""Get BGC gbk files.
-
- Returns:
- The key is BGC name (gbk file name) and value is path to the gbk file.
- """
-returnself._file_dict
+61
+62
+63
+64
defget_files(self)->dict[str,str]:
+"""Get BGC gbk files.
+
+ Returns:
+ The key is BGC name (gbk file name) and value is path to the gbk file.
+ """
+returnself._file_dict
@@ -1874,19 +1960,19 @@
Source code in src/nplinker/genomics/antismash/antismash_loader.py
-
if download_root and extract_root dirs are the same.
+
if download_root and extract_root dirs are the same.
@@ -2472,7 +2641,7 @@
-
if /antismash/ dir is not empty.
+
if <extract_root>/antismash/<refseq_assembly_id> dir is not empty.
@@ -2562,8 +2731,8 @@
it doesn't exist. The files will be extracted to `<extract_root>/antismash/<antismash_id>` directory. Raises:
- ValueError: if download_root and extract_root dirs are the same.
- ValueError: if <extract_root>/antismash/<refseq_assembly_id> dir is not empty.
+ ValueError: if `download_root` and `extract_root` dirs are the same.
+ ValueError: if `<extract_root>/antismash/<refseq_assembly_id>` dir is not empty. Examples: >>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted")
@@ -2686,10 +2855,7 @@
Source code in src/nplinker/genomics/antismash/antismash_loader.py
-
def__init__(self)->None:
-"""Arrange the dataset required by NPLinker.
-
- This class is used to arrange the datasets required by NPLinker according to the
- configuration. The datasets include MIBiG, GNPS, antiSMASH, and BiG-SCAPE.
-
- If `config.mode` is "local", the datasets are validated.
- If `config.mode` is "podp", the datasets are downloaded or generated.
-
- It uses the default downloads directory `globals.DOWNLOADS_DEFAULT_PATH` to store the
- downloaded files. Default data paths for MIBiG, GNPS, antiSMASH, and BiG-SCAPE are defined
- in `nplinker.globals`.
- """
-# Prepare the downloads directory and/or PODP json file which are required for other methods
-globals.DOWNLOADS_DEFAULT_PATH.mkdir(exist_ok=True)
-self.arrange_podp_project_json()
+46
+47
def__init__(self)->None:
+# Prepare the downloads directory and/or PODP json file which are required for other methods
+globals.DOWNLOADS_DEFAULT_PATH.mkdir(exist_ok=True)
+self.arrange_podp_project_json()
defarrange(self)->None:
-"""Arrange the datasets according to the configuration.
-
- The datasets include MIBiG, GNPS, antiSMASH, and BiG-SCAPE.
- """
-# The order of arranging the datasets matters, as some datasets depend on others
-self.arrange_mibig()
-self.arrange_gnps()
-self.arrange_antismash()
-self.arrange_bigscape()
-self.arrange_strain_mappings()
-self.arrange_strains_selected()
+59
+60
defarrange(self)->None:
+"""Arrange the datasets according to the configuration.
+
+ The datasets include MIBiG, GNPS, antiSMASH, and BiG-SCAPE.
+ """
+# The order of arranging the datasets matters, as some datasets depend on others
+self.arrange_mibig()
+self.arrange_gnps()
+self.arrange_antismash()
+self.arrange_bigscape()
+self.arrange_strain_mappings()
+self.arrange_strains_selected()
defarrange_podp_project_json(self)->None:
-"""Arrange the PODP project JSON file.
-
- If `config.mode` is "podp", download the PODP project JSON file if it doesn't exist. Then
- validate the PODP project JSON file if it exists or is downloaded.
-
- The validation is controlled by the json schema `schemas/podp_adapted_schema.json`.
- """
-ifconfig.mode=="podp":
-file_name=f"paired_datarecord_{config.podp_id}.json"
-podp_file=globals.DOWNLOADS_DEFAULT_PATH/file_name
-ifnotpodp_file.exists():
-download_url(
-PODP_PROJECT_URL.format(config.podp_id),
-globals.DOWNLOADS_DEFAULT_PATH,
-file_name,
-)
-
-withopen(podp_file,"r")asf:
-json_data=json.load(f)
-validate_podp_json(json_data)
+81
+82
defarrange_podp_project_json(self)->None:
+"""Arrange the PODP project JSON file.
+
+ If `config.mode` is "podp", download the PODP project JSON file if it doesn't exist. Then
+ validate the PODP project JSON file if it exists or is downloaded.
+
+ The validation is controlled by the json schema `schemas/podp_adapted_schema.json`.
+ """
+ifconfig.mode=="podp":
+file_name=f"paired_datarecord_{config.podp_id}.json"
+podp_file=globals.DOWNLOADS_DEFAULT_PATH/file_name
+ifnotpodp_file.exists():
+download_url(
+PODP_PROJECT_URL.format(config.podp_id),
+globals.DOWNLOADS_DEFAULT_PATH,
+file_name,
+)
+
+withopen(podp_file,"r")asf:
+json_data=json.load(f)
+validate_podp_json(json_data)
defarrange_mibig(self)->None:
-"""Arrange the MIBiG metadata.
-
- Always download and extract the MIBiG metadata if `config.mibig.to_use` is True.
- If the default directory has already existed, it will be removed and re-downloaded to ensure
- the latest version is used. So it's not allowed to manually put MIBiG metadata in the
- default directory.
- """
-ifconfig.mibig.to_use:
-ifglobals.MIBIG_DEFAULT_PATH.exists():
-# remove existing mibig data
-shutil.rmtree(globals.MIBIG_DEFAULT_PATH)
-download_and_extract_mibig_metadata(
-globals.DOWNLOADS_DEFAULT_PATH,
-globals.MIBIG_DEFAULT_PATH,
-version=config.mibig.version,
-)
+
defarrange_mibig(self)->None:
+"""Arrange the MIBiG metadata.
+
+ Always download and extract the MIBiG metadata if `config.mibig.to_use` is True.
+ If the default directory has already existed, it will be removed and re-downloaded to ensure
+ the latest version is used. So it's not allowed to manually put MIBiG metadata in the
+ default directory.
+ """
+ifconfig.mibig.to_use:
+ifglobals.MIBIG_DEFAULT_PATH.exists():
+# remove existing mibig data
+shutil.rmtree(globals.MIBIG_DEFAULT_PATH)
+download_and_extract_mibig_metadata(
+globals.DOWNLOADS_DEFAULT_PATH,
+globals.MIBIG_DEFAULT_PATH,
+version=config.mibig.version,
+)
defarrange_gnps(self)->None:
-"""Arrange the GNPS data.
-
- If `config.mode` is "local", validate the GNPS data directory.
- If `config.mode` is "podp", download the GNPS data if it doesn't exist or remove the
- existing GNPS data and re-download it if it is invalid.
-
- The validation process includes:
-
- - Check if the GNPS data directory exists.
- - Check if the required files exist in the GNPS data directory, including:
- - file_mappings.tsv or file_mappings.csv
- - spectra.mgf
- - molecular_families.tsv
- - annotations.tsv
- """
-pass_validation=False
-ifconfig.mode=="podp":
-# retry downloading at most 3 times if downloaded data has problems
-for_inrange(3):
-try:
-validate_gnps(globals.GNPS_DEFAULT_PATH)
-pass_validation=True
-break
-except(FileNotFoundError,ValueError):
-# Don't need to remove downloaded archive, as it'll be overwritten
-shutil.rmtree(globals.GNPS_DEFAULT_PATH,ignore_errors=True)
-self._download_and_extract_gnps()
-
-ifnotpass_validation:
-validate_gnps(globals.GNPS_DEFAULT_PATH)
-
-# get the path to file_mappings file (csv or tsv)
-self.gnps_file_mappings_file=self._get_gnps_file_mappings_file()
+134
+135
defarrange_gnps(self)->None:
+"""Arrange the GNPS data.
+
+ If `config.mode` is "local", validate the GNPS data directory.
+ If `config.mode` is "podp", download the GNPS data if it doesn't exist or remove the
+ existing GNPS data and re-download it if it is invalid.
+
+ The validation process includes:
+
+ - Check if the GNPS data directory exists.
+ - Check if the required files exist in the GNPS data directory, including:
+ - file_mappings.tsv or file_mappings.csv
+ - spectra.mgf
+ - molecular_families.tsv
+ - annotations.tsv
+ """
+pass_validation=False
+ifconfig.mode=="podp":
+# retry downloading at most 3 times if downloaded data has problems
+for_inrange(3):
+try:
+validate_gnps(globals.GNPS_DEFAULT_PATH)
+pass_validation=True
+break
+except(FileNotFoundError,ValueError):
+# Don't need to remove downloaded archive, as it'll be overwritten
+shutil.rmtree(globals.GNPS_DEFAULT_PATH,ignore_errors=True)
+self._download_and_extract_gnps()
+
+ifnotpass_validation:
+validate_gnps(globals.GNPS_DEFAULT_PATH)
+
+# get the path to file_mappings file (csv or tsv)
+self.gnps_file_mappings_file=self._get_gnps_file_mappings_file()
defarrange_antismash(self)->None:
-"""Arrange the antiSMASH data.
-
- If `config.mode` is "local", validate the antiSMASH data directory.
- If `config.mode` is "podp", download the antiSMASH data if it doesn't exist or remove the
- existing antiSMASH data and re-download it if it is invalid.
-
- The validation process includes:
- - Check if the antiSMASH data directory exists.
- - Check if the antiSMASH data directory contains at least one sub-directory, and each
- sub-directory contains at least one BGC file (with the suffix ".region???.gbk" where ???
- is a number).
-
- AntiSMASH BGC directory must follow the structure below:
- ```
- antismash
- ├── genome_id_1 (one AntiSMASH output, e.g. GCF_000514775.1)
- │ ├── GCF_000514775.1.gbk
- │ ├── NZ_AZWO01000004.region001.gbk
- │ └── ...
- ├── genome_id_2
- │ ├── ...
- └── ...
- ```
- """
-pass_validation=False
-ifconfig.mode=="podp":
-for_inrange(3):
-try:
-validate_antismash(globals.ANTISMASH_DEFAULT_PATH)
-pass_validation=True
-break
-exceptFileNotFoundError:
-shutil.rmtree(globals.ANTISMASH_DEFAULT_PATH,ignore_errors=True)
-self._download_and_extract_antismash()
-
-ifnotpass_validation:
-validate_antismash(globals.ANTISMASH_DEFAULT_PATH)
+209
+210
defarrange_antismash(self)->None:
+"""Arrange the antiSMASH data.
+
+ If `config.mode` is "local", validate the antiSMASH data directory.
+ If `config.mode` is "podp", download the antiSMASH data if it doesn't exist or remove the
+ existing antiSMASH data and re-download it if it is invalid.
+
+ The validation process includes:
+ - Check if the antiSMASH data directory exists.
+ - Check if the antiSMASH data directory contains at least one sub-directory, and each
+ sub-directory contains at least one BGC file (with the suffix ".region???.gbk" where ???
+ is a number).
+
+ AntiSMASH BGC directory must follow the structure below:
+ ```
+ antismash
+ ├── genome_id_1 (one AntiSMASH output, e.g. GCF_000514775.1)
+ │ ├── GCF_000514775.1.gbk
+ │ ├── NZ_AZWO01000004.region001.gbk
+ │ └── ...
+ ├── genome_id_2
+ │ ├── ...
+ └── ...
+ ```
+ """
+pass_validation=False
+ifconfig.mode=="podp":
+for_inrange(3):
+try:
+validate_antismash(globals.ANTISMASH_DEFAULT_PATH)
+pass_validation=True
+break
+exceptFileNotFoundError:
+shutil.rmtree(globals.ANTISMASH_DEFAULT_PATH,ignore_errors=True)
+self._download_and_extract_antismash()
+
+ifnotpass_validation:
+validate_antismash(globals.ANTISMASH_DEFAULT_PATH)
@@ -2063,19 +2078,20 @@
+in the default BiG-SCAPE directory, and the clustering file
+"mix_clustering_c{config.bigscape.cutoff}.tsv" will be copied to the default BiG-SCAPE
+directory.
The validation process includes:
Check if the default BiG-SCAPE data directory exists.
Check if the clustering file "mix_clustering_c{config.bigscape.cutoff}.tsv" exists in the
BiG-SCAPE data directory.
+
Check if the 'data_sqlite.db' file exists in the BiG-SCAPE data directory.
defarrange_bigscape(self)->None:
-"""Arrange the BiG-SCAPE data.
-
- If `config.mode` is "local", validate the BiG-SCAPE data directory.
- If `config.mode` is "podp", run BiG-SCAPE to generate the clustering file if it doesn't
- exist or remove the existing BiG-SCAPE data and re-run BiG-SCAPE if it is invalid.
- The running output of BiG-SCAPE will be saved to the directory "bigscape_running_output"
- in the default BiG-SCAPE directory, and the clustering file "mix_clustering_c{config.bigscape.cutoff}.tsv"
- will be copied to the default BiG-SCAPE directory.
-
- The validation process includes:
+252
+253
+254
+255
defarrange_bigscape(self)->None:
+"""Arrange the BiG-SCAPE data.
+
+ If `config.mode` is "local", validate the BiG-SCAPE data directory.
+ If `config.mode` is "podp", run BiG-SCAPE to generate the clustering file if it doesn't
+ exist or remove the existing BiG-SCAPE data and re-run BiG-SCAPE if it is invalid.
+ The running output of BiG-SCAPE will be saved to the directory "bigscape_running_output"
+ in the default BiG-SCAPE directory, and the clustering file
+ "mix_clustering_c{config.bigscape.cutoff}.tsv" will be copied to the default BiG-SCAPE
+ directory.
- - Check if the default BiG-SCAPE data directory exists.
- - Check if the clustering file "mix_clustering_c{config.bigscape.cutoff}.tsv" exists in the
- BiG-SCAPE data directory.
- """
-pass_validation=False
-ifconfig.mode=="podp":
-for_inrange(3):
-try:
-validate_bigscape(globals.BIGSCAPE_DEFAULT_PATH)
-pass_validation=True
-break
-exceptFileNotFoundError:
-shutil.rmtree(globals.BIGSCAPE_DEFAULT_PATH,ignore_errors=True)
-self._run_bigscape()
-
-ifnotpass_validation:
-validate_bigscape(globals.BIGSCAPE_DEFAULT_PATH)
+ The validation process includes:
+
+ - Check if the default BiG-SCAPE data directory exists.
+ - Check if the clustering file "mix_clustering_c{config.bigscape.cutoff}.tsv" exists in the
+ BiG-SCAPE data directory.
+ - Check if the 'data_sqlite.db' file exists in the BiG-SCAPE data directory.
+ """
+pass_validation=False
+ifconfig.mode=="podp":
+for_inrange(3):
+try:
+validate_bigscape(globals.BIGSCAPE_DEFAULT_PATH)
+pass_validation=True
+break
+exceptFileNotFoundError:
+shutil.rmtree(globals.BIGSCAPE_DEFAULT_PATH,ignore_errors=True)
+self._run_bigscape()
+
+ifnotpass_validation:
+validate_bigscape(globals.BIGSCAPE_DEFAULT_PATH)
defarrange_strain_mappings(self)->None:
-"""Arrange the strain mappings file.
-
- If `config.mode` is "local", validate the strain mappings file.
- If `config.mode` is "podp", always generate the strain mappings file and validate it.
+291
+292
+293
+294
defarrange_strain_mappings(self)->None:
+"""Arrange the strain mappings file.
- The valiation checks if the strain mappings file exists and if it is a valid JSON file
- according to the schema defined in `schemas/strain_mappings_schema.json`.
- """
-ifconfig.mode=="podp":
-self._generate_strain_mappings()
-
-self._validate_strain_mappings()
+ If `config.mode` is "local", validate the strain mappings file.
+ If `config.mode` is "podp", always generate the strain mappings file and validate it.
+
+ The valiation checks if the strain mappings file exists and if it is a valid JSON file
+ according to the schema defined in `schemas/strain_mappings_schema.json`.
+ """
+ifconfig.mode=="podp":
+self._generate_strain_mappings()
+
+self._validate_strain_mappings()
defarrange_strains_selected(self)->None:
-"""Arrange the strains selected file.
-
- Validate the strains selected file if it exists.
- The validation checks if the strains selected file is a valid JSON file according to the
- schema defined in `schemas/user_strains.json`.
- """
-strains_selected_file=config.root_dir/globals.STRAINS_SELECTED_FILENAME
-ifstrains_selected_file.exists():
-withopen(strains_selected_file,"r")asf:
-json_data=json.load(f)
-validate(instance=json_data,schema=USER_STRAINS_SCHEMA)
+347
+348
+349
+350
defarrange_strains_selected(self)->None:
+"""Arrange the strains selected file.
+
+ Validate the strains selected file if it exists.
+ The validation checks if the strains selected file is a valid JSON file according to the
+ schema defined in `schemas/user_strains.json`.
+ """
+strains_selected_file=config.root_dir/globals.STRAINS_SELECTED_FILENAME
+ifstrains_selected_file.exists():
+withopen(strains_selected_file,"r")asf:
+json_data=json.load(f)
+validate(instance=json_data,schema=USER_STRAINS_SCHEMA)
defvalidate_gnps(gnps_dir:Path)->None:
-"""Validate the GNPS data directory and its contents.
-
- The GNPS data directory must contain the following files:
-
- - file_mappings.tsv or file_mappings.csv
- - spectra.mgf
- - molecular_families.tsv
- - annotations.tsv
-
- Args:
- gnps_dir: Path to the GNPS data directory.
+393
+394
+395
+396
defvalidate_gnps(gnps_dir:Path)->None:
+"""Validate the GNPS data directory and its contents.
+
+ The GNPS data directory must contain the following files:
+
+ - file_mappings.tsv or file_mappings.csv
+ - spectra.mgf
+ - molecular_families.tsv
+ - annotations.tsv
- Raises:
- FileNotFoundError: If the GNPS data directory is not found or any of the required files
- is not found.
- ValueError: If both file_mappings.tsv and file_mapping.csv are found.
- """
-ifnotgnps_dir.exists():
-raiseFileNotFoundError(f"GNPS data directory not found at {gnps_dir}")
-
-file_mappings_tsv=gnps_dir/globals.GNPS_FILE_MAPPINGS_TSV
-file_mappings_csv=gnps_dir/globals.GNPS_FILE_MAPPINGS_CSV
-iffile_mappings_tsv.exists()andfile_mappings_csv.exists():
-raiseValueError(
-f"Both {file_mappings_tsv.name} and {file_mappings_csv.name} found in GNPS directory "
-f"{gnps_dir}, only one is allowed."
-)
-elifnotfile_mappings_tsv.exists()andnotfile_mappings_csv.exists():
-raiseFileNotFoundError(
-f"Neither {file_mappings_tsv.name} nor {file_mappings_csv.name} found in GNPS directory"
-f" {gnps_dir}"
-)
-
-required_files=[
-gnps_dir/globals.GNPS_SPECTRA_FILENAME,
-gnps_dir/globals.GNPS_MOLECULAR_FAMILY_FILENAME,
-gnps_dir/globals.GNPS_ANNOTATIONS_FILENAME,
-]
-list_not_found=[f.nameforfinrequired_filesifnotf.exists()]
-iflist_not_found:
-raiseFileNotFoundError(
-f"Files not found in GNPS directory {gnps_dir}: ', '.join({list_not_found})"
-)
+ Args:
+ gnps_dir: Path to the GNPS data directory.
+
+ Raises:
+ FileNotFoundError: If the GNPS data directory is not found or any of the required files
+ is not found.
+ ValueError: If both file_mappings.tsv and file_mapping.csv are found.
+ """
+ifnotgnps_dir.exists():
+raiseFileNotFoundError(f"GNPS data directory not found at {gnps_dir}")
+
+file_mappings_tsv=gnps_dir/globals.GNPS_FILE_MAPPINGS_TSV
+file_mappings_csv=gnps_dir/globals.GNPS_FILE_MAPPINGS_CSV
+iffile_mappings_tsv.exists()andfile_mappings_csv.exists():
+raiseValueError(
+f"Both {file_mappings_tsv.name} and {file_mappings_csv.name} found in GNPS directory "
+f"{gnps_dir}, only one is allowed."
+)
+elifnotfile_mappings_tsv.exists()andnotfile_mappings_csv.exists():
+raiseFileNotFoundError(
+f"Neither {file_mappings_tsv.name} nor {file_mappings_csv.name} found in GNPS directory"
+f" {gnps_dir}"
+)
+
+required_files=[
+gnps_dir/globals.GNPS_SPECTRA_FILENAME,
+gnps_dir/globals.GNPS_MOLECULAR_FAMILY_FILENAME,
+gnps_dir/globals.GNPS_ANNOTATIONS_FILENAME,
+]
+list_not_found=[f.nameforfinrequired_filesifnotf.exists()]
+iflist_not_found:
+raiseFileNotFoundError(
+f"Files not found in GNPS directory {gnps_dir}: ', '.join({list_not_found})"
+)
defvalidate_antismash(antismash_dir:Path)->None:
-"""Validate the antiSMASH data directory and its contents.
-
- The validation only checks the structure of the antiSMASH data directory and file names.
- It does not check
+438
+439
+440
+441
defvalidate_antismash(antismash_dir:Path)->None:
+"""Validate the antiSMASH data directory and its contents.
- - the content of the BGC files
- - the consistency between the antiSMASH data and the PODP project JSON file for the PODP
- mode
-
- The antiSMASH data directory must exist and contain at least one sub-directory. The name of the
- sub-directories must not contain any space. Each sub-directory must contain at least one BGC
- file (with the suffix ".region???.gbk" where ??? is the region number).
-
- Args:
- antismash_dir: Path to the antiSMASH data directory.
+ The validation only checks the structure of the antiSMASH data directory and file names.
+ It does not check
+
+ - the content of the BGC files
+ - the consistency between the antiSMASH data and the PODP project JSON file for the PODP
+ mode
+
+ The antiSMASH data directory must exist and contain at least one sub-directory. The name of the
+ sub-directories must not contain any space. Each sub-directory must contain at least one BGC
+ file (with the suffix ".region???.gbk" where ??? is the region number).
- Raises:
- FileNotFoundError: If the antiSMASH data directory is not found, or no sub-directories
- are found in the antiSMASH data directory, or no BGC files are found in any
- sub-directory.
- ValueError: If any sub-directory name contains a space.
- """
-ifnotantismash_dir.exists():
-raiseFileNotFoundError(f"antiSMASH data directory not found at {antismash_dir}")
-
-sub_dirs=list_dirs(antismash_dir)
-ifnotsub_dirs:
-raiseFileNotFoundError(
-"No BGC directories found in antiSMASH data directory {antismash_dir}"
-)
-
-forsub_dirinsub_dirs:
-dir_name=Path(sub_dir).name
-if" "indir_name:
-raiseValueError(
-f"antiSMASH sub-directory name {dir_name} contains space, which is not allowed"
-)
-
-gbk_files=list_files(sub_dir,suffix=".gbk",keep_parent=False)
-bgc_files=fnmatch.filter(gbk_files,"*.region???.gbk")
-ifnotbgc_files:
-raiseFileNotFoundError(f"No BGC files found in antiSMASH sub-directory {sub_dir}")
+ Args:
+ antismash_dir: Path to the antiSMASH data directory.
+
+ Raises:
+ FileNotFoundError: If the antiSMASH data directory is not found, or no sub-directories
+ are found in the antiSMASH data directory, or no BGC files are found in any
+ sub-directory.
+ ValueError: If any sub-directory name contains a space.
+ """
+ifnotantismash_dir.exists():
+raiseFileNotFoundError(f"antiSMASH data directory not found at {antismash_dir}")
+
+sub_dirs=list_dirs(antismash_dir)
+ifnotsub_dirs:
+raiseFileNotFoundError(
+"No BGC directories found in antiSMASH data directory {antismash_dir}"
+)
+
+forsub_dirinsub_dirs:
+dir_name=Path(sub_dir).name
+if" "indir_name:
+raiseValueError(
+f"antiSMASH sub-directory name {dir_name} contains space, which is not allowed"
+)
+
+gbk_files=list_files(sub_dir,suffix=".gbk",keep_parent=False)
+bgc_files=fnmatch.filter(gbk_files,"*.region???.gbk")
+ifnotbgc_files:
+raiseFileNotFoundError(f"No BGC files found in antiSMASH sub-directory {sub_dir}")
@@ -2641,6 +2662,9 @@
The BiG-SCAPE data directory must exist and contain the clustering file
"mix_clustering_c{config.bigscape.cutoff}.tsv" where {config.bigscape.cutoff} is the
bigscape cutoff value set in the config file.
+
Alternatively, the directory can contain the BiG-SCAPE database file generated by BiG-SCAPE v2.
+At the moment, all the family assignments in the database will be used, so this database should
+contain results from a single run with the desired cutoff.
defvalidate_bigscape(bigscape_dir:Path)->None:
-"""Validate the BiG-SCAPE data directory and its contents.
-
- The BiG-SCAPE data directory must exist and contain the clustering file
- "mix_clustering_c{config.bigscape.cutoff}.tsv" where {config.bigscape.cutoff} is the
- bigscape cutoff value set in the config file.
-
- Args:
- bigscape_dir: Path to the BiG-SCAPE data directory.
+459
+460
+461
+462
+463
+464
+465
+466
+467
defvalidate_bigscape(bigscape_dir:Path)->None:
+"""Validate the BiG-SCAPE data directory and its contents.
+
+ The BiG-SCAPE data directory must exist and contain the clustering file
+ "mix_clustering_c{config.bigscape.cutoff}.tsv" where {config.bigscape.cutoff} is the
+ bigscape cutoff value set in the config file.
- Raises:
- FileNotFoundError: If the BiG-SCAPE data directory or the clustering file is not found.
- """
-ifnotbigscape_dir.exists():
-raiseFileNotFoundError(f"BiG-SCAPE data directory not found at {bigscape_dir}")
-
-clustering_file=bigscape_dir/f"mix_clustering_c{config.bigscape.cutoff}.tsv"
-ifnotclustering_file.exists():
-raiseFileNotFoundError(f"BiG-SCAPE clustering file not found: {clustering_file}")
+ Alternatively, the directory can contain the BiG-SCAPE database file generated by BiG-SCAPE v2.
+ At the moment, all the family assignments in the database will be used, so this database should
+ contain results from a single run with the desired cutoff.
+
+ Args:
+ bigscape_dir: Path to the BiG-SCAPE data directory.
+
+ Raises:
+ FileNotFoundError: If the BiG-SCAPE data directory or the clustering file is not found.
+ """
+ifnotbigscape_dir.exists():
+raiseFileNotFoundError(f"BiG-SCAPE data directory not found at {bigscape_dir}")
+
+clustering_file=bigscape_dir/f"mix_clustering_c{config.bigscape.cutoff}.tsv"
+database_file=bigscape_dir/"data_sqlite.db"
+ifnotclustering_file.exists()andnotdatabase_file.exists():
+raiseFileNotFoundError(f"BiG-SCAPE data not found in {clustering_file} or {database_file}")
def__init__(self,cluster_file:str|PathLike,/)->None:
+"""Initialize the BiG-SCAPE GCF loader.
+
+ Args:
+ cluster_file: Path to the BiG-SCAPE cluster file,
+ the filename has a pattern of "<class>_clustering_c0.xx.tsv".
+ """
+self.cluster_file:str=str(cluster_file)
+self._gcf_list=self._parse_gcf(self.cluster_file)
+
defget_gcfs(self,keep_mibig_only:bool=False,keep_singleton:bool=False)->list[GCF]:
+"""Get all GCF objects.
+
+ Args:
+ keep_mibig_only: True to keep GCFs that contain only MIBiG
+ BGCs.
+ keep_singleton: True to keep singleton GCFs. A singleton GCF
+ is a GCF that contains only one BGC.
+
+ Returns:
+ A list of GCF objects.
+ """
+gcf_list=self._gcf_list
+ifnotkeep_mibig_only:
+gcf_list=[gcfforgcfingcf_listifnotgcf.has_mibig_only()]
+ifnotkeep_singleton:
+gcf_list=[gcfforgcfingcf_listifnotgcf.is_singleton()]
+returngcf_list
+
def__init__(self,cluster_file:str|PathLike,/)->None:
-"""Build a loader for BiG-SCAPE GCF cluster file.
-
- Args:
- cluster_file: Path to the BiG-SCAPE cluster file,
- the filename has a pattern of "<class>_clustering_c0.xx.tsv".
-
- Attributes:
- cluster_file: path to the BiG-SCAPE cluster file.
- """
-self.cluster_file=str(cluster_file)
-self._gcf_list=self._parse_gcf(self.cluster_file)
+
defget_gcfs(self,keep_mibig_only:bool=False,keep_singleton:bool=False)->list[GCF]:
+"""Get all GCF objects.
+
+ Args:
+ keep_mibig_only: True to keep GCFs that contain only MIBiG
+ BGCs.
+ keep_singleton: True to keep singleton GCFs. A singleton GCF
+ is a GCF that contains only one BGC.
+
+ Returns:
+ a list of GCF objects.
+ """
+gcf_list=self._gcf_list
+ifnotkeep_mibig_only:
+gcf_list=[gcfforgcfingcf_listifnotgcf.has_mibig_only()]
+ifnotkeep_singleton:
+gcf_list=[gcfforgcfingcf_listifnotgcf.is_singleton()]
+returngcf_list
+
defget_gcfs(self,keep_mibig_only:bool=False,keep_singleton:bool=False)->list[GCF]:
-"""Get all GCF objects.
+43
+44
+45
+46
+47
defrun_bigscape(
+antismash_path:str|PathLike,
+output_path:str|PathLike,
+extra_params:str,
+):
+bigscape_py_path="bigscape.py"
+logger.info(
+f'run_bigscape: input="{antismash_path}", output="{output_path}", extra_params={extra_params}"'
+)
+
+try:
+subprocess.run([bigscape_py_path,"-h"],capture_output=True,check=True)
+exceptExceptionase:
+raiseException(f"Failed to find/run bigscape.py (path={bigscape_py_path}, err={e})")frome
- Args:
- keep_mibig_only: True to keep GCFs that contain only MIBiG
- BGCs.
- keep_singleton: True to keep singleton GCFs. A singleton GCF
- is a GCF that contains only one BGC.
+ifnotos.path.exists(antismash_path):
+raiseException(f'antismash_path "{antismash_path}" does not exist!')
+
+# configure the IO-related parameters, including pfam_dir
+args=[bigscape_py_path,"-i",antismash_path,"-o",output_path,"--pfam_dir",PFAM_PATH]
- Returns:
- list[GCF]: a list of GCF objects.
- """
-gcf_list=self._gcf_list
-ifnotkeep_mibig_only:
-gcf_list=[gcfforgcfingcf_listifnotgcf.has_mibig_only()]
-ifnotkeep_singleton:
-gcf_list=[gcfforgcfingcf_listifnotgcf.is_singleton()]
-returngcf_list
+# append the user supplied params, if any
+iflen(extra_params)>0:
+args.extend(extra_params.split(" "))
+
+logger.info(f"BiG-SCAPE command: {args}")
+result=subprocess.run(args,stdout=sys.stdout,stderr=sys.stderr,check=True)
+logger.info(f"BiG-SCAPE completed with return code {result.returncode}")
+# use subprocess.CompletedProcess.check_returncode() to test if the BiG-SCAPE
+# process exited successfully. This throws an exception for non-zero returncodes
+# which will indicate to the PODPDownloader module that something went wrong.
+result.check_returncode()
+
+returnTrue
def__init__(self,bgc_id:str,/,*product_prediction:str):
-"""Class to model BGC (biosynthetic gene cluster) data.
-
- BGC data include both annotations and sequence data. This class is
- mainly designed to model the annotations or metadata.
-
- The raw BGC data is stored in GenBank format (.gbk). Additional
- `GenBank features`_ could be added to the GenBank file to annotate
- BGCs, e.g. antiSMASH has some self-defined features (like "region") in
- its output GenBank files.
-
- The annotations of BGC can be stored in JSON format, which is defined
- and used by MIBiG.
-
- Args:
- bgc_id: BGC identifier, e.g. MIBiG accession, GenBank accession.
- product_prediction: BGC's (predicted) natural products
- or product classes.
-
- Attributes:
- bgc_id: BGC identifier, e.g. MIBiG accession, GenBank accession.
- product_prediction: A tuple of (predicted) natural
- products or product classes of the BGC.
- For antiSMASH's GenBank data, the feature `region /product`
- gives product information.
- For MIBiG metadata, its biosynthetic class provides such info.
- mibig_bgc_class: A tuple of MIBiG biosynthetic
- classes to which the BGC belongs.
- Defaults to None.
- MIBiG defines 6 major biosynthetic classes for natural products,
- including "NRP", "Polyketide", "RiPP", "Terpene", "Saccharide"
- and "Alkaloid". Note that natural products created by all other
- biosynthetic mechanisms fall under the category "Other".
- More details see the publication: https://doi.org/10.1186/s40793-018-0318-y.
- description: Brief description of the BGC.
- Defaults to None.
- smiles: A tuple of SMILES formulas of the BGC's
- products.
- Defaults to None.
- antismash_file: The path to the antiSMASH GenBank file.
- Defaults to None.
- antismash_id: Identifier of the antiSMASH BGC, referring
- to the feature `VERSION` of GenBank file.
- Defaults to None.
- antismash_region: AntiSMASH BGC region number, referring
- to the feature `region` of GenBank file.
- Defaults to None.
- parents: The set of GCFs that contain the BGC.
- strain: The strain of the BGC.
-
- .. GenBank features:
- https://www.insdc.org/submitting-standards/feature-table/
+84
defis_mibig(self)->bool:
-"""Check if the BGC is MIBiG reference BGC or not.
-
- Note:
- This method evaluates MIBiG BGC based on the pattern that MIBiG
- BGC names start with "BGC". It might give false positive result.
-
- Returns:
- True if it's MIBiG reference BGC
- """
-returnself.bgc_id.startswith("BGC")
+148
+149
defis_mibig(self)->bool:
+"""Check if the BGC is MIBiG reference BGC or not.
+
+ Note:
+ This method evaluates MIBiG BGC based on the pattern that MIBiG
+ BGC names start with "BGC". It might give false positive result.
+
+ Returns:
+ True if it's MIBiG reference BGC
+ """
+returnself.bgc_id.startswith("BGC")
BiG-SCAPE's BGC class.
+BiG-SCAPE's BGC classes are similar to those defined in MiBIG
+but have more categories (7 classes). More details see:
+https://doi.org/10.1038%2Fs41589-019-0400-9.
BiG-SCAPE's BGC class.
-BiG-SCAPE's BGC classes are similar to those defined in MiBIG
-but have more categories (7 classes). More details see:
-https://doi.org/10.1038%2Fs41589-019-0400-9.
def__init__(self,gcf_id:str,/)->None:
-"""Class to model gene cluster family (GCF).
-
- GCF is a group of similar BGCs and generated by clustering BGCs with
- tools such as BiG-SCAPE and BiG-SLICE.
-
- Args:
- gcf_id: id of the GCF object.
-
- Attributes:
- gcf_id: id of the GCF object.
- bgc_ids: a set of BGC ids that belongs to the GCF.
- bigscape_class: BiG-SCAPE's BGC class.
- BiG-SCAPE's BGC classes are similar to those defined in MiBIG
- but have more categories (7 classes). More details see:
- https://doi.org/10.1038%2Fs41589-019-0400-9.
- """
-self.gcf_id=gcf_id
-self.bgc_ids:set[str]=set()
-self.bigscape_class:str|None=None
-self._bgcs:set[BGC]=set()
-self._strains:StrainCollection=StrainCollection()
+36
+37
+38
+39
def__init__(self,gcf_id:str,/)->None:
+"""Initialize the GCF object.
+
+ Args:
+ gcf_id: id of the GCF object.
+ """
+self.gcf_id=gcf_id
+self.bgc_ids:set[str]=set()
+self.bigscape_class:str|None=None
+self._bgcs:set[BGC]=set()
+self._strains:StrainCollection=StrainCollection()
defadd_bgc(self,bgc:BGC)->None:
-"""Add a BGC object to the GCF."""
-bgc.parents.add(self)
-self._bgcs.add(bgc)
-self.bgc_ids.add(bgc.bgc_id)
-ifbgc.strainisnotNone:
-self._strains.add(bgc.strain)
-else:
-logger.warning("No strain specified for the BGC %s",bgc.bgc_id)
+78
+79
+80
+81
defadd_bgc(self,bgc:BGC)->None:
+"""Add a BGC object to the GCF."""
+bgc.parents.add(self)
+self._bgcs.add(bgc)
+self.bgc_ids.add(bgc.bgc_id)
+ifbgc.strainisnotNone:
+self._strains.add(bgc.strain)
+else:
+logger.warning("No strain specified for the BGC %s",bgc.bgc_id)
defhas_strain(self,strain:Strain)->bool:
-"""Check if the given strain exists.
-
- Args:
- strain: `Strain` object.
+100
+101
+102
+103
defhas_strain(self,strain:Strain)->bool:
+"""Check if the given strain exists.
- Returns:
- True when the given strain exist.
- """
-returnstraininself._strains
+ Args:
+ strain: `Strain` object.
+
+ Returns:
+ True when the given strain exist.
+ """
+returnstraininself._strains
defhas_mibig_only(self)->bool:
-"""Check if the GCF's children are only MIBiG BGCs.
-
- Returns:
- True if `GCF.bgc_ids` are only MIBiG BGC ids.
- """
-returnall(map(lambdaid:id.startswith("BGC"),self.bgc_ids))
+108
+109
+110
+111
defhas_mibig_only(self)->bool:
+"""Check if the GCF's children are only MIBiG BGCs.
+
+ Returns:
+ True if `GCF.bgc_ids` are only MIBiG BGC ids.
+ """
+returnall(map(lambdaid:id.startswith("BGC"),self.bgc_ids))
defis_singleton(self)->bool:
-"""Check if the GCF contains only one BGC.
-
- Returns:
- True if `GCF.bgc_ids` contains only one BGC id.
- """
-returnlen(self.bgc_ids)==1
+116
+117
+118
+119
defis_singleton(self)->bool:
+"""Check if the GCF contains only one BGC.
+
+ Returns:
+ True if `GCF.bgc_ids` contains only one BGC id.
+ """
+returnlen(self.bgc_ids)==1
def__init__(self,data_dir:str):
-"""Abstract base class for BGC loader.
-
- Args:
- data_dir: Path to directory that contains BGC metadata files
- (.json) or full data genbank files (.gbk).
- """
-self.data_dir=data_dir
+16
+17
+18
def__init__(self,data_dir:str):
+"""Initialize the BGC loader.
+
+ Args:
+ data_dir: Path to directory that contains BGC metadata files
+ (.json) or full data genbank files (.gbk).
+ """
+self.data_dir=data_dir
@abstractmethod
-defget_files(self)->dict[str,str]:
-"""Get path to BGC files.
-
- Returns:
- The key is BGC name and value is path to BGC file
- """
+24
+25
+26
@abstractmethod
+defget_files(self)->dict[str,str]:
+"""Get path to BGC files.
+
+ Returns:
+ The key is BGC name and value is path to BGC file
+ """
@abstractmethod
-defget_gcfs(self,keep_mibig_only:bool,keep_singleton:bool)->Sequence[GCF]:
-"""Get GCF objects.
-
- Args:
- keep_mibig_only: True to keep GCFs that contain only MIBiG
- BGCs.
- keep_singleton: True to keep singleton GCFs. A singleton GCF
- is a GCF that contains only one BGC.
-
- Returns:
- A list of GCF objects
- """
+48
+49
+50
+51
+52
@abstractmethod
+defget_gcfs(self,keep_mibig_only:bool,keep_singleton:bool)->Sequence[GCF]:
+"""Get GCF objects.
+
+ Args:
+ keep_mibig_only: True to keep GCFs that contain only MIBiG
+ BGCs.
+ keep_singleton: True to keep singleton GCFs. A singleton GCF
+ is a GCF that contains only one BGC.
+
+ Returns:
+ A list of GCF objects
+ """
# sort mappings by genome_id and construct json datagenome_bgc_mappings=dict(sorted(genome_bgc_mappings.items()))
-json_data=[{"genome_ID":k,"BGC_ID":v}fork,vingenome_bgc_mappings.items()]
-json_data={"mappings":json_data,"version":"1.0"}
+json_data_mappings=[{"genome_ID":k,"BGC_ID":v}fork,vingenome_bgc_mappings.items()]
+json_data={"mappings":json_data_mappings,"version":"1.0"}# validate json datavalidate(instance=json_data,schema=GENOME_BGC_MAPPINGS_SCHEMA)
@@ -1732,10 +1771,12 @@
-
A tuple of two lists of BGC objects. The
-first list contains BGC objects that are updated with Strain object;
-the second list contains BGC objects that are not updated with
-Strain object because no Strain object is found.
+
A tuple of two lists of BGC objects,
+
+
the first list contains BGC objects that are updated with Strain object;
+
the second list contains BGC objects that are not updated with
+ Strain object because no Strain object is found.
defadd_strain_to_bgc(strains:StrainCollection,bgcs:list[BGC])->tuple[list[BGC],list[BGC]]:"""Assign a Strain object to `BGC.strain` for input BGCs. BGC id is used to find the corresponding Strain object. It's possible that
@@ -1822,35 +1864,36 @@
bgcs: A list of BGC objects. Returns:
- A tuple of two lists of BGC objects. The
- first list contains BGC objects that are updated with Strain object;
- the second list contains BGC objects that are not updated with
- Strain object because no Strain object is found.
-
- Raises:
- ValueError: Multiple strain objects found for a BGC id.
- """
-bgc_with_strain=[]
-bgc_without_strain=[]
-forbgcinbgcs:
-try:
-strain_list=strains.lookup(bgc.bgc_id)
-exceptValueError:
-bgc_without_strain.append(bgc)
-continue
-iflen(strain_list)>1:
-raiseValueError(
-f"Multiple strain objects found for BGC id '{bgc.bgc_id}'."
-f"BGC object accept only one strain."
-)
-bgc.strain=strain_list[0]
-bgc_with_strain.append(bgc)
-
-logger.info(
-f"{len(bgc_with_strain)} BGC objects updated with Strain object.\n"
-f"{len(bgc_without_strain)} BGC objects not updated with Strain object."
-)
-returnbgc_with_strain,bgc_without_strain
+ A tuple of two lists of BGC objects,
+
+ - the first list contains BGC objects that are updated with Strain object;
+ - the second list contains BGC objects that are not updated with
+ Strain object because no Strain object is found.
+
+ Raises:
+ ValueError: Multiple strain objects found for a BGC id.
+ """
+bgc_with_strain=[]
+bgc_without_strain=[]
+forbgcinbgcs:
+try:
+strain_list=strains.lookup(bgc.bgc_id)
+exceptValueError:
+bgc_without_strain.append(bgc)
+continue
+iflen(strain_list)>1:
+raiseValueError(
+f"Multiple strain objects found for BGC id '{bgc.bgc_id}'."
+f"BGC object accept only one strain."
+)
+bgc.strain=strain_list[0]
+bgc_with_strain.append(bgc)
+
+logger.info(
+f"{len(bgc_with_strain)} BGC objects updated with Strain object.\n"
+f"{len(bgc_without_strain)} BGC objects not updated with Strain object."
+)
+returnbgc_with_strain,bgc_without_strain
@@ -1941,11 +1984,14 @@
-
The first list contains GCF objects that are updated with BGC objects;
-The second list contains GCF objects that are not updated with BGC objects
-because no BGC objects are found;
-The dictionary contains GCF objects as keys and a set of ids of missing
-BGC objects as values.
+
A tuple of two lists and a dictionary,
+
+
The first list contains GCF objects that are updated with BGC objects;
+
The second list contains GCF objects that are not updated with BGC objects
+ because no BGC objects are found;
+
The dictionary contains GCF objects as keys and a set of ids of missing
+ BGC objects as values.
defadd_bgc_to_gcf(
-bgcs:list[BGC],gcfs:list[GCF]
-)->tuple[list[GCF],list[GCF],dict[GCF,set[str]]]:
-"""Add BGC objects to GCF object based on GCF's BGC ids.
-
- The attribute of `GCF.bgc_ids` contains the ids of BGC objects. These ids
- are used to find BGC objects from the input `bgcs` list. The found BGC
- objects are added to the `bgcs` attribute of GCF object. It is possible that
- some BGC ids are not found in the input `bgcs` list, and so their BGC
- objects are missing in the GCF object.
-
- This method changes the lists `bgcs` and `gcfs` in place.
-
- Args:
- bgcs: A list of BGC objects.
- gcfs: A list of GCF objects.
-
- Returns:
- The first list contains GCF objects that are updated with BGC objects;
- The second list contains GCF objects that are not updated with BGC objects
- because no BGC objects are found;
- The dictionary contains GCF objects as keys and a set of ids of missing
- BGC objects as values.
- """
-bgc_dict={bgc.bgc_id:bgcforbgcinbgcs}
-gcf_with_bgc=[]
-gcf_without_bgc=[]
-gcf_missing_bgc:dict[GCF,set[str]]={}
-forgcfingcfs:
-forbgc_idingcf.bgc_ids:
-try:
-bgc=bgc_dict[bgc_id]
-exceptKeyError:
-ifgcfnotingcf_missing_bgc:
-gcf_missing_bgc[gcf]={bgc_id}
-else:
-gcf_missing_bgc[gcf].add(bgc_id)
-continue
-gcf.add_bgc(bgc)
-
-ifgcf.bgcs:
-gcf_with_bgc.append(gcf)
-else:
-gcf_without_bgc.append(gcf)
-
-logger.info(
-f"{len(gcf_with_bgc)} GCF objects updated with BGC objects.\n"
-f"{len(gcf_without_bgc)} GCF objects not updated with BGC objects.\n"
-f"{len(gcf_missing_bgc)} GCF objects have missing BGC objects."
-)
-returngcf_with_bgc,gcf_without_bgc,gcf_missing_bgc
+162
+163
+164
+165
defadd_bgc_to_gcf(
+bgcs:list[BGC],gcfs:list[GCF]
+)->tuple[list[GCF],list[GCF],dict[GCF,set[str]]]:
+"""Add BGC objects to GCF object based on GCF's BGC ids.
+
+ The attribute of `GCF.bgc_ids` contains the ids of BGC objects. These ids
+ are used to find BGC objects from the input `bgcs` list. The found BGC
+ objects are added to the `bgcs` attribute of GCF object. It is possible that
+ some BGC ids are not found in the input `bgcs` list, and so their BGC
+ objects are missing in the GCF object.
+
+ This method changes the lists `bgcs` and `gcfs` in place.
+
+ Args:
+ bgcs: A list of BGC objects.
+ gcfs: A list of GCF objects.
+
+ Returns:
+ A tuple of two lists and a dictionary,
+
+ - The first list contains GCF objects that are updated with BGC objects;
+ - The second list contains GCF objects that are not updated with BGC objects
+ because no BGC objects are found;
+ - The dictionary contains GCF objects as keys and a set of ids of missing
+ BGC objects as values.
+ """
+bgc_dict={bgc.bgc_id:bgcforbgcinbgcs}
+gcf_with_bgc=[]
+gcf_without_bgc=[]
+gcf_missing_bgc:dict[GCF,set[str]]={}
+forgcfingcfs:
+forbgc_idingcf.bgc_ids:
+try:
+bgc=bgc_dict[bgc_id]
+exceptKeyError:
+ifgcfnotingcf_missing_bgc:
+gcf_missing_bgc[gcf]={bgc_id}
+else:
+gcf_missing_bgc[gcf].add(bgc_id)
+continue
+gcf.add_bgc(bgc)
+
+ifgcf.bgcs:
+gcf_with_bgc.append(gcf)
+else:
+gcf_without_bgc.append(gcf)
+
+logger.info(
+f"{len(gcf_with_bgc)} GCF objects updated with BGC objects.\n"
+f"{len(gcf_without_bgc)} GCF objects not updated with BGC objects.\n"
+f"{len(gcf_missing_bgc)} GCF objects have missing BGC objects."
+)
+returngcf_with_bgc,gcf_without_bgc,gcf_missing_bgc
@@ -2125,9 +2175,12 @@
-
tuple[list[BGC], StrainCollection]: The first is a list of MIBiG BGC
-objects used in the GCFs; the second is a StrainCollection object
-that contains all Strain objects used in the GCFs.
+
A tuple of two objects,
+
+
the first is a list of MIBiG BGC objects used in the GCFs;
+
the second is a StrainCollection object that contains all Strain objects used in the
+GCFs.
defget_mibig_from_gcf(gcfs:list[GCF])->tuple[list[BGC],StrainCollection]:
-"""Get MIBiG BGCs and strains from GCF objects.
-
- Args:
- gcfs: A list of GCF objects.
+184
+185
+186
+187
+188
+189
defget_mibig_from_gcf(gcfs:list[GCF])->tuple[list[BGC],StrainCollection]:
+"""Get MIBiG BGCs and strains from GCF objects.
- Returns:
- tuple[list[BGC], StrainCollection]: The first is a list of MIBiG BGC
- objects used in the GCFs; the second is a StrainCollection object
- that contains all Strain objects used in the GCFs.
- """
-mibig_bgcs_in_use=[]
-mibig_strains_in_use=StrainCollection()
-forgcfingcfs:
-forbgcingcf.bgcs:
-ifbgc.is_mibig():
-mibig_bgcs_in_use.append(bgc)
-ifbgc.strainisnotNone:
-mibig_strains_in_use.add(bgc.strain)
-returnmibig_bgcs_in_use,mibig_strains_in_use
+ Args:
+ gcfs: A list of GCF objects.
+
+ Returns:
+ A tuple of two objects,
+
+ - the first is a list of MIBiG BGC objects used in the GCFs;
+ - the second is a StrainCollection object that contains all Strain objects used in the
+ GCFs.
+ """
+mibig_bgcs_in_use=[]
+mibig_strains_in_use=StrainCollection()
+forgcfingcfs:
+forbgcingcf.bgcs:
+ifbgc.is_mibig():
+mibig_bgcs_in_use.append(bgc)
+ifbgc.strainisnotNone:
+mibig_strains_in_use.add(bgc.strain)
+returnmibig_bgcs_in_use,mibig_strains_in_use
defextract_mappings_strain_id_original_genome_id(
-podp_project_json_file:str|PathLike
-)->dict[str,set[str]]:
-"""Extract mappings "strain id <-> original genome id".
-
- Args:
- podp_project_json_file: The path to the PODP project
- JSON file.
-
- Returns:
- Key is strain id and value is a set of original genome ids.
-
- Notes:
- The `podp_project_json_file` is the project JSON file downloaded from
- PODP platform. For example, for project MSV000079284, its json file is
- https://pairedomicsdata.bioinformatics.nl/api/projects/4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4.
- """
-mappings_dict={}
-withopen(podp_project_json_file,"r")asf:
-json_data=json.load(f)
-
-validate_podp_json(json_data)
-
-forrecordinjson_data["genomes"]:
-strain_id=record["genome_label"]
-genome_id=get_best_available_genome_id(record["genome_ID"])
-ifgenome_idisNone:
-logger.warning("Failed to extract genome ID from genome with label %s",strain_id)
-continue
-ifstrain_idinmappings_dict:
-mappings_dict[strain_id].add(genome_id)
-else:
-mappings_dict[strain_id]={genome_id}
-returnmappings_dict
+224
+225
+226
+227
+228
+229
defextract_mappings_strain_id_original_genome_id(
+podp_project_json_file:str|PathLike,
+)->dict[str,set[str]]:
+"""Extract mappings "strain id <-> original genome id".
+
+ Args:
+ podp_project_json_file: The path to the PODP project
+ JSON file.
+
+ Returns:
+ Key is strain id and value is a set of original genome ids.
+
+ Notes:
+ The `podp_project_json_file` is the project JSON file downloaded from
+ PODP platform. For example, for project MSV000079284, its json file is
+ https://pairedomicsdata.bioinformatics.nl/api/projects/4b29ddc3-26d0-40d7-80c5-44fb6631dbf9.4.
+ """
+mappings_dict:dict[str,set[str]]={}
+withopen(podp_project_json_file,"r")asf:
+json_data=json.load(f)
+
+validate_podp_json(json_data)
+
+forrecordinjson_data["genomes"]:
+strain_id=record["genome_label"]
+genome_id=get_best_available_genome_id(record["genome_ID"])
+ifgenome_idisNone:
+logger.warning("Failed to extract genome ID from genome with label %s",strain_id)
+continue
+ifstrain_idinmappings_dict:
+mappings_dict[strain_id].add(genome_id)
+else:
+mappings_dict[strain_id]={genome_id}
+returnmappings_dict