Skip to content

Commit

Permalink
Update format_ncbi_taxonomy.py
Browse files Browse the repository at this point in the history
add quotes around name for generated enum
  • Loading branch information
spiekos authored Jul 4, 2024
1 parent 26f8251 commit 9a9fb70
Showing 1 changed file with 12 additions and 6 deletions.
18 changes: 12 additions & 6 deletions scripts/biomedical/NCBI_Taxonomy/scripts/format_ncbi_taxonomy.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
"""
Author: Pradeep Kumar Krishnaswamy
Date: 18-Apr-2024
Edited By: Samantha Piekos
Last Edited: 03-Jul-2024
Name: format_ncbi_taxonomy
Description: cleaning the NCBI Taxonomy data.
@source data: Download Taxdump.tar.z from NCBI Taxonomy FTP Download page
Expand All @@ -33,13 +35,14 @@
from absl import flags
from absl import logging

SOURCE_FILE_PATH = None
OUTPUT_FILE_PATH = None
SOURCE_FILE_PATH = 'input/'
OUTPUT_FILE_PATH = 'output/'
OUTPUT_MCF_FILE = 'ncbi_taxonomy_schema_enum.mcf'
OUTPUT_TAXID_DCID_MAPPING_FILE = 'tax_id_dcid_mapping.txt'
OUTPUT_NCBI_TAXONOMY_CSV = 'ncbi_taxonomy.csv'

FIRST_MCF_ENTRY = """Node:dcid:BiologicalTaxonomicDivisionEnum
FIRST_MCF_ENTRY = """# This schema file is generated by format_ncbi_taxonomy.py
Node:dcid:BiologicalTaxonomicDivisionEnum
name: "BiologicalTaxonomicDivisionEnum"
typeOf: schema:Class
subClassOf: schema:Enumeration
Expand All @@ -50,7 +53,7 @@
DIVISION_COL = [
'division_code', 'division_acronym', 'division_name', 'comments'
]
DIVISION_MCF = "Node:dcid:BiologicalTaxonomicDivision{PascalCaseDivisionName}\nname: {division_name}\ntypeOf: dcs:BiologicalTaxonomicDivisionEnum\nacronym: {division_acronym}\ndescription: {comments}\n\n" # pylint: disable=line-too-long
DIVISION_MCF = 'Node:dcid:BiologicalTaxonomicDivision{PascalCaseDivisionName}\nname: "{division_name}"\ntypeOf: dcs:BiologicalTaxonomicDivisionEnum\nabbreviation: "{division_acronym}"\ndescription: "{comments}"\n\n' # pylint: disable=line-too-long

DIVISION_DICT = {}
NCBI_TAXONOMY_CSV_HEADER = [
Expand Down Expand Up @@ -158,7 +161,7 @@
subClassOf: schema:Enumeration
description: "A host is a larger organism that harbors a smaller organism. The relationship between the two organisms can be parasitic, mutualistic, or commensalist. This encodes the type of larger organism that is serving as a host."
\n"""
HOST_MCF = "Node:dcid:BiologicalHost{biological_case_item}\nname: {item}\ntypeOf: dcs:BiologicalHostEnum\n\n" # pylint: disable=line-too-long
HOST_MCF = 'Node:dcid:BiologicalHost{biological_case_item}\nname: "{item}"\ntypeOf: dcs:BiologicalHostEnum\n\n' # pylint: disable=line-too-long
HOST_DICT = {}

NODES_DMP = 'nodes.dmp'
Expand All @@ -174,7 +177,7 @@
descriptionUrl: "https://en.wikipedia.org/wiki/Taxonomic_rank"
\n
"""
NODES_MCF = "Node:dcid:BiologicalTaxonomicRank{rank_case}\nname: {rank}\ntypeOf: dcs:BiologicalTaxonomicRankEnum\n\n" # pylint: disable=line-too-long
NODES_MCF = 'Node:dcid:BiologicalTaxonomicRank{rank_case}\nname: "{rank}"\ntypeOf: dcs:BiologicalTaxonomicRankEnum\n\n' # pylint: disable=line-too-long
NODES_DICT = {}

CATEGORIES_DMP = 'categories.dmp'
Expand Down Expand Up @@ -511,6 +514,8 @@ def update_nodes_enum(self, df: pd.DataFrame) -> None:
current_node = FINAL_NCBI_TAXONOMY[row['tax_id']]
current_node['parentDcid'] = TAX_ID_DCID_MAPPING[
row["parent_tax_id"]]
rank_case = string.capwords(row["rank"]).replace(" ", "")
current_node['taxonRank'] = "dcs:BiologicalTaxonomicRank" + rank_case
current_node['division'] = DIVISION_DICT[row["division_code"]]
current_node['hasInheritedDivision'] = True if row[
'inherited_division'] == 1 else False
Expand Down Expand Up @@ -654,3 +659,4 @@ def main(_):

if __name__ == "__main__":
app.run(main)

0 comments on commit 9a9fb70

Please sign in to comment.