diff --git a/scripts/biomedical/NCBI_Gene/scripts/format_ncbi_gene.py b/scripts/biomedical/NCBI_Gene/scripts/format_ncbi_gene.py index 233ba8cdf..5788841fc 100644 --- a/scripts/biomedical/NCBI_Gene/scripts/format_ncbi_gene.py +++ b/scripts/biomedical/NCBI_Gene/scripts/format_ncbi_gene.py @@ -321,37 +321,54 @@ } GENE_QUALIFIER_DICT = { - 'NOT acts_upstream_of': 'dcs:GOTermQualifierNotActsUpstreamOf', + 'NOT acts_upstream_of': + 'dcs:GOTermQualifierNotActsUpstreamOf', 'NOT acts_upstream_of_or_within': - 'dcs:GOTermQualifierNotActsUpstreamOfOrWithin', + 'dcs:GOTermQualifierNotActsUpstreamOfOrWithin', 'NOT acts_upstream_of_or_within_negative_effect': - 'dcs:GOTermQualifierNotActsUpstreamOfOrWithinNegativeEffect', + 'dcs:GOTermQualifierNotActsUpstreamOfOrWithinNegativeEffect', 'NOT acts_upstream_of_or_within_positive_effect': - 'dcs:GOTermQualifierNotActsUpstreamOfOrWithinPositiveEffect', - 'NOT colocalizes_with': 'dcs:GOTermQualifierNotColocalizesWith', - 'NOT contributes_to': 'dcs:GOTermQualifierNotContributesTo', - 'NOT enables': 'dcs:GOTermQualifierNotEnables', - 'NOT involved_in': 'dcs:GOTermQualifierNotInvolvedIn', - 'NOT is_active_in': 'dcs:GOTermQualifierNotIsActiveIn', - 'NOT located_in': 'dcs:GOTermQualifierNotLocatedIn', - 'NOT part_of': 'dcs:GOTermQualifierNotPartOf', - 'acts_upstream_of': 'dcs:GOTermQualifierActsUpstreamOf', + 'dcs:GOTermQualifierNotActsUpstreamOfOrWithinPositiveEffect', + 'NOT colocalizes_with': + 'dcs:GOTermQualifierNotColocalizesWith', + 'NOT contributes_to': + 'dcs:GOTermQualifierNotContributesTo', + 'NOT enables': + 'dcs:GOTermQualifierNotEnables', + 'NOT involved_in': + 'dcs:GOTermQualifierNotInvolvedIn', + 'NOT is_active_in': + 'dcs:GOTermQualifierNotIsActiveIn', + 'NOT located_in': + 'dcs:GOTermQualifierNotLocatedIn', + 'NOT part_of': + 'dcs:GOTermQualifierNotPartOf', + 'acts_upstream_of': + 'dcs:GOTermQualifierActsUpstreamOf', 'acts_upstream_of_negative_effect': - 'dcs:GOTermQualifierActsUpstreamOfNegativeEffect', - 'acts_upstream_of_or_within': 'dcs:GOTermQualifierActsUpstreamOfOrWithin', + 'dcs:GOTermQualifierActsUpstreamOfNegativeEffect', + 'acts_upstream_of_or_within': + 'dcs:GOTermQualifierActsUpstreamOfOrWithin', 'acts_upstream_of_or_within_negative_effect': - 'dcs:GOTermQualifierActsUpstreamOfOrWithinNegativeEffect', + 'dcs:GOTermQualifierActsUpstreamOfOrWithinNegativeEffect', 'acts_upstream_of_or_within_positive_effect': - 'dcs:GOTermQualifierActsUpstreamOfOrWithinPositiveEffect', + 'dcs:GOTermQualifierActsUpstreamOfOrWithinPositiveEffect', 'acts_upstream_of_positive_effect': - 'dcs:GOTermQualifierActsUpstreamOfPositiveEffect', - 'colocalizes_with': 'dcs:GOTermQualifierColocalizesWith', - 'contributes_to': 'dcs:GOTermQualifierContributesTo', - 'enables': 'dcs:GOTermQualifierEnables', - 'involved_in': 'dcs:GOTermQualifierInvolvedIn', - 'is_active_in': 'dcs:GOTermQualifierIsActiveIn', - 'located_in': 'dcs:GOTermQualifierLocatedIn', - 'part_of': 'dcs:GOTermQualifierPartOf' + 'dcs:GOTermQualifierActsUpstreamOfPositiveEffect', + 'colocalizes_with': + 'dcs:GOTermQualifierColocalizesWith', + 'contributes_to': + 'dcs:GOTermQualifierContributesTo', + 'enables': + 'dcs:GOTermQualifierEnables', + 'involved_in': + 'dcs:GOTermQualifierInvolvedIn', + 'is_active_in': + 'dcs:GOTermQualifierIsActiveIn', + 'located_in': + 'dcs:GOTermQualifierLocatedIn', + 'part_of': + 'dcs:GOTermQualifierPartOf' } GENE_CATEGORY_DICT = { @@ -522,8 +539,7 @@ def process_gene_info(self, file_to_process: str, return_dict) -> None: feature_type_entries = set() unique_dbXrefs_list = [] unique_dbXrefs = set() - input_file = path_join(SOURCE_FILE_PATH + '/gene_info', - file_to_process) + input_file = path_join(SOURCE_FILE_PATH + '/gene_info', file_to_process) with open( path_join(OUTPUT_FILE_PATH + '/gene_info', file_to_process.replace('txt', 'csv')), @@ -696,9 +712,7 @@ def save_gene_info_files(self, mcf_file_path: str): for feature_type in FEATURE_TYPE_ENTRIES: ftype = json.loads(feature_type) mfc_entry = NCBI_GENE_SCHEMA_EMUN_MCF.format( - type=ftype['type'], - item=ftype['entry'], - name=ftype['name']) + type=ftype['type'], item=ftype['entry'], name=ftype['name']) file.write(mfc_entry) def _get_feature_type_list(self, value): @@ -787,7 +801,7 @@ def process_csv_file(self) -> None: else: Gene_PubMedID[input_row[1]] = { 'dcid': - f"dcid:{GENE_ID_DCID_MAPPING[input_row[1]]}", + f"dcid:{GENE_ID_DCID_MAPPING[input_row[1]]}", 'PubMedID': [input_row[2]] } #Gene_PubMedID[input_row[1]].append() @@ -937,7 +951,7 @@ def process_csv_file(self) -> None: if input_row[4] in GENE_ID_DCID_MAPPING: Gene_orthologs[input_row[1]] = { 'dcid': - f"dcid:{GENE_ID_DCID_MAPPING[input_row[1]]}", + f"dcid:{GENE_ID_DCID_MAPPING[input_row[1]]}", 'ortholog': [ f"dcid:{GENE_ID_DCID_MAPPING[input_row[4]]}" ] @@ -1019,7 +1033,7 @@ def process_csv_file(self) -> None: try: Gene_group[input_row[1]] = { 'GeneID': - f'dcid:{GENE_ID_DCID_MAPPING[input_row[1]]}', + f'dcid:{GENE_ID_DCID_MAPPING[input_row[1]]}', column_name: [ f'dcid:{GENE_ID_DCID_MAPPING[input_row[4]]}' ] @@ -1119,7 +1133,8 @@ def parse_gene_mim2gene_row(self, dcid, input_row): if len(input_row[3]) > 1: Source_lst = [ f'{GENE_OMIM_SOURCE_DICT.get(x.strip(), x.strip())}' - for x in input_row[3].strip().split(';') if len(x) > 1 + for x in input_row[3].strip().split(';') + if len(x) > 1 ] row['Source'] = ",".join(Source_lst) @@ -1562,7 +1577,7 @@ def main(_): ] for neighbors_file in gene_Neighbors_shard_files: neighbors_proc = Process(target=GeneNeighbors().process_csv_file, - args=(neighbors_file, )) + args=(neighbors_file,)) procs_Neighbors.append(neighbors_proc) neighbors_proc.start() @@ -1575,7 +1590,7 @@ def main(_): f for f in listdir(join(SOURCE_FILE_PATH, 'gene2go')) ] for go_file in gene_Go_shard_files: - go_proc = Process(target=Gene2Go().process_csv_file, args=(go_file, )) + go_proc = Process(target=Gene2Go().process_csv_file, args=(go_file,)) procs_Go.append(go_proc) go_proc.start() @@ -1589,7 +1604,7 @@ def main(_): ] for accession_file in gene_Accession_shard_files: accession_proc = Process(target=Gene2Accession().process_csv_file, - args=(accession_file, )) + args=(accession_file,)) procs_Accession.append(accession_proc) accession_proc.start() diff --git a/scripts/biomedical/NCBI_Gene/scripts/format_ncbi_gene_test.py b/scripts/biomedical/NCBI_Gene/scripts/format_ncbi_gene_test.py index 98c5679d6..2efe95cbf 100644 --- a/scripts/biomedical/NCBI_Gene/scripts/format_ncbi_gene_test.py +++ b/scripts/biomedical/NCBI_Gene/scripts/format_ncbi_gene_test.py @@ -15,12 +15,19 @@ Author: Pradeep Kumar Krishnaswamy Date: 20-Sep-2024 """ - +import os +import sys import unittest -from .format_ncbi_gene import * + +_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(_SCRIPT_DIR) + +#import format_ncbi_gene as gene + +from .format_ncbi_gene import GeneInfo, Gene2Accession, Gene2Go, GeneNeighbors, GeneMim2gene, GeneRifs_Basic, Gene2Ensembl -class NCBIGeneTest(unittest.TestCase): +class NcbiGeneTest(unittest.TestCase): def test_check_gene_info_parser(self): """ Unit test to parse gene_info row @@ -30,36 +37,51 @@ def test_check_gene_info_parser(self): unique_dbXrefs_list = [] unique_dbXrefs = set() input_row = [ - '2010893', '33370007', 'rbcL', 'CGW41_pgp092', 'Lo_mil1Pt0025', - '-', '-', '-', + '2010893', '33370007', 'rbcL', 'CGW41_pgp092', 'Lo_mil1Pt0025', '-', + '-', '-', 'ribulose-1,5-bisphosphate carboxylase/oxygenase large subunit', 'protein-coding', '-', '-', '-', '-', '20230531', '-' ] expected_result = { - 'taxID': 'dcid:bio/LobeliaMildbraedii', - 'GeneID': '33370007', - 'dcid': 'bio/33370007', - 'Symbol': '"rbcL"', - 'synonym': 'Lo_mil1Pt0025', - 'chromosome': '', - 'map_location': '', + 'taxID': + 'dcid:bio/LobeliaMildbraedii', + 'GeneID': + '33370007', + 'dcid': + 'bio/33370007', + 'Symbol': + '"rbcL"', + 'synonym': + 'Lo_mil1Pt0025', + 'chromosome': + '', + 'map_location': + '', 'description': - '"ribulose-15-bisphosphate carboxylaseoxygenase large subunit"', - 'type_of_gene': '', - 'Full_name_from_nomenclature_authority': '', - 'Nomenclature_status': '', - 'Other_designations': '', - 'Modification_date': '2023-05-31', - 'regulatory': '', - 'misc_feature': '', - 'misc_recomb': '' + '"ribulose-15-bisphosphate carboxylaseoxygenase large subunit"', + 'type_of_gene': + '', + 'Full_name_from_nomenclature_authority': + '', + 'Nomenclature_status': + '', + 'Other_designations': + '', + 'Modification_date': + '2023-05-31', + 'regulatory': + '', + 'misc_feature': + '', + 'misc_recomb': + '' } - row = GeneInfo().parse_gene_info_row('dcid:bio/LobeliaMildbraedii', - gene_id_dcid_mapping, - feature_type_entries, - unique_dbXrefs_list, - unique_dbXrefs, input_row) + row = gene.GeneInfo().parse_gene_info_row('dcid:bio/LobeliaMildbraedii', + gene_id_dcid_mapping, + feature_type_entries, + unique_dbXrefs_list, + unique_dbXrefs, input_row) self.assertDictEqual(row, expected_result) def test_check_gene_neighbors_parser(self): @@ -83,8 +105,8 @@ def test_check_gene_neighbors_parser(self): 'chromosome': 'chr9.part0', 'assembly': '"Reference PAN1.0 Primary Assembly"' } - row = GeneNeighbors().parse_gene_neighbors_row('bio/122811710', - input_row) + row = gene.GeneNeighbors().parse_gene_neighbors_row( + 'bio/122811710', input_row) self.assertDictEqual(row, expected_result) def test_check_gene_mim2gene_parser(self): @@ -102,8 +124,8 @@ def test_check_gene_mim2gene_parser(self): 'dcid': 'bio/ncbi_100652748_omim_620758', 'MedGenCUI_dcid': '' } - row = GeneMim2gene().parse_gene_mim2gene_row('bio/ncbi_100652748', - input_row) + row = gene.GeneMim2gene().parse_gene_mim2gene_row( + 'bio/ncbi_100652748', input_row) self.assertDictEqual(row, expected_result) def test_check_gene_gene2go_parser(self): @@ -111,21 +133,28 @@ def test_check_gene_gene2go_parser(self): """ input_row = [ '75485', '128904962', 'GO:0048025', 'IEA', 'involved_in', - 'negative regulation of mRNA splicing, via spliceosome', - '30032202', 'Process' + 'negative regulation of mRNA splicing, via spliceosome', '30032202', + 'Process' ] expected_result = { - 'GeneID': 'dcid:bio/128904962', - 'dcid': 'bio/GO_0048025', - 'GO_ID': 'GO:0048025', - 'Evidence': 'dcs:GOTermEvidenceCodeElectronicAnnotation', - 'Qualifier': 'dcs:GOTermQualifierInvolvedIn', + 'GeneID': + 'dcid:bio/128904962', + 'dcid': + 'bio/GO_0048025', + 'GO_ID': + 'GO:0048025', + 'Evidence': + 'dcs:GOTermEvidenceCodeElectronicAnnotation', + 'Qualifier': + 'dcs:GOTermQualifierInvolvedIn', 'GO_term': - '"negative regulation of mRNA splicing, via spliceosome"', - 'PubMed': '"30032202"', - 'Category': 'dcs:GeneOntologyCategoryBiologicalProcess' + '"negative regulation of mRNA splicing, via spliceosome"', + 'PubMed': + '"30032202"', + 'Category': + 'dcs:GeneOntologyCategoryBiologicalProcess' } - row = Gene2Go().parse_gene_gene2go_row('bio/128904962', input_row) + row = gene.Gene2Go().parse_gene_gene2go_row('bio/128904962', input_row) self.assertDictEqual(row, expected_result) def test_check_gene_gene2ensembl_parser(self): @@ -145,7 +174,7 @@ def test_check_gene_gene2ensembl_parser(self): 'Ensembl_protein_identifier': '' } - row = Gene2Ensembl().parse_gene_gene2ensembl_row( + row = gene.Gene2Ensembl().parse_gene_gene2ensembl_row( 'bio/ncbi_113218477', input_row) self.assertDictEqual(row, expected_result) @@ -158,21 +187,21 @@ def test_check_gene_rifs_basic_parser(self): ] expected_result = { 'GeneID': - 'dcid:bio/ncbi_3188', + 'dcid:bio/ncbi_3188', 'dcid': - 'bio/ncbi_3188_16171461', + 'bio/ncbi_3188_16171461', 'name': - '"ncbi 3188 PubMed 16171461 Reference Into Function"', + '"ncbi 3188 PubMed 16171461 Reference Into Function"', 'dateModified': - '2010-01-21', + '2010-01-21', 'pubMedId': - '16171461', + '16171461', 'GeneRifText': - '"the relative levels of hnRNP F and H2 in cells, as well as the target sequences in the downstream GRS on pre-mRNA, influence gene expression"' + '"the relative levels of hnRNP F and H2 in cells, as well as the target sequences in the downstream GRS on pre-mRNA, influence gene expression"' } - row = GeneRifs_Basic().parse_gene_generifs_row('bio/ncbi_3188', - input_row) + row = gene.GeneRifs_Basic().parse_gene_generifs_row( + 'bio/ncbi_3188', input_row) self.assertDictEqual(row, expected_result) def test_check_gene_gene2accession_parser(self): @@ -203,7 +232,7 @@ def test_check_gene_gene2accession_parser(self): 'mature_peptide_gi': '' } - row = Gene2Accession().parse_gene_gene2accession_row( + row = gene.Gene2Accession().parse_gene_gene2accession_row( 'bio/ncbi_113218477', input_row) self.assertDictEqual(row, expected_result)