From 95ad3fbbf80bea304488ef258a0349400eb102da Mon Sep 17 00:00:00 2001 From: Vedanth Ramji Date: Tue, 2 Apr 2024 10:28:38 +1000 Subject: [PATCH 1/6] ADD general.py --- argnorm/general.py | 44 ++++++++++++++++++++++++++++++++++++++++++ argnorm/normalizers.py | 40 +++----------------------------------- 2 files changed, 47 insertions(+), 37 deletions(-) create mode 100644 argnorm/general.py diff --git a/argnorm/general.py b/argnorm/general.py new file mode 100644 index 0000000..d5c1650 --- /dev/null +++ b/argnorm/general.py @@ -0,0 +1,44 @@ +import os +import pandas as pd + +ORIGINAL_ID_COL = 'Original ID' +MAPPING_TABLE_ARO_COL = 'ARO' +TARGET_ARO_COL = 'ARO' + +_ROOT = os.path.abspath(os.path.dirname(__file__)) + +def is_number(num): + try: + int(num) + except ValueError: + return False + + return True + +def get_data_path(path, getting_manual_curation): + if getting_manual_curation: + return os.path.join(_ROOT, 'data/manual_curation', path) + + return os.path.join(_ROOT, 'data', path) + +def get_aro_mapping_table(database): + df = pd.read_csv(get_data_path(f'{database}_ARO_mapping.tsv', False), sep='\t') + + manual_curation = pd.read_csv(get_data_path(f'{database}_curation.tsv', True), sep='\t') + manual_curation['Database'] = df['Database'] + + aro_mapping_table = pd.concat([df, manual_curation]) + aro_mapping_table[TARGET_ARO_COL] = aro_mapping_table[TARGET_ARO_COL].map(lambda a: f'ARO:{int(a)}' if is_number(a) else a) + + return aro_mapping_table + +def map_to_aro(gene, database): + mapping_table = get_aro_mapping_table(database).set_index('Original ID') + result = mapping_table.loc[gene, 'ARO'] + + # Dealing with duplicated genes in ARO mapping table. + # Getting only one ARO number + if type(result) != str: + return list(set(result))[0] + else: + return result \ No newline at end of file diff --git a/argnorm/normalizers.py b/argnorm/normalizers.py index 08c7be1..80bdc7e 100644 --- a/argnorm/normalizers.py +++ b/argnorm/normalizers.py @@ -1,31 +1,12 @@ import os import pandas as pd from .drug_categorization import confers_resistance_to, drugs_to_drug_classes - -ORIGINAL_ID_COL = 'Original ID' -MAPPING_TABLE_ARO_COL = 'ARO' -TARGET_ARO_COL = 'ARO' +from .general import * # Column headings for drug categorization output CONFERS_RESISTANCE_TO_COL = 'confers_resistance_to' RESISTANCE_TO_DRUG_CLASSES_COL = 'resistance_to_drug_classes' -_ROOT = os.path.abspath(os.path.dirname(__file__)) - -def is_number(num): - try: - int(num) - except ValueError: - return False - - return True - -def get_data_path(path, getting_manual_curation): - if getting_manual_curation: - return os.path.join(_ROOT, 'data/manual_curation', path) - - return os.path.join(_ROOT, 'data', path) - class BaseNormalizer: """ Inherit this class and customize subclass methods to implement the normalization of tools. @@ -45,7 +26,7 @@ def run(self, input_file : str): input_genes = self.preprocess_input_genes( original_annot[self._input_gene_col].str.lower() ) - aro_table = self.get_aro_mapping_table() + aro_table = get_aro_mapping_table(self.database) aro_table.set_index(self.preprocess_ref_genes( aro_table[ORIGINAL_ID_COL].str.lower() ), inplace=True) @@ -58,7 +39,6 @@ def run(self, input_file : str): return original_annot - def preprocess_ref_genes(self, ref_genes): """ Customize this when ref gene and input gene can not exactly match. @@ -78,20 +58,6 @@ def _set_input_gene_col(self): """ self._input_gene_col = '' - def get_aro_mapping_table(self): - """ - Don't customize this unless you're using your own (not package built-in) reference data. - """ - df = pd.read_csv(get_data_path(f'{self.database}_ARO_mapping.tsv', False), sep='\t') - - manual_curation = pd.read_csv(get_data_path(f'{self.database}_curation.tsv', True), sep='\t') - manual_curation['Database'] = df['Database'] - - aro_mapping_table = pd.concat([df, manual_curation]) - aro_mapping_table[TARGET_ARO_COL] = aro_mapping_table[TARGET_ARO_COL].map(lambda a: f'ARO:{int(a)}' if is_number(a) else a) - - return aro_mapping_table - def load_input(self, input_file): """ Customize this when it fails to parse the input data. @@ -221,4 +187,4 @@ def preprocess_ref_genes(self, ref_genes): megares=lambda x: x.split('|')[0], argannot=lambda x: x.split('~~~')[-1] ) - return ref_genes.apply(process_funcs_by_db[self.database]) + return ref_genes.apply(process_funcs_by_db[self.database]) \ No newline at end of file From 8cb37c44bde375d12ffe6ad56629b4ddfa3ef7f2 Mon Sep 17 00:00:00 2001 From: Vedanth Ramji Date: Tue, 2 Apr 2024 11:43:20 +1000 Subject: [PATCH 2/6] ENH db validation and errors when db and gene not recognized in map_to_aro() --- argnorm/general.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/argnorm/general.py b/argnorm/general.py index d5c1650..1cbe601 100644 --- a/argnorm/general.py +++ b/argnorm/general.py @@ -1,5 +1,6 @@ import os import pandas as pd +import pronto ORIGINAL_ID_COL = 'Original ID' MAPPING_TABLE_ARO_COL = 'ARO' @@ -33,12 +34,20 @@ def get_aro_mapping_table(database): return aro_mapping_table def map_to_aro(gene, database): + if database not in ['ncbi', 'deeparg', 'resfinder', 'sarg', 'megares', 'argannot']: + raise Exception(f'{database} is not a supported database.') + mapping_table = get_aro_mapping_table(database).set_index('Original ID') - result = mapping_table.loc[gene, 'ARO'] - # Dealing with duplicated genes in ARO mapping table. - # Getting only one ARO number - if type(result) != str: - return list(set(result))[0] + try: + result = mapping_table.loc[gene, 'ARO'] + except: + raise Exception(f'{gene} is not in {database} database') else: - return result \ No newline at end of file + # Dealing with duplicated genes in ARO mapping table. + # Getting only one ARO number + ARO = pronto.Ontology.from_obo_library('aro.obo') + if type(result) != str: + return ARO[list(set(result))[0]] + else: + return ARO[result] \ No newline at end of file From 2f6c840f566887775e892d74fcc7b6a474786257 Mon Sep 17 00:00:00 2001 From: Vedanth Ramji Date: Tue, 2 Apr 2024 14:41:57 +1000 Subject: [PATCH 3/6] renamed general.py to lib.py. added test for map_to_aro() --- argnorm/{general.py => lib.py} | 2 +- argnorm/normalizers.py | 13 +------------ tests/test_lib.py | 21 +++++++++++++++++++++ 3 files changed, 23 insertions(+), 13 deletions(-) rename argnorm/{general.py => lib.py} (98%) create mode 100644 tests/test_lib.py diff --git a/argnorm/general.py b/argnorm/lib.py similarity index 98% rename from argnorm/general.py rename to argnorm/lib.py index 1cbe601..734f80f 100644 --- a/argnorm/general.py +++ b/argnorm/lib.py @@ -41,7 +41,7 @@ def map_to_aro(gene, database): try: result = mapping_table.loc[gene, 'ARO'] - except: + except KeyError: raise Exception(f'{gene} is not in {database} database') else: # Dealing with duplicated genes in ARO mapping table. diff --git a/argnorm/normalizers.py b/argnorm/normalizers.py index 80bdc7e..ebb3cbe 100644 --- a/argnorm/normalizers.py +++ b/argnorm/normalizers.py @@ -1,7 +1,7 @@ import os import pandas as pd from .drug_categorization import confers_resistance_to, drugs_to_drug_classes -from .general import * +from .lib import * # Column headings for drug categorization output CONFERS_RESISTANCE_TO_COL = 'confers_resistance_to' @@ -167,17 +167,6 @@ def _set_input_gene_col(self): ) self._input_gene_col = gene_col_by_db[self.database] - def preprocess_input_genes(self, input_genes): - process_funcs_by_db = dict( - ncbi=lambda x: x, - deeparg=lambda x: x, - resfinder=lambda x: x, - sarg=lambda x: x, - megares=lambda x: x, - argannot=lambda x: x - ) - return input_genes.apply(process_funcs_by_db[self.database]) - def preprocess_ref_genes(self, ref_genes): process_funcs_by_db = dict( ncbi=lambda x: x.split('|')[5], diff --git a/tests/test_lib.py b/tests/test_lib.py new file mode 100644 index 0000000..490b8f4 --- /dev/null +++ b/tests/test_lib.py @@ -0,0 +1,21 @@ +from argnorm.lib import map_to_aro +import pronto + +def test_map_to_aro(): + test_cases = [ + ["argannot~~~(AGly)AAC(6')-Isa~~~NG_047311:101-574", 'argannot'], + ["MEG_21|Drugs|Aminoglycosides|Aminoglycoside_N-acetyltransferases|AAC3", 'megares'], + ["1028085756|WP_063844287.1|1|1|cpt|cpt|phosphotransferase|2|CHLORAMPHENICOL|PHENICOL|chloramphenicol_phosphotransferase_CPT", 'ncbi'], + ["gb|AAG57600.1|ARO:3000318|mphB", "sarg"] + ] + + ARO = pronto.Ontology.from_obo_library('aro.obo') + expected_output = [ + ARO.get_term('ARO:3002563'), + ARO.get_term('ARO:3004623'), + ARO.get_term('ARO:3000249'), + ARO.get_term('ARO:3000318') + ] + + for t, e in zip(test_cases, expected_output): + assert map_to_aro(t[0], t[1]) == e \ No newline at end of file From b828c341c3f55a681126738dbb6b942e8ce5f168 Mon Sep 17 00:00:00 2001 From: Vedanth Ramji Date: Wed, 10 Apr 2024 14:42:58 +1000 Subject: [PATCH 4/6] Avoid 'import *' --- argnorm/normalizers.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/argnorm/normalizers.py b/argnorm/normalizers.py index ebb3cbe..ad76e9b 100644 --- a/argnorm/normalizers.py +++ b/argnorm/normalizers.py @@ -1,7 +1,8 @@ import os import pandas as pd from .drug_categorization import confers_resistance_to, drugs_to_drug_classes -from .lib import * +from .lib import get_aro_mapping_table +from .lib import ORIGINAL_ID_COL, MAPPING_TABLE_ARO_COL, TARGET_ARO_COL # Column headings for drug categorization output CONFERS_RESISTANCE_TO_COL = 'confers_resistance_to' From 8aab7daa4b54aa2e92d2199278d3602b7522ec59 Mon Sep 17 00:00:00 2001 From: Vedanth Ramji Date: Thu, 11 Apr 2024 09:45:22 +1000 Subject: [PATCH 5/6] Added notes for lib.py in CHANGELOG.md --- CHANGELOG.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a917681..a816807 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,3 +32,11 @@ - Initial source code started - Normalizers: added BaseNormalizer, ARGSOAPNormalizer, DeepARGNormalizer, AbricateNormalizer - Testing: added basic ARO column test + +## Unreleased + +### argnorm.lib: Making argNorm more usable as a library +- A file called `lib.py` will be introduced so that users can use argNorm as a library more easily. +- Users can import the `map_to_aro` function using `from argnorm.lib import map_to_aro`. The function takes a gene name as input, maps the gene to the ARO and returns a pronto term object with the ARO mapping. +- The `get_aro_mapping_table` function, previously within the BaseNormalizer class, has also been moved to `lib.py` to give users the ability to access the mapping tables being used for normalization. +- With the introduction of `lib.py`, users will be able to access core mapping utilities through `argnorm.lib`, drug categorization through `argnorm.drug_categorization`, and the traditional normalizers through `argnorm.normalizers`. \ No newline at end of file From 17875168109f7b8f7459414d3f58b332348cde83 Mon Sep 17 00:00:00 2001 From: Vedanth Date: Mon, 15 Apr 2024 10:18:28 +0530 Subject: [PATCH 6/6] Update test_lib.py test case for argannot to reflect argannot protein update --- tests/test_lib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_lib.py b/tests/test_lib.py index 490b8f4..11ced10 100644 --- a/tests/test_lib.py +++ b/tests/test_lib.py @@ -3,7 +3,7 @@ def test_map_to_aro(): test_cases = [ - ["argannot~~~(AGly)AAC(6')-Isa~~~NG_047311:101-574", 'argannot'], + ["(AGly)AAC(6')-Isa:NG_047311:101-574:474", 'argannot'], ["MEG_21|Drugs|Aminoglycosides|Aminoglycoside_N-acetyltransferases|AAC3", 'megares'], ["1028085756|WP_063844287.1|1|1|cpt|cpt|phosphotransferase|2|CHLORAMPHENICOL|PHENICOL|chloramphenicol_phosphotransferase_CPT", 'ncbi'], ["gb|AAG57600.1|ARO:3000318|mphB", "sarg"]