BigDataBiology · luispedro · Apr 15, 2024 · Apr 2, 2024 · Apr 2, 2024 · Apr 2, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -38,4 +38,9 @@
 ### Using amino acid file for argannot rather than nucleotide file
 - ARG-ANNOT is comprised of coding sequences. The data wasn't being handled properly before as contig mode was used when passing coding sequences to RGI. Now, the amino acid version of ARG-ANNOT is used with protein mode when running the database in RGI.
 - One to many ARO mapping such as NG_047831:101-955 to Erm(K) and almG eliminated as protein mode used
-- A total of 10 ARO mappings changed
+- A total of 10 ARO mappings changed
+### argnorm.lib: Making argNorm more usable as a library 
+- A file called `lib.py` will be introduced so that users can use argNorm as a library more easily.
+- Users can import the `map_to_aro` function using `from argnorm.lib import map_to_aro`. The function takes a gene name as input, maps the gene to the ARO and returns a pronto term object with the ARO mapping.
+- The `get_aro_mapping_table` function, previously within the BaseNormalizer class, has also been moved to `lib.py` to give users the ability to access the mapping tables being used for normalization.
+- With the introduction of `lib.py`, users will be able to access core mapping utilities through `argnorm.lib`, drug categorization through `argnorm.drug_categorization`, and the traditional normalizers through `argnorm.normalizers`.
diff --git a/argnorm/lib.py b/argnorm/lib.py
@@ -0,0 +1,53 @@
+import os
+import pandas as pd
+import pronto
+
+ORIGINAL_ID_COL = 'Original ID'
+MAPPING_TABLE_ARO_COL = 'ARO'
+TARGET_ARO_COL = 'ARO'
+
+_ROOT = os.path.abspath(os.path.dirname(__file__))
+
+def is_number(num):
+    try:
+        int(num)
+    except ValueError:
+        return False
+
+    return True
+
+def get_data_path(path, getting_manual_curation):
+    if getting_manual_curation:
+        return os.path.join(_ROOT, 'data/manual_curation', path)
+
+    return os.path.join(_ROOT, 'data', path)
+
+def get_aro_mapping_table(database):
+    df = pd.read_csv(get_data_path(f'{database}_ARO_mapping.tsv', False), sep='\t')
+
+    manual_curation = pd.read_csv(get_data_path(f'{database}_curation.tsv', True), sep='\t')
+    manual_curation['Database'] = df['Database']
+
+    aro_mapping_table = pd.concat([df, manual_curation])
+    aro_mapping_table[TARGET_ARO_COL] = aro_mapping_table[TARGET_ARO_COL].map(lambda a: f'ARO:{int(a)}' if is_number(a) else a)
+
+    return aro_mapping_table
+
+def map_to_aro(gene, database):
+    if database not in ['ncbi', 'deeparg', 'resfinder', 'sarg', 'megares', 'argannot']:
+        raise Exception(f'{database} is not a supported database.')
+
+    mapping_table = get_aro_mapping_table(database).set_index('Original ID')
+
+    try:
+        result = mapping_table.loc[gene, 'ARO']
+    except KeyError:
+        raise Exception(f'{gene} is not in {database} database')
+    else:
+        # Dealing with duplicated genes in ARO mapping table.
+        # Getting only one ARO number
+        ARO = pronto.Ontology.from_obo_library('aro.obo')
+        if type(result) != str:
+            return ARO[list(set(result))[0]]
+        else:
+            return ARO[result]
diff --git a/argnorm/normalizers.py b/argnorm/normalizers.py
@@ -1,31 +1,13 @@
 import os
 import pandas as pd
 from .drug_categorization import confers_resistance_to, drugs_to_drug_classes
-
-ORIGINAL_ID_COL = 'Original ID'
-MAPPING_TABLE_ARO_COL = 'ARO'
-TARGET_ARO_COL = 'ARO'
+from .lib import get_aro_mapping_table
+from .lib import ORIGINAL_ID_COL, MAPPING_TABLE_ARO_COL, TARGET_ARO_COL
 
 # Column headings for drug categorization output
 CONFERS_RESISTANCE_TO_COL = 'confers_resistance_to'
 RESISTANCE_TO_DRUG_CLASSES_COL = 'resistance_to_drug_classes'
 
-_ROOT = os.path.abspath(os.path.dirname(__file__))
-
-def is_number(num):
-    try:
-        int(num)
-    except ValueError:
-        return False
-
-    return True
-
-def get_data_path(path, getting_manual_curation):
-    if getting_manual_curation:
-        return os.path.join(_ROOT, 'data/manual_curation', path)
-
-    return os.path.join(_ROOT, 'data', path)
-
 class BaseNormalizer:
     """
     Inherit this class and customize subclass methods to implement the normalization of tools.
@@ -45,7 +27,7 @@ def run(self, input_file : str):
         input_genes = self.preprocess_input_genes(
             original_annot[self._input_gene_col].str.lower()
         )
-        aro_table = self.get_aro_mapping_table()
+        aro_table = get_aro_mapping_table(self.database)
         aro_table.set_index(self.preprocess_ref_genes(
             aro_table[ORIGINAL_ID_COL].str.lower()
         ), inplace=True)
@@ -58,7 +40,6 @@ def run(self, input_file : str):
 
         return original_annot
 
-
     def preprocess_ref_genes(self, ref_genes):
         """
         Customize this when ref gene and input gene can not exactly match.
@@ -78,20 +59,6 @@ def _set_input_gene_col(self):
         """
         self._input_gene_col = ''
 
-    def get_aro_mapping_table(self):
-        """
-        Don't customize this unless you're using your own (not package built-in) reference data.
-        """
-        df = pd.read_csv(get_data_path(f'{self.database}_ARO_mapping.tsv', False), sep='\t')
-
-        manual_curation = pd.read_csv(get_data_path(f'{self.database}_curation.tsv', True), sep='\t')
-        manual_curation['Database'] = df['Database']
-
-        aro_mapping_table = pd.concat([df, manual_curation])
-        aro_mapping_table[TARGET_ARO_COL] = aro_mapping_table[TARGET_ARO_COL].map(lambda a: f'ARO:{int(a)}' if is_number(a) else a)
-
-        return aro_mapping_table
-
     def load_input(self, input_file):
         """
         Customize this when it fails to parse the input data.
@@ -202,6 +169,7 @@ def _set_input_gene_col(self):
         )
         self._input_gene_col = gene_col_by_db[self.database]
 
+
     def preprocess_input_genes(self, input_genes):
         process_funcs_by_db = dict(
             ncbi=lambda x: x,
@@ -231,4 +199,4 @@ def preprocess_ref_genes(self, ref_genes):
             argannot=self.preprocess_argannot_ref_genes,
             resfinderfg=lambda x: x.split('|')[1]
         )
-        return ref_genes.apply(process_funcs_by_db[self.database])
+        return ref_genes.apply(process_funcs_by_db[self.database])
diff --git a/tests/test_lib.py b/tests/test_lib.py
@@ -0,0 +1,21 @@
+from argnorm.lib import map_to_aro
+import pronto
+
+def test_map_to_aro():
+    test_cases = [
+        ["(AGly)AAC(6')-Isa:NG_047311:101-574:474", 'argannot'],
+        ["MEG_21|Drugs|Aminoglycosides|Aminoglycoside_N-acetyltransferases|AAC3", 'megares'],
+        ["1028085756|WP_063844287.1|1|1|cpt|cpt|phosphotransferase|2|CHLORAMPHENICOL|PHENICOL|chloramphenicol_phosphotransferase_CPT", 'ncbi'],
+        ["gb|AAG57600.1|ARO:3000318|mphB", "sarg"]
+    ]
+
+    ARO = pronto.Ontology.from_obo_library('aro.obo')
+    expected_output = [
+        ARO.get_term('ARO:3002563'),
+        ARO.get_term('ARO:3004623'),
+        ARO.get_term('ARO:3000249'),
+        ARO.get_term('ARO:3000318')
+    ]
+
+    for t, e in zip(test_cases, expected_output):
+        assert map_to_aro(t[0], t[1]) == e