From 95ad3fbbf80bea304488ef258a0349400eb102da Mon Sep 17 00:00:00 2001
From: Vedanth Ramji <vedanth.ramji@outlook.com>
Date: Tue, 2 Apr 2024 10:28:38 +1000
Subject: [PATCH 1/6] ADD general.py

---
 argnorm/general.py     | 44 ++++++++++++++++++++++++++++++++++++++++++
 argnorm/normalizers.py | 40 +++-----------------------------------
 2 files changed, 47 insertions(+), 37 deletions(-)
 create mode 100644 argnorm/general.py

diff --git a/argnorm/general.py b/argnorm/general.py
new file mode 100644
index 0000000..d5c1650
--- /dev/null
+++ b/argnorm/general.py
@@ -0,0 +1,44 @@
+import os
+import pandas as pd
+
+ORIGINAL_ID_COL = 'Original ID'
+MAPPING_TABLE_ARO_COL = 'ARO'
+TARGET_ARO_COL = 'ARO'
+
+_ROOT = os.path.abspath(os.path.dirname(__file__))
+
+def is_number(num):
+    try:
+        int(num)
+    except ValueError:
+        return False
+
+    return True
+
+def get_data_path(path, getting_manual_curation):
+    if getting_manual_curation:
+        return os.path.join(_ROOT, 'data/manual_curation', path)
+
+    return os.path.join(_ROOT, 'data', path)
+
+def get_aro_mapping_table(database):
+    df = pd.read_csv(get_data_path(f'{database}_ARO_mapping.tsv', False), sep='\t')
+
+    manual_curation = pd.read_csv(get_data_path(f'{database}_curation.tsv', True), sep='\t')
+    manual_curation['Database'] = df['Database']
+
+    aro_mapping_table = pd.concat([df, manual_curation])
+    aro_mapping_table[TARGET_ARO_COL] = aro_mapping_table[TARGET_ARO_COL].map(lambda a: f'ARO:{int(a)}' if is_number(a) else a)
+    
+    return aro_mapping_table
+
+def map_to_aro(gene, database):
+    mapping_table = get_aro_mapping_table(database).set_index('Original ID')
+    result = mapping_table.loc[gene, 'ARO']
+
+    # Dealing with duplicated genes in ARO mapping table.
+    # Getting only one ARO number
+    if type(result) != str:
+        return list(set(result))[0]
+    else:
+        return result
\ No newline at end of file
diff --git a/argnorm/normalizers.py b/argnorm/normalizers.py
index 08c7be1..80bdc7e 100644
--- a/argnorm/normalizers.py
+++ b/argnorm/normalizers.py
@@ -1,31 +1,12 @@
 import os
 import pandas as pd
 from .drug_categorization import confers_resistance_to, drugs_to_drug_classes
-
-ORIGINAL_ID_COL = 'Original ID'
-MAPPING_TABLE_ARO_COL = 'ARO'
-TARGET_ARO_COL = 'ARO'
+from .general import *
 
 # Column headings for drug categorization output
 CONFERS_RESISTANCE_TO_COL = 'confers_resistance_to'
 RESISTANCE_TO_DRUG_CLASSES_COL = 'resistance_to_drug_classes'
 
-_ROOT = os.path.abspath(os.path.dirname(__file__))
-
-def is_number(num):
-    try:
-        int(num)
-    except ValueError:
-        return False
-
-    return True
-
-def get_data_path(path, getting_manual_curation):
-    if getting_manual_curation:
-        return os.path.join(_ROOT, 'data/manual_curation', path)
-
-    return os.path.join(_ROOT, 'data', path)
-
 class BaseNormalizer:
     """
     Inherit this class and customize subclass methods to implement the normalization of tools.
@@ -45,7 +26,7 @@ def run(self, input_file : str):
         input_genes = self.preprocess_input_genes(
             original_annot[self._input_gene_col].str.lower()
         )
-        aro_table = self.get_aro_mapping_table()
+        aro_table = get_aro_mapping_table(self.database)
         aro_table.set_index(self.preprocess_ref_genes(
             aro_table[ORIGINAL_ID_COL].str.lower()
         ), inplace=True)
@@ -58,7 +39,6 @@ def run(self, input_file : str):
 
         return original_annot
 
-
     def preprocess_ref_genes(self, ref_genes):
         """
         Customize this when ref gene and input gene can not exactly match.
@@ -78,20 +58,6 @@ def _set_input_gene_col(self):
         """
         self._input_gene_col = ''
 
-    def get_aro_mapping_table(self):
-        """
-        Don't customize this unless you're using your own (not package built-in) reference data.
-        """
-        df = pd.read_csv(get_data_path(f'{self.database}_ARO_mapping.tsv', False), sep='\t')
-
-        manual_curation = pd.read_csv(get_data_path(f'{self.database}_curation.tsv', True), sep='\t')
-        manual_curation['Database'] = df['Database']
-
-        aro_mapping_table = pd.concat([df, manual_curation])
-        aro_mapping_table[TARGET_ARO_COL] = aro_mapping_table[TARGET_ARO_COL].map(lambda a: f'ARO:{int(a)}' if is_number(a) else a)
-        
-        return aro_mapping_table
-
     def load_input(self, input_file):
         """
         Customize this when it fails to parse the input data.
@@ -221,4 +187,4 @@ def preprocess_ref_genes(self, ref_genes):
             megares=lambda x: x.split('|')[0],
             argannot=lambda x: x.split('~~~')[-1]
         )
-        return ref_genes.apply(process_funcs_by_db[self.database])
+        return ref_genes.apply(process_funcs_by_db[self.database])
\ No newline at end of file

From 8cb37c44bde375d12ffe6ad56629b4ddfa3ef7f2 Mon Sep 17 00:00:00 2001
From: Vedanth Ramji <vedanth.ramji@outlook.com>
Date: Tue, 2 Apr 2024 11:43:20 +1000
Subject: [PATCH 2/6] ENH db validation and errors when db and gene not
 recognized in map_to_aro()

---
 argnorm/general.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/argnorm/general.py b/argnorm/general.py
index d5c1650..1cbe601 100644
--- a/argnorm/general.py
+++ b/argnorm/general.py
@@ -1,5 +1,6 @@
 import os
 import pandas as pd
+import pronto
 
 ORIGINAL_ID_COL = 'Original ID'
 MAPPING_TABLE_ARO_COL = 'ARO'
@@ -33,12 +34,20 @@ def get_aro_mapping_table(database):
     return aro_mapping_table
 
 def map_to_aro(gene, database):
+    if database not in ['ncbi', 'deeparg', 'resfinder', 'sarg', 'megares', 'argannot']:
+        raise Exception(f'{database} is not a supported database.')
+    
     mapping_table = get_aro_mapping_table(database).set_index('Original ID')
-    result = mapping_table.loc[gene, 'ARO']
 
-    # Dealing with duplicated genes in ARO mapping table.
-    # Getting only one ARO number
-    if type(result) != str:
-        return list(set(result))[0]
+    try:
+        result = mapping_table.loc[gene, 'ARO']
+    except:
+        raise Exception(f'{gene} is not in {database} database')
     else:
-        return result
\ No newline at end of file
+        # Dealing with duplicated genes in ARO mapping table.
+        # Getting only one ARO number
+        ARO = pronto.Ontology.from_obo_library('aro.obo')
+        if type(result) != str:
+            return ARO[list(set(result))[0]]
+        else:
+            return ARO[result]
\ No newline at end of file

From 2f6c840f566887775e892d74fcc7b6a474786257 Mon Sep 17 00:00:00 2001
From: Vedanth Ramji <vedanth.ramji@outlook.com>
Date: Tue, 2 Apr 2024 14:41:57 +1000
Subject: [PATCH 3/6] renamed general.py to lib.py. added test for map_to_aro()

---
 argnorm/{general.py => lib.py} |  2 +-
 argnorm/normalizers.py         | 13 +------------
 tests/test_lib.py              | 21 +++++++++++++++++++++
 3 files changed, 23 insertions(+), 13 deletions(-)
 rename argnorm/{general.py => lib.py} (98%)
 create mode 100644 tests/test_lib.py

diff --git a/argnorm/general.py b/argnorm/lib.py
similarity index 98%
rename from argnorm/general.py
rename to argnorm/lib.py
index 1cbe601..734f80f 100644
--- a/argnorm/general.py
+++ b/argnorm/lib.py
@@ -41,7 +41,7 @@ def map_to_aro(gene, database):
 
     try:
         result = mapping_table.loc[gene, 'ARO']
-    except:
+    except KeyError:
         raise Exception(f'{gene} is not in {database} database')
     else:
         # Dealing with duplicated genes in ARO mapping table.
diff --git a/argnorm/normalizers.py b/argnorm/normalizers.py
index 80bdc7e..ebb3cbe 100644
--- a/argnorm/normalizers.py
+++ b/argnorm/normalizers.py
@@ -1,7 +1,7 @@
 import os
 import pandas as pd
 from .drug_categorization import confers_resistance_to, drugs_to_drug_classes
-from .general import *
+from .lib import *
 
 # Column headings for drug categorization output
 CONFERS_RESISTANCE_TO_COL = 'confers_resistance_to'
@@ -167,17 +167,6 @@ def _set_input_gene_col(self):
         )
         self._input_gene_col = gene_col_by_db[self.database]
 
-    def preprocess_input_genes(self, input_genes):
-        process_funcs_by_db = dict(
-            ncbi=lambda x: x,
-            deeparg=lambda x: x,
-            resfinder=lambda x: x,
-            sarg=lambda x: x,
-            megares=lambda x: x,
-            argannot=lambda x: x
-        )
-        return input_genes.apply(process_funcs_by_db[self.database])
-
     def preprocess_ref_genes(self, ref_genes):
         process_funcs_by_db = dict(
             ncbi=lambda x: x.split('|')[5],
diff --git a/tests/test_lib.py b/tests/test_lib.py
new file mode 100644
index 0000000..490b8f4
--- /dev/null
+++ b/tests/test_lib.py
@@ -0,0 +1,21 @@
+from argnorm.lib import map_to_aro
+import pronto
+
+def test_map_to_aro():
+    test_cases = [
+        ["argannot~~~(AGly)AAC(6')-Isa~~~NG_047311:101-574", 'argannot'],
+        ["MEG_21|Drugs|Aminoglycosides|Aminoglycoside_N-acetyltransferases|AAC3", 'megares'],
+        ["1028085756|WP_063844287.1|1|1|cpt|cpt|phosphotransferase|2|CHLORAMPHENICOL|PHENICOL|chloramphenicol_phosphotransferase_CPT", 'ncbi'],
+        ["gb|AAG57600.1|ARO:3000318|mphB", "sarg"]
+    ]
+
+    ARO = pronto.Ontology.from_obo_library('aro.obo')
+    expected_output = [
+        ARO.get_term('ARO:3002563'),
+        ARO.get_term('ARO:3004623'),
+        ARO.get_term('ARO:3000249'),
+        ARO.get_term('ARO:3000318')
+    ]
+
+    for t, e in zip(test_cases, expected_output):
+        assert map_to_aro(t[0], t[1]) == e
\ No newline at end of file

From b828c341c3f55a681126738dbb6b942e8ce5f168 Mon Sep 17 00:00:00 2001
From: Vedanth Ramji <vedanth.ramji@outlook.com>
Date: Wed, 10 Apr 2024 14:42:58 +1000
Subject: [PATCH 4/6] Avoid 'import *'

---
 argnorm/normalizers.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/argnorm/normalizers.py b/argnorm/normalizers.py
index ebb3cbe..ad76e9b 100644
--- a/argnorm/normalizers.py
+++ b/argnorm/normalizers.py
@@ -1,7 +1,8 @@
 import os
 import pandas as pd
 from .drug_categorization import confers_resistance_to, drugs_to_drug_classes
-from .lib import *
+from .lib import get_aro_mapping_table
+from .lib import ORIGINAL_ID_COL, MAPPING_TABLE_ARO_COL, TARGET_ARO_COL
 
 # Column headings for drug categorization output
 CONFERS_RESISTANCE_TO_COL = 'confers_resistance_to'

From 8aab7daa4b54aa2e92d2199278d3602b7522ec59 Mon Sep 17 00:00:00 2001
From: Vedanth Ramji <vedanth.ramji@outlook.com>
Date: Thu, 11 Apr 2024 09:45:22 +1000
Subject: [PATCH 5/6] Added notes for lib.py in CHANGELOG.md

---
 CHANGELOG.md | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a917681..a816807 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -32,3 +32,11 @@
 - Initial source code started
 - Normalizers: added BaseNormalizer, ARGSOAPNormalizer, DeepARGNormalizer, AbricateNormalizer
 - Testing: added basic ARO column test
+
+## Unreleased
+
+### argnorm.lib: Making argNorm more usable as a library 
+- A file called `lib.py` will be introduced so that users can use argNorm as a library more easily.
+- Users can import the `map_to_aro` function using `from argnorm.lib import map_to_aro`. The function takes a gene name as input, maps the gene to the ARO and returns a pronto term object with the ARO mapping.
+- The `get_aro_mapping_table` function, previously within the BaseNormalizer class, has also been moved to `lib.py` to give users the ability to access the mapping tables being used for normalization.
+- With the introduction of `lib.py`, users will be able to access core mapping utilities through `argnorm.lib`, drug categorization through `argnorm.drug_categorization`, and the traditional normalizers through `argnorm.normalizers`.
\ No newline at end of file

From 17875168109f7b8f7459414d3f58b332348cde83 Mon Sep 17 00:00:00 2001
From: Vedanth <vedanth.ramji@outlook.com>
Date: Mon, 15 Apr 2024 10:18:28 +0530
Subject: [PATCH 6/6] Update test_lib.py test case for argannot to reflect
 argannot protein update

---
 tests/test_lib.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_lib.py b/tests/test_lib.py
index 490b8f4..11ced10 100644
--- a/tests/test_lib.py
+++ b/tests/test_lib.py
@@ -3,7 +3,7 @@
 
 def test_map_to_aro():
     test_cases = [
-        ["argannot~~~(AGly)AAC(6')-Isa~~~NG_047311:101-574", 'argannot'],
+        ["(AGly)AAC(6')-Isa:NG_047311:101-574:474", 'argannot'],
         ["MEG_21|Drugs|Aminoglycosides|Aminoglycoside_N-acetyltransferases|AAC3", 'megares'],
         ["1028085756|WP_063844287.1|1|1|cpt|cpt|phosphotransferase|2|CHLORAMPHENICOL|PHENICOL|chloramphenicol_phosphotransferase_CPT", 'ncbi'],
         ["gb|AAG57600.1|ARO:3000318|mphB", "sarg"]