Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ADD lib.py to make argnorm usable as a library #28

Merged
merged 9 commits into from
Apr 15, 2024
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,9 @@
### Using amino acid file for argannot rather than nucleotide file
- ARG-ANNOT is comprised of coding sequences. The data wasn't being handled properly before as contig mode was used when passing coding sequences to RGI. Now, the amino acid version of ARG-ANNOT is used with protein mode when running the database in RGI.
- One to many ARO mapping such as NG_047831:101-955 to Erm(K) and almG eliminated as protein mode used
- A total of 10 ARO mappings changed
- A total of 10 ARO mappings changed
### argnorm.lib: Making argNorm more usable as a library
- A file called `lib.py` will be introduced so that users can use argNorm as a library more easily.
- Users can import the `map_to_aro` function using `from argnorm.lib import map_to_aro`. The function takes a gene name as input, maps the gene to the ARO and returns a pronto term object with the ARO mapping.
- The `get_aro_mapping_table` function, previously within the BaseNormalizer class, has also been moved to `lib.py` to give users the ability to access the mapping tables being used for normalization.
- With the introduction of `lib.py`, users will be able to access core mapping utilities through `argnorm.lib`, drug categorization through `argnorm.drug_categorization`, and the traditional normalizers through `argnorm.normalizers`.
53 changes: 53 additions & 0 deletions argnorm/lib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os
import pandas as pd
import pronto

ORIGINAL_ID_COL = 'Original ID'
MAPPING_TABLE_ARO_COL = 'ARO'
TARGET_ARO_COL = 'ARO'

_ROOT = os.path.abspath(os.path.dirname(__file__))

def is_number(num):
try:
int(num)
except ValueError:
return False

return True

def get_data_path(path, getting_manual_curation):
if getting_manual_curation:
return os.path.join(_ROOT, 'data/manual_curation', path)

return os.path.join(_ROOT, 'data', path)

def get_aro_mapping_table(database):
df = pd.read_csv(get_data_path(f'{database}_ARO_mapping.tsv', False), sep='\t')

manual_curation = pd.read_csv(get_data_path(f'{database}_curation.tsv', True), sep='\t')
manual_curation['Database'] = df['Database']

aro_mapping_table = pd.concat([df, manual_curation])
aro_mapping_table[TARGET_ARO_COL] = aro_mapping_table[TARGET_ARO_COL].map(lambda a: f'ARO:{int(a)}' if is_number(a) else a)

return aro_mapping_table

def map_to_aro(gene, database):
if database not in ['ncbi', 'deeparg', 'resfinder', 'sarg', 'megares', 'argannot']:
raise Exception(f'{database} is not a supported database.')

mapping_table = get_aro_mapping_table(database).set_index('Original ID')

try:
result = mapping_table.loc[gene, 'ARO']
except KeyError:
raise Exception(f'{gene} is not in {database} database')
else:
# Dealing with duplicated genes in ARO mapping table.
# Getting only one ARO number
ARO = pronto.Ontology.from_obo_library('aro.obo')
if type(result) != str:
return ARO[list(set(result))[0]]
else:
return ARO[result]
42 changes: 5 additions & 37 deletions argnorm/normalizers.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,13 @@
import os
import pandas as pd
from .drug_categorization import confers_resistance_to, drugs_to_drug_classes

ORIGINAL_ID_COL = 'Original ID'
MAPPING_TABLE_ARO_COL = 'ARO'
TARGET_ARO_COL = 'ARO'
from .lib import get_aro_mapping_table
from .lib import ORIGINAL_ID_COL, MAPPING_TABLE_ARO_COL, TARGET_ARO_COL

# Column headings for drug categorization output
CONFERS_RESISTANCE_TO_COL = 'confers_resistance_to'
RESISTANCE_TO_DRUG_CLASSES_COL = 'resistance_to_drug_classes'

_ROOT = os.path.abspath(os.path.dirname(__file__))

def is_number(num):
try:
int(num)
except ValueError:
return False

return True

def get_data_path(path, getting_manual_curation):
if getting_manual_curation:
return os.path.join(_ROOT, 'data/manual_curation', path)

return os.path.join(_ROOT, 'data', path)

class BaseNormalizer:
"""
Inherit this class and customize subclass methods to implement the normalization of tools.
Expand All @@ -45,7 +27,7 @@ def run(self, input_file : str):
input_genes = self.preprocess_input_genes(
original_annot[self._input_gene_col].str.lower()
)
aro_table = self.get_aro_mapping_table()
aro_table = get_aro_mapping_table(self.database)
aro_table.set_index(self.preprocess_ref_genes(
aro_table[ORIGINAL_ID_COL].str.lower()
), inplace=True)
Expand All @@ -58,7 +40,6 @@ def run(self, input_file : str):

return original_annot


def preprocess_ref_genes(self, ref_genes):
"""
Customize this when ref gene and input gene can not exactly match.
Expand All @@ -78,20 +59,6 @@ def _set_input_gene_col(self):
"""
self._input_gene_col = ''

def get_aro_mapping_table(self):
"""
Don't customize this unless you're using your own (not package built-in) reference data.
"""
df = pd.read_csv(get_data_path(f'{self.database}_ARO_mapping.tsv', False), sep='\t')

manual_curation = pd.read_csv(get_data_path(f'{self.database}_curation.tsv', True), sep='\t')
manual_curation['Database'] = df['Database']

aro_mapping_table = pd.concat([df, manual_curation])
aro_mapping_table[TARGET_ARO_COL] = aro_mapping_table[TARGET_ARO_COL].map(lambda a: f'ARO:{int(a)}' if is_number(a) else a)

return aro_mapping_table

def load_input(self, input_file):
"""
Customize this when it fails to parse the input data.
Expand Down Expand Up @@ -202,6 +169,7 @@ def _set_input_gene_col(self):
)
self._input_gene_col = gene_col_by_db[self.database]


def preprocess_input_genes(self, input_genes):
process_funcs_by_db = dict(
ncbi=lambda x: x,
Expand Down Expand Up @@ -231,4 +199,4 @@ def preprocess_ref_genes(self, ref_genes):
argannot=self.preprocess_argannot_ref_genes,
resfinderfg=lambda x: x.split('|')[1]
)
return ref_genes.apply(process_funcs_by_db[self.database])
return ref_genes.apply(process_funcs_by_db[self.database])
Vedanth-Ramji marked this conversation as resolved.
Show resolved Hide resolved
21 changes: 21 additions & 0 deletions tests/test_lib.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from argnorm.lib import map_to_aro
import pronto

def test_map_to_aro():
test_cases = [
["(AGly)AAC(6')-Isa:NG_047311:101-574:474", 'argannot'],
["MEG_21|Drugs|Aminoglycosides|Aminoglycoside_N-acetyltransferases|AAC3", 'megares'],
["1028085756|WP_063844287.1|1|1|cpt|cpt|phosphotransferase|2|CHLORAMPHENICOL|PHENICOL|chloramphenicol_phosphotransferase_CPT", 'ncbi'],
["gb|AAG57600.1|ARO:3000318|mphB", "sarg"]
]

ARO = pronto.Ontology.from_obo_library('aro.obo')
expected_output = [
ARO.get_term('ARO:3002563'),
ARO.get_term('ARO:3004623'),
ARO.get_term('ARO:3000249'),
ARO.get_term('ARO:3000318')
]

for t, e in zip(test_cases, expected_output):
assert map_to_aro(t[0], t[1]) == e
Loading