Skip to content

Commit

Permalink
Add manually curated tables
Browse files Browse the repository at this point in the history
Add manually curated tables (curation by @SvetlanaUP & @Vedanth-Ramji) to complete/fix automated outputs.
  • Loading branch information
Vedanth-Ramji authored Nov 28, 2023
1 parent 554f29b commit 8b09c54
Show file tree
Hide file tree
Showing 10 changed files with 1,460 additions and 64 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
Original ID ARO Replacement Gene Name in CARD
0 argannot~~~(Bla)PBP1a~~~JN645776:1-2160 3003041 (Bla)PBP1a
1 argannot~~~(Ntmdz)nimj_Nitroimidazole_Gene~~~KJ816753.1:19295-19792 3007112 (Ntmdz)nimj_Nitroimidazole_Gene
2 argannot~~~(Phe)cpt_strepv~~~U09991:1412-1948 (Phe)cpt_strepv (Phe)cpt_strepv
3 argannot~~~(Bla)Penicillin_Binding_Protein_Ecoli~~~CP002291:664439-666340 3007423 (Bla)Penicillin_Binding_Protein_Ecoli
4 argannot~~~(AGly)aac3-I~~~AJ877225:5293-5757 3007384 (AGly)aac3-I
5 argannot~~~(Bla)penA~~~AB511945:1298-3049 (Bla)penA (Bla)penA
6 argannot~~~(Phe)catB6~~~AJ223604:3727-4341 3002678 (Phe)catB6
7 argannot~~~(Tet)tetR~~~HF545434:53576-54226 3003479 (Tet)tetR
8 argannot~~~(Tet)tetR(G)~~~S52438:113-745 (Tet)tetR(G) (Tet)tetR(G)
9 argannot~~~(Bla)PBP1b~~~AF101781:1-2466 (Bla)PBP1b (Bla)PBP1b
398 changes: 398 additions & 0 deletions argnorm/data/manual_curation/abricate_megares_both_manual_curation.tsv

Large diffs are not rendered by default.

15 changes: 15 additions & 0 deletions argnorm/data/manual_curation/argsoap_sarg_orfs_manual_curation.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
Categories_in_database ARO Replacement
0 beta-lactam__fmtC fmtC
1 beta-lactam__PBP-1A PBP-1A
2 beta-lactam__PBP-1B PBP-1B
3 beta-lactam__PBP-2X PBP-2X
4 beta-lactam__penA penA
5 macrolide-lincosamide-streptogramin__ermC {'ARO:3000250'}
6 multidrug__mexT {'ARO:3000814'}
7 tetracycline__tetC {'ARO:3000656', 'ARO:3000167'}
8 tetracycline__tetD {'ARO:3000168'}
9 tetracycline__tetR {'ARO:0000051'}
10 tetracycline__tetracycline_resistance_protein tetracycline_resistance_protein
11 tetracycline__tetU {'ARO:3004650'}
12 unclassified__cob(I)alamin adenolsyltransferase cob(I)alamin adenolsyltransferase
13 unclassified__cystathionine beta-lyase patB cystathionine beta-lyase patB
271 changes: 271 additions & 0 deletions argnorm/data/manual_curation/argsoap_sarg_reads_manual_curation.tsv

Large diffs are not rendered by default.

294 changes: 294 additions & 0 deletions argnorm/data/manual_curation/deeparg_deeparg_both_manual_curation.tsv

Large diffs are not rendered by default.

229 changes: 229 additions & 0 deletions argnorm/data/manual_curation/ncbi_manual_curation.tsv

Large diffs are not rendered by default.

78 changes: 78 additions & 0 deletions argnorm/data/manual_curation/resfinder_manual_curation.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
Original ID ARO Replacement Gene Name in CARD
0 qnrB11_1_EF653270 3002725 qnrB11
1 blaOXY-1-5_5_AY077486 blaOXY-1-5 blaOXY-1-5
2 aph(2'')-If_2_AY701528 3004191 aph(2'')-If
3 qnrB10_2_HM439644 3002724 qnrB10
4 blaLEN10_1_AJ635419 blaLEN10 blaLEN10
5 qnrB6_1_EF520349 3002720 qnrB6
6 qnrB24_1_HM192542 3002739 qnrB24
7 blaACT-4_2_AJ311172 blaACT-4 blaACT-4
8 blaCARB-3_1_S46063 blaCARB-3 blaCARB-3
9 blaLEN13_1_AJ635403 blaLEN13 blaLEN13
10 qnrB49_1_JQ582718 3002764 qnrB49
11 blaLEN9_1_AJ635405 blaLEN9 blaLEN9
12 blaCMY-59_1_NG_048854 blaCMY-59 blaCMY-59
13 aac(6')-Ib3_1_X60321 3002576 aac(6')-Ib3
14 qnrB59_1_JX259320 3002772 qnrB59
15 qnrB6_2_EF523819 3002720 qnrB6
16 qnrB3_1_DQ303920 3002716 qnrB3
17 nimJ_1_NZ_JH815495 3007112 nimJ
18 qnrB5_1_DQ303919 3002719 qnrB5
19 qnrB23_1_FJ981622 3002738 qnrB23
20 qnrB29_1_HM439649 3002744 qnrB29
21 qnrB13_1_EU273755 3002727 qnrB13
22 sul2_8_AJ877041 3000412 sul2
23 blaVMB-1_1_MN719868 blaVMB-1 blaVMB-1
24 qnrB17_1_JN173051 3002732 qnrB17
25 fomB_1_AB016934 3000449 fomB
26 fosA4_1_AB908992 3003210 fosA4
27 blaB-6_1_AF189302 3005554 blaB-6
28 mcr-1.26_1_NG_068217 3007280 mcr-1.26
29 formA_1_X73835 formA formA
30 qnrB31_1_HQ418999 3002746 qnrB31
31 qnrB41_1_JN166690 3002756 qnrB41
32 qnrB14_1_EU273757 3002728 qnrB14
33 aac(6')-29a_1_AF263519 3002583 aac(6')-29a
34 vat(D)_1_L12033 3002843 vat(D)
35 blaACC-4_1_KM087831 blaACC-4 blaACC-4
36 qnrB58_1_JX259319 3002771 qnrB58
37 penA_1_AF515059 penA penA
38 qnrB15_1_EU302865 3002730 qnrB15
39 EstDL136_1_JN242251 EstDL136 EstDL136
40 blaCTX-M-131_1_JN969893 blaCTX-M-131 blaCTX-M-131
41 qnrB20_1_AB379831 3002735 qnrB20
42 qnrB57_1_JX259318 3002770 qnrB57
43 mcr-3.2_1_NMWW01000143 3004511 mcr-3.2
44 tet(Q)_4_Z21523 3000191 tet(Q)
45 qnrB22_1_FJ981621 3002737 qnrB22
46 blaOXA-85_1_JANA01000064 blaOXA-85 blaOXA-85
47 qnrB56_1_JX259317 3002769 qnrB56
48 aac(6')-Ib-cr_2_EF636461 3005113 aac(6')-Ib-cr
49 qnrB53_1_HQ704413 3002761 qnrB53
50 ant(3'')-Ia_1_X02340 3000232 ant(3'')-Ia
51 qnrB30_1_HM439650 3002745 qnrB30
52 blaCARB-21_1_NG_048724 blaCARB-21 blaCARB-21
53 blaOXA-289_1_APRM01000005 blaOXA-289 blaOXA-289
54 blaBRO-1_1_Z54180 blaBRO-1 blaBRO-1
55 erm(N)_2_MZ015744 erm(N) erm(N)
56 qnrB16_1_EU136183 3002731 qnrB16
57 mcr-3.4_1_FLXA01000011 3004691 mcr-3.4
58 qnrB26_1_HQ386846 3002741 qnrB26
59 blaLEN12_1_AJ635406 blaLEN12 blaLEN12
60 qnrB12_2_AM774474 3002726 qnrB12
61 aac(3)-Ia_1_X15852 3002528 aac(3)-Ia
62 blaCARB-23_1_NG_048726 blaCARB-23 blaCARB-23
63 aac(3)-I_1_AJ877225 3007384 aac(3)-I
64 mcr-3.5_1_MF489760 3004505 mcr-3.5
65 qnrB62_1_JX987101 3002775 qnrB62
66 mcr-3.3_1_MF495680 3004662 mcr-3.3
67 blaACC-1_2_HG530658 blaACC-1 blaACC-1
68 aac(6')-29b_1_AF263519 3002584 aac(6')-29b
69 sul2_9_FJ197818 3000412 sul2
70 qnrB34_1_JN173056 3002749 qnrB34
71 blaLEN7_1_AJ635425 blaLEN7 blaLEN7
72 aadA9_1_AJ420072 3002609 aadA9
73 qnrB10_1_DQ631414 3002724 qnrB10
74 qnrB18_1_AM919399 3002733 qnrB18
75 blaLEN8_1_AJ635424 blaLEN8 blaLEN8
76 qnrB17_2_AM919398 3002732 qnrB17
2 changes: 1 addition & 1 deletion argnorm/drug_categorization.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pronto
from typing import List, Tuple

# Load the ArgNorm ontology from the 'aro.obo' file
# Load the ArgNorm ontology from internet
ARO = pronto.Ontology.from_obo_library('aro.obo')

def get_immediate_drug_classes(aro_num: str) -> List[Tuple]:
Expand Down
76 changes: 60 additions & 16 deletions argnorm/normalizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,21 +16,40 @@

_ROOT = os.path.abspath(os.path.dirname(__file__))

def is_number(num):
"""
Required for checking aro mappings to discern between numbers and other
string identifiers.
"""
try:
float(num)
except ValueError:
return False

def get_data_path(path):
return os.path.join(_ROOT, 'data', path)
return True

def get_data_path(path, getting_manual_curation):
"""
Gets mapping tables and manual curation tables.
Giving 'True' as argument after 'path' will get manual curation table.
Else mapping table will be returned.
"""
if getting_manual_curation:
return os.path.join(_ROOT, 'data/manual_curation', path)

return os.path.join(_ROOT, 'data', path)

class BaseNormalizer:
"""
Inherit this class and customize subclass methods to implement the normalization of tools.
"""

def __init__(self, database=None, is_hamronized=False, mode=None) -> None:
def __init__(self, database=None, is_hamronized=False, mode=None, uses_manual_curation=True) -> None:
self.tool = ''
self.database = database
self.mode = mode
self.is_hamronized = is_hamronized
self.uses_manual_curation = uses_manual_curation
self._set_input_gene_col()
self._set_ref_gene_and_aro_cols()

Expand Down Expand Up @@ -103,9 +122,34 @@ def get_aro_mapping_table(self):
"""
Don't customize this unless you're using your own (not package built-in) reference data.
"""
df = pd.read_csv(get_data_path(f'{self.tool}_{self.database}_{self.mode}_ARO_mapping.tsv'), sep='\t', index_col=0)
if self.tool != 'argsoap' or self.mode != 'orfs':
df[TARGET_ARO_COL] = df[TARGET_ARO_COL].map(lambda a: f'ARO:{int(a) if a == a else "nan"}') # a == a checks that a is not nan
df = pd.read_csv(get_data_path(f'{self.tool}_{self.database}_{self.mode}_ARO_mapping.tsv', False), sep='\t', index_col=0)

if self.uses_manual_curation:
if self.database == 'sarg' and self.mode == 'orfs':
gene_identifier = 'Categories_in_database'
else:
gene_identifier = 'Original ID'

if self.database == 'ncbi':
manual_curation = pd.read_csv(get_data_path('ncbi_manual_curation.tsv', True), sep='\t')
elif self.database == 'resfinder':
manual_curation = pd.read_csv(get_data_path('resfinder_manual_curation.tsv', True), sep='\t')
else:
manual_curation = pd.read_csv(get_data_path(f'{self.tool}_{self.database}_{self.mode}_manual_curation.tsv', True), sep='\t')

aro_nan_indices = [(list(df[gene_identifier]).index(manual_curation.loc[i, gene_identifier])) for i in range(manual_curation.shape[0])]

for i in range(len(aro_nan_indices)):
df.loc[aro_nan_indices[i], 'ARO'] = manual_curation.loc[i, 'ARO Replacement']

if self.tool != 'argsoap' and self.mode != 'orfs':
df.loc[aro_nan_indices[i], 'Gene Name in CARD'] = manual_curation.loc[i, 'Gene Name in CARD']
if self.tool != 'argsoap' or self.mode != 'orfs':
df[TARGET_ARO_COL] = df[TARGET_ARO_COL].map(lambda a: f'ARO:{int(float(a)) if is_number(a) == True else a}')
else:
if self.tool != 'argsoap' or self.mode != 'orfs':
df[TARGET_ARO_COL] = df[TARGET_ARO_COL].map(lambda a: f'ARO:{int(a) if a == a else "nan"}') # a == a checks that a is not nan

return df

def load_input(self, input_file):
Expand All @@ -116,14 +160,14 @@ def load_input(self, input_file):


class ARGSOAPNormalizer(BaseNormalizer):
def __init__(self, database=None, is_hamronized=False, mode=None) -> None:
def __init__(self, database=None, is_hamronized=False, mode=None, uses_manual_curation=True) -> None:
if not database:
warnings.warn('No `database` specified. Will try using SARG.')
database = 'sarg'
elif database != 'sarg':
warnings.warn('The `database` is not supported. Will try using SARG instead.')
database = 'sarg'
super().__init__(database, is_hamronized, mode)
super().__init__(database, is_hamronized, mode, uses_manual_curation)
self.tool = 'argsoap'

def _set_ref_gene_and_aro_cols(self):
Expand Down Expand Up @@ -182,7 +226,7 @@ def _raise_incorrect_mode_error(self):

class DeepARGNormalizer(BaseNormalizer):

def __init__(self, database=None, is_hamronized=False, mode=None) -> None:
def __init__(self, database=None, is_hamronized=False, mode=None, uses_manual_curation=True) -> None:
if mode:
warnings.warn('`mode` is not relavant for DeepARG and will be ignored.')
mode = 'both'
Expand All @@ -195,7 +239,7 @@ def __init__(self, database=None, is_hamronized=False, mode=None) -> None:
elif database != 'deeparg':
warnings.warn('The `database` is not supported. Will try using DeepARG instead.')
database = 'deeparg'
super().__init__(database, is_hamronized, mode)
super().__init__(database, is_hamronized, mode, uses_manual_curation)
self.tool = 'deeparg'

def _set_input_gene_col(self):
Expand All @@ -210,7 +254,7 @@ def _set_input_gene_col(self):

class ResFinderNormalizer(BaseNormalizer):

def __init__(self, database=None, is_hamronized=False, mode=None) -> None:
def __init__(self, database=None, is_hamronized=False, mode=None, uses_manual_curation=True) -> None:
if mode:
warnings.warn('`mode` is not relavant for ResFinder and will be ignored.')
mode = 'both'
Expand All @@ -223,7 +267,7 @@ def __init__(self, database=None, is_hamronized=False, mode=None) -> None:
elif database != 'resfinder':
warnings.warn('The `database` is not supported. Will try using ResFinder instead.')
database = 'resfinder'
super().__init__(database, is_hamronized, mode)
super().__init__(database, is_hamronized, mode, uses_manual_curation)
self.tool = 'resfinder'

def _set_input_gene_col(self):
Expand All @@ -241,7 +285,7 @@ def preprocess_ref_genes(self, ref_genes):

class AMRFinderPlusNormalizer(BaseNormalizer):

def __init__(self, database=None, is_hamronized=False, mode=None) -> None:
def __init__(self, database=None, is_hamronized=False, mode=None, uses_manual_curation=True) -> None:
if mode:
warnings.warn('`mode` is not relavant for AMRFinderPlus and will be ignored.')
mode = 'both'
Expand All @@ -254,7 +298,7 @@ def __init__(self, database=None, is_hamronized=False, mode=None) -> None:
elif database != 'ncbi':
warnings.warn('The `database` is not supported. Will try using NCBI instead.')
database = 'ncbi'
super().__init__(database, is_hamronized, mode)
super().__init__(database, is_hamronized, mode, uses_manual_curation)
self.tool = 'amrfinderplus'

def _set_input_gene_col(self):
Expand All @@ -272,14 +316,14 @@ def preprocess_ref_genes(self, ref_genes):

class AbricateNormalizer(BaseNormalizer):

def __init__(self, database=None, is_hamronized=False, mode=None) -> None:
def __init__(self, database=None, is_hamronized=False, mode=None, uses_manual_curation=True) -> None:
if mode:
warnings.warn('`mode` is not relavant for Abricate and will be ignored.')
mode = 'both'
else:
warnings.warn('`mode` is not specified. Will use default setting "both".')
mode = 'both'
super().__init__(database, is_hamronized, mode)
super().__init__(database, is_hamronized, mode, uses_manual_curation)
self.tool = 'abricate'

def _set_input_gene_col(self):
Expand Down
Loading

0 comments on commit 8b09c54

Please sign in to comment.