diff --git a/setup.py b/setup.py index d6d71c5..7455ca3 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ long_description=long_description, long_description_content_type='text/markdown', author='Center for Computational Biomedicine, Harvard Medical School', - author_email='rafael_goncalves@hms.harvard.edu', + author_email='rsgoncalves@gmx.com', classifiers=[ 'Development Status :: 4 - Beta', 'License :: OSI Approved :: MIT License', diff --git a/test/simple_tests.py b/test/simple_tests.py index 305281e..25dfb60 100644 --- a/test/simple_tests.py +++ b/test/simple_tests.py @@ -145,11 +145,19 @@ def test_mapping_zooma_ontologies(self): assert df_zooma[self.MAPPED_TERM_CURIE_COLUMN].str.contains("EFO:").any() assert df_zooma[self.MAPPED_TERM_CURIE_COLUMN].str.contains("NCIT:").any() + def test_mapping_bioportal_ontologies_no_apikey(self): + # Test mapping a list of terms to multiple ontologies using the BioPortal Annotator mapper without API Key + print("Test mapping a list of terms to multiple ontologies using the BioPortal Annotator mapper...") + df_bioportal = text2term.map_terms(["asthma", "location", "food allergy"], target_ontology="EFO,NCIT", + mapper=Mapper.BIOPORTAL, term_type=OntologyTermType.ANY) + assert df_bioportal.empty is True + def test_mapping_bioportal_ontologies(self): # Test mapping a list of terms to multiple ontologies using the BioPortal Annotator mapper print("Test mapping a list of terms to multiple ontologies using the BioPortal Annotator mapper...") df_bioportal = text2term.map_terms(["asthma", "location", "food allergy"], target_ontology="EFO,NCIT", - mapper=Mapper.BIOPORTAL, term_type=OntologyTermType.ANY) + mapper=Mapper.BIOPORTAL, term_type=OntologyTermType.ANY, + bioportal_apikey="8f0cbe43-2906-431a-9572-8600d3f4266e") print(f"{df_bioportal}\n") assert df_bioportal.size > 0 assert df_bioportal[self.MAPPED_TERM_CURIE_COLUMN].str.contains("EFO:").any() @@ -207,6 +215,13 @@ def test_mapping_with_min_score_filter(self): term_type=OntologyTermType.ANY, min_score=min_score) assert (df_leven[self.MAPPING_SCORE_COLUMN] >= min_score).all() + def test_mapping_with_min_score_filter_empty_results(self): + self.ensure_cache_exists("EFO", self.EFO_URL) + print("Test mapping to EFO using TFIDF similarity metric and min_score filter that results in no mappings...") + df_tfidf = text2term.map_terms(["carbon monoxide"], target_ontology="EFO", use_cache=True, mapper=Mapper.TFIDF, + term_type=OntologyTermType.ANY, min_score=0.99) + assert df_tfidf.empty is True + def test_include_unmapped_terms(self): self.ensure_cache_exists("EFO", self.EFO_URL) df = text2term.map_terms(["asthma", "margarita"], target_ontology="EFO", use_cache=True, mapper=Mapper.TFIDF, diff --git a/text2term/__main__.py b/text2term/__main__.py index ed94223..54dacc6 100644 --- a/text2term/__main__.py +++ b/text2term/__main__.py @@ -7,7 +7,7 @@ if __name__ == "__main__": parser = argparse.ArgumentParser(description='A tool for mapping free-text descriptions of (biomedical) ' - 'entities to controlled terms in an ontology') + 'entities to ontology terms') parser.add_argument("-s", "--source", required=True, type=str, help="Input file containing 'source' terms to map to ontology terms: list of terms or CSV file") parser.add_argument("-t", "--target", required=True, type=str, @@ -42,6 +42,8 @@ help="Define whether to map to ontology classes, properties, or both") parser.add_argument('-u', "--incl_unmapped", required=False, default=False, action="store_true", help="Include all unmapped terms in the output") + parser.add_argument('-bp', "--bioportal_apikey", required=False, type=str, default="", + help="BioPortal API Key to use along with the BioPortal mapper option") arguments = parser.parse_args() if not os.path.exists(arguments.source): @@ -63,4 +65,5 @@ excl_deprecated=arguments.excl_deprecated, mapper=mapper, max_mappings=arguments.top_mappings, min_score=arguments.min_score, base_iris=iris, save_graphs=arguments.save_term_graphs, save_mappings=True, separator=arguments.separator, use_cache=cache_exists(target), - term_type=arguments.term_type, incl_unmapped=arguments.incl_unmapped) + term_type=arguments.term_type, incl_unmapped=arguments.incl_unmapped, + bioportal_apikey=arguments.bioportal_apikey) diff --git a/text2term/config.py b/text2term/config.py index 773464b..f045f1a 100644 --- a/text2term/config.py +++ b/text2term/config.py @@ -1 +1 @@ -VERSION = "4.2.0" +VERSION = "4.2.1" diff --git a/text2term/t2t.py b/text2term/t2t.py index a2e27a4..def86a8 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -31,7 +31,7 @@ def map_terms(source_terms, target_ontology, base_iris=(), csv_columns=(), excl_deprecated=False, max_mappings=3, min_score=0.3, mapper=Mapper.TFIDF, output_file='', save_graphs=False, save_mappings=False, source_terms_ids=(), separator=',', use_cache=False, term_type=OntologyTermType.CLASS, - incl_unmapped=False): + incl_unmapped=False, bioportal_apikey=""): """ Maps the terms in the given list to the specified target ontology. @@ -75,6 +75,8 @@ def map_terms(source_terms, target_ontology, base_iris=(), csv_columns=(), excl_ The type(s) of ontology terms to map to, which can be 'class' or 'property' or 'any' incl_unmapped : bool Include unmapped terms in the output data frame + bioportal_apikey : str + BioPortal API Key to use along with the BioPortal mapper option Returns ---------- @@ -101,8 +103,9 @@ def map_terms(source_terms, target_ontology, base_iris=(), csv_columns=(), excl_ # Run the mapper LOGGER.info(f"Mapping {len(source_terms)} source terms to {target_ontology}") mappings_df = _do_mapping(source_terms, source_terms_ids, target_terms, mapper, max_mappings, min_score, tags, - incl_unmapped) - mappings_df["Mapping Score"] = mappings_df["Mapping Score"].astype(float).round(decimals=3) + incl_unmapped, bioportal_apikey) + if not mappings_df.empty: + mappings_df["Mapping Score"] = mappings_df["Mapping Score"].astype(float).round(decimals=3) if save_mappings: _save_mappings(mappings_df, output_file, min_score, mapper, target_ontology, base_iris, excl_deprecated, max_mappings, term_type, source_terms, incl_unmapped) @@ -194,7 +197,8 @@ def _load_ontology(ontology, iris, exclude_deprecated, use_cache=False, term_typ return onto_terms -def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappings, min_score, tags, incl_unmapped): +def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappings, min_score, tags, incl_unmapped, + bioportal_apikey): to_map, tags = _process_tags(source_terms, tags) start = time.time() if mapper == Mapper.TFIDF: @@ -204,7 +208,10 @@ def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappi term_mapper = ZoomaMapper() mappings_df = term_mapper.map(to_map, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) elif mapper == Mapper.BIOPORTAL: - term_mapper = BioPortalAnnotatorMapper("8f0cbe43-2906-431a-9572-8600d3f4266e") + if bioportal_apikey == "": + LOGGER.error("A BioPortal API Key must be specified via the parameter `bioportal_apikey`") + return pd.DataFrame() + term_mapper = BioPortalAnnotatorMapper(bioportal_apikey) mappings_df = term_mapper.map(to_map, source_term_ids, ontologies=ontology_terms, max_mappings=max_mappings) elif mapper in {Mapper.LEVENSHTEIN, Mapper.JARO, Mapper.JARO_WINKLER, Mapper.INDEL, Mapper.FUZZY, Mapper.JACCARD}: term_mapper = SyntacticMapper(ontology_terms)