diff --git a/README-UI.md b/README-UI.md index a096884..7846136 100644 --- a/README-UI.md +++ b/README-UI.md @@ -9,7 +9,7 @@ The following information pertains to the text2term UI, which is written [here]( - npm >= 8.0.0 - Python >= 3.9.0 - pip >= 21.0.0 -- text2term >= 1.1.0 +- text2term >= 4.1.2 **\*** These are the versions I have that work; while I know Python 3.9 or higher is necessary, the others may not strictly require the listed versions. diff --git a/README.md b/README.md index f936def..feaf75d 100644 --- a/README.md +++ b/README.md @@ -13,40 +13,40 @@ pip install text2term import text2term import pandas -df1 = text2term.map_terms("test/unstruct_terms.txt", "http://www.ebi.ac.uk/efo/efo.owl") -df2 = text2term.map_terms(["asthma", "acute bronchitis"], "http://www.ebi.ac.uk/efo/efo.owl") -df3 = text2term.map_terms({"asthma":"disease", "acute bronchitis":["disease", "lungs"]}, "http://www.ebi.ac.uk/efo/efo.owl") +df1 = text2term.map_terms("test/unstruct_terms.txt", "http://purl.obolibrary.org/obo/mondo.owl") +df2 = text2term.map_terms(["asthma", "acute bronchitis"], "http://purl.obolibrary.org/obo/mondo.owl") +df3 = text2term.map_terms({"asthma":"disease", "acute bronchitis":["disease", "lungs"]}, "http://purl.obolibrary.org/obo/mondo.owl") ``` Below is an example of caching, assuming the same imports as above: ```python -text2term.cache_ontology("http://www.ebi.ac.uk/efo/efo.owl", "EFO") -df1 = text2term.map_terms("test/unstruct_terms.txt", "EFO", use_cache=True) -df2 = text2term.map_terms(["asthma", "acute bronchitis"], "EFO", use_cache=True) -text2term.clear_cache("EFO") +text2term.cache_ontology("http://purl.obolibrary.org/obo/mondo.owl", "MONDO") +df1 = text2term.map_terms("test/unstruct_terms.txt", "MONDO", use_cache=True) +df2 = text2term.map_terms(["asthma", "acute bronchitis"], "MONDO", use_cache=True) +text2term.clear_cache("MONDO") ``` ### Command Line The basic use of the tool requires a `source` file containing a list of terms to map to the given `target` ontology: -`python text2term -s test/unstruct_terms.txt -t http://www.ebi.ac.uk/efo/efo.owl` +`python text2term -s test/unstruct_terms.txt -t http://purl.obolibrary.org/obo/mondo.owl` Specify an output file where the mappings should be saved using `-o`: -`python text2term -s test/unstruct_terms.txt -t efo.owl -o /Documents/my-mappings.csv` +`python text2term -s test/unstruct_terms.txt -t mondo.owl -o /Documents/my-mappings.csv` Set the minimum acceptable similarity score for mapping each given term to an ontology term using `-min`: -`python text2term -s test/unstruct_terms.txt -t efo.owl -min 0.8` +`python text2term -s test/unstruct_terms.txt -t mondo.owl -min 0.8` The mapped terms returned will have been determined to be 0.8 similar to their source terms in a 0-1 scale. Exclude deprecated ontology terms (declared as such via *owl:deprecated true*) using `-d`: -`python text2term -s test/unstruct_terms.txt -t efo.owl -d` +`python text2term -s test/unstruct_terms.txt -t mondo.owl -d` Limit search to only terms whose IRIs start with any IRI given in a list specified using `-iris`: -`python text2term.py -s test/unstruct_terms.txt -t efo.owl -iris http://www.ebi.ac.uk/efo/EFO,http://purl.obolibrary.org/obo/HP` +`python text2term.py -s test/unstruct_terms.txt -t mondo.owl -iris http://purl.obolibrary.org/obo/mondo.owl,http://purl.obolibrary.org/obo/HP` Here, because EFO reuses terms from other ontologies such as HP and GO, the HP terms would be included but the GO terms would be excluded. Use the cache on the command line, first by flagging it, then in the future using the acronym: -`python text2term -s test/unstruct_terms.txt -t http://www.ebi.ac.uk/efo/efo.owl -c EFO` +`python text2term -s test/unstruct_terms.txt -t http://purl.obolibrary.org/obo/mondo.owl -c MONDO` Then, after running this, the following command is equivalent: -`python text2term -s test/unstruct_terms.txt -t EFO` +`python text2term -s test/unstruct_terms.txt -t MONDO` ## Programmatic Usage The tool can be executed in Python with the `map_terms` function: diff --git a/text2term/__main__.py b/text2term/__main__.py index df9863b..9560fac 100644 --- a/text2term/__main__.py +++ b/text2term/__main__.py @@ -17,7 +17,7 @@ "'all' to search all ontologies") parser.add_argument("-o", "--output", required=False, type=str, default="", help="Path to desired output file for the mappings (default=current working directory)") - parser.add_argument("-m", "--mapper", required=False, type=str, default=Mapper.TFIDF, + parser.add_argument("-m", "--mapper", required=False, type=str, default="tfidf", help="Method used to compare source terms with ontology terms. One of: " + str(Mapper.list()) + " (default=tfidf)") parser.add_argument("-csv", "--csv_input", required=False, type=str, default=(), @@ -39,8 +39,10 @@ help="Save vis.js graphs representing the neighborhood of each ontology term (default=False)") parser.add_argument("-c", "--store_in_cache", required=False, type=str, default="", help="Store the target ontology into local cache under acronym") - parser.add_argument("-type", "--term_type", required=False, type=str, default=OntologyTermType.CLASS, + parser.add_argument("-type", "--term_type", required=False, type=str, default="class", help="Define whether to return ontology classes, properties, or both") + parser.add_argument('-u', "--incl_unmapped", required=False, default=False, action="store_true", + help="Include all unmapped terms in the output") arguments = parser.parse_args() if not os.path.exists(arguments.source): @@ -62,4 +64,4 @@ excl_deprecated=arguments.excl_deprecated, mapper=mapper, max_mappings=arguments.top_mappings, min_score=arguments.min_score, base_iris=iris, save_graphs=arguments.save_term_graphs, save_mappings=True, separator=arguments.separator, use_cache=cache_exists(target), - term_type=arguments.term_type) + term_type=arguments.term_type, incl_unmapped=arguments.incl_unmapped) diff --git a/text2term/config.py b/text2term/config.py index 546a837..aa1adbb 100644 --- a/text2term/config.py +++ b/text2term/config.py @@ -1 +1 @@ -VERSION = "4.1.2" +VERSION = "4.1.3" diff --git a/text2term/onto_utils.py b/text2term/onto_utils.py index 9cbd9ac..d0bc45b 100644 --- a/text2term/onto_utils.py +++ b/text2term/onto_utils.py @@ -11,7 +11,7 @@ STOP_WORDS = {'in', 'the', 'any', 'all', 'for', 'and', 'or', 'dx', 'on', 'fh', 'tx', 'only', 'qnorm', 'w', 'iqb', 's', 'ds', 'rd', 'rdgwas', 'ICD', 'excluded', 'excluding', 'unspecified', 'certain', 'also', 'undefined', 'ordinary', 'least', 'squares', 'FINNGEN', 'elsewhere', 'more', 'excluded', 'classified', 'classifeid', - 'unspcified', 'unspesified', 'specified', 'acquired', 'combined', 'unspeficied', 'elsewhere', 'not', 'by', + 'unspcified', 'unspesified', 'specified', 'acquired', 'combined', 'unspeficied', 'elsewhere', 'by', 'strict', 'wide', 'definition', 'definitions', 'confirmed', 'chapter', 'chapters', 'controls', 'characterized', 'main', 'diagnosis', 'hospital', 'admissions', 'other', 'resulting', 'from'} diff --git a/text2term/t2t.py b/text2term/t2t.py index ca89c34..a2e27a4 100644 --- a/text2term/t2t.py +++ b/text2term/t2t.py @@ -23,7 +23,7 @@ IGNORE_TAGS = ["ignore", "Ignore", "ignore ", "Ignore "] UNMAPPED_TAG = "unmapped" OUTPUT_COLUMNS = ["Source Term", "Source Term ID", "Mapped Term Label", - "Mapped Term CURIE", "Mapped Term IRI", "Mapping Score", "Tags"] + "Mapped Term CURIE", "Mapped Term IRI", "Mapping Score", "Tags"] LOGGER = onto_utils.get_logger(__name__, level=logging.INFO) @@ -217,15 +217,26 @@ def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappi if mapper == Mapper.BIOPORTAL: LOGGER.warning("The BioPortal mapper does not return a 'mapping score' for its mappings, so the min_score " "filter has no effect on BioPortal mappings. The mapping score is hardcoded to 1 by text2term.") - df = mappings_df else: - df = _filter_mappings(mappings_df, min_score) + LOGGER.debug("Filtering mappings by their score...") + start_filter = time.time() + mappings_df = _filter_mappings(mappings_df, min_score) + LOGGER.debug("...done (filtering time: %.2fs seconds)", time.time() - start_filter) + # Include in output data frame any input terms that did not meet min_score threshold if incl_unmapped: - df = _add_unmapped_terms(df, tags, source_terms, source_term_ids) + LOGGER.debug("Adding unmapped terms...") + start_unmapped = time.time() + mappings_df = _add_unmapped_terms(mappings_df, tags, source_terms, source_term_ids) + LOGGER.debug("...done (adding unmapped time: %.2fs seconds)", time.time() - start_unmapped) + # Add tags - df = _add_tags_to_df(df, tags) - return df + if not mappings_df.empty: + LOGGER.debug("Adding tags...") + start_tagging = time.time() + mappings_df = _add_tags_to_df(mappings_df, tags) + LOGGER.debug("...done (adding tags time: %.2fs seconds)", time.time() - start_tagging) + return mappings_df # Takes in the tags and source terms and processes them accordingly @@ -270,6 +281,7 @@ def _filter_mappings(mappings_df, min_score): new_df = mappings_df.loc[mappings_df["Mapping Score"] >= min_score] return new_df + def _add_unmapped_terms(mappings_df, tags, source_terms, source_terms_ids): if mappings_df.size == 0: mapped = [] diff --git a/text2term/tfidf_mapper.py b/text2term/tfidf_mapper.py index c90c7f9..f8e4f07 100644 --- a/text2term/tfidf_mapper.py +++ b/text2term/tfidf_mapper.py @@ -17,7 +17,7 @@ def __init__(self, target_ontology_terms): self.target_ontology_terms = target_ontology_terms self.target_labels, self.target_terms = self._get_target_labels_terms(target_ontology_terms) - def map(self, source_terms, source_terms_ids, max_mappings=3, min_score=0.3): + def map(self, source_terms, source_terms_ids, max_mappings=3, min_score=0.3, ngram_length=3): """ Main mapping function. Default settings return only the top candidate for every source string. :param source_terms: List of source terms to be mapped with ontology terms @@ -25,9 +25,10 @@ def map(self, source_terms, source_terms_ids, max_mappings=3, min_score=0.3): :param max_mappings: The maximum number of (top scoring) ontology term mappings that should be returned :param min_score: The lower-bound threshold for keeping a candidate term mapping, between 0-1. Default set to 0, so consider all candidates + :param ngram_length: The gram length n for the string tokenizer """ source_terms_norm = onto_utils.normalize_list(source_terms) - vectorizer = self._tokenize(source_terms_norm, self.target_labels) + vectorizer = self._tokenize(source_terms_norm, self.target_labels, n=ngram_length) results_mtx = self._sparse_dot_top(vectorizer, source_terms_norm, self.target_labels, min_score) results_df = self._get_mappings(results_mtx, max_mappings, source_terms, source_terms_ids, self.target_terms) return results_df @@ -80,9 +81,15 @@ def _get_target_labels_terms(self, ontology_terms): target_labels, target_terms = [], [] for term in ontology_terms.values(): for label in term.labels: - target_labels.append(label) - target_terms.append(term) + if not isinstance(label, str): + self.logger.debug(f"ontology term label {label} is not a string") + else: + target_labels.append(label) + target_terms.append(term) for synonym in term.synonyms: - target_labels.append(synonym) - target_terms.append(term) + if not isinstance(synonym, str): + self.logger.debug(f"ontology term synonym {synonym} is not a string") + else: + target_labels.append(synonym) + target_terms.append(term) return target_labels, target_terms