Merge pull request #50 from ccb-hms/development

Minor improvements and bug fixes
ccb-hms · Jun 4, 2024 · 7faefc9 · 7faefc9
2 parents b8eaf76 + 3e38e9e
commit 7faefc9
Show file tree

Hide file tree

Showing 7 changed files with 53 additions and 32 deletions.
diff --git a/README-UI.md b/README-UI.md
@@ -9,7 +9,7 @@ The following information pertains to the text2term UI, which is written [here](
 -   npm >= 8.0.0
 -   Python >= 3.9.0
 -   pip >= 21.0.0
--   text2term >= 1.1.0
+-   text2term >= 4.1.2
 
 **\*** These are the versions I have that work; while I know Python 3.9 or higher is necessary, the others may not strictly require the listed versions.
 

diff --git a/README.md b/README.md
@@ -13,40 +13,40 @@ pip install text2term
 import text2term
 import pandas
 
-df1 = text2term.map_terms("test/unstruct_terms.txt", "http://www.ebi.ac.uk/efo/efo.owl")
-df2 = text2term.map_terms(["asthma", "acute bronchitis"], "http://www.ebi.ac.uk/efo/efo.owl")
-df3 = text2term.map_terms({"asthma":"disease", "acute bronchitis":["disease", "lungs"]}, "http://www.ebi.ac.uk/efo/efo.owl")
+df1 = text2term.map_terms("test/unstruct_terms.txt", "http://purl.obolibrary.org/obo/mondo.owl")
+df2 = text2term.map_terms(["asthma", "acute bronchitis"], "http://purl.obolibrary.org/obo/mondo.owl")
+df3 = text2term.map_terms({"asthma":"disease", "acute bronchitis":["disease", "lungs"]}, "http://purl.obolibrary.org/obo/mondo.owl")
 ```
 Below is an example of caching, assuming the same imports as above:
 ```python
-text2term.cache_ontology("http://www.ebi.ac.uk/efo/efo.owl", "EFO")
-df1 = text2term.map_terms("test/unstruct_terms.txt", "EFO", use_cache=True)
-df2 = text2term.map_terms(["asthma", "acute bronchitis"], "EFO", use_cache=True)
-text2term.clear_cache("EFO")
+text2term.cache_ontology("http://purl.obolibrary.org/obo/mondo.owl", "MONDO")
+df1 = text2term.map_terms("test/unstruct_terms.txt", "MONDO", use_cache=True)
+df2 = text2term.map_terms(["asthma", "acute bronchitis"], "MONDO", use_cache=True)
+text2term.clear_cache("MONDO")
 ```
 
 ### Command Line
 The basic use of the tool requires a `source` file containing a list of terms to map to the given `target` ontology:  
-`python text2term -s test/unstruct_terms.txt -t http://www.ebi.ac.uk/efo/efo.owl`
+`python text2term -s test/unstruct_terms.txt -t http://purl.obolibrary.org/obo/mondo.owl`
 
 Specify an output file where the mappings should be saved using `-o`:  
-`python text2term -s test/unstruct_terms.txt -t efo.owl -o /Documents/my-mappings.csv`
+`python text2term -s test/unstruct_terms.txt -t mondo.owl -o /Documents/my-mappings.csv`
 
 Set the minimum acceptable similarity score for mapping each given term to an ontology term using `-min`:  
-`python text2term -s test/unstruct_terms.txt -t efo.owl -min 0.8`  
+`python text2term -s test/unstruct_terms.txt -t mondo.owl -min 0.8`  
 The mapped terms returned will have been determined to be 0.8 similar to their source terms in a 0-1 scale.  
 
 Exclude deprecated ontology terms (declared as such via *owl:deprecated true*) using `-d`:  
-`python text2term -s test/unstruct_terms.txt -t efo.owl -d`
+`python text2term -s test/unstruct_terms.txt -t mondo.owl -d`
 
 Limit search to only terms whose IRIs start with any IRI given in a list specified using `-iris`:  
-`python text2term.py -s test/unstruct_terms.txt -t efo.owl -iris http://www.ebi.ac.uk/efo/EFO,http://purl.obolibrary.org/obo/HP`  
+`python text2term.py -s test/unstruct_terms.txt -t mondo.owl -iris http://purl.obolibrary.org/obo/mondo.owl,http://purl.obolibrary.org/obo/HP`  
 Here, because EFO reuses terms from other ontologies such as HP and GO, the HP terms would be included but the GO terms would be excluded.
 
 Use the cache on the command line, first by flagging it, then in the future using the acronym:
-`python text2term -s test/unstruct_terms.txt -t http://www.ebi.ac.uk/efo/efo.owl -c EFO`
+`python text2term -s test/unstruct_terms.txt -t http://purl.obolibrary.org/obo/mondo.owl -c MONDO`
 Then, after running this, the following command is equivalent:
-`python text2term -s test/unstruct_terms.txt -t EFO`
+`python text2term -s test/unstruct_terms.txt -t MONDO`
 
 ## Programmatic Usage
 The tool can be executed in Python with the `map_terms` function:

diff --git a/text2term/__main__.py b/text2term/__main__.py
@@ -17,7 +17,7 @@
                              "'all' to search all ontologies")
     parser.add_argument("-o", "--output", required=False, type=str, default="",
                         help="Path to desired output file for the mappings (default=current working directory)")
-    parser.add_argument("-m", "--mapper", required=False, type=str, default=Mapper.TFIDF,
+    parser.add_argument("-m", "--mapper", required=False, type=str, default="tfidf",
                         help="Method used to compare source terms with ontology terms. One of: " + str(Mapper.list()) +
                              " (default=tfidf)")
     parser.add_argument("-csv", "--csv_input", required=False, type=str, default=(),
@@ -39,8 +39,10 @@
                         help="Save vis.js graphs representing the neighborhood of each ontology term (default=False)")
     parser.add_argument("-c", "--store_in_cache", required=False, type=str, default="",
                         help="Store the target ontology into local cache under acronym")
-    parser.add_argument("-type", "--term_type", required=False, type=str, default=OntologyTermType.CLASS,
+    parser.add_argument("-type", "--term_type", required=False, type=str, default="class",
                         help="Define whether to return ontology classes, properties, or both")
+    parser.add_argument('-u', "--incl_unmapped", required=False, default=False, action="store_true",
+                        help="Include all unmapped terms in the output")
 
     arguments = parser.parse_args()
     if not os.path.exists(arguments.source):
@@ -62,4 +64,4 @@
               excl_deprecated=arguments.excl_deprecated, mapper=mapper, max_mappings=arguments.top_mappings,
               min_score=arguments.min_score, base_iris=iris, save_graphs=arguments.save_term_graphs,
               save_mappings=True, separator=arguments.separator, use_cache=cache_exists(target),
-              term_type=arguments.term_type)
+              term_type=arguments.term_type, incl_unmapped=arguments.incl_unmapped)
diff --git a/text2term/config.py b/text2term/config.py
@@ -1 +1 @@
-VERSION = "4.1.2"
+VERSION = "4.1.3"
diff --git a/text2term/onto_utils.py b/text2term/onto_utils.py
@@ -11,7 +11,7 @@
 STOP_WORDS = {'in', 'the', 'any', 'all', 'for', 'and', 'or', 'dx', 'on', 'fh', 'tx', 'only', 'qnorm', 'w', 'iqb', 's',
               'ds', 'rd', 'rdgwas', 'ICD', 'excluded', 'excluding', 'unspecified', 'certain', 'also', 'undefined',
               'ordinary', 'least', 'squares', 'FINNGEN', 'elsewhere', 'more', 'excluded', 'classified', 'classifeid',
-              'unspcified', 'unspesified', 'specified', 'acquired', 'combined', 'unspeficied', 'elsewhere', 'not', 'by',
+              'unspcified', 'unspesified', 'specified', 'acquired', 'combined', 'unspeficied', 'elsewhere', 'by',
               'strict', 'wide', 'definition', 'definitions', 'confirmed', 'chapter', 'chapters', 'controls',
               'characterized', 'main', 'diagnosis', 'hospital', 'admissions', 'other', 'resulting', 'from'}
 

diff --git a/text2term/t2t.py b/text2term/t2t.py
@@ -23,7 +23,7 @@
 IGNORE_TAGS = ["ignore", "Ignore", "ignore ", "Ignore "]
 UNMAPPED_TAG = "unmapped"
 OUTPUT_COLUMNS = ["Source Term", "Source Term ID", "Mapped Term Label",
-                    "Mapped Term CURIE", "Mapped Term IRI", "Mapping Score", "Tags"]
+                  "Mapped Term CURIE", "Mapped Term IRI", "Mapping Score", "Tags"]
 
 LOGGER = onto_utils.get_logger(__name__, level=logging.INFO)
 
@@ -217,15 +217,26 @@ def _do_mapping(source_terms, source_term_ids, ontology_terms, mapper, max_mappi
     if mapper == Mapper.BIOPORTAL:
         LOGGER.warning("The BioPortal mapper does not return a 'mapping score' for its mappings, so the min_score "
                        "filter has no effect on BioPortal mappings. The mapping score is hardcoded to 1 by text2term.")
-        df = mappings_df
     else:
-        df = _filter_mappings(mappings_df, min_score)
+        LOGGER.debug("Filtering mappings by their score...")
+        start_filter = time.time()
+        mappings_df = _filter_mappings(mappings_df, min_score)
+        LOGGER.debug("...done (filtering time: %.2fs seconds)", time.time() - start_filter)
+
     # Include in output data frame any input terms that did not meet min_score threshold
     if incl_unmapped:
-        df = _add_unmapped_terms(df, tags, source_terms, source_term_ids)
+        LOGGER.debug("Adding unmapped terms...")
+        start_unmapped = time.time()
+        mappings_df = _add_unmapped_terms(mappings_df, tags, source_terms, source_term_ids)
+        LOGGER.debug("...done (adding unmapped time: %.2fs seconds)", time.time() - start_unmapped)
+
     # Add tags
-    df = _add_tags_to_df(df, tags)
-    return df
+    if not mappings_df.empty:
+        LOGGER.debug("Adding tags...")
+        start_tagging = time.time()
+        mappings_df = _add_tags_to_df(mappings_df, tags)
+        LOGGER.debug("...done (adding tags time: %.2fs seconds)", time.time() - start_tagging)
+    return mappings_df
 
 
 # Takes in the tags and source terms and processes them accordingly
@@ -270,6 +281,7 @@ def _filter_mappings(mappings_df, min_score):
     new_df = mappings_df.loc[mappings_df["Mapping Score"] >= min_score]
     return new_df
 
+
 def _add_unmapped_terms(mappings_df, tags, source_terms, source_terms_ids):
     if mappings_df.size == 0:
         mapped = []

diff --git a/text2term/tfidf_mapper.py b/text2term/tfidf_mapper.py
@@ -17,17 +17,18 @@ def __init__(self, target_ontology_terms):
         self.target_ontology_terms = target_ontology_terms
         self.target_labels, self.target_terms = self._get_target_labels_terms(target_ontology_terms)
 
-    def map(self, source_terms, source_terms_ids, max_mappings=3, min_score=0.3):
+    def map(self, source_terms, source_terms_ids, max_mappings=3, min_score=0.3, ngram_length=3):
         """
         Main mapping function. Default settings return only the top candidate for every source string.
         :param source_terms: List of source terms to be mapped with ontology terms
         :param source_terms_ids: List of identifiers for the given source terms
         :param max_mappings: The maximum number of (top scoring) ontology term mappings that should be returned
         :param min_score: The lower-bound threshold for keeping a candidate term mapping, between 0-1.
                             Default set to 0, so consider all candidates
+        :param ngram_length: The gram length n for the string tokenizer
         """
         source_terms_norm = onto_utils.normalize_list(source_terms)
-        vectorizer = self._tokenize(source_terms_norm, self.target_labels)
+        vectorizer = self._tokenize(source_terms_norm, self.target_labels, n=ngram_length)
         results_mtx = self._sparse_dot_top(vectorizer, source_terms_norm, self.target_labels, min_score)
         results_df = self._get_mappings(results_mtx, max_mappings, source_terms, source_terms_ids, self.target_terms)
         return results_df
@@ -80,9 +81,15 @@ def _get_target_labels_terms(self, ontology_terms):
         target_labels, target_terms = [], []
         for term in ontology_terms.values():
             for label in term.labels:
-                target_labels.append(label)
-                target_terms.append(term)
+                if not isinstance(label, str):
+                    self.logger.debug(f"ontology term label {label} is not a string")
+                else:
+                    target_labels.append(label)
+                    target_terms.append(term)
             for synonym in term.synonyms:
-                target_labels.append(synonym)
-                target_terms.append(term)
+                if not isinstance(synonym, str):
+                    self.logger.debug(f"ontology term synonym {synonym} is not a string")
+                else:
+                    target_labels.append(synonym)
+                    target_terms.append(term)
         return target_labels, target_terms