From d2f7efc70d3c73447524b9d2f4801e4bc331146e Mon Sep 17 00:00:00 2001 From: Rafael Goncalves Date: Mon, 3 Jun 2024 16:57:36 -0400 Subject: [PATCH] Parameterize ngram length. Ensure inputs are strings closes #49 --- text2term/config.py | 2 +- text2term/tfidf_mapper.py | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/text2term/config.py b/text2term/config.py index 546a837..aa1adbb 100644 --- a/text2term/config.py +++ b/text2term/config.py @@ -1 +1 @@ -VERSION = "4.1.2" +VERSION = "4.1.3" diff --git a/text2term/tfidf_mapper.py b/text2term/tfidf_mapper.py index c90c7f9..f8e4f07 100644 --- a/text2term/tfidf_mapper.py +++ b/text2term/tfidf_mapper.py @@ -17,7 +17,7 @@ def __init__(self, target_ontology_terms): self.target_ontology_terms = target_ontology_terms self.target_labels, self.target_terms = self._get_target_labels_terms(target_ontology_terms) - def map(self, source_terms, source_terms_ids, max_mappings=3, min_score=0.3): + def map(self, source_terms, source_terms_ids, max_mappings=3, min_score=0.3, ngram_length=3): """ Main mapping function. Default settings return only the top candidate for every source string. :param source_terms: List of source terms to be mapped with ontology terms @@ -25,9 +25,10 @@ def map(self, source_terms, source_terms_ids, max_mappings=3, min_score=0.3): :param max_mappings: The maximum number of (top scoring) ontology term mappings that should be returned :param min_score: The lower-bound threshold for keeping a candidate term mapping, between 0-1. Default set to 0, so consider all candidates + :param ngram_length: The gram length n for the string tokenizer """ source_terms_norm = onto_utils.normalize_list(source_terms) - vectorizer = self._tokenize(source_terms_norm, self.target_labels) + vectorizer = self._tokenize(source_terms_norm, self.target_labels, n=ngram_length) results_mtx = self._sparse_dot_top(vectorizer, source_terms_norm, self.target_labels, min_score) results_df = self._get_mappings(results_mtx, max_mappings, source_terms, source_terms_ids, self.target_terms) return results_df @@ -80,9 +81,15 @@ def _get_target_labels_terms(self, ontology_terms): target_labels, target_terms = [], [] for term in ontology_terms.values(): for label in term.labels: - target_labels.append(label) - target_terms.append(term) + if not isinstance(label, str): + self.logger.debug(f"ontology term label {label} is not a string") + else: + target_labels.append(label) + target_terms.append(term) for synonym in term.synonyms: - target_labels.append(synonym) - target_terms.append(term) + if not isinstance(synonym, str): + self.logger.debug(f"ontology term synonym {synonym} is not a string") + else: + target_labels.append(synonym) + target_terms.append(term) return target_labels, target_terms