diff --git a/find_similar/calc_functions.py b/find_similar/calc_functions.py index 4be06ce..6d41738 100644 --- a/find_similar/calc_functions.py +++ b/find_similar/calc_functions.py @@ -45,6 +45,7 @@ def __init__(self, # pylint: disable=too-many-arguments :return: cos similarity """ self.text = text + self.key = 0 for k, val in kwargs.items(): setattr(self, k, val) self.tokens = tokens if tokens else get_tokens(text, @@ -79,3 +80,12 @@ def get_tokens(text, dictionary=None, language="russian", remove_stopwords=True) """ tokens = tokenize(text, language, dictionary, remove_stopwords) return tokens + + +def calc_keywords_rating(text, keywords): + rating = 0 + for token in text.tokens: + for k, v in keywords.items(): + if k == token: + rating = rating + v + return rating \ No newline at end of file diff --git a/find_similar/core.py b/find_similar/core.py index f692a10..eed7ada 100644 --- a/find_similar/core.py +++ b/find_similar/core.py @@ -2,7 +2,7 @@ Core module with search functions """ -from .calc_functions import TokenText, calc_cosine_similarity_opt +from .calc_functions import TokenText, calc_cosine_similarity_opt, calc_keywords_rating from .tokenize import tokenize @@ -14,6 +14,7 @@ def find_similar( # pylint: disable=too-many-arguments count=5, dictionary=None, remove_stopwords=True, + keywords=None, ) -> list[TokenText]: """ The main function to search similar texts. @@ -22,6 +23,7 @@ def find_similar( # pylint: disable=too-many-arguments :param language: Language, default='russian' :param count: Count results :param dictionary: default = None. + :param keywords: default = None. If you want to replace one words to others you can send the dictionary. :param remove_stopwords: default = True. Remove or not stopwords :return: Result list sorted by similarity percent @@ -42,6 +44,9 @@ def find_similar( # pylint: disable=too-many-arguments remove_stopwords=remove_stopwords) cos = calc_cosine_similarity_opt(text.tokens, text_to_check_tokens) text.cos = cos + if keywords: + keywords_rating = calc_keywords_rating(text.tokens, keywords) + text.key = keywords_rating token_texts.append(text) text_rated_sorted = sorted(token_texts, key=lambda item: item.cos, reverse=True) return text_rated_sorted[:count]