From 2268626f99e320655b158b7af34d774019102011 Mon Sep 17 00:00:00 2001 From: Klein Tahiraj Date: Fri, 15 Mar 2024 11:50:57 +0000 Subject: [PATCH 1/6] Add(SpaCy to pre-splitting options) --- semantic_router/splitters/rolling_window.py | 16 +++++++++++++++- semantic_router/splitters/utils.py | 17 +++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/semantic_router/splitters/rolling_window.py b/semantic_router/splitters/rolling_window.py index 092433fe..e9a5ff9f 100644 --- a/semantic_router/splitters/rolling_window.py +++ b/semantic_router/splitters/rolling_window.py @@ -6,7 +6,7 @@ from semantic_router.encoders.base import BaseEncoder from semantic_router.schema import DocumentSplit from semantic_router.splitters.base import BaseSplitter -from semantic_router.splitters.utils import split_to_sentences, tiktoken_length +from semantic_router.splitters.utils import split_to_sentences, split_to_sentences_spacy, tiktoken_length from semantic_router.utils.logger import logger @@ -39,6 +39,7 @@ class RollingWindowSplitter(BaseSplitter): def __init__( self, encoder: BaseEncoder, + pre_splitter: str = "regex", threshold_adjustment=0.01, dynamic_threshold: bool = True, window_size=5, @@ -51,6 +52,7 @@ def __init__( super().__init__(name=name, encoder=encoder) self.calculated_threshold: float self.encoder = encoder + self.pre_splitter = pre_splitter self.threshold_adjustment = threshold_adjustment self.dynamic_threshold = dynamic_threshold self.window_size = window_size @@ -79,6 +81,14 @@ def __call__(self, docs: List[str]) -> List[DocumentSplit]: f"of {self.max_split_tokens}. " "Splitting to sentences before semantically splitting." ) + try: + if self.pre_splitter == "spacy": + docs = split_to_sentences_spacy(docs[0]) + elif self.pre_splitter == "regex": + docs = split_to_sentences(docs[0]) + except Exception as e: + logger.error(f"Error splitting document to sentences: {e}") + raise docs = split_to_sentences(docs[0]) encoded_docs = self._encode_documents(docs) similarities = self._calculate_similarity_scores(encoded_docs) @@ -401,6 +411,10 @@ def plot_sentence_similarity_scores( sentence after a similarity score below a specified threshold. """ + if self.pre_splitter == "spacy": + docs = split_to_sentences_spacy(docs) + elif self.pre_splitter == "regex": + docs = split_to_sentences(docs) sentences = [sentence for doc in docs for sentence in split_to_sentences(doc)] encoded_sentences = self._encode_documents(sentences) similarity_scores = [] diff --git a/semantic_router/splitters/utils.py b/semantic_router/splitters/utils.py index 349c3eaa..3a2cdc62 100644 --- a/semantic_router/splitters/utils.py +++ b/semantic_router/splitters/utils.py @@ -1,4 +1,5 @@ import regex +import spacy import tiktoken @@ -57,6 +58,22 @@ def split_to_sentences(text: str) -> list[str]: return sentences +def split_to_sentences_spacy(text: str) -> list[str]: + """ + Use SpaCy to split a given text into sentences. Supported languages: English. + + Args: + text (str): The text to split into sentences. + + Returns: + list: A list of sentences extracted from the text. + """ + nlp = spacy.load("en_core_web_sm") + doc = nlp(text) + sentences = [sentence.text.strip() for sentence in doc.sents] + return sentences + + def tiktoken_length(text: str) -> int: tokenizer = tiktoken.get_encoding("cl100k_base") tokens = tokenizer.encode(text, disallowed_special=()) From 2f7acd49cc4ff2b75a863a37ff9775fbe08b0461 Mon Sep 17 00:00:00 2001 From: Klein Tahiraj Date: Fri, 15 Mar 2024 12:29:17 +0000 Subject: [PATCH 2/6] Add(check for spacy language mode) --- semantic_router/splitters/rolling_window.py | 5 ++++- semantic_router/splitters/utils.py | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/semantic_router/splitters/rolling_window.py b/semantic_router/splitters/rolling_window.py index e9a5ff9f..4a881706 100644 --- a/semantic_router/splitters/rolling_window.py +++ b/semantic_router/splitters/rolling_window.py @@ -6,7 +6,7 @@ from semantic_router.encoders.base import BaseEncoder from semantic_router.schema import DocumentSplit from semantic_router.splitters.base import BaseSplitter -from semantic_router.splitters.utils import split_to_sentences, split_to_sentences_spacy, tiktoken_length +from semantic_router.splitters.utils import split_to_sentences, split_to_sentences_spacy, check_and_download_spacy_model, tiktoken_length from semantic_router.utils.logger import logger @@ -40,6 +40,7 @@ def __init__( self, encoder: BaseEncoder, pre_splitter: str = "regex", + spacy_model: str = "en_core_web_sm", threshold_adjustment=0.01, dynamic_threshold: bool = True, window_size=5, @@ -53,6 +54,7 @@ def __init__( self.calculated_threshold: float self.encoder = encoder self.pre_splitter = pre_splitter + self.spacy_model = spacy_model self.threshold_adjustment = threshold_adjustment self.dynamic_threshold = dynamic_threshold self.window_size = window_size @@ -83,6 +85,7 @@ def __call__(self, docs: List[str]) -> List[DocumentSplit]: ) try: if self.pre_splitter == "spacy": + check_and_download_spacy_model(self.spacy_model) docs = split_to_sentences_spacy(docs[0]) elif self.pre_splitter == "regex": docs = split_to_sentences(docs[0]) diff --git a/semantic_router/splitters/utils.py b/semantic_router/splitters/utils.py index 3a2cdc62..c4a3f746 100644 --- a/semantic_router/splitters/utils.py +++ b/semantic_router/splitters/utils.py @@ -73,6 +73,24 @@ def split_to_sentences_spacy(text: str) -> list[str]: sentences = [sentence.text.strip() for sentence in doc.sents] return sentences +def check_and_download_spacy_model(model_name="en_core_web_sm"): + """ + Checks if the specified SpaCy language model is installed, and if not, attempts to download and install it. + + Args: + - model_name (str): The name of the SpaCy model to check and download. Defaults to 'en_core_web_sm'. + + """ + try: + # Try loading the model to see if it's already installed + spacy.load(model_name) + print(f"Spacy model '{model_name}' is installed.") + except OSError: + print(f"Spacy model '{model_name}' not found, downloading...") + from spacy.cli import download + download(model_name) + print(f"Downloaded and installed model '{model_name}'.") + def tiktoken_length(text: str) -> int: tokenizer = tiktoken.get_encoding("cl100k_base") From e94231e7199ac57baaec52ffdfcf37710c2b5a17 Mon Sep 17 00:00:00 2001 From: Klein Tahiraj Date: Fri, 15 Mar 2024 12:29:56 +0000 Subject: [PATCH 3/6] Update(add spacy to pyproject.toml) --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index d19bf636..cfa7a942 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,11 +32,13 @@ black = "^23.12.1" colorama = "^0.4.6" pinecone-client = {version="^3.0.0", optional = true} regex = "^2023.12.25" +spacy = "^3.0" torchvision = { version = "^0.17.0", optional = true} pillow = { version= "^10.2.0", optional = true} tiktoken = "^0.6.0" matplotlib = { version="^3.8.3", optional = true} + [tool.poetry.extras] hybrid = ["pinecone-text"] fastembed = ["fastembed"] From 176e4162bd6ae19356a0114895dd200e941744cc Mon Sep 17 00:00:00 2001 From: Klein Tahiraj Date: Fri, 15 Mar 2024 12:52:21 +0000 Subject: [PATCH 4/6] Fix(dumb bug) --- semantic_router/splitters/rolling_window.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/semantic_router/splitters/rolling_window.py b/semantic_router/splitters/rolling_window.py index 4a881706..ef4efbbb 100644 --- a/semantic_router/splitters/rolling_window.py +++ b/semantic_router/splitters/rolling_window.py @@ -92,7 +92,6 @@ def __call__(self, docs: List[str]) -> List[DocumentSplit]: except Exception as e: logger.error(f"Error splitting document to sentences: {e}") raise - docs = split_to_sentences(docs[0]) encoded_docs = self._encode_documents(docs) similarities = self._calculate_similarity_scores(encoded_docs) if self.dynamic_threshold: @@ -415,10 +414,11 @@ def plot_sentence_similarity_scores( a specified threshold. """ if self.pre_splitter == "spacy": - docs = split_to_sentences_spacy(docs) + sentences = [sentence for doc in docs for sentence in split_to_sentences_spacy(doc)] elif self.pre_splitter == "regex": - docs = split_to_sentences(docs) - sentences = [sentence for doc in docs for sentence in split_to_sentences(doc)] + sentences = [sentence for doc in docs for sentence in split_to_sentences(doc)] + else: + raise ValueError("Invalid pre_splitter value. Supported values are 'spacy' and 'regex'.") encoded_sentences = self._encode_documents(sentences) similarity_scores = [] From 2e6188254e2d5cb8b332cc1fb7d916ba4877cf15 Mon Sep 17 00:00:00 2001 From: Klein Tahiraj Date: Fri, 15 Mar 2024 15:56:47 +0000 Subject: [PATCH 5/6] Update(made SpaCy optional, try-except import) --- pyproject.toml | 4 +- semantic_router/splitters/rolling_window.py | 3 +- semantic_router/splitters/utils.py | 41 +++++++++++---------- 3 files changed, 25 insertions(+), 23 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index cfa7a942..4081ffc3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -32,7 +32,7 @@ black = "^23.12.1" colorama = "^0.4.6" pinecone-client = {version="^3.0.0", optional = true} regex = "^2023.12.25" -spacy = "^3.0" +spacy = { version = "^3.0", optional = true } torchvision = { version = "^0.17.0", optional = true} pillow = { version= "^10.2.0", optional = true} tiktoken = "^0.6.0" @@ -45,7 +45,7 @@ fastembed = ["fastembed"] local = ["torch", "transformers", "llama-cpp-python"] pinecone = ["pinecone-client"] vision = ["torch", "torchvision", "transformers", "pillow"] -processing = ["matplotlib"] +processing = ["matplotlib", "spacy"] mistralai = ["mistralai"] [tool.poetry.group.dev.dependencies] diff --git a/semantic_router/splitters/rolling_window.py b/semantic_router/splitters/rolling_window.py index ef4efbbb..e629e51d 100644 --- a/semantic_router/splitters/rolling_window.py +++ b/semantic_router/splitters/rolling_window.py @@ -85,8 +85,7 @@ def __call__(self, docs: List[str]) -> List[DocumentSplit]: ) try: if self.pre_splitter == "spacy": - check_and_download_spacy_model(self.spacy_model) - docs = split_to_sentences_spacy(docs[0]) + docs = split_to_sentences_spacy(docs[0], self.spacy_model) elif self.pre_splitter == "regex": docs = split_to_sentences(docs[0]) except Exception as e: diff --git a/semantic_router/splitters/utils.py b/semantic_router/splitters/utils.py index c4a3f746..c1492d2c 100644 --- a/semantic_router/splitters/utils.py +++ b/semantic_router/splitters/utils.py @@ -1,7 +1,8 @@ import regex -import spacy import tiktoken +from semantic_router.utils.logger import logger + def split_to_sentences(text: str) -> list[str]: """ @@ -58,7 +59,7 @@ def split_to_sentences(text: str) -> list[str]: return sentences -def split_to_sentences_spacy(text: str) -> list[str]: +def split_to_sentences_spacy(text: str, spacy_model: str = "en_core_web_sm") -> list[str]: """ Use SpaCy to split a given text into sentences. Supported languages: English. @@ -67,29 +68,31 @@ def split_to_sentences_spacy(text: str) -> list[str]: Returns: list: A list of sentences extracted from the text. - """ - nlp = spacy.load("en_core_web_sm") - doc = nlp(text) - sentences = [sentence.text.strip() for sentence in doc.sents] - return sentences - -def check_and_download_spacy_model(model_name="en_core_web_sm"): """ - Checks if the specified SpaCy language model is installed, and if not, attempts to download and install it. - Args: - - model_name (str): The name of the SpaCy model to check and download. Defaults to 'en_core_web_sm'. + # Check if SpaCy is installed + try: + import spacy + except ImportError: + logger.warning( + "SpaCy is not installed. Please `pip install " + "semantic-router[processing]`." + ) + return - """ + # Check if the SpaCy model is installed try: - # Try loading the model to see if it's already installed - spacy.load(model_name) - print(f"Spacy model '{model_name}' is installed.") + spacy.load(spacy_model) except OSError: - print(f"Spacy model '{model_name}' not found, downloading...") + print(f"Spacy model '{spacy_model}' not found, downloading...") from spacy.cli import download - download(model_name) - print(f"Downloaded and installed model '{model_name}'.") + download(spacy_model) + print(f"Downloaded and installed model '{spacy_model}'.") + + nlp = spacy.load("en_core_web_sm") + doc = nlp(text) + sentences = [sentence.text.strip() for sentence in doc.sents] + return sentences def tiktoken_length(text: str) -> int: From dc339b1d40490610228223aecf4f1089dd40a1e9 Mon Sep 17 00:00:00 2001 From: Klein Tahiraj <62718109+klein-t@users.noreply.github.com> Date: Mon, 18 Mar 2024 17:46:10 +0000 Subject: [PATCH 6/6] Update rolling_window.py Fixing a small bug --- semantic_router/splitters/rolling_window.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/semantic_router/splitters/rolling_window.py b/semantic_router/splitters/rolling_window.py index e629e51d..7f0eb1d6 100644 --- a/semantic_router/splitters/rolling_window.py +++ b/semantic_router/splitters/rolling_window.py @@ -6,7 +6,7 @@ from semantic_router.encoders.base import BaseEncoder from semantic_router.schema import DocumentSplit from semantic_router.splitters.base import BaseSplitter -from semantic_router.splitters.utils import split_to_sentences, split_to_sentences_spacy, check_and_download_spacy_model, tiktoken_length +from semantic_router.splitters.utils import split_to_sentences, split_to_sentences_spacy, tiktoken_length from semantic_router.utils.logger import logger