From ce8a26a08e9b2e7fb2eb668b1a7923de658e9394 Mon Sep 17 00:00:00 2001 From: James Briggs Date: Sat, 2 Mar 2024 17:54:27 +0800 Subject: [PATCH 1/2] add docstrings --- coverage.xml | 810 +++++++++---------- semantic_router/splitters/consecutive_sim.py | 9 +- semantic_router/splitters/cumulative_sim.py | 9 +- semantic_router/splitters/rolling_window.py | 71 +- 4 files changed, 460 insertions(+), 439 deletions(-) diff --git a/coverage.xml b/coverage.xml index 2144ec09..818c0304 100644 --- a/coverage.xml +++ b/coverage.xml @@ -1,12 +1,12 @@ - + /Users/jamesbriggs/Documents/projects/aurelio-labs/semantic-router/semantic_router - + @@ -18,7 +18,7 @@ - + @@ -33,98 +33,98 @@ - - - + + + - - - - - - - + + + + + + + - - - - - - + + + + + + - + - - - - - - - - - - - - - + + + + + + + + + + + + + - - - - - - - + + + + + + + - - + + - - + + - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + - - - + + + - - - - - - - - - - - - - + + + + + + + + + + + + + - - - + + + @@ -426,7 +426,7 @@ - + @@ -439,8 +439,8 @@ - - + + @@ -518,7 +518,7 @@ - + @@ -568,7 +568,7 @@ - + @@ -663,7 +663,7 @@ - + @@ -697,7 +697,7 @@ - + @@ -711,45 +711,45 @@ - - - - - - - + + + + + + + - - - - - - + + + + + + - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + - - - - + + + + - + @@ -770,70 +770,70 @@ - - + + - - - - - - - - - - - - - - + + + + + + + + + + + + + + - - - - - - - - - - + + + + + + + + + + - - - - - - - - + + + + + + + + - + - + - - + + - + - - - - - + + + + + - - - - - - + + + + + + - - - + + + @@ -870,7 +870,7 @@ - + @@ -890,22 +890,22 @@ - - - - - - + + + + + + - - - - + + + + - + @@ -929,45 +929,45 @@ - - - - - - - - + + + + + + + + - - - - + + + + - - - - - - - - - - + + + + + + + + + + - - - - - + + + + + - - - + + + - - - - + + + + @@ -1138,7 +1138,7 @@ - + @@ -1157,57 +1157,57 @@ - - + + - - - - - - - - - - - + + + + + + + + + + + - - - - - - - - + + + + + + + + - + - + - - + + - + - - - - + + + + - - - + + + - - - - - - - - + + + + + + + + @@ -1533,7 +1533,7 @@ - + @@ -1547,7 +1547,7 @@ - + @@ -1571,8 +1571,8 @@ - - + + @@ -1582,15 +1582,15 @@ - + - - - - - - - + + + + + + + @@ -1627,49 +1627,49 @@ - + - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + - + - - - - - - - - - - - + + + + + + + + + + + @@ -2027,164 +2027,167 @@ - - - - - + - + + + - + + - - + + + + + + + - - + + + + + + + + - - - - - + + + + + + - - + + - - - - - - - - - - + + + + + + + + - - - + + - - - - - + + + + + - - - - - + + + + + + - - - - + + + - + - - - - + + + + + + + + - - - - - - - - - - - + + + + + + + + + + + - - - - - - - - - - - - - - - + + + + + + + + - - + + + + - - - - - - - + + + - - - + + + @@ -2193,9 +2196,6 @@ - - - diff --git a/semantic_router/splitters/consecutive_sim.py b/semantic_router/splitters/consecutive_sim.py index 4a2e1106..775d5d2c 100644 --- a/semantic_router/splitters/consecutive_sim.py +++ b/semantic_router/splitters/consecutive_sim.py @@ -23,7 +23,14 @@ def __init__( encoder.score_threshold = score_threshold self.score_threshold = score_threshold - def __call__(self, docs: List[Any]): + def __call__(self, docs: List[Any]) -> List[DocumentSplit]: + """Split documents into smaller chunks based on semantic similarity. + + :param docs: list of text documents to be split, if only wanted to + split a single document, pass it as a list with a single element. + + :return: list of DocumentSplit objects containing the split documents. + """ # Check if there's only a single document if len(docs) == 1: raise ValueError( diff --git a/semantic_router/splitters/cumulative_sim.py b/semantic_router/splitters/cumulative_sim.py index e9dd8deb..0e50a354 100644 --- a/semantic_router/splitters/cumulative_sim.py +++ b/semantic_router/splitters/cumulative_sim.py @@ -23,7 +23,14 @@ def __init__( encoder.score_threshold = score_threshold self.score_threshold = score_threshold - def __call__(self, docs: List[str]): + def __call__(self, docs: List[str]) -> List[DocumentSplit]: + """Split documents into smaller chunks based on semantic similarity. + + :param docs: list of text documents to be split, if only wanted to + split a single document, pass it as a list with a single element. + + :return: list of DocumentSplit objects containing the split documents. + """ total_docs = len(docs) # Check if there's only a single document if total_docs == 1: diff --git a/semantic_router/splitters/rolling_window.py b/semantic_router/splitters/rolling_window.py index dc5110a6..092433fe 100644 --- a/semantic_router/splitters/rolling_window.py +++ b/semantic_router/splitters/rolling_window.py @@ -60,7 +60,39 @@ def __init__( self.split_tokens_tolerance = split_tokens_tolerance self.statistics: SplitStatistics - def encode_documents(self, docs: List[str]) -> np.ndarray: + def __call__(self, docs: List[str]) -> List[DocumentSplit]: + """Split documents into smaller chunks based on semantic similarity. + + :param docs: list of text documents to be split, if only wanted to + split a single document, pass it as a list with a single element. + + :return: list of DocumentSplit objects containing the split documents. + """ + if not docs: + raise ValueError("At least one document is required for splitting.") + + if len(docs) == 1: + token_count = tiktoken_length(docs[0]) + if token_count > self.max_split_tokens: + logger.warning( + f"Single document exceeds the maximum token limit " + f"of {self.max_split_tokens}. " + "Splitting to sentences before semantically splitting." + ) + docs = split_to_sentences(docs[0]) + encoded_docs = self._encode_documents(docs) + similarities = self._calculate_similarity_scores(encoded_docs) + if self.dynamic_threshold: + self._find_optimal_threshold(docs, similarities) + else: + self.calculated_threshold = self.encoder.score_threshold + split_indices = self._find_split_indices(similarities=similarities) + splits = self._split_documents(docs, split_indices, similarities) + self.plot_similarity_scores(similarities, split_indices, splits) + logger.info(self.statistics) + return splits + + def _encode_documents(self, docs: List[str]) -> np.ndarray: try: embeddings = self.encoder(docs) return np.array(embeddings) @@ -68,7 +100,7 @@ def encode_documents(self, docs: List[str]) -> np.ndarray: logger.error(f"Error encoding documents {docs}: {e}") raise - def calculate_similarity_scores(self, encoded_docs: np.ndarray) -> List[float]: + def _calculate_similarity_scores(self, encoded_docs: np.ndarray) -> List[float]: raw_similarities = [] for idx in range(1, len(encoded_docs)): window_start = max(0, idx - self.window_size) @@ -80,7 +112,7 @@ def calculate_similarity_scores(self, encoded_docs: np.ndarray) -> List[float]: raw_similarities.append(curr_sim_score) return raw_similarities - def find_split_indices(self, similarities: List[float]) -> List[int]: + def _find_split_indices(self, similarities: List[float]) -> List[int]: split_indices = [] for idx, score in enumerate(similarities): logger.debug(f"Similarity score at index {idx}: {score}") @@ -93,7 +125,7 @@ def find_split_indices(self, similarities: List[float]) -> List[int]: split_indices.append(idx + 1) return split_indices - def find_optimal_threshold(self, docs: List[str], similarity_scores: List[float]): + def _find_optimal_threshold(self, docs: List[str], similarity_scores: List[float]): token_counts = [tiktoken_length(doc) for doc in docs] cumulative_token_counts = np.cumsum([0] + token_counts) @@ -109,7 +141,7 @@ def find_optimal_threshold(self, docs: List[str], similarity_scores: List[float] median_tokens = 0 while low <= high: self.calculated_threshold = (low + high) / 2 - split_indices = self.find_split_indices(similarity_scores) + split_indices = self._find_split_indices(similarity_scores) logger.debug( f"Iteration {iteration}: Trying threshold: {self.calculated_threshold}" ) @@ -150,7 +182,7 @@ def find_optimal_threshold(self, docs: List[str], similarity_scores: List[float] return self.calculated_threshold - def split_documents( + def _split_documents( self, docs: List[str], split_indices: List[int], similarities: List[float] ) -> List[DocumentSplit]: """ @@ -370,7 +402,7 @@ def plot_sentence_similarity_scores( a specified threshold. """ sentences = [sentence for doc in docs for sentence in split_to_sentences(doc)] - encoded_sentences = self.encode_documents(sentences) + encoded_sentences = self._encode_documents(sentences) similarity_scores = [] for i in range(window_size, len(encoded_sentences)): @@ -399,28 +431,3 @@ def plot_sentence_similarity_scores( f"First sentence after similarity score " f"below {threshold}: {sentences[i + window_size]}" ) - - def __call__(self, docs: List[str]) -> List[DocumentSplit]: - if not docs: - raise ValueError("At least one document is required for splitting.") - - if len(docs) == 1: - token_count = tiktoken_length(docs[0]) - if token_count > self.max_split_tokens: - logger.warning( - f"Single document exceeds the maximum token limit " - f"of {self.max_split_tokens}. " - "Splitting to sentences before semantically splitting." - ) - docs = split_to_sentences(docs[0]) - encoded_docs = self.encode_documents(docs) - similarities = self.calculate_similarity_scores(encoded_docs) - if self.dynamic_threshold: - self.find_optimal_threshold(docs, similarities) - else: - self.calculated_threshold = self.encoder.score_threshold - split_indices = self.find_split_indices(similarities=similarities) - splits = self.split_documents(docs, split_indices, similarities) - self.plot_similarity_scores(similarities, split_indices, splits) - logger.info(self.statistics) - return splits From 97fa1d3098d8e5be965d5cf0c29bd974c17f68d3 Mon Sep 17 00:00:00 2001 From: James Briggs Date: Sat, 2 Mar 2024 17:57:27 +0800 Subject: [PATCH 2/2] update course URL --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 28755c79..d8cb1a83 100644 --- a/README.md +++ b/README.md @@ -133,7 +133,7 @@ In this case, no decision could be made as we had no matches — so our route la ### Online Course -[![Semantic Router Course](https://github.com/aurelio-labs/assets/blob/main/images/aurelio-1080p-header-dark-semantic-router.jpg)](https://www.youtube.com/watch?v=ro312jDqAh0&list=PLIUOU7oqGTLhYDPiDKlALecva3jab531-&index=1) +[![Semantic Router Course](https://github.com/aurelio-labs/assets/blob/main/images/aurelio-1080p-header-dark-semantic-router.jpg)](https://www.aurelio.ai/course/semantic-router) ### Community