-
Notifications
You must be signed in to change notification settings - Fork 37
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refactor: refactored embedder and chroma client
- Loading branch information
1 parent
de6eda4
commit 0594b3f
Showing
6 changed files
with
228 additions
and
280 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,80 +1,49 @@ | ||
from abc import ABC, abstractmethod | ||
from typing import Any | ||
|
||
import sentence_transformers | ||
|
||
class Embedder(ABC): | ||
@abstractmethod | ||
def embed_documents(self, texts: list[str]) -> list[list[float]]: | ||
"""Embed search docs.""" | ||
|
||
@abstractmethod | ||
def embed_query(self, text: str) -> list[float]: | ||
"""Embed query text.""" | ||
|
||
|
||
class HuggingFaceEmbedder(Embedder): | ||
"""HuggingFace sentence_transformers embedding models. | ||
To use, you should have the ``sentence_transformers`` python package installed. | ||
""" | ||
|
||
client: Any #: :meta private: | ||
model_name: str = "all-MiniLM-L6-v2" | ||
"""Model name to use.""" | ||
cache_folder: str | None = None | ||
"""Path to store models. | ||
Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable.""" | ||
model_kwargs: dict[str, Any] = {} | ||
"""Keyword arguments to pass to the model.""" | ||
encode_kwargs: dict[str, Any] = {} | ||
"""Keyword arguments to pass when calling the `encode` method of the model.""" | ||
multi_process: bool = False | ||
"""Run encode() on multiple GPUs.""" | ||
|
||
def __init__(self, **kwargs: Any): | ||
"""Initialize the sentence_transformer.""" | ||
super().__init__(**kwargs) | ||
try: | ||
import sentence_transformers | ||
|
||
except ImportError as exc: | ||
raise ImportError( | ||
"Could not import sentence_transformers python package. " | ||
"Please install it with `pip install sentence-transformers`." | ||
) from exc | ||
class Embedder: | ||
def __init__(self, model_name: str = "all-MiniLM-L6-v2", cache_folder: str | None = None, **kwargs: Any): | ||
""" | ||
Initialize the Embedder class with the specified parameters. | ||
self.client = sentence_transformers.SentenceTransformer( | ||
self.model_name, cache_folder=self.cache_folder, **self.model_kwargs | ||
) | ||
Args: | ||
**kwargs (Any): Additional keyword arguments to pass to the SentenceTransformer model. | ||
""" | ||
self.client = sentence_transformers.SentenceTransformer(model_name, cache_folder=cache_folder, **kwargs) | ||
|
||
def embed_documents(self, texts: list[str]) -> list[list[float]]: | ||
"""Compute doc embeddings using a HuggingFace transformer model. | ||
def embed_documents(self, texts: list[str], multi_process: bool = False, **encode_kwargs: Any) -> list[list[float]]: | ||
""" | ||
Compute document embeddings using a transformer model. | ||
Args: | ||
texts: The list of texts to embed. | ||
texts (list[str]): The list of texts to embed. | ||
multi_process (bool): If True, use multiple processes to compute embeddings. | ||
**encode_kwargs (Any): Additional keyword arguments to pass when calling the `encode` method of the model. | ||
Returns: | ||
List of embeddings, one for each text. | ||
list[list[float]]: A list of embeddings, one for each text. | ||
""" | ||
import sentence_transformers | ||
|
||
texts = list(map(lambda x: x.replace("\n", " "), texts)) | ||
if self.multi_process: | ||
if multi_process: | ||
pool = self.client.start_multi_process_pool() | ||
embeddings = self.client.encode_multi_process(texts, pool) | ||
sentence_transformers.SentenceTransformer.stop_multi_process_pool(pool) | ||
else: | ||
embeddings = self.client.encode(texts, **self.encode_kwargs) | ||
embeddings = self.client.encode(texts, show_progress_bar=True, **encode_kwargs) | ||
|
||
return embeddings.tolist() | ||
|
||
def embed_query(self, text: str) -> list[float]: | ||
"""Compute query embeddings using a HuggingFace transformer model. | ||
""" | ||
Compute query embeddings using a transformer model. | ||
Args: | ||
text: The text to embed. | ||
text (str): The text to embed. | ||
Returns: | ||
Embeddings for the text. | ||
list[float]: Embeddings for the text. | ||
""" | ||
return self.embed_documents([text])[0] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.