From 695853590fc01db6df163af51097008d67ae2f34 Mon Sep 17 00:00:00 2001 From: Andrew White Date: Mon, 11 Mar 2024 19:29:45 -0700 Subject: [PATCH] Fixed bug for mixed encoding types from numpy (#253) --- paperqa/llms.py | 3 ++- paperqa/version.py | 2 +- tests/test_paperqa.py | 6 ++++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/paperqa/llms.py b/paperqa/llms.py index 2f5ef8a6..a39bb137 100644 --- a/paperqa/llms.py +++ b/paperqa/llms.py @@ -126,7 +126,8 @@ async def embed_documents(self, client, texts) -> list[list[float]]: # noqa: AR enc_batch = self.enc.encode_ordinary_batch(texts) # now get frequency of each token rel to length return [ - np.bincount([xi % self.ndim for xi in x], minlength=self.ndim) / len(x) + np.bincount([xi % self.ndim for xi in x], minlength=self.ndim).astype(float) + / len(x) for x in enc_batch ] diff --git a/paperqa/version.py b/paperqa/version.py index 0fd7811c..aef46acb 100644 --- a/paperqa/version.py +++ b/paperqa/version.py @@ -1 +1 @@ -__version__ = "4.2.0" +__version__ = "4.2.1" diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py index b82f3663..7b188c0e 100644 --- a/tests/test_paperqa.py +++ b/tests/test_paperqa.py @@ -743,6 +743,9 @@ def test_sparse_embedding(): ) assert any(docs.docs["test"].embedding) # type: ignore[arg-type] + # check the embeddings are the same size + assert np.shape(docs.texts[0].embedding) == np.shape(docs.texts[1].embedding) + # test alias docs = Docs(embedding="sparse") assert docs._embedding_client is None @@ -775,6 +778,9 @@ def test_hybrid_embedding(): ) assert any(docs.docs["test"].embedding) # type: ignore[arg-type] + # check the embeddings are the same size + assert np.shape(docs.texts[0].embedding) == np.shape(docs.texts[1].embedding) + # now try via alias docs = Docs( embedding="hybrid-text-embedding-3-small",