Skip to content

Commit

Permalink
Fixed bug for mixed encoding types from numpy (#253)
Browse files Browse the repository at this point in the history
  • Loading branch information
whitead authored Mar 12, 2024
1 parent a512c2f commit 6958535
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 2 deletions.
3 changes: 2 additions & 1 deletion paperqa/llms.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,8 @@ async def embed_documents(self, client, texts) -> list[list[float]]: # noqa: AR
enc_batch = self.enc.encode_ordinary_batch(texts)
# now get frequency of each token rel to length
return [
np.bincount([xi % self.ndim for xi in x], minlength=self.ndim) / len(x)
np.bincount([xi % self.ndim for xi in x], minlength=self.ndim).astype(float)
/ len(x)
for x in enc_batch
]

Expand Down
2 changes: 1 addition & 1 deletion paperqa/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "4.2.0"
__version__ = "4.2.1"
6 changes: 6 additions & 0 deletions tests/test_paperqa.py
Original file line number Diff line number Diff line change
Expand Up @@ -743,6 +743,9 @@ def test_sparse_embedding():
)
assert any(docs.docs["test"].embedding) # type: ignore[arg-type]

# check the embeddings are the same size
assert np.shape(docs.texts[0].embedding) == np.shape(docs.texts[1].embedding)

# test alias
docs = Docs(embedding="sparse")
assert docs._embedding_client is None
Expand Down Expand Up @@ -775,6 +778,9 @@ def test_hybrid_embedding():
)
assert any(docs.docs["test"].embedding) # type: ignore[arg-type]

# check the embeddings are the same size
assert np.shape(docs.texts[0].embedding) == np.shape(docs.texts[1].embedding)

# now try via alias
docs = Docs(
embedding="hybrid-text-embedding-3-small",
Expand Down

0 comments on commit 6958535

Please sign in to comment.