-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CrateDB: Vector Store -- make it work using CrateDB's vector_similarity()
#31
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -232,7 +232,7 @@ def test_cratedb_with_metadatas_with_scores() -> None: | |
pre_delete_collection=True, | ||
) | ||
output = docsearch.similarity_search_with_score("foo", k=1) | ||
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 2.0)] | ||
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 1.0)] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This spot, and others, perfectly demonstrates that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, |
||
|
||
|
||
def test_cratedb_with_filter_match() -> None: | ||
|
@@ -250,9 +250,7 @@ def test_cratedb_with_filter_match() -> None: | |
# TODO: Original: | ||
# assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)] # noqa: E501 | ||
output = docsearch.similarity_search_with_score("foo", k=1, filter={"page": "0"}) | ||
assert output == [ | ||
(Document(page_content="foo", metadata={"page": "0"}), pytest.approx(2.2, 0.3)) | ||
] | ||
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 1.0)] | ||
|
||
|
||
def test_cratedb_with_filter_distant_match() -> None: | ||
|
@@ -269,9 +267,7 @@ def test_cratedb_with_filter_distant_match() -> None: | |
) | ||
output = docsearch.similarity_search_with_score("foo", k=2, filter={"page": "2"}) | ||
# Original score value: 0.0013003906671379406 | ||
assert output == [ | ||
(Document(page_content="baz", metadata={"page": "2"}), pytest.approx(1.5, 0.2)) | ||
] | ||
assert output == [(Document(page_content="baz", metadata={"page": "2"}), 0.2)] | ||
|
||
|
||
def test_cratedb_with_filter_no_match() -> None: | ||
|
@@ -425,8 +421,8 @@ def test_cratedb_with_filter_in_set() -> None: | |
) | ||
# Original score values: 0.0, 0.0013003906671379406 | ||
assert output == [ | ||
(Document(page_content="foo", metadata={"page": "0"}), pytest.approx(3.0, 0.1)), | ||
(Document(page_content="baz", metadata={"page": "2"}), pytest.approx(2.2, 0.1)), | ||
(Document(page_content="foo", metadata={"page": "0"}), 1.0), | ||
(Document(page_content="baz", metadata={"page": "2"}), 0.2), | ||
] | ||
|
||
|
||
|
@@ -474,9 +470,9 @@ def test_cratedb_relevance_score() -> None: | |
output = docsearch.similarity_search_with_relevance_scores("foo", k=3) | ||
# Original score values: 1.0, 0.9996744261675065, 0.9986996093328621 | ||
assert output == [ | ||
(Document(page_content="foo", metadata={"page": "0"}), pytest.approx(1.4, 0.1)), | ||
(Document(page_content="bar", metadata={"page": "1"}), pytest.approx(1.1, 0.1)), | ||
(Document(page_content="baz", metadata={"page": "2"}), pytest.approx(0.8, 0.1)), | ||
(Document(page_content="foo", metadata={"page": "0"}), 0.7071067811865475), | ||
(Document(page_content="bar", metadata={"page": "1"}), 0.35355339059327373), | ||
(Document(page_content="baz", metadata={"page": "2"}), 0.1414213562373095), | ||
Comment on lines
470
to
+475
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is why we need to have a custom |
||
] | ||
|
||
|
||
|
@@ -495,9 +491,9 @@ def test_cratedb_retriever_search_threshold() -> None: | |
|
||
retriever = docsearch.as_retriever( | ||
search_type="similarity_score_threshold", | ||
search_kwargs={"k": 3, "score_threshold": 0.999}, | ||
search_kwargs={"k": 3, "score_threshold": 0.35}, # Original value: 0.999 | ||
Comment on lines
-498
to
+494
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's certainly an anomaly that had to be applied to an input value now, in order to get the expected result here. |
||
) | ||
output = retriever.get_relevant_documents("summer") | ||
output = retriever.invoke("summer") | ||
assert output == [ | ||
Document(page_content="foo", metadata={"page": "0"}), | ||
Document(page_content="bar", metadata={"page": "1"}), | ||
|
@@ -522,7 +518,7 @@ def test_cratedb_retriever_search_threshold_custom_normalization_fn() -> None: | |
search_type="similarity_score_threshold", | ||
search_kwargs={"k": 3, "score_threshold": 0.5}, | ||
) | ||
output = retriever.get_relevant_documents("foo") | ||
output = retriever.invoke("foo") | ||
assert output == [] | ||
|
||
|
||
|
@@ -551,7 +547,7 @@ def test_cratedb_max_marginal_relevance_search_with_score() -> None: | |
pre_delete_collection=True, | ||
) | ||
output = docsearch.max_marginal_relevance_search_with_score("foo", k=1, fetch_k=3) | ||
assert output == [(Document(page_content="foo"), 2.0)] | ||
assert output == [(Document(page_content="foo"), 1.0)] | ||
|
||
|
||
def test_cratedb_multicollection_search_success() -> None: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
See also:
UnsupportedOperationException: Can't handle Symbol [ParameterSymbol: $1]]
when using JOINs and parameters to an aliased and sortedvector_similarity()
together crate/crate#16912There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not quite. The snippet above uses a workaround, by marshalling the
embedding
argument tovector_similarity()
manually, instead of submitting it as an SQL parameter. In this spirit, we can proceed, and submit an improvement later.If you agree with this patch, and then also with langchain-ai#27710 in its current form, we can toggle it ready to be reviewed by upstream maintainers.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If you are ready for a bigger review, I am fine with toggling it ready 😄