Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vectorstore: use a retriever query for hybrid search #2666

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 34 additions & 19 deletions elasticsearch/helpers/vectorstore/_async/strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,31 +283,46 @@ def _hybrid(
) -> Dict[str, Any]:
# Add a query to the knn query.
# RRF is used to even the score from the knn query and text query
# RRF has two optional parameters: {'rank_constant':int, 'window_size':int}
# RRF has two optional parameters: {'rank_constant':int, 'rank_window_size':int}
# https://www.elastic.co/guide/en/elasticsearch/reference/current/rrf.html
rrf_options = {}
if isinstance(self.rrf, Dict):
if "rank_constant" in self.rrf:
rrf_options["rank_constant"] = self.rrf["rank_constant"]
if "window_size" in self.rrf:
# 'window_size' was renamed to 'rank_window_size', but we support
# the older name for backwards compatibility
rrf_options["rank_window_size"] = self.rrf["window_size"]
if "rank_window_size" in self.rrf:
rrf_options["rank_window_size"] = self.rrf["rank_window_size"]
query_body = {
"knn": knn,
"query": {
"bool": {
"must": [
"retriever": {
"rrf": {
"retrievers": [
{
"match": {
self.text_field: {
"query": query,
}
}
}
"standard": {
"query": {
"bool": {
"must": [
{
"match": {
self.text_field: {
"query": query,
}
}
}
],
"filter": filter,
}
},
},
},
{"knn": knn},
],
"filter": filter,
}
**rrf_options,
},
},
}

if isinstance(self.rrf, Dict):
query_body["rank"] = {"rrf": self.rrf}
elif isinstance(self.rrf, bool) and self.rrf is True:
query_body["rank"] = {"rrf": {}}

return query_body

def needs_inference(self) -> bool:
Expand Down
53 changes: 34 additions & 19 deletions elasticsearch/helpers/vectorstore/_sync/strategies.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,31 +283,46 @@ def _hybrid(
) -> Dict[str, Any]:
# Add a query to the knn query.
# RRF is used to even the score from the knn query and text query
# RRF has two optional parameters: {'rank_constant':int, 'window_size':int}
# RRF has two optional parameters: {'rank_constant':int, 'rank_window_size':int}
# https://www.elastic.co/guide/en/elasticsearch/reference/current/rrf.html
rrf_options = {}
if isinstance(self.rrf, Dict):
if "rank_constant" in self.rrf:
rrf_options["rank_constant"] = self.rrf["rank_constant"]
if "window_size" in self.rrf:
# 'window_size' was renamed to 'rank_window_size', but we support
# the older name for backwards compatibility
rrf_options["rank_window_size"] = self.rrf["window_size"]
if "rank_window_size" in self.rrf:
rrf_options["rank_window_size"] = self.rrf["rank_window_size"]
query_body = {
"knn": knn,
"query": {
"bool": {
"must": [
"retriever": {
"rrf": {
"retrievers": [
{
"match": {
self.text_field: {
"query": query,
}
}
}
"standard": {
"query": {
"bool": {
"must": [
{
"match": {
self.text_field: {
"query": query,
}
}
}
],
"filter": filter,
}
},
},
},
{"knn": knn},
],
"filter": filter,
}
**rrf_options,
},
},
}

if isinstance(self.rrf, Dict):
query_body["rank"] = {"rrf": self.rrf}
elif isinstance(self.rrf, bool) and self.rrf is True:
query_body["rank"] = {"rrf": {}}

return query_body

def needs_inference(self) -> bool:
Expand Down
181 changes: 129 additions & 52 deletions test_elasticsearch/test_server/test_vectorstore/test_vectorstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
VectorStore,
)
from elasticsearch.helpers.vectorstore._sync._utils import model_is_deployed
from test_elasticsearch.utils import es_version

from . import ConsistentFakeEmbeddings, FakeEmbeddings

Expand Down Expand Up @@ -337,6 +338,9 @@ def test_search_knn_with_hybrid_search(
self, sync_client: Elasticsearch, index: str
) -> None:
"""Test end to end construction and search with metadata."""
if es_version(sync_client) < (8, 14):
pytest.skip("This test requires Elasticsearch 8.14 or newer")

store = VectorStore(
index=index,
retrieval_strategy=DenseVectorStrategy(hybrid=True),
Expand All @@ -349,20 +353,48 @@ def test_search_knn_with_hybrid_search(

def assert_query(query_body: dict, query: Optional[str]) -> dict:
assert query_body == {
"knn": {
"field": "vector_field",
"filter": [],
"k": 1,
"num_candidates": 50,
"query_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0],
},
"query": {
"bool": {
"filter": [],
"must": [{"match": {"text_field": {"query": "foo"}}}],
"retriever": {
"rrf": {
"retrievers": [
{
"standard": {
"query": {
"bool": {
"filter": [],
"must": [
{
"match": {
"text_field": {"query": "foo"}
}
}
],
}
},
},
},
{
"knn": {
"field": "vector_field",
"filter": [],
"k": 1,
"num_candidates": 50,
"query_vector": [
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
0.0,
],
},
},
],
}
},
"rank": {"rrf": {}},
}
}
return query_body

Expand All @@ -373,6 +405,9 @@ def test_search_knn_with_hybrid_search_rrf(
self, sync_client: Elasticsearch, index: str
) -> None:
"""Test end to end construction and rrf hybrid search with metadata."""
if es_version(sync_client) < (8, 14):
pytest.skip("This test requires Elasticsearch 8.14 or newer")

texts = ["foo", "bar", "baz"]

def assert_query(
Expand All @@ -381,36 +416,52 @@ def assert_query(
expected_rrf: Union[dict, bool],
) -> dict:
cmp_query_body = {
"knn": {
"field": "vector_field",
"filter": [],
"k": 3,
"num_candidates": 50,
"query_vector": [
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
0.0,
],
},
"query": {
"bool": {
"filter": [],
"must": [{"match": {"text_field": {"query": "foo"}}}],
"retriever": {
"rrf": {
"retrievers": [
{
"standard": {
"query": {
"bool": {
"filter": [],
"must": [
{
"match": {
"text_field": {"query": "foo"}
}
}
],
}
},
},
},
{
"knn": {
"field": "vector_field",
"filter": [],
"k": 3,
"num_candidates": 50,
"query_vector": [
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
0.0,
],
},
},
],
}
},
}
}

if isinstance(expected_rrf, dict):
cmp_query_body["rank"] = {"rrf": expected_rrf}
elif isinstance(expected_rrf, bool) and expected_rrf is True:
cmp_query_body["rank"] = {"rrf": {}}
cmp_query_body["retriever"]["rrf"].update(expected_rrf)

assert query_body == cmp_query_body

Expand All @@ -420,7 +471,7 @@ def assert_query(
rrf_test_cases: List[Union[dict, bool]] = [
True,
False,
{"rank_constant": 1, "window_size": 5},
{"rank_constant": 1, "rank_window_size": 5},
]
for rrf_test_case in rrf_test_cases:
store = VectorStore(
Expand All @@ -441,21 +492,47 @@ def assert_query(
# 2. check query result is okay
es_output = store.client.search(
index=index,
query={
"bool": {
"filter": [],
"must": [{"match": {"text_field": {"query": "foo"}}}],
retriever={
"rrf": {
"retrievers": [
{
"knn": {
"field": "vector_field",
"filter": [],
"k": 3,
"num_candidates": 50,
"query_vector": [
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
1.0,
0.0,
],
},
},
{
"standard": {
"query": {
"bool": {
"filter": [],
"must": [
{"match": {"text_field": {"query": "foo"}}}
],
}
},
},
},
],
"rank_constant": 1,
"rank_window_size": 5,
}
},
knn={
"field": "vector_field",
"filter": [],
"k": 3,
"num_candidates": 50,
"query_vector": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.0],
},
size=3,
rank={"rrf": {"rank_constant": 1, "window_size": 5}},
)

assert [o["_source"]["text_field"] for o in output] == [
Expand Down
Loading