From 582459780dfe4025111890ed936fb1cb02b54288 Mon Sep 17 00:00:00 2001 From: SeeknnDestroy Date: Sun, 31 Dec 2023 19:25:11 +0300 Subject: [PATCH 1/6] implement setup connection method --- autollm/utils/lancedb_vectorstore.py | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/autollm/utils/lancedb_vectorstore.py b/autollm/utils/lancedb_vectorstore.py index 13996733..803a5ecc 100644 --- a/autollm/utils/lancedb_vectorstore.py +++ b/autollm/utils/lancedb_vectorstore.py @@ -9,6 +9,7 @@ class LanceDBVectorStore(LanceDBVectorStoreBase): + """Advanced LanceDB Vector Store supporting cloud storage and prefiltering.""" def __init__( self, @@ -20,26 +21,27 @@ def __init__( region: Optional[str] = None, **kwargs: Any, ) -> None: + """Init params.""" + self._setup_connection(uri, api_key, region) + self.uri = uri + self.table_name = table_name + self.nprobes = nprobes + self.refine_factor = refine_factor + self.api_key = api_key + self.region = region + + def _setup_connection(self, uri: str, api_key: Optional[str], region: Optional[str]): + """Establishes a robust connection to LanceDB.""" + api_key = api_key or os.getenv('LANCEDB_API_KEY') + region = region or os.getenv('LANCEDB_REGION') + import_err_msg = "`lancedb` package not found, please run `pip install lancedb`" try: import lancedb except ImportError: raise ImportError(import_err_msg) - # Check for API key and region in environment variables if not provided - if api_key is None: - api_key = os.getenv('LANCEDB_API_KEY') - if region is None: - region = os.getenv('LANCEDB_REGION') - if api_key and region: self.connection = lancedb.connect(uri, api_key=api_key, region=region) else: self.connection = lancedb.connect(uri) - - self.uri = uri - self.table_name = table_name - self.nprobes = nprobes - self.refine_factor = refine_factor - self.api_key = api_key - self.region = region From b89ab6a07022c8c2e596abf758957f5604ac31bc Mon Sep 17 00:00:00 2001 From: SeeknnDestroy Date: Sun, 31 Dec 2023 20:07:48 +0300 Subject: [PATCH 2/6] implement prefilter feature --- autollm/utils/lancedb_vectorstore.py | 59 ++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/autollm/utils/lancedb_vectorstore.py b/autollm/utils/lancedb_vectorstore.py index 803a5ecc..b45e35e4 100644 --- a/autollm/utils/lancedb_vectorstore.py +++ b/autollm/utils/lancedb_vectorstore.py @@ -3,13 +3,19 @@ from typing import Any, Optional from dotenv import load_dotenv +from llama_index.schema import NodeRelationship, RelatedNodeInfo, TextNode from llama_index.vector_stores import LanceDBVectorStore as LanceDBVectorStoreBase +from llama_index.vector_stores.lancedb import _to_lance_filter, _to_llama_similarities +from llama_index.vector_stores.types import VectorStoreQuery, VectorStoreQueryResult +from pandas import DataFrame load_dotenv() class LanceDBVectorStore(LanceDBVectorStoreBase): """Advanced LanceDB Vector Store supporting cloud storage and prefiltering.""" + from lancedb.query import LanceQueryBuilder + from lancedb.table import Table def __init__( self, @@ -45,3 +51,56 @@ def _setup_connection(self, uri: str, api_key: Optional[str], region: Optional[s self.connection = lancedb.connect(uri, api_key=api_key, region=region) else: self.connection = lancedb.connect(uri) + + def query( + self, + query: VectorStoreQuery, + where: Optional[str] = None, + prefilter: bool = False, + ) -> VectorStoreQueryResult: + """Enhanced query method to support prefiltering in LanceDB queries.""" + table = self.connection.open_table(self.table_name) + lance_query = self._prepare_lance_query(query, table, where, prefilter) + + results = lance_query.to_df() + return self._construct_query_result(results) + + def _prepare_lance_query( + self, query: VectorStoreQuery, table: Table, where: str, prefilter: bool) -> LanceQueryBuilder: + """Prepares the LanceDB query considering prefiltering and additional parameters.""" + if query.filters is not None: + if where: + raise ValueError( + "Cannot specify filter via both query and lance-specific " + "Use kwargs only for lancedb specific items that are " + "not supported via the generic query interface.") + where = _to_lance_filter(query.filters) + + table = self.connection.open_table(self.table_name) + lance_query = ( + table.search(query.query_embedding).limit(query.similarity_top_k).where( + where, prefilter=prefilter).nprobes(self.nprobes)) + + if self.refine_factor is not None: + lance_query.refine_factor(self.refine_factor) + + return lance_query + + def _construct_query_result(self, results: DataFrame) -> VectorStoreQueryResult: + """Constructs a VectorStoreQueryResult from a LanceDB query result.""" + nodes = [] + + for _, row in results.iterrows(): + node = TextNode( + text=row.get('text', ''), # ensure text is a string + id_=row['id'], + relationships={ + NodeRelationship.SOURCE: RelatedNodeInfo(node_id=row['doc_id']), + }) + nodes.append(node) + + return VectorStoreQueryResult( + nodes=nodes, + similarities=_to_llama_similarities(results), + ids=results["id"].tolist(), + ) From 0c341abb2ad7c8c3ad1e15f5b2892706654f03f3 Mon Sep 17 00:00:00 2001 From: SeeknnDestroy Date: Sun, 31 Dec 2023 20:15:47 +0300 Subject: [PATCH 3/6] fixes.. --- autollm/utils/lancedb_vectorstore.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/autollm/utils/lancedb_vectorstore.py b/autollm/utils/lancedb_vectorstore.py index b45e35e4..2b4e1a5d 100644 --- a/autollm/utils/lancedb_vectorstore.py +++ b/autollm/utils/lancedb_vectorstore.py @@ -55,26 +55,27 @@ def _setup_connection(self, uri: str, api_key: Optional[str], region: Optional[s def query( self, query: VectorStoreQuery, - where: Optional[str] = None, - prefilter: bool = False, + **kwargs: Any, ) -> VectorStoreQueryResult: """Enhanced query method to support prefiltering in LanceDB queries.""" table = self.connection.open_table(self.table_name) - lance_query = self._prepare_lance_query(query, table, where, prefilter) + lance_query = self._prepare_lance_query(query, table, **kwargs) results = lance_query.to_df() return self._construct_query_result(results) - def _prepare_lance_query( - self, query: VectorStoreQuery, table: Table, where: str, prefilter: bool) -> LanceQueryBuilder: + def _prepare_lance_query(self, query: VectorStoreQuery, table: Table, **kwargs) -> LanceQueryBuilder: """Prepares the LanceDB query considering prefiltering and additional parameters.""" if query.filters is not None: - if where: + if "where" in kwargs: raise ValueError( - "Cannot specify filter via both query and lance-specific " + "Cannot specify filter via both query and kwargs. " "Use kwargs only for lancedb specific items that are " "not supported via the generic query interface.") where = _to_lance_filter(query.filters) + else: + where = kwargs.pop("where", None) + prefilter = kwargs.pop("prefilter", False) table = self.connection.open_table(self.table_name) lance_query = ( From 57db4c9245155ff2cc9acad376fe72b1d86027b3 Mon Sep 17 00:00:00 2001 From: SeeknnDestroy Date: Sun, 31 Dec 2023 20:56:42 +0300 Subject: [PATCH 4/6] more fixes --- autollm/auto/vector_store_index.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/autollm/auto/vector_store_index.py b/autollm/auto/vector_store_index.py index 9ca267b1..181a61ef 100644 --- a/autollm/auto/vector_store_index.py +++ b/autollm/auto/vector_store_index.py @@ -85,8 +85,6 @@ def from_defaults( region=lancedb_region, **kwargs) - vector_store = VectorStoreClass(uri=lancedb_uri, table_name=lancedb_table_name, **kwargs) - else: vector_store = VectorStoreClass(**kwargs) From 6972a03c73b068ea7b39ea7505ecc9df00b24570 Mon Sep 17 00:00:00 2001 From: SeeknnDestroy Date: Sun, 31 Dec 2023 21:06:11 +0300 Subject: [PATCH 5/6] minor fix --- autollm/utils/lancedb_vectorstore.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autollm/utils/lancedb_vectorstore.py b/autollm/utils/lancedb_vectorstore.py index 2b4e1a5d..33fc10f7 100644 --- a/autollm/utils/lancedb_vectorstore.py +++ b/autollm/utils/lancedb_vectorstore.py @@ -36,7 +36,7 @@ def __init__( self.api_key = api_key self.region = region - def _setup_connection(self, uri: str, api_key: Optional[str], region: Optional[str]): + def _setup_connection(self, uri: str, api_key: Optional[str] = None, region: Optional[str] = None): """Establishes a robust connection to LanceDB.""" api_key = api_key or os.getenv('LANCEDB_API_KEY') region = region or os.getenv('LANCEDB_REGION') From a9500447c49831eb344f25f02f2e306982f11d47 Mon Sep 17 00:00:00 2001 From: SeeknnDestroy Date: Sun, 31 Dec 2023 21:07:32 +0300 Subject: [PATCH 6/6] bumpt autollm --- autollm/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autollm/__init__.py b/autollm/__init__.py index 763b64ba..12519bbd 100644 --- a/autollm/__init__.py +++ b/autollm/__init__.py @@ -4,7 +4,7 @@ and vector databases, along with various utility functions. """ -__version__ = '0.1.5' +__version__ = '0.1.6' __author__ = 'safevideo' __license__ = 'AGPL-3.0'