Skip to content
This repository has been archived by the owner on Sep 12, 2024. It is now read-only.

refactor AutoVectorStore and move methods to db_utils #11

Merged
merged 9 commits into from
Oct 16, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions autollm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""QuickLLM: A Base Package for Large Language Model Applications.
"""AutoLLM: A Base Package for Large Language Model Applications.

This package provides automated integrations with leading large language models
and vector databases, along with various utility functions.
Expand All @@ -11,6 +11,6 @@
from autollm.auto.llm import AutoLLM
from autollm.auto.query_engine import AutoQueryEngine
from autollm.auto.service_context import AutoServiceContext
from autollm.auto.vector_store import AutoVectorStore
from autollm.auto.vector_store_index import AutoVectorStoreIndex

__all__ = ['AutoLLM', 'AutoServiceContext', 'AutoVectorStore', 'AutoQueryEngine']
__all__ = ['AutoLLM', 'AutoServiceContext', 'AutoVectorStoreIndex', 'AutoQueryEngine']
24 changes: 11 additions & 13 deletions autollm/auto/query_engine.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,39 @@
from llama_index import ServiceContext
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.indices.query.base import BaseQueryEngine

from autollm.auto.llm import AutoLLM
from autollm.auto.service_context import AutoServiceContext
from autollm.auto.vector_store import AutoVectorStore
from autollm.vectorstores.base import BaseVS
from autollm.auto.vector_store_index import AutoVectorStoreIndex


class AutoQueryEngine:
"""AutoQueryEngine for query execution and optionally logging the query cost."""

@staticmethod
def from_instances(vector_store: BaseVS, service_context: ServiceContext, **kwargs) -> BaseQueryEngine:
def from_instances(
vector_store_index: VectorStoreIndex, service_context: ServiceContext,
**kwargs) -> BaseQueryEngine:
"""
Create an AutoQueryEngine from a vector store and a service context.
Create an AutoQueryEngine from a vector store index and a service context.

Parameters:
vector_store: Vector store instance.
vector_store_index: Vector store index instance.
service_context: Service context instance.
**kwargs: Keyword arguments for the query engine.

Returns:
An AutoQueryEngine instance.
"""

return vector_store.vectorindex.as_query_engine(service_context=service_context, **kwargs)
return vector_store_index.as_query_engine(service_context=service_context, **kwargs)

@staticmethod
def from_parameters(
system_prompt: str = None,
query_wrapper_prompt: str = None,
enable_cost_calculator: bool = True,
llm_params: dict = None,
vector_store_params: dict = {"vector_store_type": "in_memory"},
vector_store_params: dict = {"vector_store_type": "VectorStoreIndex"},
service_context_params: dict = None,
query_engine_params: dict = None) -> BaseQueryEngine:
"""
Expand All @@ -57,15 +58,12 @@ def from_parameters(
query_engine_params = {} if query_engine_params is None else query_engine_params

llm = AutoLLM.from_defaults(**llm_params)
vector_store = AutoVectorStore.from_defaults(**vector_store_params)
vector_store.initialize_vectorindex()
vector_store.connect_vectorstore()
vector_store_index = AutoVectorStoreIndex.from_defaults(**vector_store_params)
service_context = AutoServiceContext.from_defaults(
llm=llm,
system_prompt=system_prompt,
query_wrapper_prompt=query_wrapper_prompt,
enable_cost_calculator=enable_cost_calculator,
**service_context_params)

return vector_store.vectorindex.as_query_engine(
service_context=service_context, **query_engine_params)
return vector_store_index.as_query_engine(service_context=service_context, **query_engine_params)
60 changes: 0 additions & 60 deletions autollm/auto/vector_store.py

This file was deleted.

48 changes: 48 additions & 0 deletions autollm/auto/vector_store_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from typing import Optional, Sequence

from llama_index import Document, VectorStoreIndex


def import_vector_store_class(vector_store_class_name: str):
"""
Imports a predefined vector store class by class name.

Args:
Returns:
The imported VectorStore class.
"""
module = __import__("llama_index.vector_stores", fromlist=[vector_store_class_name])
class_ = getattr(module, vector_store_class_name)
return class_


class AutoVectorStoreIndex:
"""AutoVectorStoreIndex lets you dynamically initialize any Vector Store index based on the vector store
class name and additional parameters.
"""

@staticmethod
def from_defaults(
vector_store_type: str,
documents: Optional[Sequence[Document]] = None,
*args,
**kwargs) -> VectorStoreIndex:
"""
Initializes a Vector Store index from Vector Store type and additional parameters.

Parameters:
vector_store_type (str): The class name of the vector store (e.g., 'PineconeVectorStore', 'VectorStoreIndex')
documents (Optional[Sequence[Document]]): Documents to initialize in memory vector store index.
*args: Additional positional arguments for initializing the vector store
**kwargs: Additional parameters for initializing the vector store

Returns:
index (VectorStoreIndex): The initialized Vector Store index instance for given vector store type and parameter set.
"""
if vector_store_type == "VectorStoreIndex":
index = VectorStoreIndex.from_documents(documents=[documents], *args, **kwargs)
else:
vector_store = import_vector_store_class(vector_store_type)
index = VectorStoreIndex.from_vector_store(vector_store=vector_store, *args, **kwargs)

return index
165 changes: 127 additions & 38 deletions autollm/utils/db_utils.py
Original file line number Diff line number Diff line change
@@ -1,70 +1,159 @@
# db_utils.py
import logging
from typing import Sequence

from llama_index import Document
import pinecone
SeeknnDestroy marked this conversation as resolved.
Show resolved Hide resolved
from llama_index import Document, StorageContext, VectorStoreIndex
from llama_index.vector_stores import PineconeVectorStore, QdrantVectorStore
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams

from autollm.auto.vector_store import AutoVectorStore
from autollm.utils.constants import DEFAULT_INDEX_NAME, DEFAULT_VECTORE_STORE_TYPE
from autollm.auto.vector_store_index import AutoVectorStoreIndex
from autollm.utils.constants import DEFAULT_INDEX_NAME
from autollm.utils.env_utils import read_env_variable
from autollm.utils.hash_utils import check_for_changes

logger = logging.getLogger(__name__)


def initialize_database(
documents: Sequence[Document], vectore_store_type: str = DEFAULT_VECTORE_STORE_TYPE) -> None:
def initialize_pinecone_index(
index_name: str, dimension: int = 1536, metric: str = 'euclidean', pod_type: str = 'p1'):
# Read environment variables for Pinecone initialization
api_key = read_env_variable('PINECONE_API_KEY')
environment = read_env_variable('PINECONE_ENVIRONMENT')

# Initialize Pinecone
pinecone.init(api_key=api_key, environment=environment)
pinecone.create_index(index_name, dimension=dimension, metric=metric, pod_type=pod_type)


def initialize_qdrant_index(index_name: str, size: int = 1536, distance: str = 'EUCLID'):
# Initialize client
url = read_env_variable('QDRANT_URL')
api_key = read_env_variable('QDRANT_API_KEY')
client = QdrantClient(url=url, api_key=api_key)

# Convert string distance measure to Distance Enum equals to Distance.EUCLID
distance = Distance[distance]

# Create index
client.recreate_collection(
collection_name=index_name, vectors_config=VectorParams(size=size, distance=distance))


def connect_vectorstore(vector_store, **params):
"""Connect to an existing vector store."""
# Logic to connect to vector store based on the specific type of vector store
if isinstance(vector_store, PineconeVectorStore):
vector_store.pinecone_index = pinecone.Index(params['index_name'])
elif isinstance(vector_store, QdrantVectorStore):
vector_store.client = QdrantClient(url=params['url'], api_key=params['api_key'])
# TODO: Add more elif conditions for other vector stores as needed


def update_vector_store_index(vector_store_index: VectorStoreIndex, documents: Sequence[Document]):
"""
Initializes the vector database for the first time from given documents.
Update the vector store index with new documents.

Parameters:
documents (Sequence[Document]): List of documents to initialize the vector store with.
vectore_store_type (str): Type of vector store to use ('qdrant', 'pinecone', etc.).
vector_store_index: An instance of AutoVectorStoreIndex or any compatible vector store.
documents (Sequence[Document]): List of documents to update.

Returns:
None
"""
logger.info('Initializing vector store')
for document in documents:
delete_documents_by_id(vector_store_index, [document.id_])
vector_store_index.insert(document)


# Create a new index and connect to it
vector_store = AutoVectorStore.from_defaults(
vector_store_type=vectore_store_type, collection_name=DEFAULT_INDEX_NAME)
vector_store.initialize_vectorindex()
vector_store.connect_vectorstore()
def overwrite_vectorindex(vector_store, documents: Sequence[Document]):
"""
Overwrite the vector store index with new documents.

logger.info('Updating vector store with documents')
Parameters:
vector_store: An instance of AutoVectorStore or any compatible vector store.
documents (Sequence[Document]): List of documents to overwrite.

# Update the index with the documents
vector_store.overwrite_vectorindex(documents)
Returns:
None
"""
# Create storage context
storage_context = StorageContext.from_defaults(vector_store=vector_store)

logger.info('Vector database successfully initialized.')
# Create index, which will insert documents/vectors to vector store
_ = VectorStoreIndex.from_documents(documents, storage_context=storage_context)


def update_database(documents: Sequence[Document], vectore_store_type: str) -> None:
def delete_documents_by_id(vector_store_index: VectorStoreIndex, document_ids: Sequence[str]):
"""
Update the vector database to synchronize it with the provided list of documents.

This function performs the following actions:
1. Updates or adds new documents in the vector database that match the input list.
2. Removes any documents from the vector database that are not present in the input list.
Delete documents from vector store by their ids.

Parameters:
documents (Sequence[Document]): Complete set of documents that should exist in the vector database after the update.
vectore_store_type (str): Specifies the type of vector store to use (e.g., 'qdrant', 'pinecone'). Defaults to DEFAULT_VECTORE_STORE_TYPE.
vector_store_index: An instance of AutoVectorStoreIndex or any compatible vector store.
document_ids (Sequence[str]): List of document ids to delete.

Returns:
None

Note:
Ensure that the 'documents' list includes all documents that should remain in the database, as any missing items will be deleted.
"""
logger.info('Updating vector store')
# Check if there are any document IDs to delete.
if not document_ids:
return

# Proceed with deletion.
for document_id in document_ids:
vector_store_index.delete_ref_doc(document_id, delete_from_docstore=True)


# TODO: refactor and update.
# def initialize_database(
# documents: Sequence[Document], vector_store_class_name: str, **vector_store_params) -> None:
# logger.info('Initializing vector store')

# vector_store = AutoVectorStore.from_defaults(vector_store_class_name, **vector_store_params)

# if vector_store_class_name == 'PineconeVectorStore':
# initialize_pinecone_index(vector_store, **vector_store_params)
# elif vector_store_class_name == 'QdrantVectorStore':
# initialize_qdrant_index(vector_store, **vector_store_params)
# # TODO: Add more elif conditions for other vector stores as needed

# connect_vectorstore(vector_store, **vector_store_params)

# logger.info('Updating vector store with documents')

# update_vector_store_index(vector_store, documents)

# logger.info('Vector database successfully initialized.')

# # TODO: refactor and update.
# def update_database(documents: Sequence[Document], vectore_store_type: str) -> None:
# """
# Update the vector database to synchronize it with the provided list of documents.

# This function performs the following actions:
# 1. Updates or adds new documents in the vector database that match the input list.
# 2. Removes any documents from the vector database that are not present in the input list.

# Parameters:
# documents (Sequence[Document]): Complete set of documents that should exist in the vector database after the update.
# vectore_store_type (str): Specifies the type of vector store to use (e.g., 'qdrant', 'pinecone'). Defaults to DEFAULT_VECTORE_STORE_TYPE.

# Returns:
# None

# Note:
# Ensure that the 'documents' list includes all documents that should remain in the database, as any missing items will be deleted.
# """
# logger.info('Updating vector store')

# Get changed document ids using the hash of the documents available in the vector store index item metadata
vector_store = AutoVectorStore.from_defaults(
vector_store_type=vectore_store_type, index_name=DEFAULT_INDEX_NAME)
changed_documents, deleted_document_ids = check_for_changes(documents, vector_store)
# # Get changed document ids using the hash of the documents available in the vector store index item metadata
# vector_store = AutoVectorStore.from_defaults(
# vector_store_type=vectore_store_type, index_name=DEFAULT_INDEX_NAME)
# changed_documents, deleted_document_ids = check_for_changes(documents, vector_store)

# Update the index with the changed documents
vector_store.update_vectorindex(changed_documents)
vector_store.delete_documents_by_id(deleted_document_ids)
# # Update the index with the changed documents
# vector_store.update_vectorindex(changed_documents)
# vector_store.delete_documents_by_id(deleted_document_ids)

logger.info('Vector database successfully updated.')
# logger.info('Vector database successfully updated.')
5 changes: 0 additions & 5 deletions autollm/vectorstores/__init__.py

This file was deleted.

Loading