Merge pull request #240 from lvalics/main

Metadata, delete chatbot namespace and other changes.
openchatai · Feb 20, 2024 · f4afbe2 · f4afbe2
2 parents b0b49eb + 42d4bb0
commit f4afbe2
Show file tree

Hide file tree

Showing 17 changed files with 897 additions and 401 deletions.
diff --git a/dj_backend_server/.vscode/settings.json b/dj_backend_server/.vscode/settings.json
@@ -1,6 +1,6 @@
 {
     "[python]": {
-        "editor.defaultFormatter": "ms-python.python"
+        "editor.defaultFormatter": "ms-python.black-formatter"
     },
     "python.formatting.provider": "none"
 }
diff --git a/dj_backend_server/CHANGELOG.MD b/dj_backend_server/CHANGELOG.MD
@@ -1,3 +1,10 @@
+2.20.2024
+- Implemented functionality to delete a chatbot namespace from the vector database, along with all records associated with that chatbot, upon chatbot deletion.
+- The Directory Data Loader must be updated to include filename metadata to enable filtering. PR#138
+- Enhanced the history log interface to display whether a message has received positive or negative feedback.
+- Fixed the title when the page is being crawled; it will now be saved to the database.
+- On a deleted document, if the namespace does not exist, do not throw an error; instead, delete the file
+
 2.18.2024
 - The conversational retrieval functionality is now operating as expected. It successfully sends the conversation history to the language model, allowing the context from previous interactions to be utilized effectively.
 - Added support for Ollama as the Language Model (LLM). Ensure Ollama is specified in the .env configuration and the model is preloaded on the server.

diff --git a/dj_backend_server/api/data_sources/codebase_handler.py b/dj_backend_server/api/data_sources/codebase_handler.py
@@ -1,30 +1,57 @@
 # views.py
+import logging.config
 from django.http import JsonResponse
 from django.views.decorators.csrf import csrf_exempt
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from api.utils import get_embeddings
-from langchain_community.document_loaders import GitLoader
+from django.conf import settings
 from api.utils import init_vector_store
+from api.utils import get_embeddings
 from api.interfaces import StoreOptions
+from langchain_community.document_loaders import GitLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from web.models.codebase_data_sources import CodebaseDataSource
+from typing import Optional, Dict, Any, List
+
+logging.config.dictConfig(settings.LOGGING)
+logger = logging.getLogger(__name__)
+
 
 # https://python.langchain.com/docs/integrations/document_loaders/git
 @csrf_exempt
-def codebase_handler(repo_path: str, namespace: str):
+def codebase_handler(repo_path: str, namespace: str, metadata: Dict[str, Any]):
     try:
         folder_path = f"website_data_sources/{namespace}"
         loader = GitLoader(repo_path=folder_path, clone_url=repo_path, branch="master")
 
         raw_docs = loader.load()
+        logging.debug("Loaded documents")
+        for doc in raw_docs:
+            doc.metadata = (
+                getattr(doc, "metadata", {})
+                if getattr(doc, "metadata", {}) is not None
+                else {}
+            )
 
-        print('Loaded documents')
-
-        text_splitter = RecursiveCharacterTextSplitter(separators=["\n"], chunk_size=1000, chunk_overlap=200,length_function=len)
+        text_splitter = RecursiveCharacterTextSplitter(
+            separators=["\n"], chunk_size=1000, chunk_overlap=200, length_function=len
+        )
         docs = text_splitter.split_documents(raw_docs)
 
         embeddings = get_embeddings()
 
-        init_vector_store(docs, embeddings, options=StoreOptions(namespace))
+        init_vector_store(
+            docs,
+            embeddings,
+            options=StoreOptions(namespace),
+            metadata={
+                "bot_id": str(CodebaseDataSource.chatbot.id),
+                "repository": str(CodebaseDataSource.chatbot.id),
+                "last_update": CodebaseDataSource.ingested_at.strftime(
+                    "%Y-%m-%d %H:%M:%S"
+                ),
+                "type": "codebase",
+            },
+        )
 
-        print('Indexed documents. all done!')
+        print("Indexed documents. all done!")
     except Exception as e:
-        print(e)
+        print(e)