This repository has been archived by the owner on Jan 5, 2025. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 640
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #240 from lvalics/main
Metadata, delete chatbot namespace and other changes.
- Loading branch information
Showing
17 changed files
with
897 additions
and
401 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
{ | ||
"[python]": { | ||
"editor.defaultFormatter": "ms-python.python" | ||
"editor.defaultFormatter": "ms-python.black-formatter" | ||
}, | ||
"python.formatting.provider": "none" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,30 +1,57 @@ | ||
# views.py | ||
import logging.config | ||
from django.http import JsonResponse | ||
from django.views.decorators.csrf import csrf_exempt | ||
from langchain.text_splitter import RecursiveCharacterTextSplitter | ||
from api.utils import get_embeddings | ||
from langchain_community.document_loaders import GitLoader | ||
from django.conf import settings | ||
from api.utils import init_vector_store | ||
from api.utils import get_embeddings | ||
from api.interfaces import StoreOptions | ||
from langchain_community.document_loaders import GitLoader | ||
from langchain.text_splitter import RecursiveCharacterTextSplitter | ||
from web.models.codebase_data_sources import CodebaseDataSource | ||
from typing import Optional, Dict, Any, List | ||
|
||
logging.config.dictConfig(settings.LOGGING) | ||
logger = logging.getLogger(__name__) | ||
|
||
|
||
# https://python.langchain.com/docs/integrations/document_loaders/git | ||
@csrf_exempt | ||
def codebase_handler(repo_path: str, namespace: str): | ||
def codebase_handler(repo_path: str, namespace: str, metadata: Dict[str, Any]): | ||
try: | ||
folder_path = f"website_data_sources/{namespace}" | ||
loader = GitLoader(repo_path=folder_path, clone_url=repo_path, branch="master") | ||
|
||
raw_docs = loader.load() | ||
logging.debug("Loaded documents") | ||
for doc in raw_docs: | ||
doc.metadata = ( | ||
getattr(doc, "metadata", {}) | ||
if getattr(doc, "metadata", {}) is not None | ||
else {} | ||
) | ||
|
||
print('Loaded documents') | ||
|
||
text_splitter = RecursiveCharacterTextSplitter(separators=["\n"], chunk_size=1000, chunk_overlap=200,length_function=len) | ||
text_splitter = RecursiveCharacterTextSplitter( | ||
separators=["\n"], chunk_size=1000, chunk_overlap=200, length_function=len | ||
) | ||
docs = text_splitter.split_documents(raw_docs) | ||
|
||
embeddings = get_embeddings() | ||
|
||
init_vector_store(docs, embeddings, options=StoreOptions(namespace)) | ||
init_vector_store( | ||
docs, | ||
embeddings, | ||
options=StoreOptions(namespace), | ||
metadata={ | ||
"bot_id": str(CodebaseDataSource.chatbot.id), | ||
"repository": str(CodebaseDataSource.chatbot.id), | ||
"last_update": CodebaseDataSource.ingested_at.strftime( | ||
"%Y-%m-%d %H:%M:%S" | ||
), | ||
"type": "codebase", | ||
}, | ||
) | ||
|
||
print('Indexed documents. all done!') | ||
print("Indexed documents. all done!") | ||
except Exception as e: | ||
print(e) | ||
print(e) |
Oops, something went wrong.