Skip to content
This repository has been archived by the owner on Jan 5, 2025. It is now read-only.

Metadata will show now in chat more info. #241

Merged
merged 5 commits into from
Feb 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,5 @@ dj_backend_server.code-workspace
.aiderignore
dj_backend_server/.vscode/settings.json

dj_backend_server/a.py
dj_backend_server/1.pdf
3 changes: 2 additions & 1 deletion dj_backend_server/.vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter"
},
"python.formatting.provider": "none"
"python.formatting.provider": "black",
"editor.formatOnSave": false
}
3 changes: 3 additions & 0 deletions dj_backend_server/CHANGELOG.MD
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
2.22.2024
- We've recently enhanced our chat interface to display metadata values, such as the data source and web links, from our vector database when available. However, it's important to note that, as of now, there is no option to toggle this feature on or off. This means that whenever this metadata is available for newer database entries, it will be automatically displayed. Please be aware that older database records might not include this information due to the feature's recent implementation.

2.20.2024
- Implemented functionality to delete a chatbot namespace from the vector database, along with all records associated with that chatbot, upon chatbot deletion.
- The Directory Data Loader must be updated to include filename metadata to enable filtering. PR#138
Expand Down
38 changes: 22 additions & 16 deletions dj_backend_server/api/data_sources/pdf_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,26 +126,31 @@ def pdf_handler(

@csrf_exempt
def process_pdf_with_pypdfium(file_path, directory_path):
pdf = PdfDocument(file_path)
text_pages = []
pdf_document = PdfDocument(file_path)
text_pages_with_numbers = []

for page_index in range(len(pdf)):
page = pdf.get_page(page_index)
for page_index in range(len(pdf_document)):
page = pdf_document.get_page(page_index)
text_page = page.get_textpage() # get a text page handle for this page
text = text_page.get_text_range() # extract text from the text page
text_pages.append(text)
text_pages_with_numbers.append(
(page_index + 1, text)
) # Store page number and text
text_page.close() # close the text page handle

text = "".join(text_pages)
# Combine texts from all pages, prepending each with its page number
combined_text = "\n".join(
[f"Page {num}: {text}" for num, text in text_pages_with_numbers]
)
txt_file_path = os.path.splitext(file_path)[0] + ".txt"
logging.debug(
f"Debug: Writing text to {txt_file_path}, directory_path: {directory_path}, text: {text}"
f"Debug: Writing text with page numbers to {txt_file_path}, directory_path: {directory_path}"
)

with open(txt_file_path, "w") as f:
f.write(text)
f.write(combined_text)

pdf.close()
pdf_document.close()


@csrf_exempt
Expand All @@ -159,6 +164,7 @@ def process_pdf(FilePath, directory_path):
resturl = "http://www.ocrwebservice.com/restservices/processDocument"

RequestUrl = f"{resturl}?pagerange={pagerange}&language={language}&outputformat={outputformat}&gettext={gettext}"
logging.debug(f"Debug: RequestUrl: {RequestUrl}")

try:
with open(FilePath, "rb") as image_file:
Expand Down Expand Up @@ -216,7 +222,7 @@ def process_pdf(FilePath, directory_path):
f"\nThe text: {{text}}. "
)

# print (f"Debug: initial_prompt: {initial_prompt}")
logging.debug(f"Debug: initial_prompt: {initial_prompt}")

# Call LLM and write the result into a new text file
process_text_with_llm(txt_file, mode, initial_prompt)
Expand Down Expand Up @@ -291,13 +297,8 @@ def txt_to_vectordb(
)

docs = text_splitter.split_documents(raw_docs)

logging.debug("external files docs -->", docs)

if not docs:
print("No documents were processed successfully.")
return

embeddings = get_embeddings()

logging.debug(
Expand All @@ -311,6 +312,11 @@ def txt_to_vectordb(
"bot_id": str(pdf_data_source.chatbot.id),
"last_update": pdf_data_source.updated_at.strftime("%Y-%m-%d %H:%M:%S"),
"type": "document",
"doc_type": (
pdf_data_source.files_info[0]["original_name"].split(".")[-1]
if pdf_data_source.files_info
else "unknown"
),
"page": "1", # @TODO to extract the page number.
"folder": pdf_data_source.folder_name,
"original_filename": (
Expand All @@ -321,7 +327,7 @@ def txt_to_vectordb(
},
)
logging.debug(
f"Vector store initialized successfully for namespace: {namespace}."
f"Vector store initialized successfully for metadata: {metadata}."
)

logging.debug(f"Folder need or not to delete. {delete_folder_flag}")
Expand Down
14 changes: 6 additions & 8 deletions dj_backend_server/api/utils/make_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,14 +145,11 @@ def process_text_with_llm(txt_file_path: str, mode, initial_prompt: str):

# Send the formatted prompt to LLM and get the result
llm = get_llm()
result = llm(prompt=initial_prompt.format(text=text), temperature=0)

# Check if result is a string
if isinstance(result, str):
response = result
elif isinstance(result, dict):
# Extract only the response from the result
response = result["choices"][0]["message"]["content"]
result = llm.invoke(input=initial_prompt.format(text=text), temperature=0)

# Extract the response from the result
if hasattr(result, "content"):
response = result.content
else:
print(
f"Error: LLM result is not a dictionary or a string. It is a {type(result)} with value {result}"
Expand All @@ -166,6 +163,7 @@ def process_text_with_llm(txt_file_path: str, mode, initial_prompt: str):
print(f"Write with value {txt_file_path}")
else:
# Write the response into a new text file
result_file_path = txt_file_path.replace(".txt", "_processed.txt")
result_file_path = txt_file_path.replace(".txt", ".txt")
with open(result_file_path, "w") as result_file:
result_file.write(response)
Expand Down
64 changes: 43 additions & 21 deletions dj_backend_server/api/views/views_chat.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,33 +43,31 @@ def chat(request):
"""
try:

logger.debug("Received chat request from view_messages.py - /api/chat/")
body = json.loads(request.body.decode("utf-8"))
question = body.get("question")
namespace = body.get("namespace")
mode = body.get("mode")
initial_prompt = body.get("initial_prompt")
token = body.get("token")
session_id = body.get("session_id")
metadata = body.get("metadata", {})

logger.debug(f"Request body parsed: {body}")
logger.debug(f"Question: {question}")
bot = get_object_or_404(Chatbot, token=token)
logger.debug(f"Chatbot found: {bot.name}")
if not question:
return JsonResponse({"error": "No question in the request"}, status=400)
sanitized_question = question.strip().replace("\n", " ")
logger.debug(f"Sanitized question: {sanitized_question}")
vector_store = get_vector_store(StoreOptions(namespace=namespace))
logger.debug(f"Vector store obtained")
response_text = get_completion_response(

response_text, metadata = get_completion_response(
vector_store=vector_store,
initial_prompt=initial_prompt,
mode=mode,
sanitized_question=sanitized_question,
session_id=session_id,
metadata=metadata,
)
logger.debug(f"Response text: {response_text}")

if isinstance(response_text, dict) and "text" in response_text:
ChatHistory.objects.bulk_create(
[
Expand All @@ -90,9 +88,9 @@ def chat(request):
]
)
logger.debug(
f"Response after creating ChatHistory: {json.dumps(response_text, indent=2)}"
f"Response after creating ChatHistory: {json.dumps(response_text, indent=2)}, metadata: {metadata}"
)
return JsonResponse({"text": response_text})
return JsonResponse({"text": response_text, "metadata": metadata})

elif isinstance(response_text, str):
ChatHistory.objects.bulk_create(
Expand All @@ -114,9 +112,9 @@ def chat(request):
]
)
logger.debug(
f"Response after creating ChatHistory 2: {json.dumps(response_text, indent=2)}"
f"Response after creating ChatHistory 2: {json.dumps(response_text, indent=2)}, metadata: {metadata}"
)
return JsonResponse({"text": response_text})
return JsonResponse({"text": response_text, "metadata": metadata})

else:
return JsonResponse({"error": "Unexpected response from API"}, status=500)
Expand All @@ -132,7 +130,7 @@ def chat(request):


def get_completion_response(
vector_store, mode, initial_prompt, sanitized_question, session_id
vector_store, mode, initial_prompt, sanitized_question, session_id, metadata
):
"""
This function generates a response based on a given question. It uses either the 'retrieval_qa' or 'conversation_retrieval'
Expand All @@ -151,15 +149,18 @@ def get_completion_response(
is a string, it is returned after removing markdown code block formatting.
"""

logger.debug(f"Entering get_completion_response function")
logger.debug(
f"Mode: {mode}, Initial Prompt: {initial_prompt}, Sanitized Question: {sanitized_question}, Session ID: {session_id}"
)
# logger.debug(f"Entering get_completion_response function")
# logger.debug(
# f"Mode: {mode}, Initial Prompt: {initial_prompt}, Sanitized Question: {sanitized_question}, Session ID: {session_id}"
# )
chain_type = os.getenv("CHAIN_TYPE", "conversation_retrieval")
chain: QAWithSourcesChain
if chain_type == "retrieval_qa":
chain = getRetrievalQAWithSourcesChain(vector_store, mode, initial_prompt)
response = chain({"question": sanitized_question}, return_only_outputs=True)
response = chain.invoke(
{"question": sanitized_question, "metadata": metadata},
return_only_outputs=True,
)
response_text = response["answer"]
logger.debug(f"RetrievalQA response: {response_text}")
elif chain_type == "conversation_retrieval":
Expand All @@ -171,13 +172,33 @@ def get_completion_response(
logger.debug(f"Formatted Chat_history {chat_history}")

response = chain.invoke(
{"question": sanitized_question, "chat_history": chat_history},
{
"question": sanitized_question,
"chat_history": chat_history,
"metadata": metadata,
},
)
response_text = response.get("answer")
# Assuming 'response' is the JSON object you've provided
source_documents = response["source_documents"]

# Initialize an empty list to hold metadata from all documents
all_metadata = []

# Iterate through each document in the source documents
for document in source_documents:
# Correctly access the metadata attribute or method of the Document object
# Assuming the Document object has a 'metadata' attribute
metadata = document.metadata

# Add the metadata dictionary to the list
all_metadata.append(metadata)

response_text = response.get("answer", "")

try:
# Attempt to parse the response_text as JSON
response_text = json.loads(response_text)
logger.debug(f"Response text after JSON parsing: {response_text}")

except json.JSONDecodeError:
# If response_text is not a JSON string, leave it as is
pass
Expand All @@ -194,4 +215,5 @@ def get_completion_response(
response_text.replace("```", "").replace("markdown\n", "").strip()
)
logger.debug(f"Response text after markdown removal: {response_text}")
return response_text
# print(f"metadata {metadata}")
return response_text, all_metadata
64 changes: 51 additions & 13 deletions dj_backend_server/api/views/views_message.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,13 +182,9 @@ def send_chat(request):
) # {'from': 'user', 'type': 'text', 'content': 'input text from chat'}
# Validate the request data
content = data.get("content")
history = data.get("history")
logger.debug(f"Content: {content}")
logger.debug(
f"History: {history}"
) # history is a list of chat history - None????
content_type = data.get("type")
metadata = data.get("metadata") or {}
# history = data.get("history")
# logger.debug(f"Content: {content}")
# logger.debug(f"History: {history}")

session_id = get_session_id(request=request, bot_id=bot.id)
history = ChatHistory.objects.filter(session_id=session_id)
Expand Down Expand Up @@ -219,11 +215,9 @@ def send_chat(request):
"history": history_entries,
"token": bot_token,
"session_id": session_id,
"metadata": metadata,
},
timeout=200,
)
logger.debug(f"External API response: {response.text} and {response}")

"""
This block will first check if the response content is not empty. If it is empty,
Expand All @@ -242,7 +236,7 @@ def send_chat(request):
else:
try:
response_json = response.json()
logger.debug(f"Response JSON: {response_json}")
logger.debug(f"External API response 2")
except json.JSONDecodeError:
logger.error("JSONDecodeError occurred")
return JsonResponse(
Expand All @@ -255,18 +249,21 @@ def send_chat(request):
)

bot_response = ChatbotResponse(response.json())
# context = {'APP_URL': settings.APP_URL, session_id: session_id}

feedback_form_html = render_to_string(
"widgets/feedback.html",
{"APP_URL": settings.APP_URL, "session_id": session_id},
)
print(f"Response in JSON {session_id}")

html_compose = (
metadata_html_append(response_json, session_id) + feedback_form_html
)
return JsonResponse(
{
"type": "text",
"response": {
"text": bot_response.get_bot_reply(),
"html": feedback_form_html,
"html": html_compose,
"session_id": session_id,
},
}
Expand Down Expand Up @@ -313,3 +310,44 @@ def handle_feedback(request):
return JsonResponse({"error": "Chat history not found"}, status=404)
except Exception as e:
return JsonResponse({"error": "An error occurred"}, status=500)


def metadata_html_append(response_json, session_id):
# Example logic to determine type based on response_json
# This is a placeholder. Adjust according to your actual logic.
type = "document" # or "website", determined dynamically
seen_filenames = set()
metadata_items = []

for metadata_entry in response_json.get("metadata", []):
type = metadata_entry.get("type")
if type == "document":
# if the original_filename is the same in for, then show it only one time.
for entry in response_json.get("metadata", []):
original_filename = entry.get("original_filename")
if original_filename not in seen_filenames:
metadata_items.append(
{
"source": entry.get("source"),
"original_filename": original_filename,
}
)
seen_filenames.add(original_filename)

if type == "website":
# if the link is the same in for, then show it only one time.
for entry in response_json.get("metadata", []):
link = entry.get("link")
if link not in seen_filenames:
metadata_items.append({"source": entry.get("source"), "link": link})
seen_filenames.add(link)

return render_to_string(
"widgets/metadata.html",
{
"APP_URL": settings.APP_URL,
"session_id": session_id,
"metadata_items": metadata_items,
"type": type,
},
)
2 changes: 1 addition & 1 deletion dj_backend_server/web/services/chat_history_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,6 @@ def get_chat_history_for_retrieval_chain(
memory.save_context({"input": user_query}, {"output": entry.message})
user_query = None

logger.debug(f"Memory PRINT: {memory}")
# logger.debug(f"Memory PRINT: {memory}")
# chat_history = memory.load_memory_variables({})
return chat_history
Loading
Loading