-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Updated File Fix + Updated Env + Add a Few Tests
@wesslen brought up a great point about what happens if you want to update a document. Previously, if I uploaded an updated version of a document, the previous chunks would still exist alongside the new ones. To address this, I’ve updated the logic to identify when someone uploads the same document based on matching criteria (date, file type, and meeting type). The system now checks for matching chunks, lists how many exist, deletes them, and uploads the new ones. I’ve also updated the .env_example file to include AZURE_STORAGE_CONTAINER, which people need to configure. I forgot to include that earlier—oops! Additionally, I’ve added a few tests related to this functionality: metadata_deletion_test.py: Allows someone to test deleting chunks based on a specific date. metadata_by_date.py: Lets users retrieve all chunks associated with a specific date.
- Loading branch information
1 parent
74562ea
commit d4198c5
Showing
7 changed files
with
556 additions
and
57 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
# This allows you to find all the chunks by a specific meeting date. | ||
|
||
import os | ||
import weaviate | ||
from dotenv import load_dotenv | ||
from docx import Document | ||
|
||
# Load environment variables from .env | ||
load_dotenv() | ||
|
||
# Initialize Weaviate client | ||
WEAVIATE_URL = os.getenv("WEAVIATE_URL") | ||
WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY") | ||
client = weaviate.Client( | ||
url=WEAVIATE_URL, | ||
auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY) | ||
) | ||
|
||
def fetch_documents_by_date_and_export_to_word(date): | ||
""" | ||
Fetch documents from Weaviate filtered by a specific date and export metadata, including source_document, to a Word document. | ||
Args: | ||
date (str): The date to filter by (YYYY-MM-DD format). | ||
""" | ||
query = f""" | ||
{{ | ||
Get {{ | ||
MeetingDocument(where: {{ | ||
path: ["meeting_date"], | ||
operator: Equal, | ||
valueString: "{date}" | ||
}}) {{ | ||
content | ||
meeting_date | ||
meeting_type | ||
file_type | ||
chunk_index | ||
source_document | ||
}} | ||
}} | ||
}} | ||
""" | ||
try: | ||
print(f"Querying Weaviate for documents on {date}...") | ||
response = client.query.raw(query) | ||
documents = response.get("data", {}).get("Get", {}).get("MeetingDocument", []) | ||
|
||
if not documents: | ||
print(f"No documents found for the date: {date}.") | ||
return | ||
|
||
print(f"\nRetrieved Documents for {date}:") | ||
for doc in documents: | ||
print(f"- Chunk Index: {doc.get('chunk_index', 'N/A')}") | ||
print(f" Meeting Date: {doc.get('meeting_date', 'N/A')}") | ||
print(f" Meeting Type: {doc.get('meeting_type', 'N/A')}") | ||
print(f" File Type: {doc.get('file_type', 'N/A')}") | ||
print(f" Source Document: {doc.get('source_document', 'N/A')}") | ||
print(f" Content Preview: {doc.get('content', 'N/A')[:100]}...") | ||
print() | ||
|
||
# Export metadata to Word | ||
print(f"Exporting metadata for {date} to Word document...") | ||
doc = Document() | ||
doc.add_heading(f'Document Metadata for {date}', level=1) | ||
|
||
for doc_data in documents: | ||
doc.add_heading(f"Chunk Index: {doc_data.get('chunk_index', 'N/A')}", level=2) | ||
doc.add_paragraph(f"Meeting Date: {doc_data.get('meeting_date', 'N/A')}") | ||
doc.add_paragraph(f"Meeting Type: {doc_data.get('meeting_type', 'N/A')}") | ||
doc.add_paragraph(f"File Type: {doc_data.get('file_type', 'N/A')}") | ||
doc.add_paragraph(f"Source Document: {doc_data.get('source_document', 'N/A')}") | ||
doc.add_paragraph(f"Content Preview: {doc_data.get('content', 'N/A')}") | ||
doc.add_paragraph("\n") | ||
|
||
word_file_path = f"Weaviate_Metadata_List_{date}.docx" | ||
doc.save(word_file_path) | ||
print(f"Metadata exported to {word_file_path} successfully.") | ||
|
||
except Exception as e: | ||
print(f"Error querying Weaviate: {e}") | ||
|
||
if __name__ == "__main__": | ||
# Filter by specific date (YYYY-MM-DD format) | ||
specific_date = "2000-10-27" | ||
fetch_documents_by_date_and_export_to_word(specific_date) |
Oops, something went wrong.