diff --git a/Preprocessing/.env.example b/Preprocessing/.env.example index 39a39ba5..85a4136e 100644 --- a/Preprocessing/.env.example +++ b/Preprocessing/.env.example @@ -11,6 +11,7 @@ ASSEMBLY_AI_KEY = #Azure Storage Container Connection AZURE_STORAGE_CONNECTION_STRING= +AZURE_STORAGE_CONTAINER = # Pathing Setup PYTHONPATH= \ No newline at end of file diff --git a/Preprocessing/App/main.py b/Preprocessing/App/main.py index e6546dfa..d6f339f8 100644 --- a/Preprocessing/App/main.py +++ b/Preprocessing/App/main.py @@ -18,7 +18,11 @@ from preprocessing_pipeline.pdf_conversion import convert_pdf_to_text from preprocessing_pipeline.audio_transcription import transcribe_audio from preprocessing_pipeline.text_cleaning import clean_text -from preprocessing_pipeline.chunking_vector_embedding import tokenize_and_embed_text +from preprocessing_pipeline.chunking_vector_embedding import ( + tokenize_and_embed_text, + fetch_matching_chunks, + delete_matching_chunks +) from utils.azure_blob_utils import ( upload_to_azure, download_from_azure, @@ -209,11 +213,21 @@ def upload_files_page(): upload_to_azure("clean", clean_file_name, cleaned_text) st.write(f"Uploaded cleaned text to `clean/` folder: {clean_file_name}") - # Display cleaned text - st.text_area("Cleaned Text:", cleaned_text, height=200) - st.download_button("Download Cleaned Text", data=cleaned_text, file_name=clean_file_name) + # Stage 4: Check and Delete Existing Embeddings + with st.spinner("Checking for existing embeddings..."): + matching_chunks = fetch_matching_chunks( + str(metadata["meeting_date"]), + metadata["meeting_type"], + metadata["file_type"], + clean_file_name + ) + if matching_chunks: + st.write(f"Found {len(matching_chunks)} existing chunks. Deleting...") + delete_matching_chunks(matching_chunks) + else: + st.write("No existing chunks found.") - # Stage 4: Chunk and Embed into Weaviate + # Stage 5: Chunk and Embed into Weaviate with st.spinner("Chunking and embedding text into Weaviate..."): tokenize_and_embed_text(clean_file_name, metadata) st.success("Document processed and embedded successfully!") @@ -242,33 +256,23 @@ def group_blobs_by_date(blobs): grouped = {} for blob in blobs: try: - # Extract the file name without folder prefix (e.g., "raw/") - file_name = blob.split("/")[-1] # Get only the file name part - - # Extract the date from the file name (assuming format: YYYY_MM_DD) - parts = file_name.split("_") # Split into ['2023', '12', '12', 'BOC', 'Agenda', ...] + file_name = blob.split("/")[-1] # Extract the file name + parts = file_name.split("_") # Split into parts: ['2023', '12', '12', 'BOC', 'Agenda', ...] date_str = "_".join(parts[:3]) # Join the first three parts: '2023_12_12' - - # Convert the date string to a readable format readable_date = datetime.strptime(date_str, "%Y_%m_%d").strftime("%B %d, %Y") - - # Group by the readable date if readable_date not in grouped: grouped[readable_date] = [] grouped[readable_date].append(blob) except (ValueError, IndexError): - # Handle files with unexpected formats if "Unknown Date" not in grouped: grouped["Unknown Date"] = [] grouped["Unknown Date"].append(blob) return grouped - # Group blobs by date raw_grouped = group_blobs_by_date(raw_blobs) dirty_grouped = group_blobs_by_date(dirty_blobs) clean_grouped = group_blobs_by_date(clean_blobs) - # Function to display blobs within a group def display_grouped_blobs(grouped_blobs, category): if grouped_blobs: st.subheader(f"{category.capitalize()} Documents") @@ -282,7 +286,6 @@ def display_grouped_blobs(grouped_blobs, category): else: st.info(f"No documents found in the {category} category.") - # Display grouped blobs display_grouped_blobs(raw_grouped, "raw") display_grouped_blobs(dirty_grouped, "dirty") display_grouped_blobs(clean_grouped, "clean") @@ -308,4 +311,4 @@ def display_grouped_blobs(grouped_blobs, category): elif st.session_state.page == "upload": upload_files_page() elif st.session_state.page == "view": - view_documents_page() \ No newline at end of file + view_documents_page() diff --git a/Preprocessing/preprocessing_pipeline/chunking_vector_embedding.py b/Preprocessing/preprocessing_pipeline/chunking_vector_embedding.py index 6e88de4c..380d96cd 100644 --- a/Preprocessing/preprocessing_pipeline/chunking_vector_embedding.py +++ b/Preprocessing/preprocessing_pipeline/chunking_vector_embedding.py @@ -23,6 +23,56 @@ # Initialize tiktoken for OpenAI's embedding model tokenizer = tiktoken.encoding_for_model("text-embedding-ada-002") + +def fetch_matching_chunks(meeting_date, meeting_type, file_type, source_document): + """ + Fetch matching chunks from Weaviate based on metadata. + + Args: + meeting_date (str): Date of the meeting. + meeting_type (str): Type of the meeting (e.g., "Board of Commissioners"). + file_type (str): File type (e.g., "Minutes"). + source_document (str): Name of the source document. + + Returns: + list: A list of matching documents. + """ + query = f""" + {{ + Get {{ + MeetingDocument(where: {{ + operator: And, + operands: [ + {{ path: ["meeting_date"], operator: Equal, valueString: "{meeting_date}" }}, + {{ path: ["meeting_type"], operator: Equal, valueString: "{meeting_type}" }}, + {{ path: ["file_type"], operator: Equal, valueString: "{file_type}" }}, + {{ path: ["source_document"], operator: Equal, valueString: "{source_document}" }} + ] + }}) {{ + _additional {{ + id + }} + }} + }} + }} + """ + response = client.query.raw(query) + return response.get("data", {}).get("Get", {}).get("MeetingDocument", []) + + +def delete_matching_chunks(documents): + """ + Delete matching chunks from Weaviate. + + Args: + documents (list): List of documents with IDs to delete. + """ + for doc in documents: + doc_id = doc["_additional"]["id"] + client.data_object.delete(doc_id) + print(f"Deleted chunk ID: {doc_id}") + + def tokenize_and_embed_text(clean_file_name, metadata, max_chunk_size=250): """ Tokenizes, chunks, and embeds cleaned text into Weaviate. @@ -33,68 +83,46 @@ def tokenize_and_embed_text(clean_file_name, metadata, max_chunk_size=250): max_chunk_size (int): Maximum token size for each chunk. """ try: - # Step 1: Download cleaned text from Azure + # Download cleaned text from Azure clean_text = download_from_azure("clean", clean_file_name) - print(f"Downloaded cleaned text from Azure for file: {clean_file_name}") - - # Step 2: Tokenize the text using tiktoken tokens = tokenizer.encode(clean_text) - - # Step 3: Chunk tokens into groups of max_chunk_size (default: 250 tokens per chunk) chunks = [ tokenizer.decode(tokens[i:i + max_chunk_size]) for i in range(0, len(tokens), max_chunk_size) ] - print(f"Tokenized and split text into {len(chunks)} chunks of {max_chunk_size} tokens each.") - # Extract metadata for embedding + # Metadata fields meeting_date = str(metadata["meeting_date"]) meeting_type = metadata["meeting_type"] file_type = metadata["file_type"] + source_document = clean_file_name - # Step 4: Check and delete existing embeddings in Weaviate (to prevent duplication) - query = f""" - {{ - Get {{ - MeetingDocument(where: {{ - path: ["meeting_date", "meeting_type", "file_type"], - operator: And, - valueString: "{meeting_date}" - }}) {{ - id - }} - }} - }} - """ - response = client.query.raw(query) - existing_documents = response.get("data", {}).get("Get", {}).get("MeetingDocument", []) - - for doc in existing_documents: - client.data_object.delete(doc["id"]) - print(f"Deleted {len(existing_documents)} existing embeddings for this file.") + # Check for existing embeddings + matching_chunks = fetch_matching_chunks(meeting_date, meeting_type, file_type, source_document) + if matching_chunks: + print(f"Found {len(matching_chunks)} existing chunks. Deleting...") + delete_matching_chunks(matching_chunks) + else: + print("No existing chunks found.") - # Step 5: Embed each chunk using OpenAI and store in Weaviate + # Embed and upload each chunk for i, chunk in enumerate(chunks): - # Generate embedding using OpenAI - response = openai_client.embeddings.create( - input=chunk, - model="text-embedding-ada-002" - ) - embedding = response.data[0].embedding # Correctly access embedding from the response object + response = openai_client.embeddings.create(input=chunk, model="text-embedding-ada-002") + embedding = response.data[0].embedding - # Upload chunk to Weaviate client.data_object.create( data_object={ "content": chunk, "meeting_date": meeting_date, "meeting_type": meeting_type, "file_type": file_type, - "chunk_index": i # Include chunk index for ordering + "chunk_index": i, + "source_document": source_document }, vector=embedding, class_name="MeetingDocument" ) - print(f"Uploaded chunk {i+1}/{len(chunks)} to Weaviate.") + print(f"Uploaded chunk {i + 1}/{len(chunks)} to Weaviate.") print("Successfully processed and embedded all chunks.") diff --git a/Preprocessing/tests/Weaviate_Metadata_List_2000-10-27.docx b/Preprocessing/tests/Weaviate_Metadata_List_2000-10-27.docx new file mode 100644 index 00000000..8d72bb6f Binary files /dev/null and b/Preprocessing/tests/Weaviate_Metadata_List_2000-10-27.docx differ diff --git a/Preprocessing/tests/metadata_by_date.py b/Preprocessing/tests/metadata_by_date.py new file mode 100644 index 00000000..ac0f2dab --- /dev/null +++ b/Preprocessing/tests/metadata_by_date.py @@ -0,0 +1,87 @@ +# This allows you to find all the chunks by a specific meeting date. + +import os +import weaviate +from dotenv import load_dotenv +from docx import Document + +# Load environment variables from .env +load_dotenv() + +# Initialize Weaviate client +WEAVIATE_URL = os.getenv("WEAVIATE_URL") +WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY") +client = weaviate.Client( + url=WEAVIATE_URL, + auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY) +) + +def fetch_documents_by_date_and_export_to_word(date): + """ + Fetch documents from Weaviate filtered by a specific date and export metadata, including source_document, to a Word document. + + Args: + date (str): The date to filter by (YYYY-MM-DD format). + """ + query = f""" + {{ + Get {{ + MeetingDocument(where: {{ + path: ["meeting_date"], + operator: Equal, + valueString: "{date}" + }}) {{ + content + meeting_date + meeting_type + file_type + chunk_index + source_document + }} + }} + }} + """ + try: + print(f"Querying Weaviate for documents on {date}...") + response = client.query.raw(query) + documents = response.get("data", {}).get("Get", {}).get("MeetingDocument", []) + + if not documents: + print(f"No documents found for the date: {date}.") + return + + print(f"\nRetrieved Documents for {date}:") + for doc in documents: + print(f"- Chunk Index: {doc.get('chunk_index', 'N/A')}") + print(f" Meeting Date: {doc.get('meeting_date', 'N/A')}") + print(f" Meeting Type: {doc.get('meeting_type', 'N/A')}") + print(f" File Type: {doc.get('file_type', 'N/A')}") + print(f" Source Document: {doc.get('source_document', 'N/A')}") + print(f" Content Preview: {doc.get('content', 'N/A')[:100]}...") + print() + + # Export metadata to Word + print(f"Exporting metadata for {date} to Word document...") + doc = Document() + doc.add_heading(f'Document Metadata for {date}', level=1) + + for doc_data in documents: + doc.add_heading(f"Chunk Index: {doc_data.get('chunk_index', 'N/A')}", level=2) + doc.add_paragraph(f"Meeting Date: {doc_data.get('meeting_date', 'N/A')}") + doc.add_paragraph(f"Meeting Type: {doc_data.get('meeting_type', 'N/A')}") + doc.add_paragraph(f"File Type: {doc_data.get('file_type', 'N/A')}") + doc.add_paragraph(f"Source Document: {doc_data.get('source_document', 'N/A')}") + doc.add_paragraph(f"Content Preview: {doc_data.get('content', 'N/A')}") + doc.add_paragraph("\n") + + word_file_path = f"Weaviate_Metadata_List_{date}.docx" + doc.save(word_file_path) + print(f"Metadata exported to {word_file_path} successfully.") + + except Exception as e: + print(f"Error querying Weaviate: {e}") + +if __name__ == "__main__": + # Filter by specific date (YYYY-MM-DD format) + specific_date = "2000-10-27" + fetch_documents_by_date_and_export_to_word(specific_date) \ No newline at end of file diff --git a/Preprocessing/tests/metadata_deletion_test.py b/Preprocessing/tests/metadata_deletion_test.py new file mode 100644 index 00000000..a1281834 --- /dev/null +++ b/Preprocessing/tests/metadata_deletion_test.py @@ -0,0 +1,125 @@ +# This allows you to test out deleted specific meeting date, meeting types, file type chunks. + +import os +import weaviate +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() +WEAVIATE_URL = os.getenv("WEAVIATE_URL") +WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY") + +# Initialize Weaviate client +client = weaviate.Client( + url=WEAVIATE_URL, + auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY) +) + +def fetch_documents(date, meeting_type, file_type): + """ + Fetch documents from Weaviate based on specific criteria. + + Args: + date (str): The date to filter by (YYYY-MM-DD format). + meeting_type (str): The meeting type to filter by (e.g., "Board of Commissioners"). + file_type (str): The file type to filter by (e.g., "Minutes"). + + Returns: + list: A list of matching documents. + """ + query = f""" + {{ + Get {{ + MeetingDocument(where: {{ + operator: And, + operands: [ + {{ + path: ["meeting_date"], + operator: Equal, + valueString: "{date}" + }}, + {{ + path: ["meeting_type"], + operator: Equal, + valueString: "{meeting_type}" + }}, + {{ + path: ["file_type"], + operator: Equal, + valueString: "{file_type}" + }} + ] + }}) {{ + _additional {{ + id + }} + meeting_date + meeting_type + file_type + chunk_index + content + }} + }} + }} + """ + response = client.query.raw(query) + documents = response.get("data", {}).get("Get", {}).get("MeetingDocument", []) + return documents + +def delete_documents(documents): + """ + Delete all documents in the provided list from Weaviate. + + Args: + documents (list): A list of documents with `_additional.id` to delete. + """ + for doc in documents: + doc_id = doc.get("_additional", {}).get("id") + if doc_id: + client.data_object.delete(doc_id) + print(f"Deleted document ID: {doc_id}") + else: + print("Document ID not found; skipping deletion.") + +if __name__ == "__main__": + # Specify the criteria for deletion + specific_date = "2000-10-27" + specific_meeting_type = "Board of Commissioners" + specific_file_type = "Minutes" + + # Step 1: Fetch documents + print(f"Fetching documents for {specific_date}, {specific_meeting_type}, {specific_file_type}...") + matching_documents = fetch_documents(specific_date, specific_meeting_type, specific_file_type) + if matching_documents: + print(f"\nFound {len(matching_documents)} matching documents:") + for doc in matching_documents: + print(f"- ID: {doc.get('_additional', {}).get('id')}") + print(f" Chunk Index: {doc.get('chunk_index', 'N/A')}") + print(f" Meeting Date: {doc.get('meeting_date', 'N/A')}") + print(f" Meeting Type: {doc.get('meeting_type', 'N/A')}") + print(f" File Type: {doc.get('file_type', 'N/A')}") + print(f" Content Preview: {doc.get('content', 'N/A')[:100]}...") + print() + else: + print("No matching documents found.") + + # Step 2: Delete documents + if matching_documents: + print("Deleting matching documents...") + delete_documents(matching_documents) + + # Step 3: Confirm deletion by re-fetching + print(f"Fetching documents again for {specific_date}, {specific_meeting_type}, {specific_file_type}...") + remaining_documents = fetch_documents(specific_date, specific_meeting_type, specific_file_type) + if remaining_documents: + print(f"\nFound {len(remaining_documents)} remaining documents (deletion failed for some):") + for doc in remaining_documents: + print(f"- ID: {doc.get('_additional', {}).get('id')}") + print(f" Chunk Index: {doc.get('chunk_index', 'N/A')}") + print(f" Meeting Date: {doc.get('meeting_date', 'N/A')}") + print(f" Meeting Type: {doc.get('meeting_type', 'N/A')}") + print(f" File Type: {doc.get('file_type', 'N/A')}") + print(f" Content Preview: {doc.get('content', 'N/A')[:100]}...") + print() + else: + print("All matching documents have been successfully deleted.") diff --git a/Preprocessing/tests/pytest.py b/Preprocessing/tests/pytest.py new file mode 100644 index 00000000..08b70282 --- /dev/null +++ b/Preprocessing/tests/pytest.py @@ -0,0 +1,255 @@ +#This is a few different pytests! + +import os +import sys +import importlib +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# Add the parent directory (Preprocessing) to the Python module search path +current_dir = os.path.dirname(os.path.abspath(__file__)) +parent_dir = os.path.abspath(os.path.join(current_dir, "..")) +if parent_dir not in sys.path: + sys.path.append(parent_dir) + +def test_dependencies_installed(): + dependencies = [ + "streamlit", "requests", "azure.storage.blob", "openai", "weaviate", + "fitz", "assemblyai", "transformers", "chardet", "pytest", "easyocr", "os", "sys", "importlib" + ] + for lib in dependencies: + assert importlib.util.find_spec(lib) is not None, f"{lib} is not installed!" + +def test_env_variables(): + required_vars = [ + "OPENAI_API_KEY", "OPENAI_BASE_URL", "WEAVIATE_URL", "WEAVIATE_API_KEY", + "ASSEMBLY_AI_KEY", "AZURE_STORAGE_CONNECTION_STRING", "AZURE_STORAGE_CONTAINER", "PYTHONPATH" + ] + + # Debugging + for var in required_vars: + print(f"{var}: {os.getenv(var)}") + + missing_or_empty_vars = [ + var for var in required_vars if not os.getenv(var) or os.getenv(var).strip() == "" + ] + assert not missing_or_empty_vars, f"Missing or empty variables: {', '.join(missing_or_empty_vars)}" + +from utils.azure_blob_utils import download_from_azure +def test_download_from_azure(): + """ + Test downloading a specific file from Azure Blob Storage. + """ + # File details + folder_name = "raw" + file_name = "2023_08_01_BOC_Agenda_Raw.pdf" + downloaded_file_path = "downloaded_2023_08_01_BOC_Agenda_Raw.pdf" + container_name = os.getenv("AZURE_STORAGE_CONTAINER") + + # Ensure container name is loaded from environment + assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file." + + print(f"Attempting to download {folder_name}/{file_name} from Azure Blob Storage...") + + # Download the file + try: + content = download_from_azure(folder_name, file_name, as_text=False) + # Save the downloaded content locally + with open(downloaded_file_path, "wb") as file: + file.write(content) + + # Check if the file exists locally + assert os.path.exists(downloaded_file_path), f"Downloaded file {downloaded_file_path} does not exist!" + print(f"Download successful. File saved to {downloaded_file_path}.") + except Exception as e: + assert False, f"Download failed with error: {e}" + + # Cleanup: Remove the downloaded file after the test + try: + if os.path.exists(downloaded_file_path): + os.remove(downloaded_file_path) + print(f"Cleaned up: {downloaded_file_path}.") + except Exception as cleanup_error: + print(f"Error during cleanup: {cleanup_error}") + +def test_download_from_dirty(): + """ + Test downloading a specific file from the 'dirty' folder in Azure Blob Storage. + """ + # File details + folder_name = "dirty" + file_name = "2023_08_01_BOC_Agenda_TextExtraction.txt" + downloaded_file_path = "downloaded_2023_08_01_BOC_Agenda_TextExtraction.txt" + container_name = os.getenv("AZURE_STORAGE_CONTAINER") + + # Ensure container name is loaded from environment + assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file." + + print(f"Attempting to download {folder_name}/{file_name} from Azure Blob Storage...") + + # Download the file + try: + content = download_from_azure(folder_name, file_name, as_text=True) + # Save the downloaded content locally + with open(downloaded_file_path, "w", encoding="utf-8") as file: + file.write(content) + + # Check if the file exists locally + assert os.path.exists(downloaded_file_path), f"Downloaded file {downloaded_file_path} does not exist!" + print(f"Download successful. File saved to {downloaded_file_path}.") + except Exception as e: + assert False, f"Download failed with error: {e}" + + # Cleanup: Remove the downloaded file after the test + try: + if os.path.exists(downloaded_file_path): + os.remove(downloaded_file_path) + print(f"Cleaned up: {downloaded_file_path}.") + except Exception as cleanup_error: + print(f"Error during cleanup: {cleanup_error}") + + +def test_download_from_clean(): + """ + Test downloading a specific file from the 'clean' folder in Azure Blob Storage. + """ + # File details + folder_name = "clean" + file_name = "2023_08_01_BOC_Agenda_Cleaned.txt" + downloaded_file_path = "downloaded_2023_08_01_BOC_Agenda_Cleaned.txt" + container_name = os.getenv("AZURE_STORAGE_CONTAINER") + + # Ensure container name is loaded from environment + assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file." + + print(f"Attempting to download {folder_name}/{file_name} from Azure Blob Storage...") + + # Download the file + try: + content = download_from_azure(folder_name, file_name, as_text=True) + # Save the downloaded content locally + with open(downloaded_file_path, "w", encoding="utf-8") as file: + file.write(content) + + # Check if the file exists locally + assert os.path.exists(downloaded_file_path), f"Downloaded file {downloaded_file_path} does not exist!" + print(f"Download successful. File saved to {downloaded_file_path}.") + except Exception as e: + assert False, f"Download failed with error: {e}" + + # Cleanup: Remove the downloaded file after the test + try: + if os.path.exists(downloaded_file_path): + os.remove(downloaded_file_path) + print(f"Cleaned up: {downloaded_file_path}.") + except Exception as cleanup_error: + print(f"Error during cleanup: {cleanup_error}") + +from utils.azure_blob_utils import upload_to_azure + +def test_upload_to_raw(): + """ + Test uploading a file to the 'raw' folder in Azure Blob Storage. + """ + # File details + folder_name = "raw" + file_name = "Test_Minutes.pdf" + container_name = os.getenv("AZURE_STORAGE_CONTAINER") + + # Ensure container name is loaded from environment + assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file." + + # Read the local file + local_file_path = "Test_Minutes.pdf" # Replace with your actual test file path + assert os.path.exists(local_file_path), f"Local file {local_file_path} does not exist!" + with open(local_file_path, "rb") as f: + file_content = f.read() + + # Upload to Azure + print(f"Uploading {local_file_path} to {folder_name}/{file_name} in Azure Blob Storage...") + try: + upload_to_azure(folder_name, file_name, file_content) + print(f"Upload successful: {folder_name}/{file_name}") + except Exception as e: + assert False, f"Upload failed with error: {e}" + + +def test_upload_to_dirty(): + """ + Test uploading a file to the 'dirty' folder in Azure Blob Storage. + """ + # File details + folder_name = "dirty" + file_name = "Test_Minutes.pdf" + container_name = os.getenv("AZURE_STORAGE_CONTAINER") + + # Ensure container name is loaded from environment + assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file." + + # Read the local file + local_file_path = "Test_Minutes.pdf" # Replace with your actual test file path + assert os.path.exists(local_file_path), f"Local file {local_file_path} does not exist!" + with open(local_file_path, "rb") as f: + file_content = f.read() + + # Upload to Azure + print(f"Uploading {local_file_path} to {folder_name}/{file_name} in Azure Blob Storage...") + try: + upload_to_azure(folder_name, file_name, file_content) + print(f"Upload successful: {folder_name}/{file_name}") + except Exception as e: + assert False, f"Upload failed with error: {e}" + + +def test_upload_to_clean(): + """ + Test uploading a file to the 'clean' folder in Azure Blob Storage. + """ + # File details + folder_name = "clean" + file_name = "Test_Minutes.pdf" + container_name = os.getenv("AZURE_STORAGE_CONTAINER") + + # Ensure container name is loaded from environment + assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file." + + # Read the local file + local_file_path = "Test_Minutes.pdf" # Replace with your actual test file path + assert os.path.exists(local_file_path), f"Local file {local_file_path} does not exist!" + with open(local_file_path, "rb") as f: + file_content = f.read() + + # Upload to Azure + print(f"Uploading {local_file_path} to {folder_name}/{file_name} in Azure Blob Storage...") + try: + upload_to_azure(folder_name, file_name, file_content) + print(f"Upload successful: {folder_name}/{file_name}") + except Exception as e: + assert False, f"Upload failed with error: {e}" + + +from preprocessing_pipeline.pdf_conversion import convert_pdf_to_text + +def test_pdf_conversion(): + """ + Test the PDF to text conversion function. + """ + # Define the test file + test_pdf_path = "Test_Minutes.pdf" # Replace with your test PDF file path + + # Ensure the test file exists locally + assert os.path.exists(test_pdf_path), f"Test PDF file {test_pdf_path} does not exist!" + + # Attempt to convert the PDF to text + try: + print(f"Converting {test_pdf_path} to text...") + extracted_text = convert_pdf_to_text(test_pdf_path) + + # Assertions to verify the conversion worked + assert isinstance(extracted_text, str), "Extracted text is not a string!" + assert len(extracted_text) > 0, "Extracted text is empty!" + print(f"PDF conversion successful. Extracted text length: {len(extracted_text)} characters.") + except Exception as e: + assert False, f"PDF conversion failed with error: {e}" \ No newline at end of file