diff --git a/Preprocessing/App/main.py b/Preprocessing/App/main.py index 4d8d5a2e..e6546dfa 100644 --- a/Preprocessing/App/main.py +++ b/Preprocessing/App/main.py @@ -232,37 +232,61 @@ def upload_files_page(): def view_documents_page(): st.title("Uploaded Documents") try: + # Fetch blobs from each folder raw_blobs = list_blobs_in_folder("raw") dirty_blobs = list_blobs_in_folder("dirty") clean_blobs = list_blobs_in_folder("clean") - # Display documents by category - if raw_blobs: - st.subheader("Raw Documents") - for blob in raw_blobs: - st.write(f"- {blob}") - if st.button(f"Download {blob}", key=f"download_raw_{blob}"): - file_content = download_from_azure("raw", blob) - st.download_button("Download", data=file_content, file_name=blob) - - if dirty_blobs: - st.subheader("Dirty Documents") - for blob in dirty_blobs: - st.write(f"- {blob}") - if st.button(f"Download {blob}", key=f"download_dirty_{blob}"): - file_content = download_from_azure("dirty", blob) - st.download_button("Download", data=file_content, file_name=blob) - - if clean_blobs: - st.subheader("Clean Documents") - for blob in clean_blobs: - st.write(f"- {blob}") - if st.button(f"Download {blob}", key=f"download_clean_{blob}"): - file_content = download_from_azure("clean", blob) - st.download_button("Download", data=file_content, file_name=blob) - - if not raw_blobs and not dirty_blobs and not clean_blobs: - st.write("No documents found in the Azure Blob Storage.") + def group_blobs_by_date(blobs): + """Groups blobs by their date extracted from the file name.""" + grouped = {} + for blob in blobs: + try: + # Extract the file name without folder prefix (e.g., "raw/") + file_name = blob.split("/")[-1] # Get only the file name part + + # Extract the date from the file name (assuming format: YYYY_MM_DD) + parts = file_name.split("_") # Split into ['2023', '12', '12', 'BOC', 'Agenda', ...] + date_str = "_".join(parts[:3]) # Join the first three parts: '2023_12_12' + + # Convert the date string to a readable format + readable_date = datetime.strptime(date_str, "%Y_%m_%d").strftime("%B %d, %Y") + + # Group by the readable date + if readable_date not in grouped: + grouped[readable_date] = [] + grouped[readable_date].append(blob) + except (ValueError, IndexError): + # Handle files with unexpected formats + if "Unknown Date" not in grouped: + grouped["Unknown Date"] = [] + grouped["Unknown Date"].append(blob) + return grouped + + # Group blobs by date + raw_grouped = group_blobs_by_date(raw_blobs) + dirty_grouped = group_blobs_by_date(dirty_blobs) + clean_grouped = group_blobs_by_date(clean_blobs) + + # Function to display blobs within a group + def display_grouped_blobs(grouped_blobs, category): + if grouped_blobs: + st.subheader(f"{category.capitalize()} Documents") + for date, blobs in grouped_blobs.items(): + with st.expander(f"Date: {date}", expanded=False): + for blob in blobs: + st.write(f"- {blob}") + if st.button(f"Download {blob}", key=f"download_{category}_{blob}"): + file_content = download_from_azure(category, blob) + st.download_button("Download", data=file_content, file_name=blob) + else: + st.info(f"No documents found in the {category} category.") + + # Display grouped blobs + display_grouped_blobs(raw_grouped, "raw") + display_grouped_blobs(dirty_grouped, "dirty") + display_grouped_blobs(clean_grouped, "clean") + except Exception as e: st.error(f"Error fetching documents from Azure Blob Storage: {e}") @@ -284,4 +308,4 @@ def view_documents_page(): elif st.session_state.page == "upload": upload_files_page() elif st.session_state.page == "view": - view_documents_page() + view_documents_page() \ No newline at end of file diff --git a/Preprocessing/README.md b/Preprocessing/README.md index 9f3d7b07..c08cfb5f 100644 --- a/Preprocessing/README.md +++ b/Preprocessing/README.md @@ -44,6 +44,7 @@ The **Preprocessing Pipeline** is a staff-facing application designed to streaml #### For Agendas and Minutes ๐Ÿ“„: 1. ๐Ÿ“ฅ **Upload**: Files are uploaded to Azureโ€™s "Raw Data" folder. 2. ๐Ÿ“„ **PDF-to-Text Conversion**: Files are converted to text using a PDF conversion utility. + - If the PDF contains scanned images, `easyocr` is used as a fallback for Optical Character Recognition (OCR). 3. ๐Ÿ› ๏ธ **Cleaning**: The raw text is saved in a "Dirty Folder," tokenized, chunked, and sent to OpenAI for cleaning. 4. ๐Ÿ“Š **Vectorization**: The cleaned text is embedded using **text-embedding-ada-002**. 5. ๐Ÿ’พ **Storage**: Vectorized data is stored in Weaviate Cloud for further analysis and retrieval. @@ -56,6 +57,20 @@ The **Preprocessing Pipeline** is a staff-facing application designed to streaml --- +## ๐Ÿงช Testing with Pytest + +The project includes a `pytest` file to validate key components of the pipeline. Hereโ€™s what you can test: +1. **Dependencies Check**: + - Ensure all required dependencies are installed. +2. **Environment Variables Check**: + - Verify that all environment variables (e.g., API keys, connection strings) are properly set up. +3. **Azure Upload and Download**: + - Test uploading and downloading files to/from Azure Blob Storage folders (`raw`, `dirty`, `clean`). +4. **PDF Conversion**: + - Test the `convert_pdf_to_text` function to extract text from a PDF, including fallback OCR with `easyocr` for scanned PDFs. + +--- + ## โš™๏ธ Setting Up Locally ### ๐Ÿ”‘ Prerequisites diff --git a/Preprocessing/docker/requirements.txt b/Preprocessing/docker/requirements.txt index 757357bb..e90a5e3e 100644 --- a/Preprocessing/docker/requirements.txt +++ b/Preprocessing/docker/requirements.txt @@ -19,6 +19,8 @@ PyMuPDF # azure portal azure.storage.blob +# additional libraries transformers - -chardet \ No newline at end of file +chardet +pytest +easyocr \ No newline at end of file diff --git a/Preprocessing/preprocessing_pipeline/pdf_conversion.py b/Preprocessing/preprocessing_pipeline/pdf_conversion.py index 0e23c92a..7b3c7499 100644 --- a/Preprocessing/preprocessing_pipeline/pdf_conversion.py +++ b/Preprocessing/preprocessing_pipeline/pdf_conversion.py @@ -1,9 +1,13 @@ import fitz # PyMuPDF +import easyocr +from PIL import Image +from io import BytesIO +import numpy as np from utils.azure_blob_utils import download_from_azure def convert_pdf_to_text(raw_file_name): """ - Extracts text from a PDF file. + Extracts text from a PDF file. Uses EasyOCR as a fallback for scanned PDFs. Args: raw_file_name (str): Name of the PDF file in Azure Blob Storage (raw folder). @@ -15,17 +19,31 @@ def convert_pdf_to_text(raw_file_name): # Step 1: Download the raw file from Azure Blob Storage raw_content = download_from_azure("raw", raw_file_name, as_text=False) - # Step 2: Open the PDF content and extract text - text = "" + # Step 2: Open the PDF content pdf_document = fitz.open(stream=raw_content, filetype="pdf") + text = "" + reader = easyocr.Reader(['en']) # Initialize EasyOCR for English + for page_num in range(pdf_document.page_count): page = pdf_document[page_num] - text += page.get_text() - pdf_document.close() + # Attempt to extract text directly + page_text = page.get_text() + if page_text.strip(): # If direct text is available + print(f"Text extracted directly from page {page_num + 1}.") + text += page_text + else: # Fallback to OCR for scanned pages + print(f"Applying OCR on page {page_num + 1} of {raw_file_name}.") + pix = page.get_pixmap(dpi=300) # Render page to an image + img = Image.open(BytesIO(pix.tobytes("png"))) + img_array = np.array(img) # Convert PIL Image to NumPy array for EasyOCR + ocr_text = reader.readtext(img_array, detail=0) # Extract text with EasyOCR + text += "\n".join(ocr_text) + + pdf_document.close() print(f"Successfully extracted text from {raw_file_name}.") return text except Exception as e: - print(f"Error extracting text from PDF {raw_file_name}: {e}") + print(f"Error in OCR for {raw_file_name}: {e}") return None diff --git a/Preprocessing/tests/Test_Minutes.pdf b/Preprocessing/tests/Test_Minutes.pdf new file mode 100644 index 00000000..b450e9ec Binary files /dev/null and b/Preprocessing/tests/Test_Minutes.pdf differ diff --git a/Preprocessing/tests/preprocess_test.py b/Preprocessing/tests/preprocess_test.py new file mode 100644 index 00000000..bf2795a7 --- /dev/null +++ b/Preprocessing/tests/preprocess_test.py @@ -0,0 +1,253 @@ +import os +import sys +import importlib +from dotenv import load_dotenv + +# Load environment variables +load_dotenv() + +# Add the parent directory (Preprocessing) to the Python module search path +current_dir = os.path.dirname(os.path.abspath(__file__)) +parent_dir = os.path.abspath(os.path.join(current_dir, "..")) +if parent_dir not in sys.path: + sys.path.append(parent_dir) + +def test_dependencies_installed(): + dependencies = [ + "streamlit", "requests", "azure.storage.blob", "openai", "weaviate", + "fitz", "assemblyai", "transformers", "chardet", "pytest", "easyocr", "os", "sys", "importlib" + ] + for lib in dependencies: + assert importlib.util.find_spec(lib) is not None, f"{lib} is not installed!" + +def test_env_variables(): + required_vars = [ + "OPENAI_API_KEY", "OPENAI_BASE_URL", "WEAVIATE_URL", "WEAVIATE_API_KEY", + "ASSEMBLY_AI_KEY", "AZURE_STORAGE_CONNECTION_STRING", "AZURE_STORAGE_CONTAINER", "PYTHONPATH" + ] + + # Debugging + for var in required_vars: + print(f"{var}: {os.getenv(var)}") + + missing_or_empty_vars = [ + var for var in required_vars if not os.getenv(var) or os.getenv(var).strip() == "" + ] + assert not missing_or_empty_vars, f"Missing or empty variables: {', '.join(missing_or_empty_vars)}" + +from utils.azure_blob_utils import download_from_azure +def test_download_from_azure(): + """ + Test downloading a specific file from Azure Blob Storage. + """ + # File details + folder_name = "raw" + file_name = "2023_08_01_BOC_Agenda_Raw.pdf" + downloaded_file_path = "downloaded_2023_08_01_BOC_Agenda_Raw.pdf" + container_name = os.getenv("AZURE_STORAGE_CONTAINER") + + # Ensure container name is loaded from environment + assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file." + + print(f"Attempting to download {folder_name}/{file_name} from Azure Blob Storage...") + + # Download the file + try: + content = download_from_azure(folder_name, file_name, as_text=False) + # Save the downloaded content locally + with open(downloaded_file_path, "wb") as file: + file.write(content) + + # Check if the file exists locally + assert os.path.exists(downloaded_file_path), f"Downloaded file {downloaded_file_path} does not exist!" + print(f"Download successful. File saved to {downloaded_file_path}.") + except Exception as e: + assert False, f"Download failed with error: {e}" + + # Cleanup: Remove the downloaded file after the test + try: + if os.path.exists(downloaded_file_path): + os.remove(downloaded_file_path) + print(f"Cleaned up: {downloaded_file_path}.") + except Exception as cleanup_error: + print(f"Error during cleanup: {cleanup_error}") + +def test_download_from_dirty(): + """ + Test downloading a specific file from the 'dirty' folder in Azure Blob Storage. + """ + # File details + folder_name = "dirty" + file_name = "2023_08_01_BOC_Agenda_TextExtraction.txt" + downloaded_file_path = "downloaded_2023_08_01_BOC_Agenda_TextExtraction.txt" + container_name = os.getenv("AZURE_STORAGE_CONTAINER") + + # Ensure container name is loaded from environment + assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file." + + print(f"Attempting to download {folder_name}/{file_name} from Azure Blob Storage...") + + # Download the file + try: + content = download_from_azure(folder_name, file_name, as_text=True) + # Save the downloaded content locally + with open(downloaded_file_path, "w", encoding="utf-8") as file: + file.write(content) + + # Check if the file exists locally + assert os.path.exists(downloaded_file_path), f"Downloaded file {downloaded_file_path} does not exist!" + print(f"Download successful. File saved to {downloaded_file_path}.") + except Exception as e: + assert False, f"Download failed with error: {e}" + + # Cleanup: Remove the downloaded file after the test + try: + if os.path.exists(downloaded_file_path): + os.remove(downloaded_file_path) + print(f"Cleaned up: {downloaded_file_path}.") + except Exception as cleanup_error: + print(f"Error during cleanup: {cleanup_error}") + + +def test_download_from_clean(): + """ + Test downloading a specific file from the 'clean' folder in Azure Blob Storage. + """ + # File details + folder_name = "clean" + file_name = "2023_08_01_BOC_Agenda_Cleaned.txt" + downloaded_file_path = "downloaded_2023_08_01_BOC_Agenda_Cleaned.txt" + container_name = os.getenv("AZURE_STORAGE_CONTAINER") + + # Ensure container name is loaded from environment + assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file." + + print(f"Attempting to download {folder_name}/{file_name} from Azure Blob Storage...") + + # Download the file + try: + content = download_from_azure(folder_name, file_name, as_text=True) + # Save the downloaded content locally + with open(downloaded_file_path, "w", encoding="utf-8") as file: + file.write(content) + + # Check if the file exists locally + assert os.path.exists(downloaded_file_path), f"Downloaded file {downloaded_file_path} does not exist!" + print(f"Download successful. File saved to {downloaded_file_path}.") + except Exception as e: + assert False, f"Download failed with error: {e}" + + # Cleanup: Remove the downloaded file after the test + try: + if os.path.exists(downloaded_file_path): + os.remove(downloaded_file_path) + print(f"Cleaned up: {downloaded_file_path}.") + except Exception as cleanup_error: + print(f"Error during cleanup: {cleanup_error}") + +from utils.azure_blob_utils import upload_to_azure + +def test_upload_to_raw(): + """ + Test uploading a file to the 'raw' folder in Azure Blob Storage. + """ + # File details + folder_name = "raw" + file_name = "Test_Minutes.pdf" + container_name = os.getenv("AZURE_STORAGE_CONTAINER") + + # Ensure container name is loaded from environment + assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file." + + # Read the local file + local_file_path = "Test_Minutes.pdf" # Replace with your actual test file path + assert os.path.exists(local_file_path), f"Local file {local_file_path} does not exist!" + with open(local_file_path, "rb") as f: + file_content = f.read() + + # Upload to Azure + print(f"Uploading {local_file_path} to {folder_name}/{file_name} in Azure Blob Storage...") + try: + upload_to_azure(folder_name, file_name, file_content) + print(f"Upload successful: {folder_name}/{file_name}") + except Exception as e: + assert False, f"Upload failed with error: {e}" + + +def test_upload_to_dirty(): + """ + Test uploading a file to the 'dirty' folder in Azure Blob Storage. + """ + # File details + folder_name = "dirty" + file_name = "Test_Minutes.pdf" + container_name = os.getenv("AZURE_STORAGE_CONTAINER") + + # Ensure container name is loaded from environment + assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file." + + # Read the local file + local_file_path = "Test_Minutes.pdf" # Replace with your actual test file path + assert os.path.exists(local_file_path), f"Local file {local_file_path} does not exist!" + with open(local_file_path, "rb") as f: + file_content = f.read() + + # Upload to Azure + print(f"Uploading {local_file_path} to {folder_name}/{file_name} in Azure Blob Storage...") + try: + upload_to_azure(folder_name, file_name, file_content) + print(f"Upload successful: {folder_name}/{file_name}") + except Exception as e: + assert False, f"Upload failed with error: {e}" + + +def test_upload_to_clean(): + """ + Test uploading a file to the 'clean' folder in Azure Blob Storage. + """ + # File details + folder_name = "clean" + file_name = "Test_Minutes.pdf" + container_name = os.getenv("AZURE_STORAGE_CONTAINER") + + # Ensure container name is loaded from environment + assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file." + + # Read the local file + local_file_path = "Test_Minutes.pdf" # Replace with your actual test file path + assert os.path.exists(local_file_path), f"Local file {local_file_path} does not exist!" + with open(local_file_path, "rb") as f: + file_content = f.read() + + # Upload to Azure + print(f"Uploading {local_file_path} to {folder_name}/{file_name} in Azure Blob Storage...") + try: + upload_to_azure(folder_name, file_name, file_content) + print(f"Upload successful: {folder_name}/{file_name}") + except Exception as e: + assert False, f"Upload failed with error: {e}" + + +from preprocessing_pipeline.pdf_conversion import convert_pdf_to_text + +def test_pdf_conversion(): + """ + Test the PDF to text conversion function. + """ + # Define the test file + test_pdf_path = "Test_Minutes.pdf" # Replace with your test PDF file path + + # Ensure the test file exists locally + assert os.path.exists(test_pdf_path), f"Test PDF file {test_pdf_path} does not exist!" + + # Attempt to convert the PDF to text + try: + print(f"Converting {test_pdf_path} to text...") + extracted_text = convert_pdf_to_text(test_pdf_path) + + # Assertions to verify the conversion worked + assert isinstance(extracted_text, str), "Extracted text is not a string!" + assert len(extracted_text) > 0, "Extracted text is empty!" + print(f"PDF conversion successful. Extracted text length: {len(extracted_text)} characters.") + except Exception as e: + assert False, f"PDF conversion failed with error: {e}"