dsba6010-llm-applications · neal-logan · Nov 24, 2024 · Nov 24, 2024
diff --git a/Preprocessing/App/main.py b/Preprocessing/App/main.py
@@ -13,17 +13,17 @@
    sys.path.append(python_path)

 # Import dependencies
 import streamlit as st
 import weaviate  # Import Weaviate client
 from preprocessing_pipeline.pdf_conversion import convert_pdf_to_text
 from preprocessing_pipeline.audio_transcription import transcribe_audio
 from preprocessing_pipeline.text_cleaning import clean_text
 from preprocessing_pipeline.chunking_vector_embedding import tokenize_and_embed_text
 from utils.azure_blob_utils import (
    upload_to_azure,
    download_from_azure,
    list_blobs_in_folder
 )

 # Set up Weaviate client
 client = weaviate.Client(
@@ -202,7 +202,7 @@
                st.error("Failed to extract text from the PDF.")

        # Stage 3: Clean Text and Upload to Clean
        dirty_content = download_from_azure("dirty", dirty_file_name)
        with st.spinner("Cleaning text using generative AI..."):
            cleaned_text = clean_text(dirty_file_name)
        clean_file_name = generate_file_name(metadata, "Cleaned") + ".txt"
@@ -232,37 +232,61 @@
 def view_documents_page():
     st.title("Uploaded Documents")
     try:
+        # Fetch blobs from each folder
         raw_blobs = list_blobs_in_folder("raw")
         dirty_blobs = list_blobs_in_folder("dirty")
         clean_blobs = list_blobs_in_folder("clean")
 
-        # Display documents by category
-        if raw_blobs:
-            st.subheader("Raw Documents")
-            for blob in raw_blobs:
-                st.write(f"- {blob}")
-                if st.button(f"Download {blob}", key=f"download_raw_{blob}"):
-                    file_content = download_from_azure("raw", blob)
-                    st.download_button("Download", data=file_content, file_name=blob)
-
-        if dirty_blobs:
-            st.subheader("Dirty Documents")
-            for blob in dirty_blobs:
-                st.write(f"- {blob}")
-                if st.button(f"Download {blob}", key=f"download_dirty_{blob}"):
-                    file_content = download_from_azure("dirty", blob)
-                    st.download_button("Download", data=file_content, file_name=blob)
-
-        if clean_blobs:
-            st.subheader("Clean Documents")
-            for blob in clean_blobs:
-                st.write(f"- {blob}")
-                if st.button(f"Download {blob}", key=f"download_clean_{blob}"):
-                    file_content = download_from_azure("clean", blob)
-                    st.download_button("Download", data=file_content, file_name=blob)
-
-        if not raw_blobs and not dirty_blobs and not clean_blobs:
-            st.write("No documents found in the Azure Blob Storage.")
+        def group_blobs_by_date(blobs):
+            """Groups blobs by their date extracted from the file name."""
+            grouped = {}
+            for blob in blobs:
+                try:
+                    # Extract the file name without folder prefix (e.g., "raw/")
+                    file_name = blob.split("/")[-1]  # Get only the file name part
+
+                    # Extract the date from the file name (assuming format: YYYY_MM_DD)
+                    parts = file_name.split("_")  # Split into ['2023', '12', '12', 'BOC', 'Agenda', ...]
+                    date_str = "_".join(parts[:3])  # Join the first three parts: '2023_12_12'
+
+                    # Convert the date string to a readable format
+                    readable_date = datetime.strptime(date_str, "%Y_%m_%d").strftime("%B %d, %Y")
+
+                    # Group by the readable date
+                    if readable_date not in grouped:
+                        grouped[readable_date] = []
+                    grouped[readable_date].append(blob)
+                except (ValueError, IndexError):
+                    # Handle files with unexpected formats
+                    if "Unknown Date" not in grouped:
+                        grouped["Unknown Date"] = []
+                    grouped["Unknown Date"].append(blob)
+            return grouped
+
+        # Group blobs by date
+        raw_grouped = group_blobs_by_date(raw_blobs)
+        dirty_grouped = group_blobs_by_date(dirty_blobs)
+        clean_grouped = group_blobs_by_date(clean_blobs)
+
+        # Function to display blobs within a group
+        def display_grouped_blobs(grouped_blobs, category):
+            if grouped_blobs:
+                st.subheader(f"{category.capitalize()} Documents")
+                for date, blobs in grouped_blobs.items():
+                    with st.expander(f"Date: {date}", expanded=False):
+                        for blob in blobs:
+                            st.write(f"- {blob}")
+                            if st.button(f"Download {blob}", key=f"download_{category}_{blob}"):
+                                file_content = download_from_azure(category, blob)
+                                st.download_button("Download", data=file_content, file_name=blob)
+            else:
+                st.info(f"No documents found in the {category} category.")
+
+        # Display grouped blobs
+        display_grouped_blobs(raw_grouped, "raw")
+        display_grouped_blobs(dirty_grouped, "dirty")
+        display_grouped_blobs(clean_grouped, "clean")
+
     except Exception as e:
         st.error(f"Error fetching documents from Azure Blob Storage: {e}")
 
@@ -284,4 +308,4 @@
 elif st.session_state.page == "upload":
     upload_files_page()
 elif st.session_state.page == "view":
-    view_documents_page()
+    view_documents_page()
diff --git a/Preprocessing/README.md b/Preprocessing/README.md
@@ -44,6 +44,7 @@ The **Preprocessing Pipeline** is a staff-facing application designed to streaml
 #### For Agendas and Minutes 📄:
 1. 📥 **Upload**: Files are uploaded to Azure’s "Raw Data" folder.
 2. 📄 **PDF-to-Text Conversion**: Files are converted to text using a PDF conversion utility.
+   - If the PDF contains scanned images, `easyocr` is used as a fallback for Optical Character Recognition (OCR).
 3. 🛠️ **Cleaning**: The raw text is saved in a "Dirty Folder," tokenized, chunked, and sent to OpenAI for cleaning.
 4. 📊 **Vectorization**: The cleaned text is embedded using **text-embedding-ada-002**.
 5. 💾 **Storage**: Vectorized data is stored in Weaviate Cloud for further analysis and retrieval.
@@ -56,6 +57,20 @@ The **Preprocessing Pipeline** is a staff-facing application designed to streaml
 
 ---
 
+## 🧪 Testing with Pytest
+
+The project includes a `pytest` file to validate key components of the pipeline. Here’s what you can test:
+1. **Dependencies Check**:
+   - Ensure all required dependencies are installed.
+2. **Environment Variables Check**:
+   - Verify that all environment variables (e.g., API keys, connection strings) are properly set up.
+3. **Azure Upload and Download**:
+   - Test uploading and downloading files to/from Azure Blob Storage folders (`raw`, `dirty`, `clean`).
+4. **PDF Conversion**:
+   - Test the `convert_pdf_to_text` function to extract text from a PDF, including fallback OCR with `easyocr` for scanned PDFs.
+
+---
+
 ## ⚙️ Setting Up Locally
 
 ### 🔑 Prerequisites

diff --git a/Preprocessing/docker/requirements.txt b/Preprocessing/docker/requirements.txt
@@ -19,6 +19,8 @@ PyMuPDF
 # azure portal
 azure.storage.blob
 
+# additional libraries
 transformers
-
-chardet
+chardet
+pytest
+easyocr
diff --git a/Preprocessing/preprocessing_pipeline/pdf_conversion.py b/Preprocessing/preprocessing_pipeline/pdf_conversion.py
@@ -1,9 +1,13 @@
 import fitz  # PyMuPDF
+import easyocr
+from PIL import Image
+from io import BytesIO
+import numpy as np
 from utils.azure_blob_utils import download_from_azure
 
 def convert_pdf_to_text(raw_file_name):
     """
-    Extracts text from a PDF file.
+    Extracts text from a PDF file. Uses EasyOCR as a fallback for scanned PDFs.
 
     Args:
         raw_file_name (str): Name of the PDF file in Azure Blob Storage (raw folder).
@@ -15,17 +19,31 @@ def convert_pdf_to_text(raw_file_name):
         # Step 1: Download the raw file from Azure Blob Storage
         raw_content = download_from_azure("raw", raw_file_name, as_text=False)
 
-        # Step 2: Open the PDF content and extract text
-        text = ""
+        # Step 2: Open the PDF content
         pdf_document = fitz.open(stream=raw_content, filetype="pdf")
+        text = ""
+        reader = easyocr.Reader(['en'])  # Initialize EasyOCR for English
+
         for page_num in range(pdf_document.page_count):
             page = pdf_document[page_num]
-            text += page.get_text()
-        pdf_document.close()
 
+            # Attempt to extract text directly
+            page_text = page.get_text()
+            if page_text.strip():  # If direct text is available
+                print(f"Text extracted directly from page {page_num + 1}.")
+                text += page_text
+            else:  # Fallback to OCR for scanned pages
+                print(f"Applying OCR on page {page_num + 1} of {raw_file_name}.")
+                pix = page.get_pixmap(dpi=300)  # Render page to an image
+                img = Image.open(BytesIO(pix.tobytes("png")))
+                img_array = np.array(img)  # Convert PIL Image to NumPy array for EasyOCR
+                ocr_text = reader.readtext(img_array, detail=0)  # Extract text with EasyOCR
+                text += "\n".join(ocr_text)
+
+        pdf_document.close()
         print(f"Successfully extracted text from {raw_file_name}.")
         return text
 
     except Exception as e:
-        print(f"Error extracting text from PDF {raw_file_name}: {e}")
+        print(f"Error in OCR for {raw_file_name}: {e}")
         return None
diff --git a/Preprocessing/tests/Test_Minutes.pdf b/Preprocessing/tests/Test_Minutes.pdf