diff --git a/Preprocessing/App/main.py b/Preprocessing/App/main.py
index 4d8d5a2e..e6546dfa 100644
--- a/Preprocessing/App/main.py
+++ b/Preprocessing/App/main.py
@@ -232,37 +232,61 @@ def upload_files_page():
 def view_documents_page():
     st.title("Uploaded Documents")
     try:
+        # Fetch blobs from each folder
         raw_blobs = list_blobs_in_folder("raw")
         dirty_blobs = list_blobs_in_folder("dirty")
         clean_blobs = list_blobs_in_folder("clean")
 
-        # Display documents by category
-        if raw_blobs:
-            st.subheader("Raw Documents")
-            for blob in raw_blobs:
-                st.write(f"- {blob}")
-                if st.button(f"Download {blob}", key=f"download_raw_{blob}"):
-                    file_content = download_from_azure("raw", blob)
-                    st.download_button("Download", data=file_content, file_name=blob)
-
-        if dirty_blobs:
-            st.subheader("Dirty Documents")
-            for blob in dirty_blobs:
-                st.write(f"- {blob}")
-                if st.button(f"Download {blob}", key=f"download_dirty_{blob}"):
-                    file_content = download_from_azure("dirty", blob)
-                    st.download_button("Download", data=file_content, file_name=blob)
-
-        if clean_blobs:
-            st.subheader("Clean Documents")
-            for blob in clean_blobs:
-                st.write(f"- {blob}")
-                if st.button(f"Download {blob}", key=f"download_clean_{blob}"):
-                    file_content = download_from_azure("clean", blob)
-                    st.download_button("Download", data=file_content, file_name=blob)
-
-        if not raw_blobs and not dirty_blobs and not clean_blobs:
-            st.write("No documents found in the Azure Blob Storage.")
+        def group_blobs_by_date(blobs):
+            """Groups blobs by their date extracted from the file name."""
+            grouped = {}
+            for blob in blobs:
+                try:
+                    # Extract the file name without folder prefix (e.g., "raw/")
+                    file_name = blob.split("/")[-1]  # Get only the file name part
+                    
+                    # Extract the date from the file name (assuming format: YYYY_MM_DD)
+                    parts = file_name.split("_")  # Split into ['2023', '12', '12', 'BOC', 'Agenda', ...]
+                    date_str = "_".join(parts[:3])  # Join the first three parts: '2023_12_12'
+                    
+                    # Convert the date string to a readable format
+                    readable_date = datetime.strptime(date_str, "%Y_%m_%d").strftime("%B %d, %Y")
+                    
+                    # Group by the readable date
+                    if readable_date not in grouped:
+                        grouped[readable_date] = []
+                    grouped[readable_date].append(blob)
+                except (ValueError, IndexError):
+                    # Handle files with unexpected formats
+                    if "Unknown Date" not in grouped:
+                        grouped["Unknown Date"] = []
+                    grouped["Unknown Date"].append(blob)
+            return grouped
+
+        # Group blobs by date
+        raw_grouped = group_blobs_by_date(raw_blobs)
+        dirty_grouped = group_blobs_by_date(dirty_blobs)
+        clean_grouped = group_blobs_by_date(clean_blobs)
+
+        # Function to display blobs within a group
+        def display_grouped_blobs(grouped_blobs, category):
+            if grouped_blobs:
+                st.subheader(f"{category.capitalize()} Documents")
+                for date, blobs in grouped_blobs.items():
+                    with st.expander(f"Date: {date}", expanded=False):
+                        for blob in blobs:
+                            st.write(f"- {blob}")
+                            if st.button(f"Download {blob}", key=f"download_{category}_{blob}"):
+                                file_content = download_from_azure(category, blob)
+                                st.download_button("Download", data=file_content, file_name=blob)
+            else:
+                st.info(f"No documents found in the {category} category.")
+
+        # Display grouped blobs
+        display_grouped_blobs(raw_grouped, "raw")
+        display_grouped_blobs(dirty_grouped, "dirty")
+        display_grouped_blobs(clean_grouped, "clean")
+
     except Exception as e:
         st.error(f"Error fetching documents from Azure Blob Storage: {e}")
 
@@ -284,4 +308,4 @@ def view_documents_page():
 elif st.session_state.page == "upload":
     upload_files_page()
 elif st.session_state.page == "view":
-    view_documents_page()
+    view_documents_page()
\ No newline at end of file
diff --git a/Preprocessing/README.md b/Preprocessing/README.md
index 9f3d7b07..c08cfb5f 100644
--- a/Preprocessing/README.md
+++ b/Preprocessing/README.md
@@ -44,6 +44,7 @@ The **Preprocessing Pipeline** is a staff-facing application designed to streaml
 #### For Agendas and Minutes 📄:
 1. 📥 **Upload**: Files are uploaded to Azure’s "Raw Data" folder.
 2. 📄 **PDF-to-Text Conversion**: Files are converted to text using a PDF conversion utility.
+   - If the PDF contains scanned images, `easyocr` is used as a fallback for Optical Character Recognition (OCR).
 3. 🛠️ **Cleaning**: The raw text is saved in a "Dirty Folder," tokenized, chunked, and sent to OpenAI for cleaning.
 4. 📊 **Vectorization**: The cleaned text is embedded using **text-embedding-ada-002**.
 5. 💾 **Storage**: Vectorized data is stored in Weaviate Cloud for further analysis and retrieval.
@@ -56,6 +57,20 @@ The **Preprocessing Pipeline** is a staff-facing application designed to streaml
 
 ---
 
+## 🧪 Testing with Pytest
+
+The project includes a `pytest` file to validate key components of the pipeline. Here’s what you can test:
+1. **Dependencies Check**:
+   - Ensure all required dependencies are installed.
+2. **Environment Variables Check**:
+   - Verify that all environment variables (e.g., API keys, connection strings) are properly set up.
+3. **Azure Upload and Download**:
+   - Test uploading and downloading files to/from Azure Blob Storage folders (`raw`, `dirty`, `clean`).
+4. **PDF Conversion**:
+   - Test the `convert_pdf_to_text` function to extract text from a PDF, including fallback OCR with `easyocr` for scanned PDFs.
+
+---
+
 ## ⚙️ Setting Up Locally
 
 ### 🔑 Prerequisites
diff --git a/Preprocessing/docker/requirements.txt b/Preprocessing/docker/requirements.txt
index 757357bb..e90a5e3e 100644
--- a/Preprocessing/docker/requirements.txt
+++ b/Preprocessing/docker/requirements.txt
@@ -19,6 +19,8 @@ PyMuPDF
 # azure portal
 azure.storage.blob
 
+# additional libraries
 transformers
-
-chardet
\ No newline at end of file
+chardet
+pytest
+easyocr
\ No newline at end of file
diff --git a/Preprocessing/preprocessing_pipeline/pdf_conversion.py b/Preprocessing/preprocessing_pipeline/pdf_conversion.py
index 0e23c92a..7b3c7499 100644
--- a/Preprocessing/preprocessing_pipeline/pdf_conversion.py
+++ b/Preprocessing/preprocessing_pipeline/pdf_conversion.py
@@ -1,9 +1,13 @@
 import fitz  # PyMuPDF
+import easyocr
+from PIL import Image
+from io import BytesIO
+import numpy as np
 from utils.azure_blob_utils import download_from_azure
 
 def convert_pdf_to_text(raw_file_name):
     """
-    Extracts text from a PDF file.
+    Extracts text from a PDF file. Uses EasyOCR as a fallback for scanned PDFs.
 
     Args:
         raw_file_name (str): Name of the PDF file in Azure Blob Storage (raw folder).
@@ -15,17 +19,31 @@ def convert_pdf_to_text(raw_file_name):
         # Step 1: Download the raw file from Azure Blob Storage
         raw_content = download_from_azure("raw", raw_file_name, as_text=False)
 
-        # Step 2: Open the PDF content and extract text
-        text = ""
+        # Step 2: Open the PDF content
         pdf_document = fitz.open(stream=raw_content, filetype="pdf")
+        text = ""
+        reader = easyocr.Reader(['en'])  # Initialize EasyOCR for English
+
         for page_num in range(pdf_document.page_count):
             page = pdf_document[page_num]
-            text += page.get_text()
-        pdf_document.close()
 
+            # Attempt to extract text directly
+            page_text = page.get_text()
+            if page_text.strip():  # If direct text is available
+                print(f"Text extracted directly from page {page_num + 1}.")
+                text += page_text
+            else:  # Fallback to OCR for scanned pages
+                print(f"Applying OCR on page {page_num + 1} of {raw_file_name}.")
+                pix = page.get_pixmap(dpi=300)  # Render page to an image
+                img = Image.open(BytesIO(pix.tobytes("png")))
+                img_array = np.array(img)  # Convert PIL Image to NumPy array for EasyOCR
+                ocr_text = reader.readtext(img_array, detail=0)  # Extract text with EasyOCR
+                text += "\n".join(ocr_text)
+
+        pdf_document.close()
         print(f"Successfully extracted text from {raw_file_name}.")
         return text
 
     except Exception as e:
-        print(f"Error extracting text from PDF {raw_file_name}: {e}")
+        print(f"Error in OCR for {raw_file_name}: {e}")
         return None
diff --git a/Preprocessing/tests/Test_Minutes.pdf b/Preprocessing/tests/Test_Minutes.pdf
new file mode 100644
index 00000000..b450e9ec
Binary files /dev/null and b/Preprocessing/tests/Test_Minutes.pdf differ
diff --git a/Preprocessing/tests/preprocess_test.py b/Preprocessing/tests/preprocess_test.py
new file mode 100644
index 00000000..bf2795a7
--- /dev/null
+++ b/Preprocessing/tests/preprocess_test.py
@@ -0,0 +1,253 @@
+import os
+import sys
+import importlib
+from dotenv import load_dotenv
+
+# Load environment variables
+load_dotenv()
+
+# Add the parent directory (Preprocessing) to the Python module search path
+current_dir = os.path.dirname(os.path.abspath(__file__))
+parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
+if parent_dir not in sys.path:
+    sys.path.append(parent_dir)
+
+def test_dependencies_installed():
+    dependencies = [
+        "streamlit", "requests", "azure.storage.blob", "openai", "weaviate",
+        "fitz", "assemblyai", "transformers", "chardet", "pytest", "easyocr", "os", "sys", "importlib"
+    ]
+    for lib in dependencies:
+        assert importlib.util.find_spec(lib) is not None, f"{lib} is not installed!"
+
+def test_env_variables():
+    required_vars = [
+        "OPENAI_API_KEY", "OPENAI_BASE_URL", "WEAVIATE_URL", "WEAVIATE_API_KEY",
+        "ASSEMBLY_AI_KEY", "AZURE_STORAGE_CONNECTION_STRING", "AZURE_STORAGE_CONTAINER", "PYTHONPATH"
+    ]
+
+    # Debugging
+    for var in required_vars:
+        print(f"{var}: {os.getenv(var)}")
+
+    missing_or_empty_vars = [
+        var for var in required_vars if not os.getenv(var) or os.getenv(var).strip() == ""
+    ]
+    assert not missing_or_empty_vars, f"Missing or empty variables: {', '.join(missing_or_empty_vars)}"
+
+from utils.azure_blob_utils import download_from_azure
+def test_download_from_azure():
+    """
+    Test downloading a specific file from Azure Blob Storage.
+    """
+    # File details
+    folder_name = "raw"
+    file_name = "2023_08_01_BOC_Agenda_Raw.pdf"
+    downloaded_file_path = "downloaded_2023_08_01_BOC_Agenda_Raw.pdf"
+    container_name = os.getenv("AZURE_STORAGE_CONTAINER")
+
+    # Ensure container name is loaded from environment
+    assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file."
+
+    print(f"Attempting to download {folder_name}/{file_name} from Azure Blob Storage...")
+
+    # Download the file
+    try:
+        content = download_from_azure(folder_name, file_name, as_text=False)
+        # Save the downloaded content locally
+        with open(downloaded_file_path, "wb") as file:
+            file.write(content)
+
+        # Check if the file exists locally
+        assert os.path.exists(downloaded_file_path), f"Downloaded file {downloaded_file_path} does not exist!"
+        print(f"Download successful. File saved to {downloaded_file_path}.")
+    except Exception as e:
+        assert False, f"Download failed with error: {e}"
+
+    # Cleanup: Remove the downloaded file after the test
+    try:
+        if os.path.exists(downloaded_file_path):
+            os.remove(downloaded_file_path)
+            print(f"Cleaned up: {downloaded_file_path}.")
+    except Exception as cleanup_error:
+        print(f"Error during cleanup: {cleanup_error}")
+
+def test_download_from_dirty():
+    """
+    Test downloading a specific file from the 'dirty' folder in Azure Blob Storage.
+    """
+    # File details
+    folder_name = "dirty"
+    file_name = "2023_08_01_BOC_Agenda_TextExtraction.txt"
+    downloaded_file_path = "downloaded_2023_08_01_BOC_Agenda_TextExtraction.txt"
+    container_name = os.getenv("AZURE_STORAGE_CONTAINER")
+
+    # Ensure container name is loaded from environment
+    assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file."
+
+    print(f"Attempting to download {folder_name}/{file_name} from Azure Blob Storage...")
+
+    # Download the file
+    try:
+        content = download_from_azure(folder_name, file_name, as_text=True)
+        # Save the downloaded content locally
+        with open(downloaded_file_path, "w", encoding="utf-8") as file:
+            file.write(content)
+
+        # Check if the file exists locally
+        assert os.path.exists(downloaded_file_path), f"Downloaded file {downloaded_file_path} does not exist!"
+        print(f"Download successful. File saved to {downloaded_file_path}.")
+    except Exception as e:
+        assert False, f"Download failed with error: {e}"
+
+    # Cleanup: Remove the downloaded file after the test
+    try:
+        if os.path.exists(downloaded_file_path):
+            os.remove(downloaded_file_path)
+            print(f"Cleaned up: {downloaded_file_path}.")
+    except Exception as cleanup_error:
+        print(f"Error during cleanup: {cleanup_error}")
+
+
+def test_download_from_clean():
+    """
+    Test downloading a specific file from the 'clean' folder in Azure Blob Storage.
+    """
+    # File details
+    folder_name = "clean"
+    file_name = "2023_08_01_BOC_Agenda_Cleaned.txt"
+    downloaded_file_path = "downloaded_2023_08_01_BOC_Agenda_Cleaned.txt"
+    container_name = os.getenv("AZURE_STORAGE_CONTAINER")
+
+    # Ensure container name is loaded from environment
+    assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file."
+
+    print(f"Attempting to download {folder_name}/{file_name} from Azure Blob Storage...")
+
+    # Download the file
+    try:
+        content = download_from_azure(folder_name, file_name, as_text=True)
+        # Save the downloaded content locally
+        with open(downloaded_file_path, "w", encoding="utf-8") as file:
+            file.write(content)
+
+        # Check if the file exists locally
+        assert os.path.exists(downloaded_file_path), f"Downloaded file {downloaded_file_path} does not exist!"
+        print(f"Download successful. File saved to {downloaded_file_path}.")
+    except Exception as e:
+        assert False, f"Download failed with error: {e}"
+
+    # Cleanup: Remove the downloaded file after the test
+    try:
+        if os.path.exists(downloaded_file_path):
+            os.remove(downloaded_file_path)
+            print(f"Cleaned up: {downloaded_file_path}.")
+    except Exception as cleanup_error:
+        print(f"Error during cleanup: {cleanup_error}")
+    
+from utils.azure_blob_utils import upload_to_azure
+
+def test_upload_to_raw():
+    """
+    Test uploading a file to the 'raw' folder in Azure Blob Storage.
+    """
+    # File details
+    folder_name = "raw"
+    file_name = "Test_Minutes.pdf"
+    container_name = os.getenv("AZURE_STORAGE_CONTAINER")
+
+    # Ensure container name is loaded from environment
+    assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file."
+
+    # Read the local file
+    local_file_path = "Test_Minutes.pdf"  # Replace with your actual test file path
+    assert os.path.exists(local_file_path), f"Local file {local_file_path} does not exist!"
+    with open(local_file_path, "rb") as f:
+        file_content = f.read()
+
+    # Upload to Azure
+    print(f"Uploading {local_file_path} to {folder_name}/{file_name} in Azure Blob Storage...")
+    try:
+        upload_to_azure(folder_name, file_name, file_content)
+        print(f"Upload successful: {folder_name}/{file_name}")
+    except Exception as e:
+        assert False, f"Upload failed with error: {e}"
+
+
+def test_upload_to_dirty():
+    """
+    Test uploading a file to the 'dirty' folder in Azure Blob Storage.
+    """
+    # File details
+    folder_name = "dirty"
+    file_name = "Test_Minutes.pdf"
+    container_name = os.getenv("AZURE_STORAGE_CONTAINER")
+
+    # Ensure container name is loaded from environment
+    assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file."
+
+    # Read the local file
+    local_file_path = "Test_Minutes.pdf"  # Replace with your actual test file path
+    assert os.path.exists(local_file_path), f"Local file {local_file_path} does not exist!"
+    with open(local_file_path, "rb") as f:
+        file_content = f.read()
+
+    # Upload to Azure
+    print(f"Uploading {local_file_path} to {folder_name}/{file_name} in Azure Blob Storage...")
+    try:
+        upload_to_azure(folder_name, file_name, file_content)
+        print(f"Upload successful: {folder_name}/{file_name}")
+    except Exception as e:
+        assert False, f"Upload failed with error: {e}"
+
+
+def test_upload_to_clean():
+    """
+    Test uploading a file to the 'clean' folder in Azure Blob Storage.
+    """
+    # File details
+    folder_name = "clean"
+    file_name = "Test_Minutes.pdf"
+    container_name = os.getenv("AZURE_STORAGE_CONTAINER")
+
+    # Ensure container name is loaded from environment
+    assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file."
+
+    # Read the local file
+    local_file_path = "Test_Minutes.pdf"  # Replace with your actual test file path
+    assert os.path.exists(local_file_path), f"Local file {local_file_path} does not exist!"
+    with open(local_file_path, "rb") as f:
+        file_content = f.read()
+
+    # Upload to Azure
+    print(f"Uploading {local_file_path} to {folder_name}/{file_name} in Azure Blob Storage...")
+    try:
+        upload_to_azure(folder_name, file_name, file_content)
+        print(f"Upload successful: {folder_name}/{file_name}")
+    except Exception as e:
+        assert False, f"Upload failed with error: {e}"
+
+
+from preprocessing_pipeline.pdf_conversion import convert_pdf_to_text
+
+def test_pdf_conversion():
+    """
+    Test the PDF to text conversion function.
+    """
+    # Define the test file
+    test_pdf_path = "Test_Minutes.pdf"  # Replace with your test PDF file path
+
+    # Ensure the test file exists locally
+    assert os.path.exists(test_pdf_path), f"Test PDF file {test_pdf_path} does not exist!"
+
+    # Attempt to convert the PDF to text
+    try:
+        print(f"Converting {test_pdf_path} to text...")
+        extracted_text = convert_pdf_to_text(test_pdf_path)
+
+        # Assertions to verify the conversion worked
+        assert isinstance(extracted_text, str), "Extracted text is not a string!"
+        assert len(extracted_text) > 0, "Extracted text is empty!"
+        print(f"PDF conversion successful. Extracted text length: {len(extracted_text)} characters.")
+    except Exception as e:
+        assert False, f"PDF conversion failed with error: {e}"