diff --git a/Preprocessing/.env.example b/Preprocessing/.env.example
index 39a39ba5..85a4136e 100644
--- a/Preprocessing/.env.example
+++ b/Preprocessing/.env.example
@@ -11,6 +11,7 @@ ASSEMBLY_AI_KEY =
 
 #Azure Storage Container Connection 
 AZURE_STORAGE_CONNECTION_STRING=
+AZURE_STORAGE_CONTAINER = 
 
 # Pathing Setup
 PYTHONPATH=
\ No newline at end of file
diff --git a/Preprocessing/App/main.py b/Preprocessing/App/main.py
index e6546dfa..d6f339f8 100644
--- a/Preprocessing/App/main.py
+++ b/Preprocessing/App/main.py
@@ -18,7 +18,11 @@
 from preprocessing_pipeline.pdf_conversion import convert_pdf_to_text
 from preprocessing_pipeline.audio_transcription import transcribe_audio
 from preprocessing_pipeline.text_cleaning import clean_text
-from preprocessing_pipeline.chunking_vector_embedding import tokenize_and_embed_text
+from preprocessing_pipeline.chunking_vector_embedding import (
+    tokenize_and_embed_text,
+    fetch_matching_chunks,
+    delete_matching_chunks
+)
 from utils.azure_blob_utils import (
     upload_to_azure,
     download_from_azure,
@@ -209,11 +213,21 @@ def upload_files_page():
         upload_to_azure("clean", clean_file_name, cleaned_text)
         st.write(f"Uploaded cleaned text to `clean/` folder: {clean_file_name}")
 
-        # Display cleaned text
-        st.text_area("Cleaned Text:", cleaned_text, height=200)
-        st.download_button("Download Cleaned Text", data=cleaned_text, file_name=clean_file_name)
+        # Stage 4: Check and Delete Existing Embeddings
+        with st.spinner("Checking for existing embeddings..."):
+            matching_chunks = fetch_matching_chunks(
+                str(metadata["meeting_date"]),
+                metadata["meeting_type"],
+                metadata["file_type"],
+                clean_file_name
+            )
+            if matching_chunks:
+                st.write(f"Found {len(matching_chunks)} existing chunks. Deleting...")
+                delete_matching_chunks(matching_chunks)
+            else:
+                st.write("No existing chunks found.")
 
-        # Stage 4: Chunk and Embed into Weaviate
+        # Stage 5: Chunk and Embed into Weaviate
         with st.spinner("Chunking and embedding text into Weaviate..."):
             tokenize_and_embed_text(clean_file_name, metadata)
         st.success("Document processed and embedded successfully!")
@@ -242,33 +256,23 @@ def group_blobs_by_date(blobs):
             grouped = {}
             for blob in blobs:
                 try:
-                    # Extract the file name without folder prefix (e.g., "raw/")
-                    file_name = blob.split("/")[-1]  # Get only the file name part
-                    
-                    # Extract the date from the file name (assuming format: YYYY_MM_DD)
-                    parts = file_name.split("_")  # Split into ['2023', '12', '12', 'BOC', 'Agenda', ...]
+                    file_name = blob.split("/")[-1]  # Extract the file name
+                    parts = file_name.split("_")  # Split into parts: ['2023', '12', '12', 'BOC', 'Agenda', ...]
                     date_str = "_".join(parts[:3])  # Join the first three parts: '2023_12_12'
-                    
-                    # Convert the date string to a readable format
                     readable_date = datetime.strptime(date_str, "%Y_%m_%d").strftime("%B %d, %Y")
-                    
-                    # Group by the readable date
                     if readable_date not in grouped:
                         grouped[readable_date] = []
                     grouped[readable_date].append(blob)
                 except (ValueError, IndexError):
-                    # Handle files with unexpected formats
                     if "Unknown Date" not in grouped:
                         grouped["Unknown Date"] = []
                     grouped["Unknown Date"].append(blob)
             return grouped
 
-        # Group blobs by date
         raw_grouped = group_blobs_by_date(raw_blobs)
         dirty_grouped = group_blobs_by_date(dirty_blobs)
         clean_grouped = group_blobs_by_date(clean_blobs)
 
-        # Function to display blobs within a group
         def display_grouped_blobs(grouped_blobs, category):
             if grouped_blobs:
                 st.subheader(f"{category.capitalize()} Documents")
@@ -282,7 +286,6 @@ def display_grouped_blobs(grouped_blobs, category):
             else:
                 st.info(f"No documents found in the {category} category.")
 
-        # Display grouped blobs
         display_grouped_blobs(raw_grouped, "raw")
         display_grouped_blobs(dirty_grouped, "dirty")
         display_grouped_blobs(clean_grouped, "clean")
@@ -308,4 +311,4 @@ def display_grouped_blobs(grouped_blobs, category):
 elif st.session_state.page == "upload":
     upload_files_page()
 elif st.session_state.page == "view":
-    view_documents_page()
\ No newline at end of file
+    view_documents_page()
diff --git a/Preprocessing/preprocessing_pipeline/chunking_vector_embedding.py b/Preprocessing/preprocessing_pipeline/chunking_vector_embedding.py
index 6e88de4c..380d96cd 100644
--- a/Preprocessing/preprocessing_pipeline/chunking_vector_embedding.py
+++ b/Preprocessing/preprocessing_pipeline/chunking_vector_embedding.py
@@ -23,6 +23,56 @@
 # Initialize tiktoken for OpenAI's embedding model
 tokenizer = tiktoken.encoding_for_model("text-embedding-ada-002")
 
+
+def fetch_matching_chunks(meeting_date, meeting_type, file_type, source_document):
+    """
+    Fetch matching chunks from Weaviate based on metadata.
+
+    Args:
+        meeting_date (str): Date of the meeting.
+        meeting_type (str): Type of the meeting (e.g., "Board of Commissioners").
+        file_type (str): File type (e.g., "Minutes").
+        source_document (str): Name of the source document.
+
+    Returns:
+        list: A list of matching documents.
+    """
+    query = f"""
+    {{
+        Get {{
+            MeetingDocument(where: {{
+                operator: And,
+                operands: [
+                    {{ path: ["meeting_date"], operator: Equal, valueString: "{meeting_date}" }},
+                    {{ path: ["meeting_type"], operator: Equal, valueString: "{meeting_type}" }},
+                    {{ path: ["file_type"], operator: Equal, valueString: "{file_type}" }},
+                    {{ path: ["source_document"], operator: Equal, valueString: "{source_document}" }}
+                ]
+            }}) {{
+                _additional {{
+                    id
+                }}
+            }}
+        }}
+    }}
+    """
+    response = client.query.raw(query)
+    return response.get("data", {}).get("Get", {}).get("MeetingDocument", [])
+
+
+def delete_matching_chunks(documents):
+    """
+    Delete matching chunks from Weaviate.
+
+    Args:
+        documents (list): List of documents with IDs to delete.
+    """
+    for doc in documents:
+        doc_id = doc["_additional"]["id"]
+        client.data_object.delete(doc_id)
+        print(f"Deleted chunk ID: {doc_id}")
+
+
 def tokenize_and_embed_text(clean_file_name, metadata, max_chunk_size=250):
     """
     Tokenizes, chunks, and embeds cleaned text into Weaviate.
@@ -33,68 +83,46 @@ def tokenize_and_embed_text(clean_file_name, metadata, max_chunk_size=250):
         max_chunk_size (int): Maximum token size for each chunk.
     """
     try:
-        # Step 1: Download cleaned text from Azure
+        # Download cleaned text from Azure
         clean_text = download_from_azure("clean", clean_file_name)
-        print(f"Downloaded cleaned text from Azure for file: {clean_file_name}")
-
-        # Step 2: Tokenize the text using tiktoken
         tokens = tokenizer.encode(clean_text)
-
-        # Step 3: Chunk tokens into groups of max_chunk_size (default: 250 tokens per chunk)
         chunks = [
             tokenizer.decode(tokens[i:i + max_chunk_size])
             for i in range(0, len(tokens), max_chunk_size)
         ]
-        print(f"Tokenized and split text into {len(chunks)} chunks of {max_chunk_size} tokens each.")
 
-        # Extract metadata for embedding
+        # Metadata fields
         meeting_date = str(metadata["meeting_date"])
         meeting_type = metadata["meeting_type"]
         file_type = metadata["file_type"]
+        source_document = clean_file_name
 
-        # Step 4: Check and delete existing embeddings in Weaviate (to prevent duplication)
-        query = f"""
-        {{
-            Get {{
-                MeetingDocument(where: {{
-                    path: ["meeting_date", "meeting_type", "file_type"],
-                    operator: And,
-                    valueString: "{meeting_date}"
-                }}) {{
-                    id
-                }}
-            }}
-        }}
-        """
-        response = client.query.raw(query)
-        existing_documents = response.get("data", {}).get("Get", {}).get("MeetingDocument", [])
-
-        for doc in existing_documents:
-            client.data_object.delete(doc["id"])
-        print(f"Deleted {len(existing_documents)} existing embeddings for this file.")
+        # Check for existing embeddings
+        matching_chunks = fetch_matching_chunks(meeting_date, meeting_type, file_type, source_document)
+        if matching_chunks:
+            print(f"Found {len(matching_chunks)} existing chunks. Deleting...")
+            delete_matching_chunks(matching_chunks)
+        else:
+            print("No existing chunks found.")
 
-        # Step 5: Embed each chunk using OpenAI and store in Weaviate
+        # Embed and upload each chunk
         for i, chunk in enumerate(chunks):
-            # Generate embedding using OpenAI
-            response = openai_client.embeddings.create(
-                input=chunk,
-                model="text-embedding-ada-002"
-            )
-            embedding = response.data[0].embedding  # Correctly access embedding from the response object
+            response = openai_client.embeddings.create(input=chunk, model="text-embedding-ada-002")
+            embedding = response.data[0].embedding
 
-            # Upload chunk to Weaviate
             client.data_object.create(
                 data_object={
                     "content": chunk,
                     "meeting_date": meeting_date,
                     "meeting_type": meeting_type,
                     "file_type": file_type,
-                    "chunk_index": i  # Include chunk index for ordering
+                    "chunk_index": i,
+                    "source_document": source_document
                 },
                 vector=embedding,
                 class_name="MeetingDocument"
             )
-            print(f"Uploaded chunk {i+1}/{len(chunks)} to Weaviate.")
+            print(f"Uploaded chunk {i + 1}/{len(chunks)} to Weaviate.")
 
         print("Successfully processed and embedded all chunks.")
 
diff --git a/Preprocessing/tests/Weaviate_Metadata_List_2000-10-27.docx b/Preprocessing/tests/Weaviate_Metadata_List_2000-10-27.docx
new file mode 100644
index 00000000..8d72bb6f
Binary files /dev/null and b/Preprocessing/tests/Weaviate_Metadata_List_2000-10-27.docx differ
diff --git a/Preprocessing/tests/metadata_by_date.py b/Preprocessing/tests/metadata_by_date.py
new file mode 100644
index 00000000..ac0f2dab
--- /dev/null
+++ b/Preprocessing/tests/metadata_by_date.py
@@ -0,0 +1,87 @@
+# This allows you to find all the chunks by a specific meeting date. 
+
+import os
+import weaviate
+from dotenv import load_dotenv
+from docx import Document
+
+# Load environment variables from .env
+load_dotenv()
+
+# Initialize Weaviate client
+WEAVIATE_URL = os.getenv("WEAVIATE_URL")
+WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY")
+client = weaviate.Client(
+    url=WEAVIATE_URL,
+    auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY)
+)
+
+def fetch_documents_by_date_and_export_to_word(date):
+    """
+    Fetch documents from Weaviate filtered by a specific date and export metadata, including source_document, to a Word document.
+
+    Args:
+        date (str): The date to filter by (YYYY-MM-DD format).
+    """
+    query = f"""
+    {{
+      Get {{
+        MeetingDocument(where: {{
+          path: ["meeting_date"],
+          operator: Equal,
+          valueString: "{date}"
+        }}) {{
+          content
+          meeting_date
+          meeting_type
+          file_type
+          chunk_index
+          source_document
+        }}
+      }}
+    }}
+    """
+    try:
+        print(f"Querying Weaviate for documents on {date}...")
+        response = client.query.raw(query)
+        documents = response.get("data", {}).get("Get", {}).get("MeetingDocument", [])
+        
+        if not documents:
+            print(f"No documents found for the date: {date}.")
+            return
+
+        print(f"\nRetrieved Documents for {date}:")
+        for doc in documents:
+            print(f"- Chunk Index: {doc.get('chunk_index', 'N/A')}")
+            print(f"  Meeting Date: {doc.get('meeting_date', 'N/A')}")
+            print(f"  Meeting Type: {doc.get('meeting_type', 'N/A')}")
+            print(f"  File Type: {doc.get('file_type', 'N/A')}")
+            print(f"  Source Document: {doc.get('source_document', 'N/A')}")
+            print(f"  Content Preview: {doc.get('content', 'N/A')[:100]}...")
+            print()
+
+        # Export metadata to Word
+        print(f"Exporting metadata for {date} to Word document...")
+        doc = Document()
+        doc.add_heading(f'Document Metadata for {date}', level=1)
+
+        for doc_data in documents:
+            doc.add_heading(f"Chunk Index: {doc_data.get('chunk_index', 'N/A')}", level=2)
+            doc.add_paragraph(f"Meeting Date: {doc_data.get('meeting_date', 'N/A')}")
+            doc.add_paragraph(f"Meeting Type: {doc_data.get('meeting_type', 'N/A')}")
+            doc.add_paragraph(f"File Type: {doc_data.get('file_type', 'N/A')}")
+            doc.add_paragraph(f"Source Document: {doc_data.get('source_document', 'N/A')}")
+            doc.add_paragraph(f"Content Preview: {doc_data.get('content', 'N/A')}")
+            doc.add_paragraph("\n")
+
+        word_file_path = f"Weaviate_Metadata_List_{date}.docx"
+        doc.save(word_file_path)
+        print(f"Metadata exported to {word_file_path} successfully.")
+
+    except Exception as e:
+        print(f"Error querying Weaviate: {e}")
+
+if __name__ == "__main__":
+    # Filter by specific date (YYYY-MM-DD format)
+    specific_date = "2000-10-27"
+    fetch_documents_by_date_and_export_to_word(specific_date)
\ No newline at end of file
diff --git a/Preprocessing/tests/metadata_deletion_test.py b/Preprocessing/tests/metadata_deletion_test.py
new file mode 100644
index 00000000..a1281834
--- /dev/null
+++ b/Preprocessing/tests/metadata_deletion_test.py
@@ -0,0 +1,125 @@
+# This allows you to test out deleted specific meeting date, meeting types, file type chunks. 
+
+import os
+import weaviate
+from dotenv import load_dotenv
+
+# Load environment variables
+load_dotenv()
+WEAVIATE_URL = os.getenv("WEAVIATE_URL")
+WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY")
+
+# Initialize Weaviate client
+client = weaviate.Client(
+    url=WEAVIATE_URL,
+    auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY)
+)
+
+def fetch_documents(date, meeting_type, file_type):
+    """
+    Fetch documents from Weaviate based on specific criteria.
+
+    Args:
+        date (str): The date to filter by (YYYY-MM-DD format).
+        meeting_type (str): The meeting type to filter by (e.g., "Board of Commissioners").
+        file_type (str): The file type to filter by (e.g., "Minutes").
+
+    Returns:
+        list: A list of matching documents.
+    """
+    query = f"""
+    {{
+        Get {{
+            MeetingDocument(where: {{
+                operator: And,
+                operands: [
+                    {{
+                        path: ["meeting_date"],
+                        operator: Equal,
+                        valueString: "{date}"
+                    }},
+                    {{
+                        path: ["meeting_type"],
+                        operator: Equal,
+                        valueString: "{meeting_type}"
+                    }},
+                    {{
+                        path: ["file_type"],
+                        operator: Equal,
+                        valueString: "{file_type}"
+                    }}
+                ]
+            }}) {{
+                _additional {{
+                    id
+                }}
+                meeting_date
+                meeting_type
+                file_type
+                chunk_index
+                content
+            }}
+        }}
+    }}
+    """
+    response = client.query.raw(query)
+    documents = response.get("data", {}).get("Get", {}).get("MeetingDocument", [])
+    return documents
+
+def delete_documents(documents):
+    """
+    Delete all documents in the provided list from Weaviate.
+
+    Args:
+        documents (list): A list of documents with `_additional.id` to delete.
+    """
+    for doc in documents:
+        doc_id = doc.get("_additional", {}).get("id")
+        if doc_id:
+            client.data_object.delete(doc_id)
+            print(f"Deleted document ID: {doc_id}")
+        else:
+            print("Document ID not found; skipping deletion.")
+
+if __name__ == "__main__":
+    # Specify the criteria for deletion
+    specific_date = "2000-10-27"
+    specific_meeting_type = "Board of Commissioners"
+    specific_file_type = "Minutes"
+
+    # Step 1: Fetch documents
+    print(f"Fetching documents for {specific_date}, {specific_meeting_type}, {specific_file_type}...")
+    matching_documents = fetch_documents(specific_date, specific_meeting_type, specific_file_type)
+    if matching_documents:
+        print(f"\nFound {len(matching_documents)} matching documents:")
+        for doc in matching_documents:
+            print(f"- ID: {doc.get('_additional', {}).get('id')}")
+            print(f"  Chunk Index: {doc.get('chunk_index', 'N/A')}")
+            print(f"  Meeting Date: {doc.get('meeting_date', 'N/A')}")
+            print(f"  Meeting Type: {doc.get('meeting_type', 'N/A')}")
+            print(f"  File Type: {doc.get('file_type', 'N/A')}")
+            print(f"  Content Preview: {doc.get('content', 'N/A')[:100]}...")
+            print()
+    else:
+        print("No matching documents found.")
+
+    # Step 2: Delete documents
+    if matching_documents:
+        print("Deleting matching documents...")
+        delete_documents(matching_documents)
+
+    # Step 3: Confirm deletion by re-fetching
+    print(f"Fetching documents again for {specific_date}, {specific_meeting_type}, {specific_file_type}...")
+    remaining_documents = fetch_documents(specific_date, specific_meeting_type, specific_file_type)
+    if remaining_documents:
+        print(f"\nFound {len(remaining_documents)} remaining documents (deletion failed for some):")
+        for doc in remaining_documents:
+            print(f"- ID: {doc.get('_additional', {}).get('id')}")
+            print(f"  Chunk Index: {doc.get('chunk_index', 'N/A')}")
+            print(f"  Meeting Date: {doc.get('meeting_date', 'N/A')}")
+            print(f"  Meeting Type: {doc.get('meeting_type', 'N/A')}")
+            print(f"  File Type: {doc.get('file_type', 'N/A')}")
+            print(f"  Content Preview: {doc.get('content', 'N/A')[:100]}...")
+            print()
+    else:
+        print("All matching documents have been successfully deleted.")
diff --git a/Preprocessing/tests/pytest.py b/Preprocessing/tests/pytest.py
new file mode 100644
index 00000000..08b70282
--- /dev/null
+++ b/Preprocessing/tests/pytest.py
@@ -0,0 +1,255 @@
+#This is a few different pytests!
+
+import os
+import sys
+import importlib
+from dotenv import load_dotenv
+
+# Load environment variables
+load_dotenv()
+
+# Add the parent directory (Preprocessing) to the Python module search path
+current_dir = os.path.dirname(os.path.abspath(__file__))
+parent_dir = os.path.abspath(os.path.join(current_dir, ".."))
+if parent_dir not in sys.path:
+    sys.path.append(parent_dir)
+
+def test_dependencies_installed():
+    dependencies = [
+        "streamlit", "requests", "azure.storage.blob", "openai", "weaviate",
+        "fitz", "assemblyai", "transformers", "chardet", "pytest", "easyocr", "os", "sys", "importlib"
+    ]
+    for lib in dependencies:
+        assert importlib.util.find_spec(lib) is not None, f"{lib} is not installed!"
+
+def test_env_variables():
+    required_vars = [
+        "OPENAI_API_KEY", "OPENAI_BASE_URL", "WEAVIATE_URL", "WEAVIATE_API_KEY",
+        "ASSEMBLY_AI_KEY", "AZURE_STORAGE_CONNECTION_STRING", "AZURE_STORAGE_CONTAINER", "PYTHONPATH"
+    ]
+
+    # Debugging
+    for var in required_vars:
+        print(f"{var}: {os.getenv(var)}")
+
+    missing_or_empty_vars = [
+        var for var in required_vars if not os.getenv(var) or os.getenv(var).strip() == ""
+    ]
+    assert not missing_or_empty_vars, f"Missing or empty variables: {', '.join(missing_or_empty_vars)}"
+
+from utils.azure_blob_utils import download_from_azure
+def test_download_from_azure():
+    """
+    Test downloading a specific file from Azure Blob Storage.
+    """
+    # File details
+    folder_name = "raw"
+    file_name = "2023_08_01_BOC_Agenda_Raw.pdf"
+    downloaded_file_path = "downloaded_2023_08_01_BOC_Agenda_Raw.pdf"
+    container_name = os.getenv("AZURE_STORAGE_CONTAINER")
+
+    # Ensure container name is loaded from environment
+    assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file."
+
+    print(f"Attempting to download {folder_name}/{file_name} from Azure Blob Storage...")
+
+    # Download the file
+    try:
+        content = download_from_azure(folder_name, file_name, as_text=False)
+        # Save the downloaded content locally
+        with open(downloaded_file_path, "wb") as file:
+            file.write(content)
+
+        # Check if the file exists locally
+        assert os.path.exists(downloaded_file_path), f"Downloaded file {downloaded_file_path} does not exist!"
+        print(f"Download successful. File saved to {downloaded_file_path}.")
+    except Exception as e:
+        assert False, f"Download failed with error: {e}"
+
+    # Cleanup: Remove the downloaded file after the test
+    try:
+        if os.path.exists(downloaded_file_path):
+            os.remove(downloaded_file_path)
+            print(f"Cleaned up: {downloaded_file_path}.")
+    except Exception as cleanup_error:
+        print(f"Error during cleanup: {cleanup_error}")
+
+def test_download_from_dirty():
+    """
+    Test downloading a specific file from the 'dirty' folder in Azure Blob Storage.
+    """
+    # File details
+    folder_name = "dirty"
+    file_name = "2023_08_01_BOC_Agenda_TextExtraction.txt"
+    downloaded_file_path = "downloaded_2023_08_01_BOC_Agenda_TextExtraction.txt"
+    container_name = os.getenv("AZURE_STORAGE_CONTAINER")
+
+    # Ensure container name is loaded from environment
+    assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file."
+
+    print(f"Attempting to download {folder_name}/{file_name} from Azure Blob Storage...")
+
+    # Download the file
+    try:
+        content = download_from_azure(folder_name, file_name, as_text=True)
+        # Save the downloaded content locally
+        with open(downloaded_file_path, "w", encoding="utf-8") as file:
+            file.write(content)
+
+        # Check if the file exists locally
+        assert os.path.exists(downloaded_file_path), f"Downloaded file {downloaded_file_path} does not exist!"
+        print(f"Download successful. File saved to {downloaded_file_path}.")
+    except Exception as e:
+        assert False, f"Download failed with error: {e}"
+
+    # Cleanup: Remove the downloaded file after the test
+    try:
+        if os.path.exists(downloaded_file_path):
+            os.remove(downloaded_file_path)
+            print(f"Cleaned up: {downloaded_file_path}.")
+    except Exception as cleanup_error:
+        print(f"Error during cleanup: {cleanup_error}")
+
+
+def test_download_from_clean():
+    """
+    Test downloading a specific file from the 'clean' folder in Azure Blob Storage.
+    """
+    # File details
+    folder_name = "clean"
+    file_name = "2023_08_01_BOC_Agenda_Cleaned.txt"
+    downloaded_file_path = "downloaded_2023_08_01_BOC_Agenda_Cleaned.txt"
+    container_name = os.getenv("AZURE_STORAGE_CONTAINER")
+
+    # Ensure container name is loaded from environment
+    assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file."
+
+    print(f"Attempting to download {folder_name}/{file_name} from Azure Blob Storage...")
+
+    # Download the file
+    try:
+        content = download_from_azure(folder_name, file_name, as_text=True)
+        # Save the downloaded content locally
+        with open(downloaded_file_path, "w", encoding="utf-8") as file:
+            file.write(content)
+
+        # Check if the file exists locally
+        assert os.path.exists(downloaded_file_path), f"Downloaded file {downloaded_file_path} does not exist!"
+        print(f"Download successful. File saved to {downloaded_file_path}.")
+    except Exception as e:
+        assert False, f"Download failed with error: {e}"
+
+    # Cleanup: Remove the downloaded file after the test
+    try:
+        if os.path.exists(downloaded_file_path):
+            os.remove(downloaded_file_path)
+            print(f"Cleaned up: {downloaded_file_path}.")
+    except Exception as cleanup_error:
+        print(f"Error during cleanup: {cleanup_error}")
+    
+from utils.azure_blob_utils import upload_to_azure
+
+def test_upload_to_raw():
+    """
+    Test uploading a file to the 'raw' folder in Azure Blob Storage.
+    """
+    # File details
+    folder_name = "raw"
+    file_name = "Test_Minutes.pdf"
+    container_name = os.getenv("AZURE_STORAGE_CONTAINER")
+
+    # Ensure container name is loaded from environment
+    assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file."
+
+    # Read the local file
+    local_file_path = "Test_Minutes.pdf"  # Replace with your actual test file path
+    assert os.path.exists(local_file_path), f"Local file {local_file_path} does not exist!"
+    with open(local_file_path, "rb") as f:
+        file_content = f.read()
+
+    # Upload to Azure
+    print(f"Uploading {local_file_path} to {folder_name}/{file_name} in Azure Blob Storage...")
+    try:
+        upload_to_azure(folder_name, file_name, file_content)
+        print(f"Upload successful: {folder_name}/{file_name}")
+    except Exception as e:
+        assert False, f"Upload failed with error: {e}"
+
+
+def test_upload_to_dirty():
+    """
+    Test uploading a file to the 'dirty' folder in Azure Blob Storage.
+    """
+    # File details
+    folder_name = "dirty"
+    file_name = "Test_Minutes.pdf"
+    container_name = os.getenv("AZURE_STORAGE_CONTAINER")
+
+    # Ensure container name is loaded from environment
+    assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file."
+
+    # Read the local file
+    local_file_path = "Test_Minutes.pdf"  # Replace with your actual test file path
+    assert os.path.exists(local_file_path), f"Local file {local_file_path} does not exist!"
+    with open(local_file_path, "rb") as f:
+        file_content = f.read()
+
+    # Upload to Azure
+    print(f"Uploading {local_file_path} to {folder_name}/{file_name} in Azure Blob Storage...")
+    try:
+        upload_to_azure(folder_name, file_name, file_content)
+        print(f"Upload successful: {folder_name}/{file_name}")
+    except Exception as e:
+        assert False, f"Upload failed with error: {e}"
+
+
+def test_upload_to_clean():
+    """
+    Test uploading a file to the 'clean' folder in Azure Blob Storage.
+    """
+    # File details
+    folder_name = "clean"
+    file_name = "Test_Minutes.pdf"
+    container_name = os.getenv("AZURE_STORAGE_CONTAINER")
+
+    # Ensure container name is loaded from environment
+    assert container_name, "AZURE_STORAGE_CONTAINER is not set in the .env file."
+
+    # Read the local file
+    local_file_path = "Test_Minutes.pdf"  # Replace with your actual test file path
+    assert os.path.exists(local_file_path), f"Local file {local_file_path} does not exist!"
+    with open(local_file_path, "rb") as f:
+        file_content = f.read()
+
+    # Upload to Azure
+    print(f"Uploading {local_file_path} to {folder_name}/{file_name} in Azure Blob Storage...")
+    try:
+        upload_to_azure(folder_name, file_name, file_content)
+        print(f"Upload successful: {folder_name}/{file_name}")
+    except Exception as e:
+        assert False, f"Upload failed with error: {e}"
+
+
+from preprocessing_pipeline.pdf_conversion import convert_pdf_to_text
+
+def test_pdf_conversion():
+    """
+    Test the PDF to text conversion function.
+    """
+    # Define the test file
+    test_pdf_path = "Test_Minutes.pdf"  # Replace with your test PDF file path
+
+    # Ensure the test file exists locally
+    assert os.path.exists(test_pdf_path), f"Test PDF file {test_pdf_path} does not exist!"
+
+    # Attempt to convert the PDF to text
+    try:
+        print(f"Converting {test_pdf_path} to text...")
+        extracted_text = convert_pdf_to_text(test_pdf_path)
+
+        # Assertions to verify the conversion worked
+        assert isinstance(extracted_text, str), "Extracted text is not a string!"
+        assert len(extracted_text) > 0, "Extracted text is empty!"
+        print(f"PDF conversion successful. Extracted text length: {len(extracted_text)} characters.")
+    except Exception as e:
+        assert False, f"PDF conversion failed with error: {e}"
\ No newline at end of file