dsba6010-llm-applications · neal-logan · Dec 10, 2024 · Dec 10, 2024
diff --git a/Preprocessing/.streamlit/config.toml b/Preprocessing/.streamlit/config.toml
@@ -1,2 +1,21 @@
 [server]
-maxUploadSize = 1000  # Set the upload size limit in MB
+headless = true
+port = 8501  # Default port for local testing
+enableCORS = false
+enableXsrfProtection = false
+maxUploadSize = 1000
+
+
+[theme]
+base = "light"
+primaryColor = "#0D6051"
+secondaryBackgroundColor = "#f0f2e9"
+textColor = "#263d36"
+font = "IBM Plex Mono"
+
+[global]
+pageTitle = "Minute Mate"
+favicon = "assets/favicon.ico"
+
+[home]
+welcomeMessage = "Welcome to Minute Mate: Your Meeting Transcription and Summarization Tool!"
diff --git a/Preprocessing/App/main.py b/Preprocessing/App/main.py
diff --git a/Preprocessing/docker/requirements.txt b/Preprocessing/docker/requirements.txt
@@ -23,4 +23,5 @@ azure.storage.blob
 transformers
 chardet
 pytest
-easyocr
+easyocr
+tiktoken
diff --git a/Preprocessing/preprocessing_pipeline/audio_transcription.py b/Preprocessing/preprocessing_pipeline/audio_transcription.py
@@ -1,11 +1,15 @@
-import os
 import requests
+import streamlit as st
 from utils.azure_blob_utils import download_from_azure
-from utils.env_setup import load_env
 
-# Load environment variables
-load_env()
-ASSEMBLY_AI_KEY = os.getenv("ASSEMBLY_AI_KEY")
+# Dynamically fetch AssemblyAI API key from Streamlit session state
+def get_assembly_ai_key():
+    api_keys = st.session_state.get("api_keys", {})
+    assembly_ai_key = api_keys.get("ASSEMBLY_AI_KEY")
+    if not assembly_ai_key:
+        raise ValueError("AssemblyAI API key is missing. Please configure it in the Streamlit app.")
+    return assembly_ai_key
+
 ASSEMBLY_AI_ENDPOINT = "https://api.assemblyai.com/v2"
 
 def transcribe_audio(raw_file_name, model=None, speaker_labels=False):
@@ -20,8 +24,11 @@ def transcribe_audio(raw_file_name, model=None, speaker_labels=False):
     Returns:
     - str: Transcribed text, or None if transcription fails.
     """
-    headers = {"authorization": ASSEMBLY_AI_KEY}
     try:
+        # Fetch the AssemblyAI key dynamically
+        assembly_ai_key = get_assembly_ai_key()
+        headers = {"authorization": assembly_ai_key}
+
         # Step 1: Download the raw audio file from Azure
         raw_content = download_from_azure("raw", raw_file_name, as_text=False)
         print(f"Downloaded {raw_file_name} from Azure for transcription.")
@@ -82,3 +89,4 @@ def transcribe_audio(raw_file_name, model=None, speaker_labels=False):
     except Exception as e:
         print(f"Error during transcription: {e}")
         return None
+
diff --git a/Preprocessing/preprocessing_pipeline/chunking_vector_embedding.py b/Preprocessing/preprocessing_pipeline/chunking_vector_embedding.py
@@ -1,29 +1,35 @@
-import os
-from openai import OpenAI
+import streamlit as st
+import requests
 import weaviate
 import tiktoken  # Use tiktoken for OpenAI-compatible tokenization
-from utils.env_setup import load_env
 from utils.azure_blob_utils import download_from_azure
 
-# Load environment variables
-load_env()
-WEAVIATE_URL = os.getenv("WEAVIATE_URL")
-WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY")
-OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+# Dynamic API Key Retrieval
+def get_weaviate_client():
+    api_keys = st.session_state.get("api_keys", {})
+    weaviate_url = api_keys.get("WEAVIATE_URL")
+    weaviate_api_key = api_keys.get("WEAVIATE_API_KEY")
 
-# Initialize Weaviate client
-client = weaviate.Client(
-    url=WEAVIATE_URL,
-    auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY)
+    if not weaviate_url or not weaviate_api_key:
+        raise ValueError("Weaviate API configuration is missing. Please configure it in the Streamlit app.")
+
+    return weaviate.Client(
+        url=weaviate_url,
+        auth_client_secret=weaviate.AuthApiKey(api_key=weaviate_api_key)
 )
 
-# Initialize OpenAI client for embedding
-openai_client = OpenAI(api_key=OPENAI_API_KEY)
+def get_openai_api_key():
+    api_keys = st.session_state.get("api_keys", {})
+    openai_api_key = api_keys.get("OPENAI_API_KEY")
+
+    if not openai_api_key:
+        raise ValueError("OpenAI API key is missing. Please configure it in the Streamlit app.")
+
+    return openai_api_key
 
 # Initialize tiktoken for OpenAI's embedding model
 tokenizer = tiktoken.encoding_for_model("text-embedding-ada-002")
 
-
 def fetch_matching_chunks(meeting_date, meeting_type, file_type, source_document):
     """
     Fetch matching chunks from Weaviate based on metadata.
@@ -37,6 +43,7 @@
     Returns:
         list: A list of matching documents.
     """
+    client = get_weaviate_client()
     query = f"""
     {{
         Get {{
@@ -67,6 +74,7 @@
     Args:
         documents (list): List of documents with IDs to delete.
     """
+    client = get_weaviate_client()
     for doc in documents:
         doc_id = doc["_additional"]["id"]
         client.data_object.delete(doc_id)
@@ -83,6 +91,10 @@
         max_chunk_size (int): Maximum token size for each chunk.
     """
     try:
+        # Initialize clients dynamically
+        client = get_weaviate_client()
+        openai_api_key = get_openai_api_key()
+
         # Download cleaned text from Azure
         clean_text = download_from_azure("clean", clean_file_name)
         tokens = tokenizer.encode(clean_text)
@@ -107,8 +119,17 @@
 
         # Embed and upload each chunk
         for i, chunk in enumerate(chunks):
-            response = openai_client.embeddings.create(input=chunk, model="text-embedding-ada-002")
-            embedding = response.data[0].embedding
+            # Request embedding from OpenAI
+            headers = {"Authorization": f"Bearer {openai_api_key}"}
+            response = requests.post(
+                "https://api.openai.com/v1/embeddings",
+                headers=headers,
+                json={"input": chunk, "model": "text-embedding-ada-002"}
+            )
+            if response.status_code != 200:
+                raise ValueError(f"OpenAI embedding error: {response.status_code} - {response.text}")
+
+            embedding = response.json()["data"][0]["embedding"]
 
             client.data_object.create(
                 data_object={

diff --git a/Preprocessing/preprocessing_pipeline/pdf_conversion.py b/Preprocessing/preprocessing_pipeline/pdf_conversion.py
@@ -3,8 +3,10 @@
 from PIL import Image
 from io import BytesIO
 import numpy as np
+import streamlit as st
 from utils.azure_blob_utils import download_from_azure
 
+
 def convert_pdf_to_text(raw_file_name):
     """
     Extracts text from a PDF file. Uses EasyOCR as a fallback for scanned PDFs.
@@ -17,33 +19,34 @@
     """
     try:
         # Step 1: Download the raw file from Azure Blob Storage
+        print(f"Downloading {raw_file_name} from Azure Blob Storage (raw folder)...")
         raw_content = download_from_azure("raw", raw_file_name, as_text=False)
 
-        # Step 2: Open the PDF content
+        # Step 2: Open the PDF content using PyMuPDF (fitz)
         pdf_document = fitz.open(stream=raw_content, filetype="pdf")
-        text = ""
-        reader = easyocr.Reader(['en'])  # Initialize EasyOCR for English
+        text = ""  # Initialize a string to hold extracted text
+        reader = easyocr.Reader(['en'], gpu=False)  # Initialize EasyOCR for English (disable GPU for portability)
 
         for page_num in range(pdf_document.page_count):
             page = pdf_document[page_num]
 
-            # Attempt to extract text directly
+            # Attempt to extract text directly from the page
             page_text = page.get_text()
-            if page_text.strip():  # If direct text is available
-                print(f"Text extracted directly from page {page_num + 1}.")
+            if page_text.strip():  # If direct text extraction is successful
+                print(f"Direct text extracted from page {page_num + 1}.")
                 text += page_text
             else:  # Fallback to OCR for scanned pages
-                print(f"Applying OCR on page {page_num + 1} of {raw_file_name}.")
-                pix = page.get_pixmap(dpi=300)  # Render page to an image
-                img = Image.open(BytesIO(pix.tobytes("png")))
+                print(f"Direct text extraction failed on page {page_num + 1}. Applying OCR.")
+                pix = page.get_pixmap(dpi=300)  # Render the page as a high-resolution image
+                img = Image.open(BytesIO(pix.tobytes("png")))  # Convert rendered image to a PIL Image
                 img_array = np.array(img)  # Convert PIL Image to NumPy array for EasyOCR
-                ocr_text = reader.readtext(img_array, detail=0)  # Extract text with EasyOCR
-                text += "\n".join(ocr_text)
+                ocr_text = reader.readtext(img_array, detail=0)  # Perform OCR with EasyOCR
+                text += "\n".join(ocr_text)  # Append the OCR results to the text string
 
-        pdf_document.close()
+        pdf_document.close()  # Close the PDF document
         print(f"Successfully extracted text from {raw_file_name}.")
         return text
 
     except Exception as e:
-        print(f"Error in OCR for {raw_file_name}: {e}")
+        print(f"Error processing PDF {raw_file_name}: {e}")
         return None
diff --git a/Preprocessing/preprocessing_pipeline/text_cleaning.py b/Preprocessing/preprocessing_pipeline/text_cleaning.py
@@ -1,16 +1,28 @@
-import os
+import streamlit as st
+import tiktoken  # For OpenAI-compatible tokenization
 from openai import OpenAI
-import tiktoken  # Use tiktoken for OpenAI-compatible tokenization
-from utils.env_setup import load_env
-from utils.azure_blob_utils import download_from_azure, upload_to_azure
-
-# Load environment variables
-load_env()
-client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
+from utils.azure_blob_utils import download_from_azure
 
 # Initialize tiktoken for OpenAI's GPT models
 tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo")  # Specify the OpenAI model
 
+
+def get_openai_client():
+    """
+    Retrieves the OpenAI client using the API key from Streamlit session state.
+
+    Returns:
+        OpenAI: OpenAI client object.
+    """
+    api_keys = st.session_state.get("api_keys", {})
+    openai_api_key = api_keys.get("OPENAI_API_KEY")
+
+    if not openai_api_key:
+        raise ValueError("OpenAI API Key is missing. Please configure it on the Home Page.")
+
+    return OpenAI(api_key=openai_api_key)
+
+
 def tokenize_and_split_text(text, max_chunk_size=250):
     """
     Tokenizes and splits text into smaller chunks within the token size limit.
@@ -22,6 +34,10 @@ def tokenize_and_split_text(text, max_chunk_size=250):
     Returns:
         list of str: List of smaller text chunks.
     """
+    # Validate text input
+    if not text or text.strip() == "":
+        raise ValueError("Text input is empty or invalid.")
+
     # Tokenize the text into tokens
     tokens = tokenizer.encode(text)
 
@@ -32,12 +48,14 @@ def tokenize_and_split_text(text, max_chunk_size=250):
     ]
     return chunks
 
-def clean_text_chunk(chunk):
+
+def clean_text_chunk(chunk, openai_client):
     """
     Cleans a single chunk of text using OpenAI GPT.
 
     Args:
         chunk (str): Text chunk to clean.
+        openai_client (OpenAI): OpenAI client instance.
 
     Returns:
         str: Cleaned text.
@@ -51,13 +69,18 @@ def clean_text_chunk(chunk):
         {"role": "user", "content": f"Clean the following text for readability: {chunk}"}
     ]
 
-    response = client.chat.completions.create(
-        model="gpt-3.5-turbo",
-        messages=messages,
-        max_tokens=2000,
-        temperature=0.5
-    )
-    return response.choices[0].message.content.strip()
+    try:
+        response = openai_client.chat.completions.create(
+            model="gpt-3.5-turbo",
+            messages=messages,
+            max_tokens=2000,
+            temperature=0.5
+        )
+        return response.choices[0].message.content.strip()
+    except Exception as e:
+        print(f"Error during chunk cleaning: {e}")
+        return f"Error in chunk cleaning: {e}"
+
 
 def clean_text(dirty_file_name):
     """
@@ -69,16 +92,40 @@ def clean_text(dirty_file_name):
     Returns:
         str: Combined cleaned text.
     """
-    print(f"Downloading {dirty_file_name} from Azure Blob Storage...")
-    dirty_content = download_from_azure("dirty", dirty_file_name)
-
-    # Tokenize and split the text into chunks of 250 tokens
-    chunks = tokenize_and_split_text(dirty_content, max_chunk_size=250)
-    cleaned_chunks = []
-
-    for i, chunk in enumerate(chunks):
-        print(f"Cleaning chunk {i + 1}/{len(chunks)}...")
-        cleaned_chunk = clean_text_chunk(chunk)
-        cleaned_chunks.append(cleaned_chunk)
-
-    return "\n\n".join(cleaned_chunks)
+    try:
+        print(f"Downloading {dirty_file_name} from Azure Blob Storage (dirty folder)...")
+        dirty_content = download_from_azure("dirty", dirty_file_name)
+
+        # Validate dirty content
+        if not dirty_content or dirty_content.strip() == "":
+            raise ValueError("The downloaded content is empty. Please check the file content.")
+
+        # Initialize OpenAI client dynamically
+        openai_client = get_openai_client()
+
+        # Tokenize and split the text into chunks
+        print("Tokenizing and splitting text into manageable chunks...")
+        chunks = tokenize_and_split_text(dirty_content, max_chunk_size=250)
+        cleaned_chunks = []
+
+        for i, chunk in enumerate(chunks):
+            print(f"Cleaning chunk {i + 1}/{len(chunks)}: {chunk[:100]}...")
+            try:
+                cleaned_chunk = clean_text_chunk(chunk, openai_client)
+            except Exception as e:
+                print(f"Error cleaning chunk {i + 1}: {e}")
+                cleaned_chunk = f"Error cleaning this chunk: {e}"
+
+            if not cleaned_chunk.strip():
+                print(f"Chunk {i + 1} returned empty after cleaning.")
+                raise ValueError(f"Chunk {i + 1} cleaning failed. Received empty content.")
+
+            cleaned_chunks.append(cleaned_chunk)
+
+        print(f"Successfully cleaned {len(chunks)} chunks.")
+        return "\n\n".join(cleaned_chunks)
+
+    except Exception as e:
+        print(f"Error during text cleaning: {e}")
+        return None
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -23,4 +23,5 @@ azure.storage.blob @@
     transformers
     chardet
     pytest
-    easyocr
+    easyocr
+    tiktoken