Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Streamlit Hosting + Removal of Junk Files #62

Merged
merged 1 commit into from
Dec 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 20 additions & 1 deletion Preprocessing/.streamlit/config.toml
Original file line number Diff line number Diff line change
@@ -1,2 +1,21 @@
[server]
maxUploadSize = 1000 # Set the upload size limit in MB
headless = true
port = 8501 # Default port for local testing
enableCORS = false
enableXsrfProtection = false
maxUploadSize = 1000


[theme]
base = "light"
primaryColor = "#0D6051"
secondaryBackgroundColor = "#f0f2e9"
textColor = "#263d36"
font = "IBM Plex Mono"

[global]
pageTitle = "Minute Mate"
favicon = "assets/favicon.ico"

[home]
welcomeMessage = "Welcome to Minute Mate: Your Meeting Transcription and Summarization Tool!"
489 changes: 337 additions & 152 deletions Preprocessing/App/main.py

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion Preprocessing/docker/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,5 @@ azure.storage.blob
transformers
chardet
pytest
easyocr
easyocr
tiktoken
20 changes: 14 additions & 6 deletions Preprocessing/preprocessing_pipeline/audio_transcription.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
import os
import requests
import streamlit as st
from utils.azure_blob_utils import download_from_azure
from utils.env_setup import load_env

# Load environment variables
load_env()
ASSEMBLY_AI_KEY = os.getenv("ASSEMBLY_AI_KEY")
# Dynamically fetch AssemblyAI API key from Streamlit session state
def get_assembly_ai_key():
api_keys = st.session_state.get("api_keys", {})
assembly_ai_key = api_keys.get("ASSEMBLY_AI_KEY")
if not assembly_ai_key:
raise ValueError("AssemblyAI API key is missing. Please configure it in the Streamlit app.")
return assembly_ai_key

ASSEMBLY_AI_ENDPOINT = "https://api.assemblyai.com/v2"

def transcribe_audio(raw_file_name, model=None, speaker_labels=False):
Expand All @@ -20,8 +24,11 @@ def transcribe_audio(raw_file_name, model=None, speaker_labels=False):
Returns:
- str: Transcribed text, or None if transcription fails.
"""
headers = {"authorization": ASSEMBLY_AI_KEY}
try:
# Fetch the AssemblyAI key dynamically
assembly_ai_key = get_assembly_ai_key()
headers = {"authorization": assembly_ai_key}

# Step 1: Download the raw audio file from Azure
raw_content = download_from_azure("raw", raw_file_name, as_text=False)
print(f"Downloaded {raw_file_name} from Azure for transcription.")
Expand Down Expand Up @@ -82,3 +89,4 @@ def transcribe_audio(raw_file_name, model=None, speaker_labels=False):
except Exception as e:
print(f"Error during transcription: {e}")
return None

55 changes: 38 additions & 17 deletions Preprocessing/preprocessing_pipeline/chunking_vector_embedding.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,35 @@
import os
from openai import OpenAI
import streamlit as st
import requests
import weaviate
import tiktoken # Use tiktoken for OpenAI-compatible tokenization
from utils.env_setup import load_env
from utils.azure_blob_utils import download_from_azure

# Load environment variables
load_env()
WEAVIATE_URL = os.getenv("WEAVIATE_URL")
WEAVIATE_API_KEY = os.getenv("WEAVIATE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
# Dynamic API Key Retrieval
def get_weaviate_client():
api_keys = st.session_state.get("api_keys", {})
weaviate_url = api_keys.get("WEAVIATE_URL")
weaviate_api_key = api_keys.get("WEAVIATE_API_KEY")

# Initialize Weaviate client
client = weaviate.Client(
url=WEAVIATE_URL,
auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY)
if not weaviate_url or not weaviate_api_key:
raise ValueError("Weaviate API configuration is missing. Please configure it in the Streamlit app.")

return weaviate.Client(
url=weaviate_url,
auth_client_secret=weaviate.AuthApiKey(api_key=weaviate_api_key)
)

# Initialize OpenAI client for embedding
openai_client = OpenAI(api_key=OPENAI_API_KEY)
def get_openai_api_key():
api_keys = st.session_state.get("api_keys", {})
openai_api_key = api_keys.get("OPENAI_API_KEY")

if not openai_api_key:
raise ValueError("OpenAI API key is missing. Please configure it in the Streamlit app.")

return openai_api_key

# Initialize tiktoken for OpenAI's embedding model
tokenizer = tiktoken.encoding_for_model("text-embedding-ada-002")


def fetch_matching_chunks(meeting_date, meeting_type, file_type, source_document):
"""
Fetch matching chunks from Weaviate based on metadata.
Expand All @@ -37,6 +43,7 @@
Returns:
list: A list of matching documents.
"""
client = get_weaviate_client()
query = f"""
{{
Get {{
Expand Down Expand Up @@ -67,6 +74,7 @@
Args:
documents (list): List of documents with IDs to delete.
"""
client = get_weaviate_client()
for doc in documents:
doc_id = doc["_additional"]["id"]
client.data_object.delete(doc_id)
Expand All @@ -83,6 +91,10 @@
max_chunk_size (int): Maximum token size for each chunk.
"""
try:
# Initialize clients dynamically
client = get_weaviate_client()
openai_api_key = get_openai_api_key()

# Download cleaned text from Azure
clean_text = download_from_azure("clean", clean_file_name)
tokens = tokenizer.encode(clean_text)
Expand All @@ -107,8 +119,17 @@

# Embed and upload each chunk
for i, chunk in enumerate(chunks):
response = openai_client.embeddings.create(input=chunk, model="text-embedding-ada-002")
embedding = response.data[0].embedding
# Request embedding from OpenAI
headers = {"Authorization": f"Bearer {openai_api_key}"}
response = requests.post(
"https://api.openai.com/v1/embeddings",
Comment on lines +124 to +125

Check warning

Code scanning / Bandit

Call to requests without timeout Warning

Call to requests without timeout
headers=headers,
json={"input": chunk, "model": "text-embedding-ada-002"}
)
if response.status_code != 200:
raise ValueError(f"OpenAI embedding error: {response.status_code} - {response.text}")

embedding = response.json()["data"][0]["embedding"]

client.data_object.create(
data_object={
Expand Down
29 changes: 16 additions & 13 deletions Preprocessing/preprocessing_pipeline/pdf_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
from PIL import Image
from io import BytesIO
import numpy as np
import streamlit as st

Check failure on line 6 in Preprocessing/preprocessing_pipeline/pdf_conversion.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

Preprocessing/preprocessing_pipeline/pdf_conversion.py:6:21: F401 `streamlit` imported but unused

Check failure on line 6 in Preprocessing/preprocessing_pipeline/pdf_conversion.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F401)

Preprocessing/preprocessing_pipeline/pdf_conversion.py:6:21: F401 `streamlit` imported but unused
from utils.azure_blob_utils import download_from_azure


def convert_pdf_to_text(raw_file_name):
"""
Extracts text from a PDF file. Uses EasyOCR as a fallback for scanned PDFs.
Expand All @@ -17,33 +19,34 @@
"""
try:
# Step 1: Download the raw file from Azure Blob Storage
print(f"Downloading {raw_file_name} from Azure Blob Storage (raw folder)...")
raw_content = download_from_azure("raw", raw_file_name, as_text=False)

# Step 2: Open the PDF content
# Step 2: Open the PDF content using PyMuPDF (fitz)
pdf_document = fitz.open(stream=raw_content, filetype="pdf")
text = ""
reader = easyocr.Reader(['en']) # Initialize EasyOCR for English
text = "" # Initialize a string to hold extracted text
reader = easyocr.Reader(['en'], gpu=False) # Initialize EasyOCR for English (disable GPU for portability)

for page_num in range(pdf_document.page_count):
page = pdf_document[page_num]

# Attempt to extract text directly
# Attempt to extract text directly from the page
page_text = page.get_text()
if page_text.strip(): # If direct text is available
print(f"Text extracted directly from page {page_num + 1}.")
if page_text.strip(): # If direct text extraction is successful
print(f"Direct text extracted from page {page_num + 1}.")
text += page_text
else: # Fallback to OCR for scanned pages
print(f"Applying OCR on page {page_num + 1} of {raw_file_name}.")
pix = page.get_pixmap(dpi=300) # Render page to an image
img = Image.open(BytesIO(pix.tobytes("png")))
print(f"Direct text extraction failed on page {page_num + 1}. Applying OCR.")
pix = page.get_pixmap(dpi=300) # Render the page as a high-resolution image
img = Image.open(BytesIO(pix.tobytes("png"))) # Convert rendered image to a PIL Image
img_array = np.array(img) # Convert PIL Image to NumPy array for EasyOCR
ocr_text = reader.readtext(img_array, detail=0) # Extract text with EasyOCR
text += "\n".join(ocr_text)
ocr_text = reader.readtext(img_array, detail=0) # Perform OCR with EasyOCR
text += "\n".join(ocr_text) # Append the OCR results to the text string

pdf_document.close()
pdf_document.close() # Close the PDF document
print(f"Successfully extracted text from {raw_file_name}.")
return text

except Exception as e:
print(f"Error in OCR for {raw_file_name}: {e}")
print(f"Error processing PDF {raw_file_name}: {e}")
return None
105 changes: 76 additions & 29 deletions Preprocessing/preprocessing_pipeline/text_cleaning.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,28 @@
import os
import streamlit as st
import tiktoken # For OpenAI-compatible tokenization
from openai import OpenAI
import tiktoken # Use tiktoken for OpenAI-compatible tokenization
from utils.env_setup import load_env
from utils.azure_blob_utils import download_from_azure, upload_to_azure

# Load environment variables
load_env()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
from utils.azure_blob_utils import download_from_azure

# Initialize tiktoken for OpenAI's GPT models
tokenizer = tiktoken.encoding_for_model("gpt-3.5-turbo") # Specify the OpenAI model


def get_openai_client():
"""
Retrieves the OpenAI client using the API key from Streamlit session state.

Returns:
OpenAI: OpenAI client object.
"""
api_keys = st.session_state.get("api_keys", {})
openai_api_key = api_keys.get("OPENAI_API_KEY")

if not openai_api_key:
raise ValueError("OpenAI API Key is missing. Please configure it on the Home Page.")

return OpenAI(api_key=openai_api_key)


def tokenize_and_split_text(text, max_chunk_size=250):
"""
Tokenizes and splits text into smaller chunks within the token size limit.
Expand All @@ -22,6 +34,10 @@ def tokenize_and_split_text(text, max_chunk_size=250):
Returns:
list of str: List of smaller text chunks.
"""
# Validate text input
if not text or text.strip() == "":
raise ValueError("Text input is empty or invalid.")

# Tokenize the text into tokens
tokens = tokenizer.encode(text)

Expand All @@ -32,12 +48,14 @@ def tokenize_and_split_text(text, max_chunk_size=250):
]
return chunks

def clean_text_chunk(chunk):

def clean_text_chunk(chunk, openai_client):
"""
Cleans a single chunk of text using OpenAI GPT.

Args:
chunk (str): Text chunk to clean.
openai_client (OpenAI): OpenAI client instance.

Returns:
str: Cleaned text.
Expand All @@ -51,13 +69,18 @@ def clean_text_chunk(chunk):
{"role": "user", "content": f"Clean the following text for readability: {chunk}"}
]

response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=messages,
max_tokens=2000,
temperature=0.5
)
return response.choices[0].message.content.strip()
try:
response = openai_client.chat.completions.create(
model="gpt-3.5-turbo",
messages=messages,
max_tokens=2000,
temperature=0.5
)
return response.choices[0].message.content.strip()
except Exception as e:
print(f"Error during chunk cleaning: {e}")
return f"Error in chunk cleaning: {e}"


def clean_text(dirty_file_name):
"""
Expand All @@ -69,16 +92,40 @@ def clean_text(dirty_file_name):
Returns:
str: Combined cleaned text.
"""
print(f"Downloading {dirty_file_name} from Azure Blob Storage...")
dirty_content = download_from_azure("dirty", dirty_file_name)

# Tokenize and split the text into chunks of 250 tokens
chunks = tokenize_and_split_text(dirty_content, max_chunk_size=250)
cleaned_chunks = []

for i, chunk in enumerate(chunks):
print(f"Cleaning chunk {i + 1}/{len(chunks)}...")
cleaned_chunk = clean_text_chunk(chunk)
cleaned_chunks.append(cleaned_chunk)

return "\n\n".join(cleaned_chunks)
try:
print(f"Downloading {dirty_file_name} from Azure Blob Storage (dirty folder)...")
dirty_content = download_from_azure("dirty", dirty_file_name)

# Validate dirty content
if not dirty_content or dirty_content.strip() == "":
raise ValueError("The downloaded content is empty. Please check the file content.")

# Initialize OpenAI client dynamically
openai_client = get_openai_client()

# Tokenize and split the text into chunks
print("Tokenizing and splitting text into manageable chunks...")
chunks = tokenize_and_split_text(dirty_content, max_chunk_size=250)
cleaned_chunks = []

for i, chunk in enumerate(chunks):
print(f"Cleaning chunk {i + 1}/{len(chunks)}: {chunk[:100]}...")
try:
cleaned_chunk = clean_text_chunk(chunk, openai_client)
except Exception as e:
print(f"Error cleaning chunk {i + 1}: {e}")
cleaned_chunk = f"Error cleaning this chunk: {e}"

if not cleaned_chunk.strip():
print(f"Chunk {i + 1} returned empty after cleaning.")
raise ValueError(f"Chunk {i + 1} cleaning failed. Received empty content.")

cleaned_chunks.append(cleaned_chunk)

print(f"Successfully cleaned {len(chunks)} chunks.")
return "\n\n".join(cleaned_chunks)

except Exception as e:
print(f"Error during text cleaning: {e}")
return None

Loading
Loading