Skip to content

Commit

Permalink
Merge pull request #46 from dsba6010-llm-applications/Preprocess-Updates
Browse files Browse the repository at this point in the history
Preprocess updates
  • Loading branch information
neal-logan authored Nov 21, 2024
2 parents 03dec04 + 94bfff6 commit 8920d30
Show file tree
Hide file tree
Showing 16 changed files with 710 additions and 17 deletions.
File renamed without changes.
2 changes: 2 additions & 0 deletions Preprocessing/.streamlit/config.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[server]
maxUploadSize = 1000 # Set the upload size limit in MB
287 changes: 287 additions & 0 deletions Preprocessing/App/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,287 @@
# Standard Python imports
import os
import sys
from datetime import datetime

# Load environment variables and set Python path
from dotenv import load_dotenv
load_dotenv()

# Set PYTHONPATH from .env if available
python_path = os.getenv("PYTHONPATH")
if python_path:
sys.path.append(python_path)

# Import dependencies
import streamlit as st

Check failure on line 16 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E402)

Preprocessing/App/main.py:16:1: E402 Module level import not at top of file
import weaviate # Import Weaviate client

Check failure on line 17 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E402)

Preprocessing/App/main.py:17:1: E402 Module level import not at top of file
from preprocessing_pipeline.pdf_conversion import convert_pdf_to_text

Check failure on line 18 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E402)

Preprocessing/App/main.py:18:1: E402 Module level import not at top of file
from preprocessing_pipeline.audio_transcription import transcribe_audio

Check failure on line 19 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E402)

Preprocessing/App/main.py:19:1: E402 Module level import not at top of file
from preprocessing_pipeline.text_cleaning import clean_text

Check failure on line 20 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E402)

Preprocessing/App/main.py:20:1: E402 Module level import not at top of file
from preprocessing_pipeline.chunking_vector_embedding import tokenize_and_embed_text

Check failure on line 21 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E402)

Preprocessing/App/main.py:21:1: E402 Module level import not at top of file
from utils.azure_blob_utils import (
upload_to_azure,
download_from_azure,
list_blobs_in_folder
)

Check failure on line 26 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E402)

Preprocessing/App/main.py:22:1: E402 Module level import not at top of file

# Set up Weaviate client
client = weaviate.Client(
url=os.getenv("WEAVIATE_URL"),
auth_client_secret=weaviate.AuthApiKey(api_key=os.getenv("WEAVIATE_API_KEY"))
)

# Helper function: Generate standardized file names
def generate_file_name(metadata, stage):
meeting_date = metadata["meeting_date"].strftime("%Y_%m_%d")
meeting_type = "BOC" if metadata["meeting_type"] == "Board of Commissioners" else "PB"
file_type = metadata["file_type"]
return f"{meeting_date}_{meeting_type}_{file_type}_{stage}"

# Helper function: Check and overwrite files in local storage
def save_file_with_overwrite(file_path, content):
if os.path.exists(file_path):
os.remove(file_path) # Overwrite existing file
with open(file_path, "w") as f:
f.write(content)

# Helper function: Fetch documents from Weaviate
def fetch_uploaded_documents():
query = """
{
Get {
Documents {
file_name
file_type
meeting_date
meeting_type
clean_text
chunks
}
}
}
"""
response = client.query.raw(query)
documents = response.get("data", {}).get("Get", {}).get("Documents", [])
return documents

# Home Page
def home_page():
# Custom styling with IBM Plex Mono
st.markdown("""
<style>
.main {
background: #f0f2e9;
font-family: 'IBM Plex Mono', monospace;
}
.title-container {
display: flex;
align-items: center;
justify-content: center;
gap: 50px;
height: 50vh;
flex-direction: column;
}
.main-text {
font-size: 150px;
color: #0D6051;
opacity: 0.9;
font-weight: 700;
font-family: 'IBM Plex Mono', monospace;
line-height: 1;
text-align: center;
}
.description {
font-family: 'IBM Plex Mono', monospace;
font-size: 18px;
color: #263d36;
text-align: center;
margin-top: 20px;
}
.stButton>button {
background-color: #0D6051;
color: white;
font-size: 25px;
font-weight: bold;
padding: 15px 30px;
border-radius: 10px;
border: none;
cursor: pointer;
}
.stButton>button:hover {
background-color: #2f8479;
}
</style>
""", unsafe_allow_html=True)

st.markdown("""
<div class="title-container">
<h1 class="main-text">Minute Mate</h1>
<p class="description">
Welcome to Minute Mate; this is a staff-level application to upload meeting audios, minutes, and agendas to provide further context to the front end.
</p>
</div>
""", unsafe_allow_html=True)

# Navigation buttons
col1, col2 = st.columns([1, 1])
with col1:
if st.button("Upload Files", key="upload", help="Upload meeting documents and audio files"):
st.session_state.page = "upload"
with col2:
if st.button("View Documents", key="view", help="View the documents that have been uploaded"):
st.session_state.page = "view"

# Upload Files Page
def upload_files_page():
st.title("Upload Municipal Meeting Documents")

# Sidebar for metadata and options selection
st.sidebar.header("Document Metadata & Transcription Options")
meeting_date = st.sidebar.date_input("Select Meeting Date", datetime.today())
meeting_type = st.sidebar.selectbox("Meeting Type", ["Planning Board", "Board of Commissioners"])
file_type = st.sidebar.radio("File Type", ["Agenda", "Minutes", "Audio"])
model_option = st.sidebar.selectbox("Select Transcription Model", ["default", "best", "nano"])
speaker_labels = st.sidebar.checkbox("Enable Speaker Diarization")

# Save metadata
if st.sidebar.button("Save Metadata"):
st.session_state["metadata"] = {
"meeting_date": meeting_date,
"meeting_type": meeting_type,
"file_type": file_type,
"model": model_option,
"speaker_labels": speaker_labels
}

st.header("Upload New Document")
file = st.file_uploader("Choose a file to upload", type=["pdf", "mp3", "wav"])

# Initialize progress bar
progress_bar = st.progress(0)

if file and "metadata" in st.session_state:
metadata = st.session_state["metadata"]

# Preserve the original file extension
file_extension = os.path.splitext(file.name)[1]
raw_file_name = f"{generate_file_name(metadata, 'Raw')}{file_extension}"

# Stage 1: Upload to Raw
upload_to_azure("raw", raw_file_name, file.read())
st.write(f"Uploaded file to Azure `raw/` folder: {raw_file_name}")
progress_bar.progress(20)

# Stage 2: Process based on file type
if metadata["file_type"] == "Audio" and file_extension in [".mp3", ".wav"]:
with st.spinner(f"Transcribing audio using {metadata['model']} model..."):
transcribed_text = transcribe_audio(
raw_file_name=raw_file_name,
model=metadata["model"],
speaker_labels=metadata["speaker_labels"]
)
if transcribed_text:
dirty_file_name = generate_file_name(metadata, "Transcription") + ".txt"
upload_to_azure("dirty", dirty_file_name, transcribed_text)
st.write(f"Uploaded transcription to `dirty/` folder: {dirty_file_name}")
st.text_area("Transcribed Audio Text:", transcribed_text, height=200)
st.download_button("Download Transcribed Text", data=transcribed_text, file_name=dirty_file_name)
else:
st.error("Failed to transcribe the audio.")

elif metadata["file_type"] in ["Agenda", "Minutes"] and file_extension == ".pdf":
with st.spinner("Extracting text from PDF..."):
extracted_text = convert_pdf_to_text(raw_file_name)
if extracted_text:
dirty_file_name = generate_file_name(metadata, "TextExtraction") + ".txt"
upload_to_azure("dirty", dirty_file_name, extracted_text)
st.write(f"Uploaded extracted text to `dirty/` folder: {dirty_file_name}")
st.text_area("Extracted PDF Text:", extracted_text, height=200)
st.download_button("Download Extracted Text", data=extracted_text, file_name=dirty_file_name)
else:
st.error("Failed to extract text from the PDF.")

# Stage 3: Clean Text and Upload to Clean
dirty_content = download_from_azure("dirty", dirty_file_name)

Check failure on line 205 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F841)

Preprocessing/App/main.py:205:9: F841 Local variable `dirty_content` is assigned to but never used
with st.spinner("Cleaning text using generative AI..."):
cleaned_text = clean_text(dirty_file_name)
clean_file_name = generate_file_name(metadata, "Cleaned") + ".txt"
upload_to_azure("clean", clean_file_name, cleaned_text)
st.write(f"Uploaded cleaned text to `clean/` folder: {clean_file_name}")

# Display cleaned text
st.text_area("Cleaned Text:", cleaned_text, height=200)
st.download_button("Download Cleaned Text", data=cleaned_text, file_name=clean_file_name)

# Stage 4: Chunk and Embed into Weaviate
with st.spinner("Chunking and embedding text into Weaviate..."):
tokenize_and_embed_text(clean_file_name, metadata)
st.success("Document processed and embedded successfully!")
progress_bar.progress(100)

# Navigation buttons
col1, col2 = st.columns([1, 1])
with col1:
if st.button("Return Home"):
st.session_state.page = "home"
with col2:
if st.button("View Documents"):
st.session_state.page = "view"

# View Documents Page
def view_documents_page():
st.title("Uploaded Documents")
try:
raw_blobs = list_blobs_in_folder("raw")
dirty_blobs = list_blobs_in_folder("dirty")
clean_blobs = list_blobs_in_folder("clean")

# Display documents by category
if raw_blobs:
st.subheader("Raw Documents")
for blob in raw_blobs:
st.write(f"- {blob}")
if st.button(f"Download {blob}", key=f"download_raw_{blob}"):
file_content = download_from_azure("raw", blob)
st.download_button("Download", data=file_content, file_name=blob)

if dirty_blobs:
st.subheader("Dirty Documents")
for blob in dirty_blobs:
st.write(f"- {blob}")
if st.button(f"Download {blob}", key=f"download_dirty_{blob}"):
file_content = download_from_azure("dirty", blob)
st.download_button("Download", data=file_content, file_name=blob)

if clean_blobs:
st.subheader("Clean Documents")
for blob in clean_blobs:
st.write(f"- {blob}")
if st.button(f"Download {blob}", key=f"download_clean_{blob}"):
file_content = download_from_azure("clean", blob)
st.download_button("Download", data=file_content, file_name=blob)

if not raw_blobs and not dirty_blobs and not clean_blobs:
st.write("No documents found in the Azure Blob Storage.")
except Exception as e:
st.error(f"Error fetching documents from Azure Blob Storage: {e}")

# Navigation buttons
col1, col2 = st.columns([1, 1])
with col1:
if st.button("Return Home"):
st.session_state.page = "home"
with col2:
if st.button("Upload Files"):
st.session_state.page = "upload"

# Main page selection logic
if "page" not in st.session_state:
st.session_state.page = "home"

if st.session_state.page == "home":
home_page()
elif st.session_state.page == "upload":
upload_files_page()
elif st.session_state.page == "view":
view_documents_page()
11 changes: 0 additions & 11 deletions Preprocessing/App/requirements.txt

This file was deleted.

6 changes: 0 additions & 6 deletions Preprocessing/Dockerfile

This file was deleted.

20 changes: 20 additions & 0 deletions Preprocessing/docker/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Dockerfile to set up the environment for Streamlit app

# Use Python base image
FROM python:3.9-slim

# Set working directory
WORKDIR /app

# Copy requirements file and install dependencies
COPY docker/requirements.txt .
RUN pip install -r requirements.txt

# Copy the application files
COPY . .

# Expose port for Streamlit
EXPOSE 8501

# Run the Streamlit application
CMD ["streamlit", "run", "app/main.py", "--server.port=8501", "--server.address=0.0.0.0"]
24 changes: 24 additions & 0 deletions Preprocessing/docker/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Environment variables management
python-dotenv==1.0.0

# Audio transcription
assemblyai==0.35.1

# Vector database (Weaviate client)
weaviate-client==4.7.1

# Embedding and generation services
openai==1.54.3

# Streamlit for web UI
streamlit

# PDF handling (for PDF to text conversion)
PyMuPDF

# azure portal
azure.storage.blob

transformers

chardet
Empty file.
Loading

0 comments on commit 8920d30

Please sign in to comment.