Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pytest + OCR Addition #55

Merged
merged 1 commit into from
Nov 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 52 additions & 28 deletions Preprocessing/App/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,17 @@
sys.path.append(python_path)

# Import dependencies
import streamlit as st

Check failure on line 16 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E402)

Preprocessing/App/main.py:16:1: E402 Module level import not at top of file

Check failure on line 16 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E402)

Preprocessing/App/main.py:16:1: E402 Module level import not at top of file
import weaviate # Import Weaviate client

Check failure on line 17 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E402)

Preprocessing/App/main.py:17:1: E402 Module level import not at top of file

Check failure on line 17 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E402)

Preprocessing/App/main.py:17:1: E402 Module level import not at top of file
from preprocessing_pipeline.pdf_conversion import convert_pdf_to_text

Check failure on line 18 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E402)

Preprocessing/App/main.py:18:1: E402 Module level import not at top of file

Check failure on line 18 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E402)

Preprocessing/App/main.py:18:1: E402 Module level import not at top of file
from preprocessing_pipeline.audio_transcription import transcribe_audio

Check failure on line 19 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E402)

Preprocessing/App/main.py:19:1: E402 Module level import not at top of file

Check failure on line 19 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E402)

Preprocessing/App/main.py:19:1: E402 Module level import not at top of file
from preprocessing_pipeline.text_cleaning import clean_text

Check failure on line 20 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E402)

Preprocessing/App/main.py:20:1: E402 Module level import not at top of file

Check failure on line 20 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E402)

Preprocessing/App/main.py:20:1: E402 Module level import not at top of file
from preprocessing_pipeline.chunking_vector_embedding import tokenize_and_embed_text

Check failure on line 21 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E402)

Preprocessing/App/main.py:21:1: E402 Module level import not at top of file

Check failure on line 21 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E402)

Preprocessing/App/main.py:21:1: E402 Module level import not at top of file
from utils.azure_blob_utils import (
upload_to_azure,
download_from_azure,
list_blobs_in_folder
)

Check failure on line 26 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E402)

Preprocessing/App/main.py:22:1: E402 Module level import not at top of file

Check failure on line 26 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (E402)

Preprocessing/App/main.py:22:1: E402 Module level import not at top of file

# Set up Weaviate client
client = weaviate.Client(
Expand Down Expand Up @@ -202,7 +202,7 @@
st.error("Failed to extract text from the PDF.")

# Stage 3: Clean Text and Upload to Clean
dirty_content = download_from_azure("dirty", dirty_file_name)

Check failure on line 205 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F841)

Preprocessing/App/main.py:205:9: F841 Local variable `dirty_content` is assigned to but never used

Check failure on line 205 in Preprocessing/App/main.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (F841)

Preprocessing/App/main.py:205:9: F841 Local variable `dirty_content` is assigned to but never used
with st.spinner("Cleaning text using generative AI..."):
cleaned_text = clean_text(dirty_file_name)
clean_file_name = generate_file_name(metadata, "Cleaned") + ".txt"
Expand Down Expand Up @@ -232,37 +232,61 @@
def view_documents_page():
st.title("Uploaded Documents")
try:
# Fetch blobs from each folder
raw_blobs = list_blobs_in_folder("raw")
dirty_blobs = list_blobs_in_folder("dirty")
clean_blobs = list_blobs_in_folder("clean")

# Display documents by category
if raw_blobs:
st.subheader("Raw Documents")
for blob in raw_blobs:
st.write(f"- {blob}")
if st.button(f"Download {blob}", key=f"download_raw_{blob}"):
file_content = download_from_azure("raw", blob)
st.download_button("Download", data=file_content, file_name=blob)

if dirty_blobs:
st.subheader("Dirty Documents")
for blob in dirty_blobs:
st.write(f"- {blob}")
if st.button(f"Download {blob}", key=f"download_dirty_{blob}"):
file_content = download_from_azure("dirty", blob)
st.download_button("Download", data=file_content, file_name=blob)

if clean_blobs:
st.subheader("Clean Documents")
for blob in clean_blobs:
st.write(f"- {blob}")
if st.button(f"Download {blob}", key=f"download_clean_{blob}"):
file_content = download_from_azure("clean", blob)
st.download_button("Download", data=file_content, file_name=blob)

if not raw_blobs and not dirty_blobs and not clean_blobs:
st.write("No documents found in the Azure Blob Storage.")
def group_blobs_by_date(blobs):
"""Groups blobs by their date extracted from the file name."""
grouped = {}
for blob in blobs:
try:
# Extract the file name without folder prefix (e.g., "raw/")
file_name = blob.split("/")[-1] # Get only the file name part

# Extract the date from the file name (assuming format: YYYY_MM_DD)
parts = file_name.split("_") # Split into ['2023', '12', '12', 'BOC', 'Agenda', ...]
date_str = "_".join(parts[:3]) # Join the first three parts: '2023_12_12'

# Convert the date string to a readable format
readable_date = datetime.strptime(date_str, "%Y_%m_%d").strftime("%B %d, %Y")

# Group by the readable date
if readable_date not in grouped:
grouped[readable_date] = []
grouped[readable_date].append(blob)
except (ValueError, IndexError):
# Handle files with unexpected formats
if "Unknown Date" not in grouped:
grouped["Unknown Date"] = []
grouped["Unknown Date"].append(blob)
return grouped

# Group blobs by date
raw_grouped = group_blobs_by_date(raw_blobs)
dirty_grouped = group_blobs_by_date(dirty_blobs)
clean_grouped = group_blobs_by_date(clean_blobs)

# Function to display blobs within a group
def display_grouped_blobs(grouped_blobs, category):
if grouped_blobs:
st.subheader(f"{category.capitalize()} Documents")
for date, blobs in grouped_blobs.items():
with st.expander(f"Date: {date}", expanded=False):
for blob in blobs:
st.write(f"- {blob}")
if st.button(f"Download {blob}", key=f"download_{category}_{blob}"):
file_content = download_from_azure(category, blob)
st.download_button("Download", data=file_content, file_name=blob)
else:
st.info(f"No documents found in the {category} category.")

# Display grouped blobs
display_grouped_blobs(raw_grouped, "raw")
display_grouped_blobs(dirty_grouped, "dirty")
display_grouped_blobs(clean_grouped, "clean")

except Exception as e:
st.error(f"Error fetching documents from Azure Blob Storage: {e}")

Expand All @@ -284,4 +308,4 @@
elif st.session_state.page == "upload":
upload_files_page()
elif st.session_state.page == "view":
view_documents_page()
view_documents_page()
15 changes: 15 additions & 0 deletions Preprocessing/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ The **Preprocessing Pipeline** is a staff-facing application designed to streaml
#### For Agendas and Minutes 📄:
1. 📥 **Upload**: Files are uploaded to Azure’s "Raw Data" folder.
2. 📄 **PDF-to-Text Conversion**: Files are converted to text using a PDF conversion utility.
- If the PDF contains scanned images, `easyocr` is used as a fallback for Optical Character Recognition (OCR).
3. 🛠️ **Cleaning**: The raw text is saved in a "Dirty Folder," tokenized, chunked, and sent to OpenAI for cleaning.
4. 📊 **Vectorization**: The cleaned text is embedded using **text-embedding-ada-002**.
5. 💾 **Storage**: Vectorized data is stored in Weaviate Cloud for further analysis and retrieval.
Expand All @@ -56,6 +57,20 @@ The **Preprocessing Pipeline** is a staff-facing application designed to streaml

---

## 🧪 Testing with Pytest

The project includes a `pytest` file to validate key components of the pipeline. Here’s what you can test:
1. **Dependencies Check**:
- Ensure all required dependencies are installed.
2. **Environment Variables Check**:
- Verify that all environment variables (e.g., API keys, connection strings) are properly set up.
3. **Azure Upload and Download**:
- Test uploading and downloading files to/from Azure Blob Storage folders (`raw`, `dirty`, `clean`).
4. **PDF Conversion**:
- Test the `convert_pdf_to_text` function to extract text from a PDF, including fallback OCR with `easyocr` for scanned PDFs.

---

## ⚙️ Setting Up Locally

### 🔑 Prerequisites
Expand Down
6 changes: 4 additions & 2 deletions Preprocessing/docker/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ PyMuPDF
# azure portal
azure.storage.blob

# additional libraries
transformers

chardet
chardet
pytest
easyocr
30 changes: 24 additions & 6 deletions Preprocessing/preprocessing_pipeline/pdf_conversion.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
import fitz # PyMuPDF
import easyocr
from PIL import Image
from io import BytesIO
import numpy as np
from utils.azure_blob_utils import download_from_azure

def convert_pdf_to_text(raw_file_name):
"""
Extracts text from a PDF file.
Extracts text from a PDF file. Uses EasyOCR as a fallback for scanned PDFs.

Args:
raw_file_name (str): Name of the PDF file in Azure Blob Storage (raw folder).
Expand All @@ -15,17 +19,31 @@ def convert_pdf_to_text(raw_file_name):
# Step 1: Download the raw file from Azure Blob Storage
raw_content = download_from_azure("raw", raw_file_name, as_text=False)

# Step 2: Open the PDF content and extract text
text = ""
# Step 2: Open the PDF content
pdf_document = fitz.open(stream=raw_content, filetype="pdf")
text = ""
reader = easyocr.Reader(['en']) # Initialize EasyOCR for English

for page_num in range(pdf_document.page_count):
page = pdf_document[page_num]
text += page.get_text()
pdf_document.close()

# Attempt to extract text directly
page_text = page.get_text()
if page_text.strip(): # If direct text is available
print(f"Text extracted directly from page {page_num + 1}.")
text += page_text
else: # Fallback to OCR for scanned pages
print(f"Applying OCR on page {page_num + 1} of {raw_file_name}.")
pix = page.get_pixmap(dpi=300) # Render page to an image
img = Image.open(BytesIO(pix.tobytes("png")))
img_array = np.array(img) # Convert PIL Image to NumPy array for EasyOCR
ocr_text = reader.readtext(img_array, detail=0) # Extract text with EasyOCR
text += "\n".join(ocr_text)

pdf_document.close()
print(f"Successfully extracted text from {raw_file_name}.")
return text

except Exception as e:
print(f"Error extracting text from PDF {raw_file_name}: {e}")
print(f"Error in OCR for {raw_file_name}: {e}")
return None
Binary file added Preprocessing/tests/Test_Minutes.pdf
Binary file not shown.
Loading
Loading