minor clean up

onyx-dot-app · Dec 18, 2024 · 34c7bee · 34c7bee
1 parent 2290aa9
commit 34c7bee
Show file tree

Hide file tree

Showing 2 changed files with 85 additions and 52 deletions.
diff --git a/backend/onyx/connectors/google_drive/doc_conversion.py b/backend/onyx/connectors/google_drive/doc_conversion.py
@@ -28,7 +28,7 @@
 
 logger = setup_logger()
 
-
+TEXT_SECTION_SEPARATOR = "\n\n"
 # these errors don't represent a failure in the connector, but simply files
 # that can't / shouldn't be indexed
 ERRORS_TO_CONTINUE_ON = [
@@ -44,89 +44,109 @@ def _extract_sections_basic(
     """
     Extracts text from a Google Drive file based on its MIME type.
 
-    This function handles different file types:
-    - Google Sheets are processed using specialized logic via the Sheets API
-    - Other Google formats use MarkItDown when possible
-    - If configured, falls back to Unstructured for text extraction
-
-    The function returns a list of Section objects, each containing a link and the extracted text.
+    This function uses a combination of specialized extraction methods and
+    fallback approaches to handle various file types effectively.
 
-    file_meta: Dict with file metadata (id, name, mimeType, webViewLink)
-    service: Authorized GoogleDriveService instance
+    Args:
+        file_meta: Dict with file metadata (id, name, mimeType, webViewLink)
+        service: Authorized GoogleDriveService instance
 
-    Returns: List of Section objects
+    Returns:
+        List of Section objects containing extracted text
     """
     mime_type = file_meta["mimeType"]
     link = file_meta["webViewLink"]
-    file_id = file_meta["id"]
+    file_meta["id"]
 
-    # If mime_type not recognized, fallback to at least returning something with the link.
+    # Handle unsupported MIME types
     if mime_type not in {item.value for item in GDriveMimeType}:
         return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
 
-    # Handle Google Sheets separately
+    # Specialized handling for Google Sheets
     if mime_type == GDriveMimeType.SPREADSHEET.value:
-        # Attempt Sheets API extraction
         try:
             return _extract_google_sheets(file_meta, service)
         except Exception as e:
             logger.warning(
-                f"Ran into exception '{e}' when pulling data from Google "
-                f"Sheet '{file_meta['name']}'. Falling back to basic extraction."
+                f"Error extracting data from Google Sheet '{file_meta['name']}': {e}. "
+                "Falling back to basic extraction."
             )
-    # Handle PDF files
+
+    # PDF handling
     if mime_type == GDriveMimeType.PDF.value:
-        response = service.files().get_media(fileId=file_meta["id"]).execute()
-        if get_unstructured_api_key():
-            return [
-                Section(
-                    link=link,
-                    text=unstructured_to_text(
-                        file=io.BytesIO(response),
-                        file_name=file_meta.get("name", file_meta["id"]),
-                    ),
-                )
-            ]
-        else:
-            text, _ = read_pdf_file(file=io.BytesIO(response))
-            return [Section(link=link, text=text)]
+        return _extract_pdf_content(file_meta, service)
+
+    # PowerPoint (Google Slides & MS PowerPoint) handling
+    if mime_type in [GDriveMimeType.PPT.value, GDriveMimeType.POWERPOINT.value]:
+        return _extract_presentation_content(file_meta, service)
+
+    # General handling for Google-native and text-based formats
+    return _extract_general_content(file_meta, service)
+
+
+def _extract_pdf_content(
+    file_meta: dict[str, Any], service: GoogleDriveService
+) -> list[Section]:
+    response = service.files().get_media(fileId=file_meta["id"]).execute()
+    if get_unstructured_api_key():
+        text = unstructured_to_text(
+            file=io.BytesIO(response),
+            file_name=file_meta.get("name", file_meta["id"]),
+        )
+    else:
+        text, _ = read_pdf_file(file=io.BytesIO(response))
+    return [Section(link=file_meta["webViewLink"], text=text)]
 
-    # From here on, either it’s not a spreadsheet or the spreadsheet extraction failed.
-    # Try exporting the file for Drive’s native formats or just downloading for
-    # non-native MS Office/PDF, etc.
+
+def _extract_presentation_content(
+    file_meta: dict[str, Any], service: GoogleDriveService
+) -> list[Section]:
+    try:
+        text = (
+            service.files()
+            .export(fileId=file_meta["id"], mimeType="text/plain")
+            .execute()
+            .decode("utf-8")
+        )
+        return [Section(link=file_meta["webViewLink"], text=text)]
+    except Exception:
+        logger.exception("Error extracting presentation text.")
+        return [
+            Section(link=file_meta["webViewLink"], text=UNSUPPORTED_FILE_TYPE_CONTENT)
+        ]
+
+
+def _extract_general_content(
+    file_meta: dict[str, Any], service: GoogleDriveService
+) -> list[Section]:
     try:
-        # If it's a Google-native doc or ppt, we can export to text/plain.
-        # If it's a Google-native spreadsheet (and we reached here), we treat it as text/csv.
-        drive_export_mime = None
+        mime_type = file_meta["mimeType"]
         if mime_type in [
             GDriveMimeType.DOC.value,
-            GDriveMimeType.PPT.value,
             GDriveMimeType.SPREADSHEET.value,
+            GDriveMimeType.PPT.value,
         ]:
-            # Decide on export type
-            drive_export_mime = (
+            export_mime_type = (
                 "text/csv"
                 if mime_type == GDriveMimeType.SPREADSHEET.value
                 else "text/plain"
             )
-            content = (
+            text = (
                 service.files()
-                .export(fileId=file_id, mimeType=drive_export_mime)
+                .export(fileId=file_meta["id"], mimeType=export_mime_type)
                 .execute()
+                .decode("utf-8")
             )
         else:
-            # Non-native files (e.g., PDF, MS Word, etc.) -> direct download
-            content = service.files().get_media(fileId=file_id).execute()
-
-        print(type(content))
-        print("THAT WAS THE TYPE OF THE CONTENT")
-        # Convert the content to text via MarkItDown or Unstructured
-        text_extracted = _convert_gdrive_content_to_text(content, file_meta)
-        return [Section(link=link, text=text_extracted)]
+            content = service.files().get_media(fileId=file_meta["id"]).execute()
+            text = _convert_gdrive_content_to_text(content, file_meta)
 
+        return [Section(link=file_meta["webViewLink"], text=text)]
     except Exception:
-        logger.exception("Unexpected error extracting text from file.")
-        return [Section(link=link, text=UNSUPPORTED_FILE_TYPE_CONTENT)]
+        logger.exception("Error extracting file content.")
+        return [
+            Section(link=file_meta["webViewLink"], text=UNSUPPORTED_FILE_TYPE_CONTENT)
+        ]
 
 
 def _extract_google_sheets(

diff --git a/backend/onyx/file_processing/extract_file_text.py b/backend/onyx/file_processing/extract_file_text.py
@@ -13,6 +13,7 @@
 from typing import IO
 
 import chardet
+import pptx  # type: ignore
 from fastapi import UploadFile
 from markitdown import MarkItDown  # type: ignore
 from pypdf import PdfReader
@@ -244,6 +245,18 @@ def read_pdf_file(
     return "", metadata
 
 
+def pptx_to_text(file: IO[Any]) -> str:
+    presentation = pptx.Presentation(file)
+    text_content = []
+    for slide_number, slide in enumerate(presentation.slides, start=1):
+        extracted_text = f"\nSlide {slide_number}:\n"
+        for shape in slide.shapes:
+            if hasattr(shape, "text"):
+                extracted_text += shape.text + "\n"
+        text_content.append(extracted_text)
+    return TEXT_SECTION_SEPARATOR.join(text_content)
+
+
 def pdf_to_text(file: IO[Any], pdf_pass: str | None = None) -> str:
     """Extract text from a PDF file."""
     # Return only the extracted text from read_pdf_file