herrschmidt · herrschmidt · Dec 26, 2024 · Dec 26, 2024
diff --git a/backend/app/api/routes/convert.py b/backend/app/api/routes/convert.py
@@ -10,22 +10,42 @@
     "/convert",
     response_model=ConversionResponse,
     responses={
-        413: {"model": ErrorResponse, "description": "File too large"},
+        413: {"model": ErrorResponse, "description": "File too large (max 10MB)"},
         415: {"model": ErrorResponse, "description": "Unsupported file type"},
-        500: {"model": ErrorResponse, "description": "Internal server error"},
+        500: {"model": ErrorResponse, "description": "Internal server error during conversion"},
     },
     description="Convert an uploaded document to markdown format",
+    summary="Convert Document to Markdown",
+    tags=["Conversion"],
 )
 async def convert_document(file: UploadFile) -> ConversionResponse:
-    """
-    Convert an uploaded document to markdown format.
+    """Convert an uploaded document to markdown format.
+
+    This endpoint accepts various document formats and converts them to markdown,
+    preserving the document structure and content as much as possible.
+
+    Supported formats:
+    - PDF files (with OCR for scanned documents)
+    - Images (JPEG, PNG, GIF, WebP with OCR)
+    - Microsoft Word documents (DOC, DOCX)
+    - HTML files (with table and list preservation)
+    - Microsoft PowerPoint presentations (PPTX)
+
+    Features:
+    - Automatic file type detection
+    - OCR for scanned documents and images
+    - Table structure recognition
+    - List and heading preservation
+    - Image extraction and embedding
+    - File size validation (max 10MB)
+
+    Returns:
+    - Markdown content
+    - Original filename
+    - Detected MIME type
+    - File size
 
-    Supports the following formats:
-    - PDF
-    - Images (JPEG, PNG, GIF, WebP)
-    - Microsoft Word (DOC, DOCX)
-    - HTML
-    - Microsoft PowerPoint (PPTX)
+    Note: Speaker notes in PowerPoint presentations are not currently supported.
     """
     converter = DocumentConverter()
 

diff --git a/backend/app/core/converter.py b/backend/app/core/converter.py
@@ -6,10 +6,28 @@
 from fastapi import HTTPException, UploadFile
 from docling.document_converter import DocumentConverter as DoclingConverter
 from docling.datamodel.base_models import InputFormat
-from docling.document_converter import PdfFormatOption, WordFormatOption, ImageFormatOption
-from docling.datamodel.pipeline_options import PdfPipelineOptions, ImagePipelineOptions
+from docling_core.types.doc.labels import GroupLabel, DocItemLabel
+from docling.document_converter import (
+    PdfFormatOption, WordFormatOption, ImageFormatOption,
+    HTMLFormatOption, PowerpointFormatOption
+)
+from docling.datamodel.pipeline_options import PipelineOptions, PdfPipelineOptions
+from docling.pipeline.simple_pipeline import SimplePipeline
 
 class DocumentConverter:
+    """A wrapper class for docling's DocumentConverter that handles file uploads and conversion.
+
+    This class provides a high-level interface for converting various document formats to markdown.
+    It supports the following formats:
+    - PDF files (with OCR and table structure recognition)
+    - Images (JPEG, PNG, GIF, WebP with OCR)
+    - Microsoft Word documents (DOCX)
+    - HTML files
+    - Microsoft PowerPoint presentations (PPTX)
+
+    The converter handles file type detection, validation, and cleanup automatically.
+    """
+
     SUPPORTED_FORMATS = {
         'application/pdf': InputFormat.PDF,
         'image/jpeg': InputFormat.IMAGE,
@@ -25,15 +43,22 @@ class DocumentConverter:
     MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
 
     def __init__(self):
+        """Initialize the DocumentConverter with format-specific options.
+
+        This sets up the docling converter with appropriate pipeline options for each format:
+        - PDF: OCR and table structure recognition enabled
+        - Images: Basic OCR enabled
+        - Word: Default options
+        - HTML: Default options
+        - PowerPoint: Default options with SimplePipeline
+        """
         # Configure PDF pipeline options
         pdf_pipeline_options = PdfPipelineOptions()
         pdf_pipeline_options.do_ocr = True  # Enable OCR for scanned documents
         pdf_pipeline_options.do_table_structure = True  # Enable table structure recognition
 
-        # Configure image pipeline options
-        image_pipeline_options = ImagePipelineOptions()
-        image_pipeline_options.do_ocr = True  # Enable OCR for images
-        image_pipeline_options.do_layout_analysis = True  # Enable layout analysis for better structure detection
+        # Configure base pipeline options for other formats
+        base_pipeline_options = PipelineOptions()
 
         # Create converter with format-specific options
         self.converter = DoclingConverter(
@@ -46,45 +71,82 @@ def __init__(self):
             ],
             format_options={
                 InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options),
-                InputFormat.IMAGE: ImageFormatOption(pipeline_options=image_pipeline_options),
-                InputFormat.DOCX: WordFormatOption(),
+                InputFormat.IMAGE: ImageFormatOption(pipeline_options=base_pipeline_options),
+                InputFormat.DOCX: WordFormatOption(pipeline_options=base_pipeline_options),
+                InputFormat.HTML: HTMLFormatOption(pipeline_options=base_pipeline_options),
+                InputFormat.PPTX: PowerpointFormatOption(pipeline_options=base_pipeline_options, pipeline_cls=SimplePipeline),
             }
         )
 
     async def detect_file_type(self, file_path: Path) -> str:
-        """Detect the MIME type of a file using python-magic."""
+        """Detect the MIME type of a file using python-magic.
+
+        Args:
+            file_path (Path): Path to the file to analyze
+
+        Returns:
+            str: The detected MIME type (e.g., 'application/pdf', 'image/jpeg')
+        """
         mime = magic.Magic(mime=True)
         return mime.from_file(str(file_path))
 
     def validate_file_size(self, file_size: int) -> None:
-        """Validate that the file size is within acceptable limits."""
+        """Validate that the file size is within acceptable limits.
+
+        Args:
+            file_size (int): Size of the file in bytes
+
+        Raises:
+            HTTPException: If the file size exceeds MAX_FILE_SIZE (413 Payload Too Large)
+        """
         if file_size > self.MAX_FILE_SIZE:
             raise HTTPException(
                 status_code=413,
                 detail=f"File size exceeds maximum limit of {self.MAX_FILE_SIZE / 1024 / 1024}MB"
             )
 
     def validate_file_type(self, mime_type: str) -> None:
-        """Validate that the file type is supported."""
+        """Validate that the file type is supported.
+
+        Args:
+            mime_type (str): MIME type to validate
+
+        Raises:
+            HTTPException: If the MIME type is not in SUPPORTED_FORMATS (415 Unsupported Media Type)
+        """
         if mime_type not in self.SUPPORTED_FORMATS:
             raise HTTPException(
                 status_code=415,
                 detail=f"Unsupported file type: {mime_type}"
             )
 
     async def convert(self, file: UploadFile, save_path: Path) -> dict:
-        """
-        Convert an uploaded file to markdown format.
+        """Convert an uploaded file to markdown format.
+
+        This method handles the complete conversion process:
+        1. Reads and validates the uploaded file
+        2. Saves it temporarily to disk
+        3. Detects the file type
+        4. Converts the file to markdown using docling
+        5. Cleans up temporary files
 
         Args:
-            file: The uploaded file
-            save_path: Path where the file should be temporarily saved
+            file (UploadFile): The uploaded file from FastAPI
+            save_path (Path): Path where the file should be temporarily saved
 
         Returns:
-            dict: Contains markdown content and metadata
+            dict: A dictionary containing:
+                - content (str): The markdown content
+                - metadata (dict):
+                    - original_file (str): Original filename
+                    - mime_type (str): Detected MIME type
+                    - file_size (int): Size in bytes
 
         Raises:
-            HTTPException: If file validation fails or conversion errors occur
+            HTTPException:
+                - 413 Payload Too Large: If file size exceeds MAX_FILE_SIZE
+                - 415 Unsupported Media Type: If file type is not supported
+                - 500 Internal Server Error: If conversion fails
         """
         try:
             # Validate file size
@@ -100,7 +162,18 @@ async def convert(self, file: UploadFile, save_path: Path) -> dict:
 
             # Convert document using the file path
             result = self.converter.convert(str(save_path))
-            markdown_content = result.document.export_to_markdown()
+
+            # Handle PowerPoint files specially
+            if self.SUPPORTED_FORMATS[mime_type] == InputFormat.PPTX:
+                markdown_content = ""
+                for item, level in result.document.iterate_items(with_groups=True):
+                    if hasattr(item, 'children'):
+                        for child_ref in item.children:
+                            child = child_ref.resolve(result.document)
+                            if hasattr(child, 'text'):
+                                markdown_content += child.text + "\n\n"
+            else:
+                markdown_content = result.document.export_to_markdown()
 
             return {
                 "content": markdown_content,

diff --git a/backend/app/main.py b/backend/app/main.py
@@ -6,7 +6,37 @@
 app = FastAPI(
     title=settings.app_name,
     version=settings.version,
-    description="API for converting various document formats to markdown",
+    description="""
+    API for converting various document formats to markdown.
+
+    This service provides a simple way to convert documents from different formats
+    to markdown while preserving their structure and content. It supports:
+
+    * PDF files (with OCR for scanned documents)
+    * Images (JPEG, PNG, GIF, WebP with OCR)
+    * Microsoft Word documents (DOC, DOCX)
+    * HTML files (with table and list preservation)
+    * Microsoft PowerPoint presentations (PPTX)
+
+    Features:
+    * Automatic file type detection
+    * OCR for scanned documents and images
+    * Table structure recognition
+    * List and heading preservation
+    * Image extraction and embedding
+    * File size validation (max 10MB)
+
+    The API is designed to be simple to use with a single endpoint for conversion
+    and a health check endpoint for monitoring.
+    """,
+    contact={
+        "name": "OpenHands Team",
+        "email": "openhands@all-hands.dev",
+    },
+    license_info={
+        "name": "MIT",
+        "url": "https://opensource.org/licenses/MIT",
+    },
 )
 
 # Configure CORS
@@ -28,7 +58,29 @@
 @app.get(
     "/api/health",
     tags=["health"],
-    description="Check if the service is healthy",
+    summary="Health Check",
+    description="Check if the service is healthy and ready to accept requests",
+    response_description="Service health status",
+    responses={
+        200: {
+            "description": "Service is healthy",
+            "content": {
+                "application/json": {
+                    "example": {"status": "healthy"}
+                }
+            }
+        }
+    }
 )
 async def health_check():
+    """Check the health status of the service.
+
+    This endpoint can be used by monitoring tools to verify that:
+    - The service is running and responding to requests
+    - The web server is properly configured
+    - The API routes are accessible
+
+    Returns:
+        dict: A simple status message indicating the service is healthy
+    """
     return {"status": "healthy"}
diff --git a/backend/app/schemas/documents.py b/backend/app/schemas/documents.py
@@ -1,8 +1,44 @@
+from typing import Dict
 from pydantic import BaseModel, Field
 
+class ConversionMetadata(BaseModel):
+    """Metadata about the converted document."""
+    original_file: str = Field(..., description="Original filename of the uploaded document")
+    mime_type: str = Field(..., description="Detected MIME type of the document")
+    file_size: int = Field(..., description="Size of the document in bytes")
+
 class ConversionResponse(BaseModel):
-    content: str = Field(..., description="The converted markdown content")
-    metadata: dict = Field(..., description="Additional information about the conversion")
+    """Response model for successful document conversion."""
+    content: str = Field(
+        ...,
+        description="The converted markdown content",
+        examples=["# Document Title\n\nThis is a paragraph.\n\n* List item 1\n* List item 2"]
+    )
+    metadata: ConversionMetadata = Field(
+        ...,
+        description="Additional information about the converted document"
+    )
+
+    class Config:
+        json_schema_extra = {
+            "example": {
+                "content": "# Sample Document\n\nThis is a paragraph.\n\n* List item 1\n* List item 2",
+                "metadata": {
+                    "original_file": "document.pdf",
+                    "mime_type": "application/pdf",
+                    "file_size": 12345
+                }
+            }
+        }
 
 class ErrorResponse(BaseModel):
-    detail: str = Field(..., description="Error message describing what went wrong")
+    """Response model for conversion errors."""
+    detail: str = Field(
+        ...,
+        description="Error message describing what went wrong",
+        examples=[
+            "File size exceeds maximum limit of 10MB",
+            "Unsupported file type: application/x-binary",
+            "Error during document conversion: Failed to parse PDF structure"
+        ]
+    )
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -15,3 +15,5 @@ httpx>=0.24.1  # Required by TestClient
 # Additional test dependencies
 reportlab>=4.1.0  # For creating test PDFs
 pillow>=10.4.0  # For creating test images
+python-pptx>=0.6.21  # For creating test PowerPoint files
+python-magic>=0.4.27  # For file type detection