Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add PowerPoint support and API documentation #17

Merged
merged 1 commit into from
Dec 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 30 additions & 10 deletions backend/app/api/routes/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,42 @@
"/convert",
response_model=ConversionResponse,
responses={
413: {"model": ErrorResponse, "description": "File too large"},
413: {"model": ErrorResponse, "description": "File too large (max 10MB)"},
415: {"model": ErrorResponse, "description": "Unsupported file type"},
500: {"model": ErrorResponse, "description": "Internal server error"},
500: {"model": ErrorResponse, "description": "Internal server error during conversion"},
},
description="Convert an uploaded document to markdown format",
summary="Convert Document to Markdown",
tags=["Conversion"],
)
async def convert_document(file: UploadFile) -> ConversionResponse:
"""
Convert an uploaded document to markdown format.
"""Convert an uploaded document to markdown format.

This endpoint accepts various document formats and converts them to markdown,
preserving the document structure and content as much as possible.

Supported formats:
- PDF files (with OCR for scanned documents)
- Images (JPEG, PNG, GIF, WebP with OCR)
- Microsoft Word documents (DOC, DOCX)
- HTML files (with table and list preservation)
- Microsoft PowerPoint presentations (PPTX)

Features:
- Automatic file type detection
- OCR for scanned documents and images
- Table structure recognition
- List and heading preservation
- Image extraction and embedding
- File size validation (max 10MB)

Returns:
- Markdown content
- Original filename
- Detected MIME type
- File size

Supports the following formats:
- PDF
- Images (JPEG, PNG, GIF, WebP)
- Microsoft Word (DOC, DOCX)
- HTML
- Microsoft PowerPoint (PPTX)
Note: Speaker notes in PowerPoint presentations are not currently supported.
"""
converter = DocumentConverter()

Expand Down
109 changes: 91 additions & 18 deletions backend/app/core/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,28 @@
from fastapi import HTTPException, UploadFile
from docling.document_converter import DocumentConverter as DoclingConverter
from docling.datamodel.base_models import InputFormat
from docling.document_converter import PdfFormatOption, WordFormatOption, ImageFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, ImagePipelineOptions
from docling_core.types.doc.labels import GroupLabel, DocItemLabel
from docling.document_converter import (
PdfFormatOption, WordFormatOption, ImageFormatOption,
HTMLFormatOption, PowerpointFormatOption
)
from docling.datamodel.pipeline_options import PipelineOptions, PdfPipelineOptions
from docling.pipeline.simple_pipeline import SimplePipeline

class DocumentConverter:
"""A wrapper class for docling's DocumentConverter that handles file uploads and conversion.

This class provides a high-level interface for converting various document formats to markdown.
It supports the following formats:
- PDF files (with OCR and table structure recognition)
- Images (JPEG, PNG, GIF, WebP with OCR)
- Microsoft Word documents (DOCX)
- HTML files
- Microsoft PowerPoint presentations (PPTX)

The converter handles file type detection, validation, and cleanup automatically.
"""

SUPPORTED_FORMATS = {
'application/pdf': InputFormat.PDF,
'image/jpeg': InputFormat.IMAGE,
Expand All @@ -25,15 +43,22 @@ class DocumentConverter:
MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB

def __init__(self):
"""Initialize the DocumentConverter with format-specific options.

This sets up the docling converter with appropriate pipeline options for each format:
- PDF: OCR and table structure recognition enabled
- Images: Basic OCR enabled
- Word: Default options
- HTML: Default options
- PowerPoint: Default options with SimplePipeline
"""
# Configure PDF pipeline options
pdf_pipeline_options = PdfPipelineOptions()
pdf_pipeline_options.do_ocr = True # Enable OCR for scanned documents
pdf_pipeline_options.do_table_structure = True # Enable table structure recognition

# Configure image pipeline options
image_pipeline_options = ImagePipelineOptions()
image_pipeline_options.do_ocr = True # Enable OCR for images
image_pipeline_options.do_layout_analysis = True # Enable layout analysis for better structure detection
# Configure base pipeline options for other formats
base_pipeline_options = PipelineOptions()

# Create converter with format-specific options
self.converter = DoclingConverter(
Expand All @@ -46,45 +71,82 @@ def __init__(self):
],
format_options={
InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options),
InputFormat.IMAGE: ImageFormatOption(pipeline_options=image_pipeline_options),
InputFormat.DOCX: WordFormatOption(),
InputFormat.IMAGE: ImageFormatOption(pipeline_options=base_pipeline_options),
InputFormat.DOCX: WordFormatOption(pipeline_options=base_pipeline_options),
InputFormat.HTML: HTMLFormatOption(pipeline_options=base_pipeline_options),
InputFormat.PPTX: PowerpointFormatOption(pipeline_options=base_pipeline_options, pipeline_cls=SimplePipeline),
}
)

async def detect_file_type(self, file_path: Path) -> str:
"""Detect the MIME type of a file using python-magic."""
"""Detect the MIME type of a file using python-magic.

Args:
file_path (Path): Path to the file to analyze

Returns:
str: The detected MIME type (e.g., 'application/pdf', 'image/jpeg')
"""
mime = magic.Magic(mime=True)
return mime.from_file(str(file_path))

def validate_file_size(self, file_size: int) -> None:
"""Validate that the file size is within acceptable limits."""
"""Validate that the file size is within acceptable limits.

Args:
file_size (int): Size of the file in bytes

Raises:
HTTPException: If the file size exceeds MAX_FILE_SIZE (413 Payload Too Large)
"""
if file_size > self.MAX_FILE_SIZE:
raise HTTPException(
status_code=413,
detail=f"File size exceeds maximum limit of {self.MAX_FILE_SIZE / 1024 / 1024}MB"
)

def validate_file_type(self, mime_type: str) -> None:
"""Validate that the file type is supported."""
"""Validate that the file type is supported.

Args:
mime_type (str): MIME type to validate

Raises:
HTTPException: If the MIME type is not in SUPPORTED_FORMATS (415 Unsupported Media Type)
"""
if mime_type not in self.SUPPORTED_FORMATS:
raise HTTPException(
status_code=415,
detail=f"Unsupported file type: {mime_type}"
)

async def convert(self, file: UploadFile, save_path: Path) -> dict:
"""
Convert an uploaded file to markdown format.
"""Convert an uploaded file to markdown format.

This method handles the complete conversion process:
1. Reads and validates the uploaded file
2. Saves it temporarily to disk
3. Detects the file type
4. Converts the file to markdown using docling
5. Cleans up temporary files

Args:
file: The uploaded file
save_path: Path where the file should be temporarily saved
file (UploadFile): The uploaded file from FastAPI
save_path (Path): Path where the file should be temporarily saved

Returns:
dict: Contains markdown content and metadata
dict: A dictionary containing:
- content (str): The markdown content
- metadata (dict):
- original_file (str): Original filename
- mime_type (str): Detected MIME type
- file_size (int): Size in bytes

Raises:
HTTPException: If file validation fails or conversion errors occur
HTTPException:
- 413 Payload Too Large: If file size exceeds MAX_FILE_SIZE
- 415 Unsupported Media Type: If file type is not supported
- 500 Internal Server Error: If conversion fails
"""
try:
# Validate file size
Expand All @@ -100,7 +162,18 @@ async def convert(self, file: UploadFile, save_path: Path) -> dict:

# Convert document using the file path
result = self.converter.convert(str(save_path))
markdown_content = result.document.export_to_markdown()

# Handle PowerPoint files specially
if self.SUPPORTED_FORMATS[mime_type] == InputFormat.PPTX:
markdown_content = ""
for item, level in result.document.iterate_items(with_groups=True):
if hasattr(item, 'children'):
for child_ref in item.children:
child = child_ref.resolve(result.document)
if hasattr(child, 'text'):
markdown_content += child.text + "\n\n"
else:
markdown_content = result.document.export_to_markdown()

return {
"content": markdown_content,
Expand Down
56 changes: 54 additions & 2 deletions backend/app/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,37 @@
app = FastAPI(
title=settings.app_name,
version=settings.version,
description="API for converting various document formats to markdown",
description="""
API for converting various document formats to markdown.

This service provides a simple way to convert documents from different formats
to markdown while preserving their structure and content. It supports:

* PDF files (with OCR for scanned documents)
* Images (JPEG, PNG, GIF, WebP with OCR)
* Microsoft Word documents (DOC, DOCX)
* HTML files (with table and list preservation)
* Microsoft PowerPoint presentations (PPTX)

Features:
* Automatic file type detection
* OCR for scanned documents and images
* Table structure recognition
* List and heading preservation
* Image extraction and embedding
* File size validation (max 10MB)

The API is designed to be simple to use with a single endpoint for conversion
and a health check endpoint for monitoring.
""",
contact={
"name": "OpenHands Team",
"email": "openhands@all-hands.dev",
},
license_info={
"name": "MIT",
"url": "https://opensource.org/licenses/MIT",
},
)

# Configure CORS
Expand All @@ -28,7 +58,29 @@
@app.get(
"/api/health",
tags=["health"],
description="Check if the service is healthy",
summary="Health Check",
description="Check if the service is healthy and ready to accept requests",
response_description="Service health status",
responses={
200: {
"description": "Service is healthy",
"content": {
"application/json": {
"example": {"status": "healthy"}
}
}
}
}
)
async def health_check():
"""Check the health status of the service.

This endpoint can be used by monitoring tools to verify that:
- The service is running and responding to requests
- The web server is properly configured
- The API routes are accessible

Returns:
dict: A simple status message indicating the service is healthy
"""
return {"status": "healthy"}
42 changes: 39 additions & 3 deletions backend/app/schemas/documents.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,44 @@
from typing import Dict
from pydantic import BaseModel, Field

class ConversionMetadata(BaseModel):
"""Metadata about the converted document."""
original_file: str = Field(..., description="Original filename of the uploaded document")
mime_type: str = Field(..., description="Detected MIME type of the document")
file_size: int = Field(..., description="Size of the document in bytes")

class ConversionResponse(BaseModel):
content: str = Field(..., description="The converted markdown content")
metadata: dict = Field(..., description="Additional information about the conversion")
"""Response model for successful document conversion."""
content: str = Field(
...,
description="The converted markdown content",
examples=["# Document Title\n\nThis is a paragraph.\n\n* List item 1\n* List item 2"]
)
metadata: ConversionMetadata = Field(
...,
description="Additional information about the converted document"
)

class Config:
json_schema_extra = {
"example": {
"content": "# Sample Document\n\nThis is a paragraph.\n\n* List item 1\n* List item 2",
"metadata": {
"original_file": "document.pdf",
"mime_type": "application/pdf",
"file_size": 12345
}
}
}

class ErrorResponse(BaseModel):
detail: str = Field(..., description="Error message describing what went wrong")
"""Response model for conversion errors."""
detail: str = Field(
...,
description="Error message describing what went wrong",
examples=[
"File size exceeds maximum limit of 10MB",
"Unsupported file type: application/x-binary",
"Error during document conversion: Failed to parse PDF structure"
]
)
2 changes: 2 additions & 0 deletions backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,5 @@ httpx>=0.24.1 # Required by TestClient
# Additional test dependencies
reportlab>=4.1.0 # For creating test PDFs
pillow>=10.4.0 # For creating test images
python-pptx>=0.6.21 # For creating test PowerPoint files
python-magic>=0.4.27 # For file type detection
Loading