From 9c669e94ce4f6afe213509580e6d700b654b4ec1 Mon Sep 17 00:00:00 2001 From: James Braza Date: Tue, 24 Sep 2024 11:19:07 -0700 Subject: [PATCH] Fixing `pymupdf.mupdf.FzErrorFormat` crash by recasting as an `ImpossibleParsingError` (#474) --- paperqa/readers.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/paperqa/readers.py b/paperqa/readers.py index 496b1f62..db3e9513 100644 --- a/paperqa/readers.py +++ b/paperqa/readers.py @@ -21,7 +21,14 @@ def parse_pdf_to_pages(path: Path) -> ParsedText: total_length = 0 for i in range(file.page_count): - page = file.load_page(i) + try: + page = file.load_page(i) + except pymupdf.mupdf.FzErrorFormat as exc: + raise ImpossibleParsingError( + f"Page loading via {pymupdf.__name__} failed on page {i} of" + f" {file.page_count} for the PDF at path {path}, likely this PDF" + " file is corrupt" + ) from exc pages[str(i + 1)] = page.get_text("text", sort=True) total_length += len(pages[str(i + 1)])