From 36c82e0659db14effdd0b3a95ab758027a7f9f9a Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Wed, 1 Jan 2025 18:03:15 -0800 Subject: [PATCH] Add debugging helper scripts --- misc/ocrmypdf_compare.py | 123 +++++++++++++++++++++++++++++++++++++++ misc/pdf_compare.py | 83 ++++++++++++++++++++++++++ misc/pdf_text_diff.py | 46 +++++++++++++++ 3 files changed, 252 insertions(+) create mode 100644 misc/ocrmypdf_compare.py create mode 100644 misc/pdf_compare.py create mode 100644 misc/pdf_text_diff.py diff --git a/misc/ocrmypdf_compare.py b/misc/ocrmypdf_compare.py new file mode 100644 index 000000000..c504c9b8e --- /dev/null +++ b/misc/ocrmypdf_compare.py @@ -0,0 +1,123 @@ +# SPDX-FileCopyrightText: 2025 James R. Barlow +# SPDX-License-Identifier: MIT + +"""Run OCRmyPDF on the same PDF with different options.""" + +from __future__ import annotations + +import os +import shlex +from io import BytesIO +from pathlib import Path +from subprocess import check_output, run +from tempfile import TemporaryDirectory + +import pikepdf +import pymupdf +import streamlit as st +from lxml import etree +from streamlit_pdf_viewer import pdf_viewer + + +def main(): + st.set_page_config(layout="wide") + + st.title("OCRmyPDF Compare") + st.write("Run OCRmyPDF on the same PDF with different options.") + + uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"]) + if uploaded_pdf is None: + return + + pdf_bytes = uploaded_pdf.read() + + with pikepdf.open(BytesIO(pdf_bytes)) as p, TemporaryDirectory() as d: + with st.expander("PDF Metadata"): + with p.open_metadata() as meta: + xml_txt = str(meta) + parser = etree.XMLParser(remove_blank_text=True) + tree = etree.fromstring(xml_txt, parser=parser) + st.code( + etree.tostring(tree, pretty_print=True).decode("utf-8"), + language="xml", + ) + st.write(p.docinfo) + st.write("Number of pages:", len(p.pages)) + + col1, col2 = st.columns(2) + with col1: + cli1 = st.text_area( + "Command line arguments for A", + key="args1", + value="ocrmypdf {in_} {out}", + ) + env1 = st.text_area("Environment variables for A", key="env1") + args1 = shlex.split( + cli1.format( + in_=os.path.join(d, "input.pdf"), + out=os.path.join(d, "output1.pdf"), + ) + ) + st.code(shlex.join(args1)) + with col2: + cli2 = st.text_area( + "Command line arguments for B", + key="args2", + value="ocrmypdf {in_} {out}", + ) + env2 = st.text_area("Environment variables for B", key="env2") + args2 = shlex.split( + cli2.format( + in_=os.path.join(d, "input.pdf"), + out=os.path.join(d, "output2.pdf"), + ) + ) + st.code(shlex.join(args2)) + + if not st.button("Execute and Compare"): + return + with st.spinner("Executing..."): + Path(d, "input.pdf").write_bytes(pdf_bytes) + run(args1, env=dict(os.environ, **eval(env1 or "{}"))) + run(args2, env=dict(os.environ, **eval(env2 or "{}"))) + + col1, col2 = st.columns(2) + with col1: + st.text( + "Ghostscript version A: " + + check_output( + ["gs", "--version"], + env=dict(os.environ, **eval(env1 or "{}")), + text=True, + ) + ) + with col2: + st.text( + "Ghostscript version B: " + + check_output( + ["gs", "--version"], + env=dict(os.environ, **eval(env2 or "{}")), + text=True, + ) + ) + + doc1 = pymupdf.open(os.path.join(d, "output1.pdf")) + doc2 = pymupdf.open(os.path.join(d, "output2.pdf")) + for i, page1_2 in enumerate(zip(doc1, doc2)): + st.write(f"Page {i+1}") + page1, page2 = page1_2 + col1, col2 = st.columns(2) + with col1, st.container(border=True): + st.write(page1.get_text()) + with col2, st.container(border=True): + st.write(page2.get_text()) + + col1, col2 = st.columns(2) + with col1, st.expander("PDF Viewer"): + pdf_viewer(Path(d, "output1.pdf")) + with col2, st.expander("PDF Viewer"): + pdf_viewer(Path(d, "output2.pdf")) + + +if __name__ == "__main__": + main() diff --git a/misc/pdf_compare.py b/misc/pdf_compare.py new file mode 100644 index 000000000..d2807bbb9 --- /dev/null +++ b/misc/pdf_compare.py @@ -0,0 +1,83 @@ +# SPDX-FileCopyrightText: 2025 James R. Barlow +# SPDX-License-Identifier: MIT + +"""Compare two PDFs.""" + +from __future__ import annotations + +import os +from io import BytesIO +from pathlib import Path +from tempfile import TemporaryDirectory + +import pikepdf +import pymupdf +import streamlit as st +from lxml import etree +from streamlit_pdf_viewer import pdf_viewer + + +def do_metadata(pdf): + with pikepdf.open(pdf) as pdf: + with pdf.open_metadata() as meta: + xml_txt = str(meta) + parser = etree.XMLParser(remove_blank_text=True) + tree = etree.fromstring(xml_txt, parser=parser) + st.code( + etree.tostring(tree, pretty_print=True).decode("utf-8"), + language="xml", + ) + st.write(pdf.docinfo) + st.write("Number of pages:", len(pdf.pages)) + + +def main(): + st.set_page_config(layout="wide") + + st.title("PDF Compare") + st.write("Compare two PDFs.") + + col1, col2 = st.columns(2) + with col1: + uploaded_pdf1 = st.file_uploader("Upload a PDF", type=["pdf"], key='pdf1') + with col2: + uploaded_pdf2 = st.file_uploader("Upload a PDF", type=["pdf"], key='pdf2') + if uploaded_pdf1 is None or uploaded_pdf2 is None: + return + + pdf_bytes1 = uploaded_pdf1.getvalue() + pdf_bytes2 = uploaded_pdf2.getvalue() + + with st.expander("PDF Metadata"): + col1, col2 = st.columns(2) + with col1: + do_metadata(BytesIO(pdf_bytes1)) + with col2: + do_metadata(BytesIO(pdf_bytes2)) + + with TemporaryDirectory() as d: + Path(d, "1.pdf").write_bytes(pdf_bytes1) + Path(d, "2.pdf").write_bytes(pdf_bytes2) + + with st.expander("Text"): + doc1 = pymupdf.open(os.path.join(d, "1.pdf")) + doc2 = pymupdf.open(os.path.join(d, "2.pdf")) + for i, page1_2 in enumerate(zip(doc1, doc2)): + st.write(f"Page {i+1}") + page1, page2 = page1_2 + col1, col2 = st.columns(2) + with col1, st.container(border=True): + st.write(page1.get_text()) + with col2, st.container(border=True): + st.write(page2.get_text()) + + with st.expander("PDF Viewer"): + col1, col2 = st.columns(2) + with col1: + pdf_viewer(Path(d, "1.pdf"), key='pdf_viewer1', render_text=True) + with col2: + pdf_viewer(Path(d, "2.pdf"), key='pdf_viewer2', render_text=True) + + +if __name__ == "__main__": + main() diff --git a/misc/pdf_text_diff.py b/misc/pdf_text_diff.py new file mode 100644 index 000000000..5b93ec23c --- /dev/null +++ b/misc/pdf_text_diff.py @@ -0,0 +1,46 @@ +# SPDX-FileCopyrightText: 2025 James R. Barlow +# SPDX-License-Identifier: MPL-2.0 + +"""Compare text in PDFs.""" + +from __future__ import annotations + +from subprocess import run +from tempfile import NamedTemporaryFile +from typing import Annotated + +import typer + + +def main( + pdf1: Annotated[typer.FileBinaryRead, typer.Argument()], + pdf2: Annotated[typer.FileBinaryRead, typer.Argument()], + engine: Annotated[str, typer.Option()] = 'pdftotext', +): + """Compare text in PDFs.""" + + text1 = run( + ['pdftotext', '-layout', '-', '-'], stdin=pdf1, capture_output=True, check=True + ) + text2 = run( + ['pdftotext', '-layout', '-', '-'], stdin=pdf2, capture_output=True, check=True + ) + + with NamedTemporaryFile() as f1, NamedTemporaryFile() as f2: + f1.write(text1.stdout) + f1.flush() + f2.write(text2.stdout) + f2.flush() + diff = run( + ['diff', '--color=always', '--side-by-side', f1.name, f2.name], + capture_output=True, + ) + run(['less', '-R'], input=diff.stdout, check=True) + if text1.stdout.strip() != text2.stdout.strip(): + return 1 + + return 0 + + +if __name__ == '__main__': + typer.run(main)