From 36c82e0659db14effdd0b3a95ab758027a7f9f9a Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <james@purplerock.ca>
Date: Wed, 1 Jan 2025 18:03:15 -0800
Subject: [PATCH] Add debugging helper scripts

---
 misc/ocrmypdf_compare.py | 123 +++++++++++++++++++++++++++++++++++++++
 misc/pdf_compare.py      |  83 ++++++++++++++++++++++++++
 misc/pdf_text_diff.py    |  46 +++++++++++++++
 3 files changed, 252 insertions(+)
 create mode 100644 misc/ocrmypdf_compare.py
 create mode 100644 misc/pdf_compare.py
 create mode 100644 misc/pdf_text_diff.py

diff --git a/misc/ocrmypdf_compare.py b/misc/ocrmypdf_compare.py
new file mode 100644
index 000000000..c504c9b8e
--- /dev/null
+++ b/misc/ocrmypdf_compare.py
@@ -0,0 +1,123 @@
+# SPDX-FileCopyrightText: 2025 James R. Barlow
+# SPDX-License-Identifier: MIT
+
+"""Run OCRmyPDF on the same PDF with different options."""
+
+from __future__ import annotations
+
+import os
+import shlex
+from io import BytesIO
+from pathlib import Path
+from subprocess import check_output, run
+from tempfile import TemporaryDirectory
+
+import pikepdf
+import pymupdf
+import streamlit as st
+from lxml import etree
+from streamlit_pdf_viewer import pdf_viewer
+
+
+def main():
+    st.set_page_config(layout="wide")
+
+    st.title("OCRmyPDF Compare")
+    st.write("Run OCRmyPDF on the same PDF with different options.")
+
+    uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"])
+    if uploaded_pdf is None:
+        return
+
+    pdf_bytes = uploaded_pdf.read()
+
+    with pikepdf.open(BytesIO(pdf_bytes)) as p, TemporaryDirectory() as d:
+        with st.expander("PDF Metadata"):
+            with p.open_metadata() as meta:
+                xml_txt = str(meta)
+                parser = etree.XMLParser(remove_blank_text=True)
+                tree = etree.fromstring(xml_txt, parser=parser)
+                st.code(
+                    etree.tostring(tree, pretty_print=True).decode("utf-8"),
+                    language="xml",
+                )
+            st.write(p.docinfo)
+            st.write("Number of pages:", len(p.pages))
+
+        col1, col2 = st.columns(2)
+        with col1:
+            cli1 = st.text_area(
+                "Command line arguments for A",
+                key="args1",
+                value="ocrmypdf {in_} {out}",
+            )
+            env1 = st.text_area("Environment variables for A", key="env1")
+            args1 = shlex.split(
+                cli1.format(
+                    in_=os.path.join(d, "input.pdf"),
+                    out=os.path.join(d, "output1.pdf"),
+                )
+            )
+            st.code(shlex.join(args1))
+        with col2:
+            cli2 = st.text_area(
+                "Command line arguments for B",
+                key="args2",
+                value="ocrmypdf {in_} {out}",
+            )
+            env2 = st.text_area("Environment variables for B", key="env2")
+            args2 = shlex.split(
+                cli2.format(
+                    in_=os.path.join(d, "input.pdf"),
+                    out=os.path.join(d, "output2.pdf"),
+                )
+            )
+            st.code(shlex.join(args2))
+
+        if not st.button("Execute and Compare"):
+            return
+        with st.spinner("Executing..."):
+            Path(d, "input.pdf").write_bytes(pdf_bytes)
+            run(args1, env=dict(os.environ, **eval(env1 or "{}")))
+            run(args2, env=dict(os.environ, **eval(env2 or "{}")))
+
+            col1, col2 = st.columns(2)
+            with col1:
+                st.text(
+                    "Ghostscript version A: "
+                    + check_output(
+                        ["gs", "--version"],
+                        env=dict(os.environ, **eval(env1 or "{}")),
+                        text=True,
+                    )
+                )
+            with col2:
+                st.text(
+                    "Ghostscript version B: "
+                    + check_output(
+                        ["gs", "--version"],
+                        env=dict(os.environ, **eval(env2 or "{}")),
+                        text=True,
+                    )
+                )
+
+            doc1 = pymupdf.open(os.path.join(d, "output1.pdf"))
+            doc2 = pymupdf.open(os.path.join(d, "output2.pdf"))
+            for i, page1_2 in enumerate(zip(doc1, doc2)):
+                st.write(f"Page {i+1}")
+                page1, page2 = page1_2
+                col1, col2 = st.columns(2)
+                with col1, st.container(border=True):
+                    st.write(page1.get_text())
+                with col2, st.container(border=True):
+                    st.write(page2.get_text())
+
+            col1, col2 = st.columns(2)
+            with col1, st.expander("PDF Viewer"):
+                pdf_viewer(Path(d, "output1.pdf"))
+            with col2, st.expander("PDF Viewer"):
+                pdf_viewer(Path(d, "output2.pdf"))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/misc/pdf_compare.py b/misc/pdf_compare.py
new file mode 100644
index 000000000..d2807bbb9
--- /dev/null
+++ b/misc/pdf_compare.py
@@ -0,0 +1,83 @@
+# SPDX-FileCopyrightText: 2025 James R. Barlow
+# SPDX-License-Identifier: MIT
+
+"""Compare two PDFs."""
+
+from __future__ import annotations
+
+import os
+from io import BytesIO
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+import pikepdf
+import pymupdf
+import streamlit as st
+from lxml import etree
+from streamlit_pdf_viewer import pdf_viewer
+
+
+def do_metadata(pdf):
+    with pikepdf.open(pdf) as pdf:
+        with pdf.open_metadata() as meta:
+            xml_txt = str(meta)
+            parser = etree.XMLParser(remove_blank_text=True)
+            tree = etree.fromstring(xml_txt, parser=parser)
+            st.code(
+                etree.tostring(tree, pretty_print=True).decode("utf-8"),
+                language="xml",
+            )
+        st.write(pdf.docinfo)
+        st.write("Number of pages:", len(pdf.pages))
+
+
+def main():
+    st.set_page_config(layout="wide")
+
+    st.title("PDF Compare")
+    st.write("Compare two PDFs.")
+
+    col1, col2 = st.columns(2)
+    with col1:
+        uploaded_pdf1 = st.file_uploader("Upload a PDF", type=["pdf"], key='pdf1')
+    with col2:
+        uploaded_pdf2 = st.file_uploader("Upload a PDF", type=["pdf"], key='pdf2')
+    if uploaded_pdf1 is None or uploaded_pdf2 is None:
+        return
+
+    pdf_bytes1 = uploaded_pdf1.getvalue()
+    pdf_bytes2 = uploaded_pdf2.getvalue()
+
+    with st.expander("PDF Metadata"):
+        col1, col2 = st.columns(2)
+        with col1:
+            do_metadata(BytesIO(pdf_bytes1))
+        with col2:
+            do_metadata(BytesIO(pdf_bytes2))
+
+    with TemporaryDirectory() as d:
+        Path(d, "1.pdf").write_bytes(pdf_bytes1)
+        Path(d, "2.pdf").write_bytes(pdf_bytes2)
+
+        with st.expander("Text"):
+            doc1 = pymupdf.open(os.path.join(d, "1.pdf"))
+            doc2 = pymupdf.open(os.path.join(d, "2.pdf"))
+            for i, page1_2 in enumerate(zip(doc1, doc2)):
+                st.write(f"Page {i+1}")
+                page1, page2 = page1_2
+                col1, col2 = st.columns(2)
+                with col1, st.container(border=True):
+                    st.write(page1.get_text())
+                with col2, st.container(border=True):
+                    st.write(page2.get_text())
+
+        with st.expander("PDF Viewer"):
+            col1, col2 = st.columns(2)
+            with col1:
+                pdf_viewer(Path(d, "1.pdf"), key='pdf_viewer1', render_text=True)
+            with col2:
+                pdf_viewer(Path(d, "2.pdf"), key='pdf_viewer2', render_text=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/misc/pdf_text_diff.py b/misc/pdf_text_diff.py
new file mode 100644
index 000000000..5b93ec23c
--- /dev/null
+++ b/misc/pdf_text_diff.py
@@ -0,0 +1,46 @@
+# SPDX-FileCopyrightText: 2025 James R. Barlow
+# SPDX-License-Identifier: MPL-2.0
+
+"""Compare text in PDFs."""
+
+from __future__ import annotations
+
+from subprocess import run
+from tempfile import NamedTemporaryFile
+from typing import Annotated
+
+import typer
+
+
+def main(
+    pdf1: Annotated[typer.FileBinaryRead, typer.Argument()],
+    pdf2: Annotated[typer.FileBinaryRead, typer.Argument()],
+    engine: Annotated[str, typer.Option()] = 'pdftotext',
+):
+    """Compare text in PDFs."""
+
+    text1 = run(
+        ['pdftotext', '-layout', '-', '-'], stdin=pdf1, capture_output=True, check=True
+    )
+    text2 = run(
+        ['pdftotext', '-layout', '-', '-'], stdin=pdf2, capture_output=True, check=True
+    )
+
+    with NamedTemporaryFile() as f1, NamedTemporaryFile() as f2:
+        f1.write(text1.stdout)
+        f1.flush()
+        f2.write(text2.stdout)
+        f2.flush()
+        diff = run(
+            ['diff', '--color=always', '--side-by-side', f1.name, f2.name],
+            capture_output=True,
+        )
+        run(['less', '-R'], input=diff.stdout, check=True)
+        if text1.stdout.strip() != text2.stdout.strip():
+            return 1
+
+    return 0
+
+
+if __name__ == '__main__':
+    typer.run(main)