Skip to content

Commit

Permalink
Add debugging helper scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
jbarlow83 committed Jan 2, 2025
1 parent 522f9d5 commit 36c82e0
Show file tree
Hide file tree
Showing 3 changed files with 252 additions and 0 deletions.
123 changes: 123 additions & 0 deletions misc/ocrmypdf_compare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MIT

"""Run OCRmyPDF on the same PDF with different options."""

from __future__ import annotations

import os
import shlex
from io import BytesIO
from pathlib import Path
from subprocess import check_output, run
from tempfile import TemporaryDirectory

import pikepdf
import pymupdf
import streamlit as st
from lxml import etree
from streamlit_pdf_viewer import pdf_viewer


def main():
st.set_page_config(layout="wide")

st.title("OCRmyPDF Compare")
st.write("Run OCRmyPDF on the same PDF with different options.")

uploaded_pdf = st.file_uploader("Upload a PDF", type=["pdf"])
if uploaded_pdf is None:
return

pdf_bytes = uploaded_pdf.read()

with pikepdf.open(BytesIO(pdf_bytes)) as p, TemporaryDirectory() as d:
with st.expander("PDF Metadata"):
with p.open_metadata() as meta:
xml_txt = str(meta)
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.fromstring(xml_txt, parser=parser)
st.code(
etree.tostring(tree, pretty_print=True).decode("utf-8"),
language="xml",
)
st.write(p.docinfo)
st.write("Number of pages:", len(p.pages))

col1, col2 = st.columns(2)
with col1:
cli1 = st.text_area(
"Command line arguments for A",
key="args1",
value="ocrmypdf {in_} {out}",
)
env1 = st.text_area("Environment variables for A", key="env1")
args1 = shlex.split(
cli1.format(
in_=os.path.join(d, "input.pdf"),
out=os.path.join(d, "output1.pdf"),
)
)
st.code(shlex.join(args1))
with col2:
cli2 = st.text_area(
"Command line arguments for B",
key="args2",
value="ocrmypdf {in_} {out}",
)
env2 = st.text_area("Environment variables for B", key="env2")
args2 = shlex.split(
cli2.format(
in_=os.path.join(d, "input.pdf"),
out=os.path.join(d, "output2.pdf"),
)
)
st.code(shlex.join(args2))

if not st.button("Execute and Compare"):
return
with st.spinner("Executing..."):
Path(d, "input.pdf").write_bytes(pdf_bytes)
run(args1, env=dict(os.environ, **eval(env1 or "{}")))
run(args2, env=dict(os.environ, **eval(env2 or "{}")))

col1, col2 = st.columns(2)
with col1:
st.text(
"Ghostscript version A: "
+ check_output(
["gs", "--version"],
env=dict(os.environ, **eval(env1 or "{}")),
text=True,
)
)
with col2:
st.text(
"Ghostscript version B: "
+ check_output(
["gs", "--version"],
env=dict(os.environ, **eval(env2 or "{}")),
text=True,
)
)

doc1 = pymupdf.open(os.path.join(d, "output1.pdf"))
doc2 = pymupdf.open(os.path.join(d, "output2.pdf"))
for i, page1_2 in enumerate(zip(doc1, doc2)):
st.write(f"Page {i+1}")
page1, page2 = page1_2
col1, col2 = st.columns(2)
with col1, st.container(border=True):
st.write(page1.get_text())
with col2, st.container(border=True):
st.write(page2.get_text())

col1, col2 = st.columns(2)
with col1, st.expander("PDF Viewer"):
pdf_viewer(Path(d, "output1.pdf"))
with col2, st.expander("PDF Viewer"):
pdf_viewer(Path(d, "output2.pdf"))


if __name__ == "__main__":
main()
83 changes: 83 additions & 0 deletions misc/pdf_compare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MIT

"""Compare two PDFs."""

from __future__ import annotations

import os
from io import BytesIO
from pathlib import Path
from tempfile import TemporaryDirectory

import pikepdf
import pymupdf
import streamlit as st
from lxml import etree
from streamlit_pdf_viewer import pdf_viewer


def do_metadata(pdf):
with pikepdf.open(pdf) as pdf:
with pdf.open_metadata() as meta:
xml_txt = str(meta)
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.fromstring(xml_txt, parser=parser)
st.code(
etree.tostring(tree, pretty_print=True).decode("utf-8"),
language="xml",
)
st.write(pdf.docinfo)
st.write("Number of pages:", len(pdf.pages))


def main():
st.set_page_config(layout="wide")

st.title("PDF Compare")
st.write("Compare two PDFs.")

col1, col2 = st.columns(2)
with col1:
uploaded_pdf1 = st.file_uploader("Upload a PDF", type=["pdf"], key='pdf1')
with col2:
uploaded_pdf2 = st.file_uploader("Upload a PDF", type=["pdf"], key='pdf2')
if uploaded_pdf1 is None or uploaded_pdf2 is None:
return

pdf_bytes1 = uploaded_pdf1.getvalue()
pdf_bytes2 = uploaded_pdf2.getvalue()

with st.expander("PDF Metadata"):
col1, col2 = st.columns(2)
with col1:
do_metadata(BytesIO(pdf_bytes1))
with col2:
do_metadata(BytesIO(pdf_bytes2))

with TemporaryDirectory() as d:
Path(d, "1.pdf").write_bytes(pdf_bytes1)
Path(d, "2.pdf").write_bytes(pdf_bytes2)

with st.expander("Text"):
doc1 = pymupdf.open(os.path.join(d, "1.pdf"))
doc2 = pymupdf.open(os.path.join(d, "2.pdf"))
for i, page1_2 in enumerate(zip(doc1, doc2)):
st.write(f"Page {i+1}")
page1, page2 = page1_2
col1, col2 = st.columns(2)
with col1, st.container(border=True):
st.write(page1.get_text())
with col2, st.container(border=True):
st.write(page2.get_text())

with st.expander("PDF Viewer"):
col1, col2 = st.columns(2)
with col1:
pdf_viewer(Path(d, "1.pdf"), key='pdf_viewer1', render_text=True)
with col2:
pdf_viewer(Path(d, "2.pdf"), key='pdf_viewer2', render_text=True)


if __name__ == "__main__":
main()
46 changes: 46 additions & 0 deletions misc/pdf_text_diff.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0

"""Compare text in PDFs."""

from __future__ import annotations

from subprocess import run
from tempfile import NamedTemporaryFile
from typing import Annotated

import typer


def main(
pdf1: Annotated[typer.FileBinaryRead, typer.Argument()],
pdf2: Annotated[typer.FileBinaryRead, typer.Argument()],
engine: Annotated[str, typer.Option()] = 'pdftotext',
):
"""Compare text in PDFs."""

text1 = run(
['pdftotext', '-layout', '-', '-'], stdin=pdf1, capture_output=True, check=True
)
text2 = run(
['pdftotext', '-layout', '-', '-'], stdin=pdf2, capture_output=True, check=True
)

with NamedTemporaryFile() as f1, NamedTemporaryFile() as f2:
f1.write(text1.stdout)
f1.flush()
f2.write(text2.stdout)
f2.flush()
diff = run(
['diff', '--color=always', '--side-by-side', f1.name, f2.name],
capture_output=True,
)
run(['less', '-R'], input=diff.stdout, check=True)
if text1.stdout.strip() != text2.stdout.strip():
return 1

return 0


if __name__ == '__main__':
typer.run(main)

0 comments on commit 36c82e0

Please sign in to comment.