Merge pull request #24 from bytinbit/refactor/switch-to-pikepdf

Switch from PyPDF2 to pikepdf
bytinbit · Jan 11, 2021 · c5170f9 · c5170f9
2 parents a8b1378 + 8cada69
commit c5170f9
Show file tree

Hide file tree

Showing 7 changed files with 57 additions and 64 deletions.
diff --git a/nobubo/calc.py b/nobubo/calc.py
@@ -24,7 +24,7 @@
 from dataclasses import dataclass
 from typing import List
 
-import PyPDF2
+import pikepdf
 
 from nobubo import core
 
@@ -41,23 +41,21 @@ class Factor:
 def parse_cli_input(input_layout: (int, int, int), output_layout_cli: str, print_margin: int,
                     reverse_assembly: bool, input_path: str, output_path: str
                     ) -> (core.InputProperties, core.OutputProperties):
-    with open(pathlib.Path(input_path), "rb") as inputfile:
-        reader = PyPDF2.PdfFileReader(inputfile, strict=False)
-
-        width, height = calculate_page_dimensions(
-            reader.getPage(1))  # first page (getPage(0)) may contain overview
+    with pikepdf.open(pathlib.Path(input_path)) as inputfile:
+        # first page (getPage(0)) may contain overview, so get second one
+        width, height = calculate_page_dimensions(inputfile.pages[1])
         input_properties = core.InputProperties(
             input_filepath=pathlib.Path(input_path),
             output_path=pathlib.Path(output_path),
-            number_of_pages=reader.getNumPages(),
+            number_of_pages=len(inputfile.pages),
             pagesize=core.PageSize(width=width, height=height),
             layout=parse_input_layouts(input_layout),
             reverse_assembly=reverse_assembly)
 
         output_properties = core.OutputProperties(output_path=pathlib.Path(output_path),
                                                   output_layout=parse_output_layout(output_layout_cli, print_margin),
                                                   )
-        return input_properties, output_properties
+    return input_properties, output_properties
 
 
 def parse_input_layouts(input_layout: (int, int, int)) ->[core.Layout]:
@@ -83,15 +81,17 @@ def calculate_pages_needed(layout: core.Layout, n_up_factor: Factor) -> int:
     return math.ceil(layout.columns/n_up_factor.x) * math.ceil(layout.rows/n_up_factor.y)
 
 
-def calculate_page_dimensions(page: PyPDF2.pdf.PageObject) -> (float, float):
+def calculate_page_dimensions(page: pikepdf.Page) -> (float, float):
     """
     Calculates the x, y value for the offset in default user space units as defined in the pdf standard.
-    Uses the cropBox value, since this is the area visible to the printer.
-    :param page: A pattern page.
+    :param page: A PDF page.
     :return: list with x, y value.
     """
-    return round(float(page.cropBox[2])-float(page.cropBox[0]), 2), \
-           round(float(page.cropBox[3])-float(page.cropBox[1]), 2)
+    if not hasattr(page, "CropBox"):
+        box = page.MediaBox
+    else:
+        box = page.CropBox
+    return round(float(box[2])-float(box[0]), 2), round(float(box[3])-float(box[1]), 2)
 
 
 def convert_to_userspaceunits(width_height: [int, int]) -> core.PageSize:

diff --git a/nobubo/disassembly.py b/nobubo/disassembly.py
@@ -22,7 +22,7 @@
 from copy import copy
 import pathlib
 
-import PyPDF2
+from pikepdf import Pdf, Page
 
 from nobubo import core, calc, output
 
@@ -31,25 +31,23 @@ def create_output_files(temp_collage_paths: [pathlib.Path],
                         input_properties: core.InputProperties,
                         output_properties: core.OutputProperties):
     for counter, collage_path in enumerate(temp_collage_paths):
-        with collage_path.open("rb") as collagefile:
-            reader = PyPDF2.PdfFileReader(collagefile, strict=False)
-            collage = reader.getPage(0)
-            new_outputpath = calc.generate_new_outputpath(output_properties.output_path, counter)
-            print(f"\nChopping up the collage...")
-            chopped_up_files = _create_output_files(collage, input_properties.pagesize,
-                                                    input_properties.layout[counter], output_properties.output_layout)
-            print(f"Successfully chopped up the collage.\n")
-            output.write_chops(chopped_up_files, new_outputpath)
-            print(f"Final pdf written to {new_outputpath}. Enjoy your sewing :)")
-
-
-def _create_output_files(assembled_collage: PyPDF2.pdf.PageObject,
+        collage = Pdf.open(collage_path)
+        new_outputpath = calc.generate_new_outputpath(output_properties.output_path, counter)
+        print(f"\nChopping up the collage...")
+        chopped_up_files = _create_output_files(collage, input_properties.pagesize,
+                                                input_properties.layout[counter], output_properties.output_layout)
+        print(f"Successfully chopped up the collage.\n")
+        output.write_chops(chopped_up_files, new_outputpath)
+        print(f"Final pdf written to {new_outputpath}. Enjoy your sewing :)")
+
+
+def _create_output_files(collage: Pdf,
                          pagesize: core.PageSize,
                          current_layout: core.Layout,
-                         output_layout: [int]) -> PyPDF2.PdfFileWriter:
+                         output_layout: [int]) -> Pdf:
     """
     Chops up the collage that consists of all the pattern pages to individual pages of the desired output size.
-    :param assembled_collage: One pdf page that contains all assembled pattern pages.
+    :param collage: One pdf page that contains all assembled pattern pages.
     :param input_properties: Properties of the pdf.
     :param output_layout: The desired output layout.
     :return: The pdf with several pages, ready to write to disk.
@@ -59,10 +57,11 @@ def _create_output_files(assembled_collage: PyPDF2.pdf.PageObject,
     lowerleft_factor = calc.Factor(x=0, y=0)
     upperright_factor = calc.Factor(x=1, y=1)
 
-    writer = PyPDF2.PdfFileWriter()
-    for x in range(0, calc.calculate_pages_needed(current_layout, n_up_factor)):
-        page = copy(assembled_collage)
-        # cf. https://stackoverflow.com/questions/52315259/pypdf2-cant-add-multiple-cropped-pages#
+    output = Pdf.new()
+    output.copy_foreign(collage.Root) # TODO must Root be updated if new pages are added?
+    # Root must be copied too, not only the page: thanks to https://github.com/cfcurtis/sewingutils for this!
+    for i in range(0, calc.calculate_pages_needed(current_layout, n_up_factor)):
+        page = output.copy_foreign(collage.pages[0])
 
         lowerleft: core.Point = _calculate_lowerleft_point(lowerleft_factor, n_up_factor, pagesize)
         upperright: core.Point = _calculate_upperright_point(upperright_factor, n_up_factor, current_layout, pagesize)
@@ -71,11 +70,10 @@ def _create_output_files(assembled_collage: PyPDF2.pdf.PageObject,
         colsleft = _calculate_colsrows_left(current_layout.columns, upperright_factor.x, n_up_factor.x)
         lowerleft_factor, upperright_factor = _adjust_factors(lowerleft_factor, upperright_factor, colsleft)
 
-        page.cropBox.lowerLeft = (lowerleft.x, lowerleft.y)
-        page.cropBox.upperRight = (upperright.x, upperright.y)
-        writer.addPage(page)
+        page.CropBox = [lowerleft.x, lowerleft.y, upperright.x, upperright.y]
+        output.pages.append(page)
 
-    return writer
+    return output
 
 
 def _calculate_colsrows_left(layout_element: int, factor: int, nup_factor: int) -> int:

diff --git a/nobubo/output.py b/nobubo/output.py
@@ -14,29 +14,25 @@
 #
 # You should have received a copy of the GNU Affero General Public License
 # along with Nobubo.  If not, see <https://www.gnu.org/licenses/>.
-import PyPDF2
+from pikepdf import Pdf
 import pathlib
 import sys
 
 from nobubo import core, calc
 
 
-def write_chops(pypdf2_writer: PyPDF2.PdfFileWriter, output_path: pathlib.Path):
+def write_chops(collage: Pdf, output_path: pathlib.Path):
     print("Writing file...")
     try:
-        with open(output_path, "wb") as output:
-            pypdf2_writer.write(output)
+        collage.save(output_path)
     except OSError as e:
         print(f"While writing the file, this error occurred:\n{e}")
         sys.exit(1)
 
 
 def write_collage(temp_collage_paths: [pathlib.Path], output_properties: core.OutputProperties):
     for counter, collage_path in enumerate(temp_collage_paths):
-        writer = PyPDF2.PdfFileWriter()
         new_outputpath = calc.generate_new_outputpath(output_properties.output_path, counter)
-        with collage_path.open("rb") as collagefile:
-            reader = PyPDF2.PdfFileReader(collagefile, strict=False)
-            writer.addPage(reader.getPage(0))
-            write_chops(writer, new_outputpath)
+        temp_collage = Pdf.open(collage_path)
+        temp_collage.save(new_outputpath)
         print(f"Collage written to {new_outputpath}. Enjoy your sewing :)")
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,3 @@
 click >= 7.1.2
-PyPDF2 >= 1.26.0
+pikepdf >= 1.19.3
+
diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name="nobubo-bytinbit",
-    version="1.1.0",
+    version="1.2.0",
     description="Nobubo assembles a digital pdf sewing pattern and chops it into a desired output size to be printed.",
     long_description=long_description,
     long_description_content_type="text/markdown",    
@@ -17,7 +17,7 @@
         "console_scripts": ["nobubo = nobubo.nobubo:main"]
         },
     python_requires=">=3.7",
-    install_requires=["click", "PyPDF2"],
+    install_requires=["click", "pikepdf"],
     classifiers=[
         "Topic :: Printing",
         "Topic :: Utilities",

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -1,5 +1,6 @@
 import pathlib
-import PyPDF2
+# import PyPDF2
+import pikepdf
 import pytest
 
 import textract
@@ -15,22 +16,27 @@ def __init__(self, outputdir: pathlib.Path) -> None:
 
     def read(self):
         for filepath in self.outputdir.glob("*.pdf"):
-            file = open(filepath, "rb")
+            file = pikepdf.open(filepath)
             self._files.append(file)
-            self.readers[filepath.name] = PyPDF2.PdfFileReader(file)
+            self.readers[filepath.name] = file
         return sorted(self.readers.keys())
 
     def pagesize(self, filename: str, pagenumber: int=0) -> [float, float]:
         reader = self.readers[filename]
-        page = reader.getPage(pagenumber)
-        return [round(float(page.cropBox[2])-float(page.cropBox[0]), 2), round(float(page.cropBox[3])-float(page.cropBox[1]), 2)]
+        page = reader.pages[pagenumber]
+        if not hasattr(page, "CropBox"):
+            box = page.MediaBox
+        else:
+            box = page.CropBox
+        return [round(float(box[2])-float(box[0]), 2),
+                round(float(box[3])-float(box[1]), 2)]
 
     def pagecount(self, filename: str) -> int:
         reader = self.readers[filename]
-        return reader.getNumPages()
+        return len(reader.pages)
 
     # TODO is there a better way to check the order of the pages?
-    def pages_order(self, filepath: str, pageamount: int=1) -> [str, str]:
+    def pages_order(self, filepath: str) -> [str, str]:
         text = str(textract.process(filepath, encoding="utf-8"), "utf-8").split("\n\n")
         # texteract finds ascii value '\f' (form feed, \x0c) that must be removed
         res = list(filter(lambda a: a not in '\x0c', text))
@@ -80,11 +86,6 @@ def two_overviews() -> [core.Layout, core.Layout]:
     return [first, second]
 
 
-@pytest.fixture()
-def one_pdf_page_same_boxes() -> PyPDF2.pdf.PageObject:
-    return PyPDF2.pdf.PageObject.createBlankPage(None, 483.307, 729.917)
-
-
 @pytest.fixture()
 def n_up_factor_a0() -> calc.Factor:
     return calc.Factor(x=4, y=4)

diff --git a/tests/test_calc.py b/tests/test_calc.py
@@ -16,9 +16,6 @@ def test_calculate_pages_needed_oneoverview_a0_unevenlayout(self, one_overview_u
     def test_calculate_pages_needed_oneoverview_custom_unevenlayout(self, one_overview_uneven, nup_factor_custom):
         assert calc.calculate_pages_needed(one_overview_uneven, nup_factor_custom) == 2
 
-    def test_calculate_offset(self, one_pdf_page_same_boxes):
-        assert calc.calculate_page_dimensions(one_pdf_page_same_boxes) == (483.31, 729.92)
-
     def test_userspaceunits_conversion_a0(self):
         paper = calc.convert_to_userspaceunits([841, 1189])
         assert paper.width == 2383.937