Skip to content

Commit

Permalink
[IMP]: add support for file_bytes argument with managed_file_context()
Browse files Browse the repository at this point in the history
  • Loading branch information
cscanlin-kwh authored and bosd committed Sep 5, 2024
1 parent e3c1115 commit bd01e8e
Show file tree
Hide file tree
Showing 5 changed files with 169 additions and 85 deletions.
150 changes: 96 additions & 54 deletions camelot/handlers.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,26 @@
import multiprocessing as mp
from contextlib import contextmanager
import io
import os
import sys
from pathlib import Path
from typing import Union
from typing import Union, Any, IO, TypeVar

from pypdf import PdfReader
from pypdf import PdfWriter
from pypdf._utils import StrByteType

from .core import TableList
from .parsers import Lattice
from .parsers import Stream
from .utils import TemporaryDirectory
from .utils import download_url
from .utils import InvalidArguments
from .utils import get_url_bytes
from .utils import get_page_layout
from .utils import get_rotation
from .utils import get_text_objects
from .utils import is_url

FilePathType = TypeVar(Union[str, IO[Any], Path, None])

class PDFHandler:
"""Handles all operations like temp directory creation, splitting
Expand All @@ -26,21 +29,35 @@ class PDFHandler:
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
filepath : str | pathlib.Path, optional (default: None)
Filepath or URL of the PDF file. Required if file_bytes is not given
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None)
Password for decryption.
file_bytes : io.IOBase, optional (default: None)
A file-like stream. Required if filepath is not given
"""

def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None):
def __init__(self, filepath: FilePathType = None, pages="1", password=None, file_bytes=None):
if is_url(filepath):
filepath = download_url(filepath)
self.filepath: Union[StrByteType, Path] = filepath
file_bytes = get_url_bytes(filepath)

if not filepath and not file_bytes:
raise InvalidArguments('Either `filepath` or `file_bytes` is required')
if not filepath:
# filepath must either be passed, or taken from the name attribute
try:
filepath = getattr(file_bytes, 'name')
except AttributeError:
msg = ('Either pass a `filepath`, or give the '
'`file_bytes` argument a name attribute')
raise InvalidArguments(msg)
self.file_bytes = file_bytes # ok to be None

self.filepath = filepath
if isinstance(filepath, str) and not filepath.lower().endswith(".pdf"):
raise NotImplementedError("File format not supported")

Expand All @@ -52,13 +69,35 @@ def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None)
self.password = self.password.encode("ascii")
self.pages = self._get_pages(pages)

@contextmanager
def managed_file_context(self):
"""Reads from either the `filepath` or `file_bytes`
attribute of this instance, to return a file-like object.
Closes any open file handles on exit or error.
Returns
-------
file_bytes : io.IOBase
A readable, seekable, file-like object
"""
if self.file_bytes:
# if we can't seek, write to a BytesIO object that can,
# then seek to the beginning before yielding
if not hasattr(self.file_bytes, 'seek'):
self.file_bytes = io.BytesIO(self.file_bytes.read())
self.file_bytes.seek(0)
yield self.file_bytes
else:
with open(self.filepath, "rb") as file_bytes:
yield file_bytes

def _get_pages(self, pages):
"""Converts pages string to list of ints.
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
managed_file_context : io.IOBase
A readable, seekable, file-like object
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Expand All @@ -74,74 +113,77 @@ def _get_pages(self, pages):
if pages == "1":
page_numbers.append({"start": 1, "end": 1})
else:
infile = PdfReader(self.filepath, strict=False)
with self.managed_file_context() as f:
infile = PdfReader(f, strict=False)

if infile.is_encrypted:
infile.decrypt(self.password)
if infile.is_encrypted:
infile.decrypt(self.password)

if pages == "all":
page_numbers.append({"start": 1, "end": len(infile.pages)})
else:
for r in pages.split(","):
if "-" in r:
a, b = r.split("-")
if b == "end":
b = len(infile.pages)
page_numbers.append({"start": int(a), "end": int(b)})
else:
page_numbers.append({"start": int(r), "end": int(r)})
if pages == "all":
page_numbers.append({"start": 1, "end": len(infile.pages)})
else:
for r in pages.split(","):
if "-" in r:
a, b = r.split("-")
if b == "end":
b = len(infile.pages)
page_numbers.append({"start": int(a), "end": int(b)})
else:
page_numbers.append({"start": int(r), "end": int(r)})

result = []
for p in page_numbers:
result.extend(range(p["start"], p["end"] + 1))
return sorted(set(result))

def _save_page(self, filepath: Union[StrByteType, Path], page, temp):
def _save_page(self, filepath: FilePathType, page, temp):
"""Saves specified page from PDF into a temporary directory.
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
managed_file_context : io.IOBase
A readable, seekable, file-like object
page : int
Page number.
temp : str
Tmp directory.
"""
infile = PdfReader(filepath, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)
fpath = os.path.join(temp, f"page-{page}.pdf")
froot, fext = os.path.splitext(fpath)
p = infile.pages[page - 1]
outfile = PdfWriter()
outfile.add_page(p)
with open(fpath, "wb") as f:
outfile.write(f)
layout, dim = get_page_layout(fpath)
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new)
instream = open(fpath_new, "rb")
infile = PdfReader(instream, strict=False)

with self.managed_file_context() as fileobj:
infile = PdfReader(fileobj, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)
fpath = os.path.join(temp, f"page-{page}.pdf")
froot, fext = os.path.splitext(fpath)
p = infile.pages[page - 1]
outfile = PdfWriter()
p = infile.pages[0]
if rotation == "anticlockwise":
p.rotate(90)
elif rotation == "clockwise":
p.rotate(-90)
outfile.add_page(p)
with open(fpath, "wb") as f:
outfile.write(f)
instream.close()
layout, dim = get_page_layout(fpath)
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new)
instream = open(fpath_new, "rb")
infile = PdfReader(instream, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)
outfile = PdfWriter()
p = infile.pages[0]
if rotation == "anticlockwise":
p.rotate(90)
elif rotation == "clockwise":
p.rotate(-90)
outfile.add_page(p)
with open(fpath, "wb") as f:
outfile.write(f)
instream.close()

def parse(
self,
Expand Down
25 changes: 16 additions & 9 deletions camelot/io.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,24 @@
import warnings
from pathlib import Path
from typing import Union

from pypdf._utils import StrByteType
from .handlers import PDFHandler, FilePathType

from .handlers import PDFHandler
from .utils import remove_extra
from .utils import validate_input
from .utils import (
InvalidArguments,
validate_input,
remove_extra,
)


def read_pdf(
filepath: Union[StrByteType, Path],
filepath: FilePathType = None,
pages="1",
password=None,
flavor="lattice",
suppress_stdout=False,
parallel=False,
layout_kwargs=None,
file_bytes=None,
**kwargs
):
"""Read PDF and return extracted tables.
Expand All @@ -26,8 +28,8 @@ def read_pdf(
Parameters
----------
filepath : str, Path, IO
Filepath or URL of the PDF file.
filepath : str | pathlib.Path, optional (default: None)
Filepath or URL of the PDF file. Required if file_bytes is not given
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Expand All @@ -40,6 +42,8 @@ def read_pdf(
Print all logs and warnings.
parallel : bool, optional (default: False)
Process pages in parallel using all available cpu cores.
file_bytes : io.IOBase, optional (default: None)
A file-like stream. Required if filepath is not given
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams
<https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
Expand Down Expand Up @@ -115,12 +119,15 @@ def read_pdf(
"Unknown flavor specified." " Use either 'lattice' or 'stream'"
)

if not filepath and not file_bytes:
raise InvalidArguments('Either `filepath` or `file_bytes` is required')

with warnings.catch_warnings():
if suppress_stdout:
warnings.simplefilter("ignore")

validate_input(kwargs, flavor=flavor)
p = PDFHandler(filepath, pages=pages, password=password)
p = PDFHandler(filepath, pages=pages, password=password, file_bytes=file_bytes)
kwargs = remove_extra(kwargs, flavor=flavor)
tables = p.parse(
flavor=flavor,
Expand Down
40 changes: 20 additions & 20 deletions camelot/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import os
import io
import random
import re
import shutil
Expand Down Expand Up @@ -34,6 +34,10 @@
_VALID_URLS.discard("")


class InvalidArguments(Exception):
pass


# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
def is_url(url):
"""Check to see if a URL has a valid protocol.
Expand Down Expand Up @@ -64,34 +68,30 @@ def random_string(length):
return ret


def download_url(url):
"""Download file from specified URL.
def get_url_bytes(url):
"""Get a stream of bytes for url
Parameters
----------
url : str or unicode
Returns
-------
filepath : str or unicode
Temporary filepath.
file_bytes : io.BytesIO
a file-like object that cane be read
"""
filename = f"{random_string(6)}.pdf"
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
headers = {
"User-Agent": "Mozilla/5.0",
"Accept-Encoding": "gzip;q=1.0, deflate;q=0.9, br;q=0.8, compress;q=0.7, *;q=0.1"
}
request = Request(url, None, headers)
obj = urlopen(request)
content_type = obj.info().get_content_type()
if content_type != "application/pdf":
raise NotImplementedError("File format not supported")
f.write(obj.read())
filepath = os.path.join(os.path.dirname(f.name), filename)
shutil.move(f.name, filepath)
return filepath
file_bytes = io.BytesIO()
file_bytes.name = url
headers = {"User-Agent": "Mozilla/5.0"}
request = Request(url, data=None, headers=headers)
obj = urlopen(request)
content_type = obj.info().get_content_type()
if content_type != "application/pdf":
raise NotImplementedError("File format not supported")
file_bytes.write(obj.read())
file_bytes.seek(0)
return file_bytes


stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]
Expand Down
Loading

0 comments on commit bd01e8e

Please sign in to comment.