Skip to content

Commit

Permalink
Merge branch 'processor-result-object' into new-processor-api
Browse files Browse the repository at this point in the history
  • Loading branch information
kba committed Aug 15, 2024
2 parents f8b6896 + 7045318 commit a9dba73
Show file tree
Hide file tree
Showing 7 changed files with 81 additions and 39 deletions.
4 changes: 4 additions & 0 deletions src/ocrd/processor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
Processor,
ResourceNotFoundError
)
from .ocrd_page_result import (
OcrdPageResult,
OcrdPageResultImage
)
from .helpers import (
run_cli,
run_processor,
Expand Down
75 changes: 44 additions & 31 deletions src/ocrd/processor/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,23 @@
'run_processor'
]

from os.path import exists
from os.path import exists, join
from shutil import copyfileobj
import json
import os
from os import getcwd
from pathlib import Path
from typing import Optional
from typing import List, Optional, Union
import sys
import inspect
import tarfile
import io
from deprecated import deprecated

from ocrd.workspace import Workspace
from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile
from ocrd.processor.ocrd_page_result import OcrdPageResult
from ocrd_models.ocrd_page_generateds import PcGtsType
from ocrd_utils import (
VERSION as OCRD_VERSION,
MIMETYPE_PAGE,
Expand Down Expand Up @@ -198,10 +201,12 @@ def verify(self):
assert self.output_file_grp is not None
input_file_grps = self.input_file_grp.split(',')
output_file_grps = self.output_file_grp.split(',')
def assert_file_grp_cardinality(grps, spec, msg):
if isinstance(spec, int) and spec > 0:
assert len(grps) == spec, msg % (len(grps), str(spec))
def assert_file_grp_cardinality(grps : List[str], spec : Union[int, List[int]], msg):
if isinstance(spec, int):
if spec > 0:
assert len(grps) == spec, msg % (len(grps), str(spec))
else:
assert isinstance(spec, list)
minimum = spec[0]
maximum = spec[1]
if minimum > 0:
Expand Down Expand Up @@ -289,7 +294,7 @@ def process_workspace(self, workspace: Workspace) -> None:
# - ResourceNotFoundError → use ResourceManager to download (once), then retry
# - transient (I/O or OOM) error → maybe sleep, retry
# - persistent (data) error → skip / dummy / raise
input_files = [None] * len(input_file_tuple)
input_files : List[Optional[Union[OcrdFile, ClientSideOcrdFile]]] = [None] * len(input_file_tuple)
for i, input_file in enumerate(input_file_tuple):
if i == 0:
log.info("processing page %s", input_file.pageId)
Expand All @@ -309,7 +314,7 @@ def process_workspace(self, workspace: Workspace) -> None:
# fall back to deprecated method
self.process()

def process_page_file(self, *input_files) -> None:
def process_page_file(self, *input_files : Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None:
"""
Process the given ``input_files`` of the :py:attr:`workspace`,
representing one physical page (passed as one opened
Expand All @@ -321,49 +326,55 @@ def process_page_file(self, *input_files) -> None:
to handle cases like multiple fileGrps, non-PAGE input etc.)
"""
log = getLogger('ocrd.processor.base')
input_pcgts = [None] * len(input_files)
input_pcgts : List[Optional[OcrdPage]] = [None] * len(input_files)
assert isinstance(input_files[0], (OcrdFile, ClientSideOcrdFile))
page_id = input_files[0].pageId
for i, input_file in enumerate(input_files):
# FIXME: what about non-PAGE input like image or JSON ???
assert isinstance(input_file, (OcrdFile, ClientSideOcrdFile))
log.debug("parsing file %s for page %s", input_file.ID, input_file.pageId)
try:
input_pcgts[i] = page_from_file(input_file)
page_ = page_from_file(input_file)
assert isinstance(page_, PcGtsType)
input_pcgts[i] = page_
except ValueError as e:
log.info("non-PAGE input for page %s: %s", page_id, e)
output_file_id = make_file_id(input_files[0], self.output_file_grp)
output_pcgts = self.process_page_pcgts(*input_pcgts, output_file_id=output_file_id, page_id=page_id)
if isinstance(output_pcgts, (list, tuple)):
output_images = output_pcgts[1:]
output_pcgts = output_pcgts[0]
for output_image_pil, output_image_id, output_image_path in output_images:
self.workspace.save_image_file(
output_image_pil,
output_image_id,
self.output_file_grp,
page_id=page_id,
file_path=output_image_path)
output_pcgts.set_pcGtsId(output_file_id)
self.add_metadata(output_pcgts)
result = self.process_page_pcgts(*input_pcgts, page_id=page_id)
for image_result in result.images:
image_file_id = f'{output_file_id}_{image_result.file_id_suffix}'
image_file_path = join(self.output_file_grp, f'{image_file_id}.png')
image_result.alternative_image.set_filename(image_file_path)
self.workspace.save_image_file(
image_result.pil,
image_file_id,
self.output_file_grp,
page_id=page_id,
file_path=image_file_path)
result.pcgts.set_pcGtsId(output_file_id)
self.add_metadata(result.pcgts)
# FIXME: what about non-PAGE output like JSON ???
self.workspace.add_file(file_id=output_file_id,
file_grp=self.output_file_grp,
page_id=page_id,
local_filename=os.path.join(self.output_file_grp, output_file_id + '.xml'),
mimetype=MIMETYPE_PAGE,
content=to_xml(output_pcgts))
content=to_xml(result.pcgts))

def process_page_pcgts(self, *input_pcgts : OcrdPage, output_file_id : Optional[str] = None, page_id : Optional[str] = None) -> OcrdPage:
def process_page_pcgts(self, *input_pcgts : Optional[OcrdPage], page_id : Optional[str] = None) -> OcrdPageResult:
"""
Process the given ``input_pcgts`` of the :py:attr:`workspace`,
representing one physical page (passed as one parsed
:py:class:`~ocrd_models.OcrdPage` per input fileGrp)
under the given :py:attr:`parameter`, and return the
resulting :py:class:`~ocrd_models.OcrdPage`.
resulting :py:class:`~ocrd.processor.OcrdPageResult`.
Optionally, return a list or tuple of the :py:class:`~ocrd_models.OcrdPage`
and one or more lists or tuples of :py:class:`PIL.Image` (image data),
:py:class:str (file ID) and :py:class:str (file path) of derived images
to be annotated along with the resulting PAGE file.
Optionally, add to the ``images`` attribute of the resulting
:py:class:`~ocrd.processor.OcrdPageResult` instances
of :py:class:`~ocrd.processor.OcrdPageResultImage`,
which have required fields for ``pil`` (:py:class:`PIL.Image` image data),
``file_id_suffix`` (used for generating IDs of the saved image) and
``alternative_image`` (reference of the :py:class:`ocrd_models.ocrd_page.AlternativeImageType`
for setting the filename of the saved image).
(This contains the main functionality and must be overridden by subclasses.)
"""
Expand All @@ -374,7 +385,9 @@ def add_metadata(self, pcgts: OcrdPage) -> None:
Add PAGE-XML :py:class:`~ocrd_models.ocrd_page.MetadataItemType` ``MetadataItem`` describing
the processing step and runtime parameters to :py:class:`~ocrd_models.ocrd_page.PcGtsType` ``pcgts``.
"""
pcgts.get_Metadata().add_MetadataItem(
metadata_obj = pcgts.get_Metadata()
assert metadata_obj is not None
metadata_obj.add_MetadataItem(
MetadataItemType(type_="processingStep",
name=self.ocrd_tool['steps'][0],
value=self.ocrd_tool['executable'],
Expand Down
19 changes: 13 additions & 6 deletions src/ocrd/processor/builtin/dummy_processor.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
# pylint: disable=missing-module-docstring,invalid-name
from os.path import join, basename
from typing import Optional, Union

import click

from ocrd import Processor
from ocrd.decorators import ocrd_cli_options, ocrd_cli_wrap_processor
from ocrd_models.ocrd_page import to_xml
from ocrd.processor.ocrd_page_result import OcrdPageResult
from ocrd_models.ocrd_file import ClientSideOcrdFile, OcrdFile
from ocrd_models.ocrd_page import OcrdPage, to_xml
from ocrd_models.ocrd_page_generateds import PcGtsType
from ocrd_utils import (
getLogger,
assert_file_grp_cardinality,
make_file_id,
MIME_TO_EXT,
MIMETYPE_PAGE,
Expand All @@ -24,13 +27,16 @@ class DummyProcessor(Processor):
Bare-bones processor creates PAGE-XML and optionally copies file from input group to output group
"""

def process_page_pcgts(self, *input_pcgts, output_file_id=None, page_id=None):
def process_page_pcgts(self, *input_pcgts: Optional[OcrdPage], page_id: Optional[str] = None) -> OcrdPageResult:
assert input_pcgts[0]
# nothing to do here
return input_pcgts[0]
return OcrdPageResult(input_pcgts[0])

def process_page_file(self, *input_files):
def process_page_file(self, *input_files: Optional[Union[OcrdFile, ClientSideOcrdFile]]) -> None:
LOG = getLogger('ocrd.dummy')
input_file = input_files[0]
assert input_file
assert input_file.local_filename
if self.parameter['copy_files'] and input_file.mimetype != MIMETYPE_PAGE:
# we need to mimic the actual copying in addition to the PAGE boilerplate
file_id = make_file_id(input_file, self.output_file_grp)
Expand All @@ -48,7 +54,8 @@ def process_page_file(self, *input_files):
content=content)
file_id = file_id + '_PAGE'
pcgts = page_from_file(output_file)
pcgts = self.process_page_pcgts(pcgts)
assert isinstance(pcgts, PcGtsType)
pcgts = self.process_page_pcgts(pcgts).pcgts
pcgts.set_pcGtsId(file_id)
self.add_metadata(pcgts)
LOG.info("Add PAGE-XML %s generated for %s", file_id, output_file)
Expand Down
17 changes: 17 additions & 0 deletions src/ocrd/processor/ocrd_page_result.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from dataclasses import dataclass, field
from typing import List
from ocrd_models.ocrd_page import OcrdPage
from PIL.Image import Image

from ocrd_models.ocrd_page_generateds import AlternativeImageType

@dataclass
class OcrdPageResultImage():
pil : Image
file_id_suffix : str
alternative_image : AlternativeImageType

@dataclass
class OcrdPageResult():
pcgts : OcrdPage
images : List[OcrdPageResultImage] = field(default_factory=list)
2 changes: 1 addition & 1 deletion src/ocrd/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -1073,7 +1073,7 @@ def image_from_segment(self, segment, parent_image, parent_coords,
return segment_image, segment_coords

# pylint: disable=redefined-builtin
def save_image_file(self, image : Image,
def save_image_file(self, image : Image.Image,
file_id : str,
file_grp : str,
file_path : Optional[str] = None,
Expand Down
2 changes: 1 addition & 1 deletion src/ocrd_modelfactory/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def page_from_image(input_file, with_tree=False):
revmap = dict(((node, element) for element, node in mapping.items()))
return pcgts, etree, mapping, revmap

def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsType, ET.Element, dict, dict]]:
def page_from_file(input_file, with_tree=False) -> Union[PcGtsType, Tuple[PcGtsType, ET._Element, dict, dict]]:
"""
Create :py:class:`~ocrd_models.ocrd_page.OcrdPage`
from an :py:class:`~ocrd_models.ocrd_file.OcrdFile` or a file path
Expand Down
1 change: 1 addition & 0 deletions src/ocrd_models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,6 @@
from .ocrd_exif import OcrdExif
from .ocrd_file import OcrdFile, ClientSideOcrdFile
from .ocrd_mets import OcrdMets
from .ocrd_page import OcrdPage
from .ocrd_xml_base import OcrdXmlDocument
from .report import ValidationReport

0 comments on commit a9dba73

Please sign in to comment.