From 5663f4882834fd1430c5c1d55ca438a2406ce9ec Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 00:10:12 +0200 Subject: [PATCH 01/46] processor CLI: delegate --resolve-resource, too --- src/ocrd/decorators/__init__.py | 4 +++- src/ocrd/decorators/ocrd_cli_options.py | 1 + src/ocrd/processor/helpers.py | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index 580a75b0c0..7c2dd9717c 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -36,6 +36,7 @@ def ocrd_cli_wrap_processor( profile_file=None, version=False, overwrite=False, + resolve_resource=None, show_resource=None, list_resources=False, # ocrd_network params start # @@ -50,7 +51,7 @@ def ocrd_cli_wrap_processor( if not sys.argv[1:]: processorClass(None, show_help=True) sys.exit(1) - if dump_json or dump_module_dir or help or version or show_resource or list_resources: + if dump_json or dump_module_dir or help or version or resolve_resource or show_resource or list_resources: processorClass( None, dump_json=dump_json, @@ -58,6 +59,7 @@ def ocrd_cli_wrap_processor( show_help=help, subcommand=subcommand, show_version=version, + resolve_resource=resolve_resource, show_resource=show_resource, list_resources=list_resources ) diff --git a/src/ocrd/decorators/ocrd_cli_options.py b/src/ocrd/decorators/ocrd_cli_options.py index e640a20032..9c87034ab4 100644 --- a/src/ocrd/decorators/ocrd_cli_options.py +++ b/src/ocrd/decorators/ocrd_cli_options.py @@ -41,6 +41,7 @@ def cli(mets_url): option('--address', type=ServerAddressParamType()), option('--queue', type=QueueServerParamType()), option('--database', type=DatabaseParamType()), + option('-R', '--resolve-resource'), option('-C', '--show-resource'), option('-L', '--list-resources', is_flag=True, default=False), option('-J', '--dump-json', is_flag=True, default=False), diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index f5b6010636..921cfeac80 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -290,6 +290,7 @@ def generate_processor_help(ocrd_tool, processor_instance=None, subcommand=None) ''' information_options = '''\ + -R, --resolve-resource RESNAME Show the full path of processor resource RESNAME -C, --show-resource RESNAME Dump the content of processor resource RESNAME -L, --list-resources List names of processor resources -J, --dump-json Dump tool description as JSON From 853bdb570c861b98debf1c2af60e84f39db47fbf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:12:49 +0200 Subject: [PATCH 02/46] test_mets_server: fix arg vs kwarg --- tests/test_mets_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index 58ff6e2a9b..a313ed5239 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -55,10 +55,10 @@ def add_file_server(x): mets_server_url, i = x workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) workspace_server.add_file( + 'FOO', local_filename=f'local_filename{i}', mimetype=MIMETYPE_PAGE, page_id=f'page{i}', - file_grp='FOO', file_id=f'FOO_page{i}_foo{i}', # url=f'url{i}' ) From 33c73866e5a289d83354c382b9cc34d7038027cd Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:13:46 +0200 Subject: [PATCH 03/46] mets_server: ClientSideOcrdMets needs OcrdMets-like kwargs (without deprecation) --- src/ocrd/mets_server.py | 19 +++++++++---------- tests/test_mets_server.py | 2 +- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 0d4c0a0785..da6e873c06 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -247,11 +247,9 @@ def add_agent(self, *args, **kwargs): ).json() return OcrdAgentModel.create(**kwargs) - @deprecated_alias(ID="file_id") - @deprecated_alias(pageId="page_id") - @deprecated_alias(fileGrp="file_grp") def find_files(self, **kwargs): self.log.debug("find_files(%s)", kwargs) + # translate from native OcrdMets kwargs to OcrdMetsServer REST params if "pageId" in kwargs: kwargs["page_id"] = kwargs.pop("pageId") if "ID" in kwargs: @@ -277,14 +275,14 @@ def find_files(self, **kwargs): def find_all_files(self, *args, **kwargs): return list(self.find_files(*args, **kwargs)) - @deprecated_alias(pageId="page_id") - @deprecated_alias(ID="file_id") def add_file( - self, file_grp, content=None, file_id=None, url=None, local_filename=None, mimetype=None, page_id=None, **kwargs + self, file_grp, content=None, ID=None, url=None, local_filename=None, mimetype=None, pageId=None, **kwargs ): data = OcrdFileModel.create( - file_id=file_id, file_grp=file_grp, page_id=page_id, mimetype=mimetype, url=url, - local_filename=local_filename + file_grp=file_grp, + # translate from native OcrdMets kwargs to OcrdMetsServer REST params + file_id=ID, page_id=pageId, + mimetype=mimetype, url=url, local_filename=local_filename ) if not self.multiplexing_mode: @@ -297,8 +295,9 @@ def add_file( raise RuntimeError(f"Add file failed: Msg: {r['error']}") return ClientSideOcrdFile( - None, ID=file_id, fileGrp=file_grp, url=url, pageId=page_id, mimetype=mimetype, - local_filename=local_filename + None, fileGrp=file_grp, + ID=ID, pageId=pageId, + url=url, mimetype=mimetype, local_filename=local_filename ) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index a313ed5239..1487617a71 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -236,7 +236,7 @@ def test_reload(start_mets_server : Tuple[str, Workspace]): assert len(workspace_server.mets.find_all_files()) == 35, '35 files total' assert len(workspace_server_copy.mets.find_all_files()) == 35, '35 files total' - workspace_server_copy.add_file('FOO', ID='foo', mimetype='foo/bar', local_filename='mets.xml', pageId='foo') + workspace_server_copy.add_file('FOO', file_id='foo', mimetype='foo/bar', local_filename='mets.xml', page_id='foo') assert len(workspace_server.mets.find_all_files()) == 35, '35 files total' assert len(workspace_server_copy.mets.find_all_files()) == 36, '36 files total' From 37f7cda00f53c3f8f01a722c87c2f965dc7c7b68 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:52:09 +0200 Subject: [PATCH 04/46] use up-to-date kwargs (avoiding old deprecations) --- tests/data/__init__.py | 4 ++-- tests/processor/test_processor.py | 10 +++++----- tests/validator/test_page_validator.py | 9 +++++---- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/tests/data/__init__.py b/tests/data/__init__.py index 93a2ea49a9..c7fcfb021c 100644 --- a/tests/data/__init__.py +++ b/tests/data/__init__.py @@ -52,9 +52,9 @@ def process(self): file_id = make_file_id(input_file, self.output_file_grp) # print(input_file.ID, file_id) self.workspace.add_file( - ID=file_id, + file_id=file_id, file_grp=self.output_file_grp, - pageId=input_file.pageId, + page_id=input_file.pageId, mimetype=input_file.mimetype, local_filename=os.path.join(self.output_file_grp, file_id), content='CONTENT') diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 784f68fc3d..3a47d2c23f 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -125,8 +125,8 @@ def test_run_input(self): def test_run_output0(self): with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0002') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002') run_processor(DummyProcessorWithOutput, workspace=ws, input_file_grp="GRP1", output_file_grp="OCR-D-OUT") @@ -135,10 +135,10 @@ def test_run_output0(self): def test_run_output_overwrite(self): with pushd_popd(tempdir=True) as tempdir: ws = self.resolver.workspace_from_nothing(directory=tempdir) - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar1', pageId='phys_0001') - ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, ID='foobar2', pageId='phys_0002') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar1', page_id='phys_0001') + ws.add_file('GRP1', mimetype=MIMETYPE_PAGE, file_id='foobar2', page_id='phys_0002') ws.overwrite_mode = True - ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, ID='OCR-D-OUT_phys_0001', pageId='phys_0001') + ws.add_file('OCR-D-OUT', mimetype=MIMETYPE_PAGE, file_id='OCR-D-OUT_phys_0001', page_id='phys_0001') ws.overwrite_mode = False with pytest.raises(Exception) as exc: run_processor(DummyProcessorWithOutput, workspace=ws, diff --git a/tests/validator/test_page_validator.py b/tests/validator/test_page_validator.py index 79e92d90fa..e6aaff1523 100644 --- a/tests/validator/test_page_validator.py +++ b/tests/validator/test_page_validator.py @@ -16,9 +16,10 @@ def test_validate_err(self): PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_strategy='best') # test with deprecated name with self.assertRaisesRegex(Exception, 'page_textequiv_strategy best not implemented'): - PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, strategy='best') + with self.assertWarnsRegex(DeprecationWarning, r'use page_textequiv_strategy'): + PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, strategy='best') with self.assertRaisesRegex(Exception, 'page_textequiv_consistency level superstrictest not implemented'): - PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_consistency='superstrictest', strategy='first') + PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME, page_textequiv_consistency='superstrictest', page_textequiv_strategy='first') def test_validate_filename(self): report = PageValidator.validate(filename=FAULTY_GLYPH_PAGE_FILENAME) @@ -44,7 +45,7 @@ def test_validate_lax(self): report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 26, '26 textequiv consistency errors - strict') - report = PageValidator.validate(ocrd_page=ocrd_page, strictness='lax') + report = PageValidator.validate(ocrd_page=ocrd_page, page_textequiv_consistency='lax') self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 1, '1 textequiv consistency errors - lax') def test_validate_multi_textequiv_first(self): @@ -89,7 +90,7 @@ def test_fix(self): ocrd_page = parse(FAULTY_GLYPH_PAGE_FILENAME, silence=True) report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 17, '17 textequiv consistency errors') - PageValidator.validate(ocrd_page=ocrd_page, strictness='fix') + PageValidator.validate(ocrd_page=ocrd_page, page_textequiv_consistency='fix') report = PageValidator.validate(ocrd_page=ocrd_page) self.assertEqual(len([e for e in report.errors if isinstance(e, ConsistencyError)]), 0, 'no more textequiv consistency errors') From 44946baa17d1c44d9896ef35103a97e2f48a6d2a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:52:59 +0200 Subject: [PATCH 05/46] hide/test expected deprecation warnings --- tests/test_resolver.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/tests/test_resolver.py b/tests/test_resolver.py index 16dfd03d56..c2575b6086 100644 --- a/tests/test_resolver.py +++ b/tests/test_resolver.py @@ -292,20 +292,21 @@ def test_resolve_mets_arguments(): https://github.com/OCR-D/core/issues/517 """ resolver = Resolver() - assert resolver.resolve_mets_arguments(None, None, None, None) == (str(Path.cwd()), str(Path.cwd() / 'mets.xml'), 'mets.xml', None) - assert resolver.resolve_mets_arguments('/', None, 'mets.xml', None) == ('/', '/mets.xml', 'mets.xml', None) - assert resolver.resolve_mets_arguments('/foo', '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) - assert resolver.resolve_mets_arguments(None, '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) - assert resolver.resolve_mets_arguments('/foo', 'foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) - assert resolver.resolve_mets_arguments('/foo', 'http://bar/foo.xml', None, None) == ('/foo', 'http://bar/foo.xml', 'foo.xml', None) - with pytest.raises(ValueError, match="Use either --mets or --mets-basename, not both"): - resolver.resolve_mets_arguments('/', '/foo/bar', 'foo.xml', None) - with pytest.raises(ValueError, match="inconsistent with --directory"): - resolver.resolve_mets_arguments('/foo', '/bar/foo.xml', None, None) - with pytest.warns(DeprecationWarning): - resolver.resolve_mets_arguments('/foo', None, 'not_mets.xml', None) - with pytest.raises(ValueError, match=r"--mets is an http\(s\) URL but no --directory was given"): - resolver.resolve_mets_arguments(None, 'http://bar/foo.xml', None, None) + with pytest.warns(DeprecationWarning, match='--mets-basename'): + assert resolver.resolve_mets_arguments(None, None, None, None) == (str(Path.cwd()), str(Path.cwd() / 'mets.xml'), 'mets.xml', None) + assert resolver.resolve_mets_arguments('/', None, 'mets.xml', None) == ('/', '/mets.xml', 'mets.xml', None) + assert resolver.resolve_mets_arguments('/foo', '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) + assert resolver.resolve_mets_arguments(None, '/foo/foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) + assert resolver.resolve_mets_arguments('/foo', 'foo.xml', None, None) == ('/foo', '/foo/foo.xml', 'foo.xml', None) + assert resolver.resolve_mets_arguments('/foo', 'http://bar/foo.xml', None, None) == ('/foo', 'http://bar/foo.xml', 'foo.xml', None) + with pytest.raises(ValueError, match="Use either --mets or --mets-basename, not both"): + resolver.resolve_mets_arguments('/', '/foo/bar', 'foo.xml', None) + with pytest.raises(ValueError, match="inconsistent with --directory"): + resolver.resolve_mets_arguments('/foo', '/bar/foo.xml', None, None) + with pytest.warns(DeprecationWarning): + resolver.resolve_mets_arguments('/foo', None, 'not_mets.xml', None) + with pytest.raises(ValueError, match=r"--mets is an http\(s\) URL but no --directory was given"): + resolver.resolve_mets_arguments(None, 'http://bar/foo.xml', None, None) if __name__ == '__main__': main(__file__) From d0962d67ee2e5da332ff0385e417925ab1581481 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 13 Aug 2024 23:53:25 +0200 Subject: [PATCH 06/46] improve output in case of assertion failures --- tests/cli/test_validate.py | 22 ++++++++++----------- tests/validator/test_ocrd_tool_validator.py | 4 ++-- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/tests/cli/test_validate.py b/tests/cli/test_validate.py index 36ee3e5995..bf74a84c59 100644 --- a/tests/cli/test_validate.py +++ b/tests/cli/test_validate.py @@ -57,24 +57,24 @@ def test_validate_ocrd_tool(self): json_path.write_text(OCRD_TOOL) # normal call - code, _, _ = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) - self.assertEqual(code, 0) + code, _, err = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) + self.assertEqual(code, 0, err) # relative path with pushd_popd(tempdir): - code, _, _ = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) - self.assertEqual(code, 0) + code, _, err = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) + self.assertEqual(code, 0, err) # default path with pushd_popd(tempdir): - code, _, _ = self.invoke_cli(validate_cli, ['tool-json']) - self.assertEqual(code, 0) + code, _, err = self.invoke_cli(validate_cli, ['tool-json']) + self.assertEqual(code, 0, err) def test_validate_parameter(self): with TemporaryDirectory() as tempdir: json_path = Path(tempdir, 'ocrd-tool.json') json_path.write_text(OCRD_TOOL) with pushd_popd(tempdir): - code, _, _ = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) - self.assertEqual(code, 0) + code, _, err = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) + self.assertEqual(code, 0, err) def test_validate_page(self): page_path = assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml') @@ -84,11 +84,11 @@ def test_validate_page(self): def test_validate_tasks(self): # simple - code, _, _ = self.invoke_cli(validate_cli, ['tasks', + code, _, err = self.invoke_cli(validate_cli, ['tasks', "sample-processor-required-param -I FOO -O OUT1 -p '{\"param1\": true}'", "sample-processor-required-param -I FOO -O OUT2 -p '{\"param1\": true}'", ]) - self.assertEqual(code, 0) + self.assertEqual(code, 0, err) # with workspace code, out, err = self.invoke_cli(validate_cli, ['tasks', '--workspace', assets.path_to('kant_aufklaerung_1784/data'), @@ -96,7 +96,7 @@ def test_validate_tasks(self): "sample-processor-required-param -I OCR-D-IMG,OCR-D-GT-PAGE -O OUT2 -p '{\"param1\": true}'", ]) print('code=%s out=%s err=%s' % (code, out, err)) - self.assertEqual(code, 0) + self.assertEqual(code, 0, err) if __name__ == '__main__': diff --git a/tests/validator/test_ocrd_tool_validator.py b/tests/validator/test_ocrd_tool_validator.py index 3ad40d8645..6d4616c2db 100644 --- a/tests/validator/test_ocrd_tool_validator.py +++ b/tests/validator/test_ocrd_tool_validator.py @@ -29,7 +29,7 @@ def setUp(self): def test_smoke(self): report = OcrdToolValidator.validate(self.ocrd_tool) - self.assertEqual(report.is_valid, True) + self.assertTrue(report.is_valid, report) def test_additional_props(self): self.ocrd_tool['not-allowed'] = 'YUP' @@ -48,7 +48,7 @@ def test_file_param_ok(self): ocrd_tool = json.loads(skeleton) ocrd_tool['tools']['ocrd-xyz']['parameters'] = {"file-param": {"description": "...", "type": "string", "content-type": 'application/rdf+xml'}} report = OcrdToolValidator.validate(ocrd_tool) - self.assertEqual(report.is_valid, True) + self.assertTrue(report.is_valid, report) # Not restricted anymore since spec 3.3.0 # def test_file_param_bad_content_types(self): From 061f0231a148f09943d1c5ee35f456ad502f2755 Mon Sep 17 00:00:00 2001 From: kba Date: Thu, 15 Aug 2024 17:34:43 +0200 Subject: [PATCH 07/46] allow "from ocrd_models import OcrdPage --- src/ocrd_models/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ocrd_models/__init__.py b/src/ocrd_models/__init__.py index a89ee1dec8..330fefe97d 100644 --- a/src/ocrd_models/__init__.py +++ b/src/ocrd_models/__init__.py @@ -5,5 +5,6 @@ from .ocrd_exif import OcrdExif from .ocrd_file import OcrdFile, ClientSideOcrdFile from .ocrd_mets import OcrdMets +from .ocrd_page import OcrdPage from .ocrd_xml_base import OcrdXmlDocument from .report import ValidationReport From d2f92d1e4814d810d10b5d31a63f730568c11e29 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 16 Aug 2024 15:13:58 +0200 Subject: [PATCH 08/46] ocrd_utils: forgot to export scale_coordinates at toplvl --- src/ocrd_utils/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/ocrd_utils/__init__.py b/src/ocrd_utils/__init__.py index b5bbcae121..836f01dce4 100644 --- a/src/ocrd_utils/__init__.py +++ b/src/ocrd_utils/__init__.py @@ -13,6 +13,7 @@ :py:meth:`ocrd.workspace.Workspace.image_from_segment`.) * :py:func:`rotate_coordinates`, + :py:func:`scale_coordinates`, :py:func:`shift_coordinates`, :py:func:`transpose_coordinates`, :py:func:`transform_coordinates` @@ -148,6 +149,7 @@ polygon_mask, rotate_coordinates, rotate_image, + scale_coordinates, shift_coordinates, transform_coordinates, transpose_coordinates, From c6c5c42a1d37478a6c8a4c43b5fd61c69249f7b5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 16 Aug 2024 15:01:19 +0200 Subject: [PATCH 09/46] fix imports --- src/ocrd/decorators/parameter_option.py | 2 +- src/ocrd/workspace.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ocrd/decorators/parameter_option.py b/src/ocrd/decorators/parameter_option.py index 0fbe3e0577..55abbc2a53 100644 --- a/src/ocrd/decorators/parameter_option.py +++ b/src/ocrd/decorators/parameter_option.py @@ -1,10 +1,10 @@ from click import option -#from ocrd_utils import parse_json_string_or_file __all__ = ['parameter_option', 'parameter_override_option'] def _handle_param_option(ctx, param, value): + from ocrd_utils import parse_json_string_or_file return parse_json_string_or_file(*list(value)) parameter_option = option('-p', '--parameter', diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index ff856011be..b4795f3e89 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -24,6 +24,7 @@ coordinates_of_segment, adjust_canvas_to_rotation, adjust_canvas_to_transposition, + scale_coordinates, shift_coordinates, rotate_coordinates, transform_coordinates, From 245778c74a373c07a007d5deb982197d0b22d569 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Tue, 20 Aug 2024 08:05:24 +0200 Subject: [PATCH 10/46] Processor.zip_input_files: warning instead of exception for missing input files --- src/ocrd/processor/base.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 8303413933..5113faf3da 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -377,16 +377,9 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): pageId=self.page_id, fileGrp=ifg, mimetype=mimetype), # sort by MIME type so PAGE comes before images key=lambda file_: file_.mimetype) - # Warn if no files found but pageId was specified because that - # might be because of invalid page_id (range) - if self.page_id and not files_: - msg = (f"Could not find any files for --page-id {self.page_id} - " - f"compare '{self.page_id}' with the output of 'orcd workspace list-page'.") - if on_error == 'abort': - raise ValueError(msg) - LOG.warning(msg) for file_ in files_: if not file_.pageId: + # ignore document-global files continue ift = pages.setdefault(file_.pageId, [None]*len(ifgs)) if ift[i]: @@ -431,13 +424,15 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): else: LOG.debug("adding file %s for page %s to input file group %s", file_.ID, file_.pageId, ifg) ift[i] = file_ + # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range) + if self.page_id and not any(pages): + LOG.critical(f"Could not find any files for selected pageId {self.page_id}") ifts = list() for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): if not ifiles[i]: # other fallback options? - LOG.error('found no page %s in file group %s', - page, ifg) + LOG.error(f'Found no page {page} in file group {ifg}') if ifiles[0] or not require_first: ifts.append(tuple(ifiles)) return ifts From 1f7b57fc70fe26cb5399db54edb4a4748184327d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Wed, 21 Aug 2024 18:05:38 +0200 Subject: [PATCH 11/46] Processor.zip_input_files: more verbose log msg Co-authored-by: Konstantin Baierer --- src/ocrd/processor/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/processor/base.py b/src/ocrd/processor/base.py index 5113faf3da..9e5f5aead6 100644 --- a/src/ocrd/processor/base.py +++ b/src/ocrd/processor/base.py @@ -426,7 +426,7 @@ def zip_input_files(self, require_first=True, mimetype=None, on_error='skip'): ift[i] = file_ # Warn if no files found but pageId was specified, because that might be due to invalid page_id (range) if self.page_id and not any(pages): - LOG.critical(f"Could not find any files for selected pageId {self.page_id}") + LOG.critical(f"Could not find any files for selected pageId {self.page_id}.\ncompare '{self.page_id}' with the output of 'orcd workspace list-page'.") ifts = list() for page, ifiles in pages.items(): for i, ifg in enumerate(ifgs): From 35bdb39773dd26d238d00c00f9d3f7c2c711ac4a Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 21 Aug 2024 22:28:29 +0200 Subject: [PATCH 12/46] tests report.is_valid: improve output on failure --- tests/cli/test_validate.py | 23 +++++++++---------- tests/validator/test_json_validator.py | 6 ++--- tests/validator/test_ocrd_tool_validator.py | 4 ++-- tests/validator/test_parameter_validator.py | 2 +- .../validator/test_resource_list_validator.py | 3 +-- tests/validator/test_xsd_validator.py | 8 +++---- 6 files changed, 22 insertions(+), 24 deletions(-) diff --git a/tests/cli/test_validate.py b/tests/cli/test_validate.py index bf74a84c59..cc58df6540 100644 --- a/tests/cli/test_validate.py +++ b/tests/cli/test_validate.py @@ -57,24 +57,24 @@ def test_validate_ocrd_tool(self): json_path.write_text(OCRD_TOOL) # normal call - code, _, err = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) - self.assertEqual(code, 0, err) + code, out, err = self.invoke_cli(validate_cli, ['tool-json', str(json_path)]) + self.assertEqual(code, 0, out + err) # relative path with pushd_popd(tempdir): - code, _, err = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) - self.assertEqual(code, 0, err) + code, out, err = self.invoke_cli(validate_cli, ['tool-json', 'ocrd-tool.json']) + self.assertEqual(code, 0, out + err) # default path with pushd_popd(tempdir): - code, _, err = self.invoke_cli(validate_cli, ['tool-json']) - self.assertEqual(code, 0, err) + code, out, err = self.invoke_cli(validate_cli, ['tool-json']) + self.assertEqual(code, 0, out + err) def test_validate_parameter(self): with TemporaryDirectory() as tempdir: json_path = Path(tempdir, 'ocrd-tool.json') json_path.write_text(OCRD_TOOL) with pushd_popd(tempdir): - code, _, err = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) - self.assertEqual(code, 0, err) + code, out, err = self.invoke_cli(validate_cli, ['parameters', 'ocrd-tool.json', 'ocrd-xyz', dumps({"baz": "foo"})]) + self.assertEqual(code, 0, out + err) def test_validate_page(self): page_path = assets.path_to('glyph-consistency/data/OCR-D-GT-PAGE/FAULTY_GLYPHS.xml') @@ -84,19 +84,18 @@ def test_validate_page(self): def test_validate_tasks(self): # simple - code, _, err = self.invoke_cli(validate_cli, ['tasks', + code, out, err = self.invoke_cli(validate_cli, ['tasks', "sample-processor-required-param -I FOO -O OUT1 -p '{\"param1\": true}'", "sample-processor-required-param -I FOO -O OUT2 -p '{\"param1\": true}'", ]) - self.assertEqual(code, 0, err) + self.assertEqual(code, 0, out + err) # with workspace code, out, err = self.invoke_cli(validate_cli, ['tasks', '--workspace', assets.path_to('kant_aufklaerung_1784/data'), "sample-processor-required-param -I OCR-D-IMG,OCR-D-GT-PAGE -O OUT1 -p '{\"param1\": true}'", "sample-processor-required-param -I OCR-D-IMG,OCR-D-GT-PAGE -O OUT2 -p '{\"param1\": true}'", ]) - print('code=%s out=%s err=%s' % (code, out, err)) - self.assertEqual(code, 0, err) + self.assertEqual(code, 0, out + err) if __name__ == '__main__': diff --git a/tests/validator/test_json_validator.py b/tests/validator/test_json_validator.py index 8a8387d4b6..bd756879bc 100644 --- a/tests/validator/test_json_validator.py +++ b/tests/validator/test_json_validator.py @@ -20,18 +20,18 @@ def setUp(self): def test_validate_string(self): report = JsonValidator.validate('{}', {}) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_defaults_set(self): obj = {'bar': 2000} report = self.defaults_validator._validate(obj) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) self.assertEqual(obj, {'foo': 3000, 'bar': 2000}) def test_properr(self): obj = {'bar': 100, 'quux': {}} report = self.defaults_validator._validate(obj) - self.assertFalse(report.is_valid) + self.assertFalse(report.is_valid, str(report.to_xml())) self.assertEqual(len(report.errors), 1) diff --git a/tests/validator/test_ocrd_tool_validator.py b/tests/validator/test_ocrd_tool_validator.py index 6d4616c2db..70d40c2f2a 100644 --- a/tests/validator/test_ocrd_tool_validator.py +++ b/tests/validator/test_ocrd_tool_validator.py @@ -29,7 +29,7 @@ def setUp(self): def test_smoke(self): report = OcrdToolValidator.validate(self.ocrd_tool) - self.assertTrue(report.is_valid, report) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_additional_props(self): self.ocrd_tool['not-allowed'] = 'YUP' @@ -48,7 +48,7 @@ def test_file_param_ok(self): ocrd_tool = json.loads(skeleton) ocrd_tool['tools']['ocrd-xyz']['parameters'] = {"file-param": {"description": "...", "type": "string", "content-type": 'application/rdf+xml'}} report = OcrdToolValidator.validate(ocrd_tool) - self.assertTrue(report.is_valid, report) + self.assertTrue(report.is_valid, str(report.to_xml())) # Not restricted anymore since spec 3.3.0 # def test_file_param_bad_content_types(self): diff --git a/tests/validator/test_parameter_validator.py b/tests/validator/test_parameter_validator.py index f0d9d41d2c..297a149064 100644 --- a/tests/validator/test_parameter_validator.py +++ b/tests/validator/test_parameter_validator.py @@ -42,7 +42,7 @@ def test_default_assignment(self): }) obj = {'baz': '23'} report = validator.validate(obj) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) self.assertEqual(obj, {'baz': '23', "num-param": 1}) def test_min_max(): diff --git a/tests/validator/test_resource_list_validator.py b/tests/validator/test_resource_list_validator.py index eb95d9b1ea..cc63c30ea7 100644 --- a/tests/validator/test_resource_list_validator.py +++ b/tests/validator/test_resource_list_validator.py @@ -22,8 +22,7 @@ def reslist(): def test_resource_list_validator(reslist): report = OcrdResourceListValidator.validate(reslist) - print(report.errors) - assert report.is_valid == True + assert report.is_valid, str(report.to_xml()) if __name__ == '__main__': main(__file__) diff --git a/tests/validator/test_xsd_validator.py b/tests/validator/test_xsd_validator.py index d0150338dd..50b3851ffc 100644 --- a/tests/validator/test_xsd_validator.py +++ b/tests/validator/test_xsd_validator.py @@ -37,22 +37,22 @@ def test_mets_empty(self): def test_validate_simple_protected_str(self): val = XsdValidator(XSD_METS_URL) report = val._validate(self.ws.mets.to_xml()) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_validate_simple_protected_doc(self): val = XsdValidator(XSD_METS_URL) report = val._validate(self.ws.mets._tree) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) def test_validate_simple_static_doc(self): report = XsdValidator.validate(XSD_METS_URL, self.ws.mets._tree) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) class TestXsdPageValidator(TestCase): def test_validate_page_simple_static_doc(self): report = XsdPageValidator.validate(simple_page) - self.assertTrue(report.is_valid) + self.assertTrue(report.is_valid, str(report.to_xml())) if __name__ == '__main__': main(__file__) From e595996d91ae05577cbd3bc133c2f2429d462ff2 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 11:49:08 +0200 Subject: [PATCH 13/46] fix --log-filename (6fc606027a): apply in ocrd_cli_wrap_processor --- src/ocrd/decorators/__init__.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/ocrd/decorators/__init__.py b/src/ocrd/decorators/__init__.py index 7c2dd9717c..464bb67ed8 100644 --- a/src/ocrd/decorators/__init__.py +++ b/src/ocrd/decorators/__init__.py @@ -1,4 +1,5 @@ import sys +from contextlib import nullcontext from ocrd_utils import ( config, @@ -9,6 +10,7 @@ parse_json_string_with_comments, set_json_key_value_overrides, parse_json_string_or_file, + redirect_stderr_and_stdout_to_file, ) from ocrd_validators import WorkspaceValidator from ocrd_network import ProcessingWorker, ProcessorServer, AgentType @@ -141,7 +143,7 @@ def resolve(name): print("Profiling...") pr = cProfile.Profile() pr.enable() - def exit(): + def goexit(): pr.disable() print("Profiling completed") if profile_file: @@ -150,8 +152,13 @@ def exit(): s = io.StringIO() pstats.Stats(pr, stream=s).sort_stats("cumulative").print_stats() print(s.getvalue()) - atexit.register(exit) - run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs) + atexit.register(goexit) + if log_filename: + log_ctx = redirect_stderr_and_stdout_to_file(log_filename) + else: + log_ctx = nullcontext() + with log_ctx: + run_processor(processorClass, mets_url=mets, workspace=workspace, **kwargs) def check_and_run_network_agent(ProcessorClass, subcommand: str, address: str, database: str, queue: str): From f21b8d24eaa8320b2ff1c405355ce0b40f116256 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 11:54:07 +0200 Subject: [PATCH 14/46] fix exception --- src/ocrd/resource_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index 44bbd081bc..e63c5fd015 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -248,7 +248,7 @@ def _download_impl(url, filename, progress_cb=None, size=None): if "Content-Disposition" not in r.headers: url = get_url_from_gdrive_confirmation(r.text) except RuntimeError as e: - log.warning("Cannot unwrap Google Drive URL: ", e) + log.warning("Cannot unwrap Google Drive URL: %s", e) with open(filename, 'wb') as f: with requests.get(url, stream=True) as r: r.raise_for_status() From 0cbd3ea906e8c93f940e012f3f7383a1a372c135 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 12:27:33 +0200 Subject: [PATCH 15/46] adapt to PIL.Image moved constants --- src/ocrd/workspace.py | 8 +++---- src/ocrd_utils/image.py | 50 ++++++++++++++++++++--------------------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index b4795f3e89..8b8e89bfca 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -1151,9 +1151,9 @@ def _reflect(log, name, orientation, segment_image, segment_coords, segment_xywh # Transpose in affine coordinate transform: # (consistent with image transposition or AlternativeImage below) transposition = { - 90: Image.ROTATE_90, - 180: Image.ROTATE_180, - 270: Image.ROTATE_270 + 90: Image.Transpose.ROTATE_90, + 180: Image.Transpose.ROTATE_180, + 270: Image.Transpose.ROTATE_270 }.get(orientation) # no default segment_coords['transform'] = transpose_coordinates( segment_coords['transform'], transposition, @@ -1221,5 +1221,5 @@ def _scale(log, name, factor, segment_image, segment_coords, segment_xywh, **kwa segment_image = segment_image.resize((int(segment_image.width * factor), int(segment_image.height * factor)), # slowest, but highest quality: - Image.BICUBIC) + Image.Resampling.BICUBIC) return segment_image, segment_coords, segment_xywh diff --git a/src/ocrd_utils/image.py b/src/ocrd_utils/image.py index 3bc14e6612..6f2524608c 100644 --- a/src/ocrd_utils/image.py +++ b/src/ocrd_utils/image.py @@ -65,10 +65,10 @@ def adjust_canvas_to_transposition(size, method): Return a numpy array of the enlarged width and height. """ - if method in [Image.ROTATE_90, - Image.ROTATE_270, - Image.TRANSPOSE, - Image.TRANSVERSE]: + if method in [Image.Transpose.ROTATE_90, + Image.Transpose.ROTATE_270, + Image.Transpose.TRANSPOSE, + Image.Transpose.TRANSVERSE]: size = size[::-1] return size @@ -348,26 +348,26 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])): calculate the affine coordinate transform corresponding to the composition of both transformations, which is respectively: - - ``PIL.Image.FLIP_LEFT_RIGHT``: + - ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``: entails translation to the center, followed by pure reflection about the y-axis, and subsequent translation back - - ``PIL.Image.FLIP_TOP_BOTTOM``: + - ``PIL.Image.Transpose.FLIP_TOP_BOTTOM``: entails translation to the center, followed by pure reflection about the x-axis, and subsequent translation back - - ``PIL.Image.ROTATE_180``: + - ``PIL.Image.Transpose.ROTATE_180``: entails translation to the center, followed by pure reflection about the origin, and subsequent translation back - - ``PIL.Image.ROTATE_90``: + - ``PIL.Image.Transpose.ROTATE_90``: entails translation to the center, followed by pure rotation by 90° counter-clockwise, and subsequent translation back - - ``PIL.Image.ROTATE_270``: + - ``PIL.Image.Transpose.ROTATE_270``: entails translation to the center, followed by pure rotation by 270° counter-clockwise, and subsequent translation back - - ``PIL.Image.TRANSPOSE``: + - ``PIL.Image.Transpose.TRANSPOSE``: entails translation to the center, followed by pure rotation by 90° counter-clockwise and pure reflection about the x-axis, and subsequent translation back - - ``PIL.Image.TRANSVERSE``: + - ``PIL.Image.Transpose.TRANSVERSE``: entails translation to the center, followed by pure rotation by 90° counter-clockwise and pure reflection about the y-axis, and subsequent translation back @@ -388,13 +388,13 @@ def transpose_coordinates(transform, method, orig=np.array([0, 0])): [0, 0, 1]]) transform = shift_coordinates(transform, -orig) operations = { - Image.FLIP_LEFT_RIGHT: [refly], - Image.FLIP_TOP_BOTTOM: [reflx], - Image.ROTATE_180: [reflx, refly], - Image.ROTATE_90: [rot90], - Image.ROTATE_270: [rot90, reflx, refly], - Image.TRANSPOSE: [rot90, reflx], - Image.TRANSVERSE: [rot90, refly] + Image.Transpose.FLIP_LEFT_RIGHT: [refly], + Image.Transpose.FLIP_TOP_BOTTOM: [reflx], + Image.Transpose.ROTATE_180: [reflx, refly], + Image.Transpose.ROTATE_90: [rot90], + Image.Transpose.ROTATE_270: [rot90, reflx, refly], + Image.Transpose.TRANSPOSE: [rot90, reflx], + Image.Transpose.TRANSVERSE: [rot90, refly] }.get(method) # no default for operation in operations: transform = np.dot(operation, transform) @@ -411,29 +411,29 @@ def transpose_image(image, method): Given a PIL.Image ``image`` and a transposition mode ``method``, apply the respective operation: - - ``PIL.Image.FLIP_LEFT_RIGHT``: + - ``PIL.Image.Transpose.FLIP_LEFT_RIGHT``: all pixels get mirrored at half the width of the image - - ``PIL.Image.FLIP_TOP_BOTTOM``: + - ``PIL.Image.Transpose.FLIP_TOP_BOTTOM``: all pixels get mirrored at half the height of the image - - ``PIL.Image.ROTATE_180``: + - ``PIL.Image.Transpose.ROTATE_180``: all pixels get mirrored at both, the width and half the height of the image, i.e. the image gets rotated by 180° counter-clockwise - - ``PIL.Image.ROTATE_90``: + - ``PIL.Image.Transpose.ROTATE_90``: rows become columns (but counted from the right) and columns become rows, i.e. the image gets rotated by 90° counter-clockwise; width becomes height and vice versa - - ``PIL.Image.ROTATE_270``: + - ``PIL.Image.Transpose.ROTATE_270``: rows become columns and columns become rows (but counted from the bottom), i.e. the image gets rotated by 270° counter-clockwise; width becomes height and vice versa - - ``PIL.Image.TRANSPOSE``: + - ``PIL.Image.Transpose.TRANSPOSE``: rows become columns and vice versa, i.e. all pixels get mirrored at the main diagonal; width becomes height and vice versa - - ``PIL.Image.TRANSVERSE``: + - ``PIL.Image.Transpose.TRANSVERSE``: rows become columns (but counted from the right) and columns become rows (but counted from the bottom), i.e. all pixels get mirrored at the opposite diagonal; From 8f8912c14dcccdc485d03e94efe33d9097fcdb78 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 12:31:35 +0200 Subject: [PATCH 16/46] cli.workspace: pass fileGrp as well, improve description --- src/ocrd/cli/workspace.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 0c70fd3a36..062a373608 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -118,7 +118,7 @@ def workspace_validate(ctx, mets_url, download, skip, page_textequiv_consistency @workspace_cli.command('clone', cls=command_with_replaced_help( (r' \[WORKSPACE_DIR\]', ''))) # XXX deprecated argument @click.option('-f', '--clobber-mets', help="Overwrite existing METS file", default=False, is_flag=True) -@click.option('-a', '--download', is_flag=True, help="Download all files and change location in METS file after cloning") +@click.option('-a', '--download', is_flag=True, help="Download all selected files and add local path references in METS file afterwards") @click.argument('mets_url') @mets_find_options # XXX deprecated @@ -129,8 +129,10 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim Create a workspace from METS_URL and return the directory METS_URL can be a URL, an absolute path or a path relative to $PWD. - If METS_URL is not provided, use --mets accordingly. METS_URL can also be an OAI-PMH GetRecord URL wrapping a METS file. + + Additional options pertain to the selection of files / fileGrps / pages + to be downloaded, if --download is used. """ LOG = getLogger('ocrd.cli.workspace.clone') if workspace_dir: @@ -143,6 +145,7 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim mets_basename=ctx.mets_basename, clobber_mets=clobber_mets, download=download, + fileGrp=file_grp, ID=file_id, pageId=page_id, mimetype=mimetype, @@ -407,7 +410,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi if dry_run: log.info('workspace.add_file(%s)' % file_dict) else: - workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) + workspace.add_file(fileGrp, ignore=ignore, force=force, **file_dict) # pylint: disable=redundant-keyword-arg # save changes to disk workspace.save_mets() From 6dccfb388209a7e14b61a46e139ad07e72926c3f Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 12:35:37 +0200 Subject: [PATCH 17/46] OcrdMets.add_agent: does not have positional args --- src/ocrd/mets_server.py | 2 +- src/ocrd_models/ocrd_mets.py | 4 ++-- tests/model/test_ocrd_mets.py | 2 +- tests/test_workspace.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index da6e873c06..7c22da278d 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -236,7 +236,7 @@ def agents(self): agent_dict["_type"] = agent_dict.pop("type") return [ClientSideOcrdAgent(None, **agent_dict) for agent_dict in agent_dicts] - def add_agent(self, *args, **kwargs): + def add_agent(self, **kwargs): if not self.multiplexing_mode: return self.session.request("POST", f"{self.url}/agent", json=OcrdAgentModel.create(**kwargs).dict()) else: diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index d6da3e1cda..66251a54dc 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -198,7 +198,7 @@ def agents(self) -> List[OcrdAgent]: """ return [OcrdAgent(el_agent) for el_agent in self._tree.getroot().findall('mets:metsHdr/mets:agent', NS)] - def add_agent(self, *args, **kwargs) -> OcrdAgent: + def add_agent(self, **kwargs) -> OcrdAgent: """ Add an :py:class:`ocrd_models.ocrd_agent.OcrdAgent` to the list of agents in the ``metsHdr``. """ @@ -213,7 +213,7 @@ def add_agent(self, *args, **kwargs) -> OcrdAgent: el_agent_last.addnext(el_agent) except StopIteration: el_metsHdr.insert(0, el_agent) - return OcrdAgent(el_agent, *args, **kwargs) + return OcrdAgent(el_agent, **kwargs) @property def file_groups(self) -> List[str]: diff --git a/tests/model/test_ocrd_mets.py b/tests/model/test_ocrd_mets.py index 739db7625a..89742a507e 100644 --- a/tests/model/test_ocrd_mets.py +++ b/tests/model/test_ocrd_mets.py @@ -248,7 +248,7 @@ def test_file_pageid(sbb_sample_01): def test_agent(sbb_sample_01): beforelen = len(sbb_sample_01.agents) - sbb_sample_01.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'YETOTHERSTILL') + sbb_sample_01.add_agent(name='foo bar v0.0.1', _type='OTHER', othertype='OTHER', role='YETOTHERSTILL') assert len(sbb_sample_01.agents) == beforelen + 1 def test_metshdr(): diff --git a/tests/test_workspace.py b/tests/test_workspace.py index c8df9b444b..75e9b6886f 100644 --- a/tests/test_workspace.py +++ b/tests/test_workspace.py @@ -734,7 +734,7 @@ def _fixture_metsDocumentID(tmp_path): def test_agent_before_metsDocumentID(workspace_metsDocumentID): report = WorkspaceValidator.validate(Resolver(), mets_url=workspace_metsDocumentID.mets_target) assert report.is_valid - workspace_metsDocumentID.mets.add_agent('foo bar v0.0.1', 'OTHER', 'OTHER', 'OTHER') + workspace_metsDocumentID.mets.add_agent(name='foo bar v0.0.1', _type='OTHER', othertype='OTHER', role='OTHER') workspace_metsDocumentID.save_mets() report = WorkspaceValidator.validate(Resolver(), mets_url=workspace_metsDocumentID.mets_target) print(report.errors) From 2d85f14d00bd112553e6ee4a0751436e8d1131f7 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 24 Aug 2024 13:15:13 +0200 Subject: [PATCH 18/46] update pylintrc --- .pylintrc | 18 ++++++++---------- src/ocrd/resource_manager.py | 4 ++++ 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/.pylintrc b/.pylintrc index b2125d824c..a4106a1bb7 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,19 +1,21 @@ [MASTER] -extension-pkg-whitelist=lxml -ignored-modules=cv2,tesserocr,ocrd.model +extension-pkg-whitelist=lxml,pydantic +ignored-modules=cv2,tesserocr,ocrd_models.ocrd_page_generateds +ignore-patterns=.*generateds.* [MESSAGES CONTROL] -ignore-patterns='.*generateds.*' disable = fixme, - E501, + line-too-long, + consider-using-f-string, + logging-fstring-interpolation, trailing-whitespace, logging-not-lazy, inconsistent-return-statements, + disallowed-name, invalid-name, line-too-long, missing-docstring, - no-self-use, wrong-import-order, too-many-nested-blocks, superfluous-parens, @@ -25,13 +27,9 @@ disable = ungrouped-imports, useless-object-inheritance, useless-import-alias, - bad-continuation, no-else-return, logging-not-lazy -[FORMAT] -no-space-check=empty-line - [DESIGN] # Maximum number of arguments for function / method max-args=12 @@ -40,7 +38,7 @@ max-locals=30 # Maximum number of return / yield for function / method body max-returns=12 # Maximum number of branch for function / method body -max-branchs=30 +max-branches=30 # Maximum number of statements in function / method body max-statements=60 # Maximum number of parents for a class (see R0901). diff --git a/src/ocrd/resource_manager.py b/src/ocrd/resource_manager.py index e63c5fd015..1fc0409250 100644 --- a/src/ocrd/resource_manager.py +++ b/src/ocrd/resource_manager.py @@ -13,12 +13,16 @@ from gdown.download import get_url_from_gdrive_confirmation from yaml import safe_load, safe_dump +# pylint: disable=wrong-import-position + # https://github.com/OCR-D/core/issues/867 # https://stackoverflow.com/questions/50900727/skip-converting-entities-while-loading-a-yaml-string-using-pyyaml import yaml.constructor yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:timestamp'] = \ yaml.constructor.SafeConstructor.yaml_constructors[u'tag:yaml.org,2002:str'] +# pylint: enable=wrong-import-position + from ocrd_validators import OcrdResourceListValidator from ocrd_utils import getLogger, directory_size, get_moduledir, EXT_TO_MIME, nth_url_segment, guess_media_type, config from ocrd_utils.os import get_processor_resource_types, list_all_resources, pushd_popd, get_ocrd_tool_json From ea68370e223a7b8af2843ca16c0ebd8f223b6574 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 25 Aug 2024 02:18:53 +0200 Subject: [PATCH 19/46] pylint: try ignoring generateds (again) --- .pylintrc | 1 + src/ocrd/cli/ocrd_tool.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/.pylintrc b/.pylintrc index a4106a1bb7..2e3af4288b 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,6 +1,7 @@ [MASTER] extension-pkg-whitelist=lxml,pydantic ignored-modules=cv2,tesserocr,ocrd_models.ocrd_page_generateds +ignore-paths=ocrd_page_generateds.py ignore-patterns=.*generateds.* [MESSAGES CONTROL] diff --git a/src/ocrd/cli/ocrd_tool.py b/src/ocrd/cli/ocrd_tool.py index 2a7fa99ec9..3c024ec668 100644 --- a/src/ocrd/cli/ocrd_tool.py +++ b/src/ocrd/cli/ocrd_tool.py @@ -29,6 +29,8 @@ def __init__(self, filename): self.filename = filename with codecs.open(filename, encoding='utf-8') as f: self.content = f.read() + # perhaps the validator should _always_ run (for default expansion) + # so validate command only for the report? self.json = loads(self.content) pass_ocrd_tool = click.make_pass_decorator(OcrdToolCtx) From 18ac2c0ab954268811a2ed8654cafc44924e01a4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 15:11:49 +0200 Subject: [PATCH 20/46] ClientSideOcrdMets: use same logger name prefix as server --- src/ocrd/mets_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 7c22da278d..9b66871349 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -120,7 +120,7 @@ class ClientSideOcrdMets: def __init__(self, url, workspace_path: Optional[str] = None): self.protocol = "tcp" if url.startswith("http://") else "uds" - self.log = getLogger(f"ocrd.mets_client[{url}]") + self.log = getLogger(f"ocrd.models.ocrd_mets.client.{url}") self.url = url if self.protocol == "tcp" else f'http+unix://{url.replace("/", "%2F")}' self.ws_dir_path = workspace_path if workspace_path else None From da37967357f4d1bf9076498342319fddc35db070 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 15:15:03 +0200 Subject: [PATCH 21/46] test_mets_server: use tmpdir to avoid side effects between suites --- tests/test_mets_server.py | 48 +++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 19 deletions(-) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index 1487617a71..8f94b95645 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -22,13 +22,16 @@ from requests.exceptions import ConnectionError from ocrd import Resolver, OcrdMetsServer, Workspace -from ocrd_utils import pushd_popd, MIMETYPE_PAGE +from ocrd_utils import pushd_popd, MIMETYPE_PAGE, initLogging, setOverrideLogLevel -WORKSPACE_DIR = '/tmp/ocrd-mets-server' TRANSPORTS = ['/tmp/ocrd-mets-server.sock', 'http://127.0.0.1:12345'] +initLogging() +setOverrideLogLevel(10) + @fixture(scope='function', name='start_mets_server', params=TRANSPORTS) -def fixture_start_mets_server(request) -> Iterable[Tuple[str, Workspace]]: +def fixture_start_mets_server(request, tmpdir) -> Iterable[Tuple[str, Workspace]]: + tmpdir = str(tmpdir) def _start_mets_server(*args, **kwargs): mets_server = OcrdMetsServer(*args, **kwargs) mets_server.startup() @@ -39,21 +42,22 @@ def _start_mets_server(*args, **kwargs): if exists(mets_server_url): remove(mets_server_url) - if exists(WORKSPACE_DIR): - rmtree(WORKSPACE_DIR, ignore_errors=True) + if exists(tmpdir): + rmtree(tmpdir, ignore_errors=True) - copytree(assets.path_to('SBB0000F29300010000/data'), WORKSPACE_DIR) - workspace = Workspace(Resolver(), WORKSPACE_DIR) + copytree(assets.path_to('SBB0000F29300010000/data'), tmpdir) + workspace = Workspace(Resolver(), tmpdir) p = Process(target=_start_mets_server, kwargs={'workspace': workspace, 'url': request.param}) p.start() sleep(1) # sleep to start up server - yield mets_server_url, Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) + workspace_server = Workspace(Resolver(), tmpdir, mets_server_url=mets_server_url) + yield mets_server_url, workspace_server p.terminate() - rmtree(WORKSPACE_DIR, ignore_errors=True) + rmtree(tmpdir, ignore_errors=True) def add_file_server(x): - mets_server_url, i = x - workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) + mets_server_url, directory, i = x + workspace_server = Workspace(Resolver(), directory, mets_server_url=mets_server_url) workspace_server.add_file( 'FOO', local_filename=f'local_filename{i}', @@ -64,8 +68,8 @@ def add_file_server(x): ) def add_agent_server(x): - mets_server_url, i = x - workspace_server = Workspace(resolver=Resolver(), directory=WORKSPACE_DIR, mets_server_url=mets_server_url) + mets_server_url, directory, i = x + workspace_server = Workspace(Resolver(), directory, mets_server_url=mets_server_url) workspace_server.mets.add_agent( name=f'proc{i}', _type='baz', @@ -82,7 +86,10 @@ def test_mets_server_add_file(start_mets_server): # add NO_FILES files in parallel with Pool() as pool: - pool.map(add_file_server, zip(repeat(mets_server_url), range(NO_FILES))) + pool.map(add_file_server, zip( + repeat(mets_server_url), + repeat(workspace_server.directory), + range(NO_FILES))) assert set(workspace_server.mets.file_groups) == set( [ 'OCR-D-IMG', @@ -107,7 +114,7 @@ def test_mets_server_add_file(start_mets_server): assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == NO_FILES # not yet synced - workspace_file = Workspace(Resolver(), WORKSPACE_DIR) + workspace_file = Workspace(Resolver(), workspace_server.directory) assert len(workspace_file.mets.find_all_files(fileGrp='FOO')) == 0 # sync @@ -125,13 +132,16 @@ def test_mets_server_add_agents(start_mets_server): # add NO_AGENTS agents in parallel with Pool() as pool: - pool.map(add_agent_server, zip(repeat(mets_server_url), list(range(NO_AGENTS)))) + pool.map(add_agent_server, zip( + repeat(mets_server_url), + repeat(workspace_server.directory), + list(range(NO_AGENTS)))) assert len(workspace_server.mets.agents) == NO_AGENTS + no_agents_before # XXX not a tuple assert workspace_server.mets.agents[-1].notes[0][0] == {'{https://ocr-d.de}foo': 'bar'} - workspace_file = Workspace(Resolver(), WORKSPACE_DIR) + workspace_file = Workspace(Resolver(), workspace_server.directory) assert len(workspace_file.mets.agents) == no_agents_before # sync @@ -142,7 +152,7 @@ def test_mets_server_add_agents(start_mets_server): def test_mets_server_str(start_mets_server): mets_server_url, workspace_server = start_mets_server - workspace_server = Workspace(Resolver(), WORKSPACE_DIR, mets_server_url=mets_server_url) + workspace_server = Workspace(Resolver(), workspace_server.directory, mets_server_url=mets_server_url) f = next(workspace_server.find_files()) assert str(f) == '' a = workspace_server.mets.agents[0] @@ -182,7 +192,7 @@ def test_mets_server_socket_stop(start_mets_server): assert True, 'No stop conditions to test for TCP server' else: assert Path(mets_server_url).exists() - assert workspace_server.mets.workspace_path == WORKSPACE_DIR + assert workspace_server.mets.workspace_path == workspace_server.directory workspace_server.mets.stop() with raises(ConnectionError): workspace_server.mets.file_groups From ccb416b13e7f91781568fda8e60ad8182bfea88c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 17:04:04 +0200 Subject: [PATCH 22/46] disableLogging: re-instate root logger, to --- src/ocrd_utils/logging.py | 4 +++- tests/test_decorators.py | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index bb771fc0ce..8f45f9c7fc 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -212,11 +212,13 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG): # logging.basicConfig(level=logging.CRITICAL) # logging.disable(logging.ERROR) # remove all handlers for the ocrd logger - for logger_name in ROOT_OCRD_LOGGERS: + for logger_name in ROOT_OCRD_LOGGERS + ['']: for handler in logging.getLogger(logger_name).handlers[:]: logging.getLogger(logger_name).removeHandler(handler) for logger_name in LOGGING_DEFAULTS: logging.getLogger(logger_name).setLevel(logging.NOTSET) + # Python default log level is WARNING + logging.root.setLevel(logging.WARNING) # Initializing stream handlers at module level # would cause message output in all runtime contexts, diff --git a/tests/test_decorators.py b/tests/test_decorators.py index 5ab2880053..df8d6422be 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -64,6 +64,7 @@ def test_loglevel_override(self): pytest.skip(f"ocrd_logging.conf found at {get_logging_config_files()}, skipping logging test") import logging disableLogging() + assert logging.getLogger('').getEffectiveLevel() == logging.WARNING assert logging.getLogger('ocrd').getEffectiveLevel() == logging.WARNING initLogging() assert logging.getLogger('ocrd').getEffectiveLevel() == logging.INFO From 7e3cdf4ec014efe5b4cddb8d9554981f9181a6d5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Wed, 28 Aug 2024 17:15:56 +0200 Subject: [PATCH 23/46] test-logging: also remove ocrd.log from tempdir --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4997066d1b..b5cd2f276e 100644 --- a/Makefile +++ b/Makefile @@ -273,7 +273,7 @@ test-logging: assets cp src/ocrd_utils/ocrd_logging.conf $$tempdir; \ cd $$tempdir; \ $(PYTHON) -m pytest --continue-on-collection-errors -k TestLogging -k TestDecorators $(TESTDIR); \ - rm -r $$tempdir/ocrd_logging.conf $$tempdir/.benchmarks; \ + rm -r $$tempdir/ocrd_logging.conf $$tempdir/ocrd.log $$tempdir/.benchmarks; \ rm -rf $$tempdir/.coverage; \ rmdir $$tempdir From 4f45b12027fb0d53301dbbf17e2dcfa5637a1497 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 00:50:34 +0200 Subject: [PATCH 24/46] bashlib: re-add --log-filename, implement as stderr redirect --- src/ocrd/lib.bash | 1 + 1 file changed, 1 insertion(+) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index 1e3ecfc6eb..febaf92ae6 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -141,6 +141,7 @@ ocrd__parse_argv () { while [[ "${1:-}" = -* ]];do case "$1" in -l|--log-level) ocrd__argv[log_level]=$2 ; shift ;; + --log-filename) exec 2> "$2" ; shift ;; -h|--help|--usage) ocrd__usage; exit ;; -J|--dump-json) ocrd__dumpjson; exit ;; -D|--dump-module-dir) echo $(dirname "$OCRD_TOOL_JSON"); exit ;; From 7b70c90957bd8fe4ccfa78328ff860cff69cc87b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 22:13:01 +0200 Subject: [PATCH 25/46] ocrd_utils.config: add reset_defaults() --- src/ocrd_utils/config.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/src/ocrd_utils/config.py b/src/ocrd_utils/config.py index 063af930c8..4182456435 100644 --- a/src/ocrd_utils/config.py +++ b/src/ocrd_utils/config.py @@ -68,14 +68,26 @@ def has_default(self, name): raise ValueError(f"Unregistered env variable {name}") return self._variables[name].has_default + def reset_defaults(self): + for name in self._variables: + try: + # we cannot use hasattr, because that delegates to getattr, + # which we override and provide defaults for (which of course + # cannot be removed) + if self.__getattribute__(name): + delattr(self, name) + except AttributeError: + pass + def describe(self, name, *args, **kwargs): if not name in self._variables: raise ValueError(f"Unregistered env variable {name}") return self._variables[name].describe(*args, **kwargs) def __getattr__(self, name): + # will be called if name is not accessible (has not been added directly yet) if not name in self._variables: - raise ValueError(f"Unregistered env variable {name}") + raise AttributeError(f"Unregistered env variable {name}") var_obj = self._variables[name] try: raw_value = self.raw_value(name) From 48bb3c2316e6838ff235a2badc985da14ee8b1b5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 29 Aug 2024 22:13:31 +0200 Subject: [PATCH 26/46] add test for OcrdEnvConfig.reset_defaults() --- tests/utils/test_config.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/utils/test_config.py b/tests/utils/test_config.py index 99595a864c..a94eb5d3cc 100644 --- a/tests/utils/test_config.py +++ b/tests/utils/test_config.py @@ -57,3 +57,11 @@ def test_OCRD_PROFILE(): with temp_env_var('OCRD_PROFILE', 'some other value'): with raises(ValueError, match="'OCRD_PROFILE' set to invalid value 'some other value'"): config.OCRD_PROFILE + +def test_defaults(): + default = config.OCRD_MAX_PROCESSOR_CACHE + print(type(default)) + config.OCRD_MAX_PROCESSOR_CACHE = 2 + assert config.OCRD_MAX_PROCESSOR_CACHE == 2 + config.reset_defaults() + assert config.OCRD_MAX_PROCESSOR_CACHE == default From ed924032cc959c15f5f6fdd5a2cb34efa4d925a6 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 1 Sep 2024 10:14:13 +0200 Subject: [PATCH 27/46] Workspace.reload_mets: fix for METS server case --- src/ocrd/workspace.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/ocrd/workspace.py b/src/ocrd/workspace.py index 8b8e89bfca..4ef59252a0 100644 --- a/src/ocrd/workspace.py +++ b/src/ocrd/workspace.py @@ -123,7 +123,10 @@ def reload_mets(self): """ Reload METS from the filesystem. """ - self.mets = OcrdMets(filename=self.mets_target) + if self.is_remote: + self.mets.reload() + else: + self.mets = OcrdMets(filename=self.mets_target) @deprecated_alias(pageId="page_id") @deprecated_alias(ID="file_id") From 9c3c3997b5039ca68192d7046808aa5d1cfb83cf Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 2 Sep 2024 14:59:42 +0200 Subject: [PATCH 28/46] OcrdMetsServer.add_file: pass on 'force' kwarg, too --- src/ocrd/mets_server.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 9b66871349..8a18f01682 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -284,15 +284,17 @@ def add_file( file_id=ID, page_id=pageId, mimetype=mimetype, url=url, local_filename=local_filename ) + # add force+ignore + kwargs = {**kwargs, **data.dict()} if not self.multiplexing_mode: - r = self.session.request("POST", f"{self.url}/file", data=data.dict()) - if not r: - raise RuntimeError("Add file failed. Please check provided parameters") + r = self.session.request("POST", f"{self.url}/file", data=kwargs) + if not r.ok: + raise RuntimeError(f"Failed to add file ({str(data)}): {r.json()}") else: - r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path, data.dict())) - if "error" in r: - raise RuntimeError(f"Add file failed: Msg: {r['error']}") + r = self.session.request("POST", self.url, json=MpxReq.add_file(self.ws_dir_path, kwargs)) + if not r.ok: + raise RuntimeError(f"Failed to add file ({str(data)}): {r.json()[errors]}") return ClientSideOcrdFile( None, fileGrp=file_grp, @@ -506,7 +508,8 @@ async def add_file( page_id: Optional[str] = Form(), mimetype: str = Form(), url: Optional[str] = Form(None), - local_filename: Optional[str] = Form(None) + local_filename: Optional[str] = Form(None), + force: bool = Form(False), ): """ Add a file @@ -518,7 +521,7 @@ async def add_file( ) # Add to workspace kwargs = file_resource.dict() - workspace.add_file(**kwargs) + workspace.add_file(**kwargs, force=force) return file_resource # ------------- # From c077e957f256c21ec46c2b18cf5881e815a55fac Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 2 Sep 2024 15:00:38 +0200 Subject: [PATCH 29/46] test_mets_server: add test for force (overwrite) --- tests/test_mets_server.py | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/tests/test_mets_server.py b/tests/test_mets_server.py index 8f94b95645..dc94d6c560 100644 --- a/tests/test_mets_server.py +++ b/tests/test_mets_server.py @@ -55,7 +55,7 @@ def _start_mets_server(*args, **kwargs): p.terminate() rmtree(tmpdir, ignore_errors=True) -def add_file_server(x): +def add_file_server(x, force=False): mets_server_url, directory, i = x workspace_server = Workspace(Resolver(), directory, mets_server_url=mets_server_url) workspace_server.add_file( @@ -65,6 +65,7 @@ def add_file_server(x): page_id=f'page{i}', file_id=f'FOO_page{i}_foo{i}', # url=f'url{i}' + force=force ) def add_agent_server(x): @@ -123,6 +124,19 @@ def test_mets_server_add_file(start_mets_server): assert len(workspace_file.mets.find_all_files(fileGrp='FOO')) == NO_FILES +def test_mets_server_add_file_overwrite(start_mets_server): + mets_server_url, workspace_server = start_mets_server + + add_file_server((mets_server_url, workspace_server.directory, 5)) + + assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == 1 + + with raises(RuntimeError, match="already exists"): + add_file_server((mets_server_url, workspace_server.directory, 5)) + + add_file_server((mets_server_url, workspace_server.directory, 5), force=True) + assert len(workspace_server.mets.find_all_files(fileGrp='FOO')) == 1 + def test_mets_server_add_agents(start_mets_server): NO_AGENTS = 30 From 4492168ddabaf835b70c91602f905469c4ce6f3d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 00:59:51 +0200 Subject: [PATCH 30/46] PcGts.Page.id / make_xml_id: replace '/' with '_' --- src/ocrd_utils/str.py | 3 ++- tests/model/test_ocrd_page.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/ocrd_utils/str.py b/src/ocrd_utils/str.py index dea3715bf4..b3d3ef496f 100644 --- a/src/ocrd_utils/str.py +++ b/src/ocrd_utils/str.py @@ -105,10 +105,11 @@ def make_xml_id(idstr: str) -> str: ret = idstr if not REGEX_FILE_ID.fullmatch(ret): ret = ret.replace(':', '_') + ret = ret.replace('/', '_') ret = re.sub(r'^([^a-zA-Z_])', r'id_\1', ret) ret = re.sub(r'[^\w.-]', r'', ret) return ret - + def nth_url_segment(url, n=-1): """ Return the last /-delimited segment of a URL-like string diff --git a/tests/model/test_ocrd_page.py b/tests/model/test_ocrd_page.py index 7dc130809f..97335775d6 100644 --- a/tests/model/test_ocrd_page.py +++ b/tests/model/test_ocrd_page.py @@ -460,7 +460,7 @@ def test_id(): # TODO: is this *really* desired? # I would expect for a single Page-Element the ID is like from the top-level-Pgts-Container, not like a fileName - assert pcgts.get_Page().id == 'OCR-D-IMG/INPUT_0017.tif' + assert pcgts.get_Page().id == 'OCR-D-IMG_INPUT_0017.tif' if __name__ == '__main__': From 83d52d888a4d403c3ce35a7db50c90db83253f7e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 15 Sep 2024 16:32:55 +0200 Subject: [PATCH 31/46] METS Server: also export+delegate physical_pages --- src/ocrd/mets_server.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/ocrd/mets_server.py b/src/ocrd/mets_server.py index 8a18f01682..c85368e305 100644 --- a/src/ocrd/mets_server.py +++ b/src/ocrd/mets_server.py @@ -88,6 +88,14 @@ def create(file_groups: List[str]): return OcrdFileGroupListModel(file_groups=file_groups) +class OcrdPageListModel(BaseModel): + physical_pages: List[str] = Field() + + @staticmethod + def create(physical_pages: List[str]): + return OcrdPageListModel(physical_pages=physical_pages) + + class OcrdAgentListModel(BaseModel): agents: List[OcrdAgentModel] = Field() @@ -210,6 +218,17 @@ def workspace_path(self): ).json()["text"] return self.ws_dir_path + @property + def physical_pages(self) -> List[str]: + if not self.multiplexing_mode: + return self.session.request("GET", f"{self.url}/physical_pages").json()["physical_pages"] + else: + return self.session.request( + "POST", + self.url, + json=MpxReq.physical_pages(self.ws_dir_path) + ).json()["physical_pages"] + @property def file_groups(self): if not self.multiplexing_mode: @@ -349,6 +368,11 @@ def workspace_path(ws_dir_path: str) -> Dict: return MpxReq.__args_wrapper( ws_dir_path, method_type="GET", response_type="text", request_url="workspace_path", request_data={}) + @staticmethod + def physical_pages(ws_dir_path: str) -> Dict: + return MpxReq.__args_wrapper( + ws_dir_path, method_type="GET", response_type="dict", request_url="physical_pages", request_data={}) + @staticmethod def file_groups(ws_dir_path: str) -> Dict: return MpxReq.__args_wrapper( @@ -469,6 +493,10 @@ async def unique_identifier(): async def workspace_path(): return Response(content=workspace.directory, media_type="text/plain") + @app.get(path='/physical_pages', response_model=OcrdPageListModel) + async def physical_pages(): + return {'physical_pages': workspace.mets.physical_pages} + @app.get(path='/file_groups', response_model=OcrdFileGroupListModel) async def file_groups(): return {'file_groups': workspace.mets.file_groups} From 4eccefc43b39e26337d0542e633fda077097d079 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 00:57:08 +0200 Subject: [PATCH 32/46] ocrd.cli.workspace: consistently pass on --mets-server-url and --backup (also, simplify) --- src/ocrd/cli/workspace.py | 87 ++++++++++++++++++--------------------- 1 file changed, 39 insertions(+), 48 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 062a373608..6add3f839f 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -37,6 +37,17 @@ def __init__(self, directory, mets_url, mets_basename=DEFAULT_METS_BASENAME, met = self.resolver.resolve_mets_arguments(directory, mets_url, mets_basename, mets_server_url) self.automatic_backup = automatic_backup + def workspace(self): + return Workspace( + self.resolver, + directory=self.directory, + mets_basename=self.mets_basename, + automatic_backup=self.automatic_backup, + mets_server_url=self.mets_server_url, + ) + def backup_manager(self): + return WorkspaceBackupManager(self.workspace()) + pass_workspace = click.make_pass_decorator(WorkspaceCtx) @@ -139,6 +150,7 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir)) ctx.directory = workspace_dir + assert not ctx.mets_server_url workspace = ctx.resolver.workspace_from_url( mets_url, dst_dir=ctx.directory, @@ -174,10 +186,11 @@ def workspace_init(ctx, clobber_mets, directory): if directory: LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory)) ctx.directory = directory + assert not ctx.mets_server_url workspace = ctx.resolver.workspace_from_nothing( directory=ctx.directory, mets_basename=ctx.mets_basename, - clobber_mets=clobber_mets + clobber_mets=clobber_mets, ) workspace.save_mets() print(workspace.directory) @@ -201,13 +214,7 @@ def workspace_add_file(ctx, file_grp, file_id, mimetype, page_id, ignore, check_ Add a file or http(s) URL FNAME to METS in a workspace. If FNAME is not an http(s) URL and is not a workspace-local existing file, try to copy to workspace. """ - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - automatic_backup=ctx.automatic_backup, - mets_server_url=ctx.mets_server_url, - ) + workspace = ctx.workspace() log = getLogger('ocrd.cli.workspace.add') if not mimetype: @@ -313,13 +320,7 @@ def workspace_cli_bulk_add(ctx, regex, mimetype, page_id, file_id, url, local_fi -G '{{ filegrp }}' -g '{{ pageid }}' -i '{{ fileid }}' -S '{{ local_filename }}' - """ log = getLogger('ocrd.cli.workspace.bulk-add') # pylint: disable=redefined-outer-name - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - automatic_backup=ctx.automatic_backup, - mets_server_url=ctx.mets_server_url, - ) + workspace = ctx.workspace() try: pat = re.compile(regex) @@ -454,13 +455,8 @@ def workspace_find(ctx, file_grp, mimetype, page_id, file_id, output_field, incl snake_to_camel = {"file_id": "ID", "page_id": "pageId", "file_grp": "fileGrp"} output_field = [snake_to_camel.get(x, x) for x in output_field] modified_mets = False - ret = list() - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - mets_server_url=ctx.mets_server_url, - ) + ret = [] + workspace = ctx.workspace() with pushd_popd(workspace.directory): for f in workspace.find_files( file_id=file_id, @@ -510,7 +506,7 @@ def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefin (If any ``ID`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() for i in id: workspace.remove_file(i, force=force, keep_file=keep_file) workspace.save_mets() @@ -528,7 +524,7 @@ def rename_group(ctx, old, new): """ Rename fileGrp (USE attribute ``NEW`` to ``OLD``). """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() workspace.rename_file_group(old, new) workspace.save_mets() @@ -549,7 +545,7 @@ def remove_group(ctx, group, recursive, force, keep_files): (If any ``GROUP`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() for g in group: workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files) workspace.save_mets() @@ -571,7 +567,7 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id): (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() with pushd_popd(workspace.directory): for f in workspace.find_files( file_id=file_id, @@ -608,8 +604,7 @@ def clean(ctx, dry_run, directories, path_glob): If no PATH_GLOB are specified, then all files and directories may match. """ - log = getLogger('ocrd.cli.workspace.clean') - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() allowed_files = [normpath(f.local_filename) for f in workspace.find_files(local_only=True)] allowed_files.append(relpath(workspace.mets_target, start=workspace.directory)) allowed_dirs = set(dirname(path) for path in allowed_files) @@ -627,7 +622,7 @@ def clean(ctx, dry_run, directories, path_glob): if normpath(path) in allowed_files: continue if dry_run: - log.info('unlink(%s)' % path) + ctx.log.info('unlink(%s)' % path) else: unlink(path) if not directories: @@ -637,7 +632,7 @@ def clean(ctx, dry_run, directories, path_glob): if normpath(path) in allowed_dirs: continue if dry_run: - log.info('rmdir(%s)' % path) + ctx.log.info('rmdir(%s)' % path) else: rmdir(path) @@ -651,7 +646,7 @@ def list_groups(ctx): """ List fileGrp USE attributes """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() print("\n".join(workspace.mets.file_groups)) # ---------------------------------------------------------------------- @@ -677,7 +672,7 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() find_kwargs = {} if page_id_range and 'ID' in output_field: find_kwargs['pageId'] = page_id_range @@ -724,7 +719,7 @@ def get_id(ctx): """ Get METS id if any """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename) + workspace = ctx.workspace() ID = workspace.mets.unique_identifier if ID: print(ID) @@ -744,7 +739,7 @@ def set_id(ctx, id): # pylint: disable=redefined-builtin Otherwise will create a new {{ ID }}. """ - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() workspace.mets.unique_identifier = id workspace.save_mets() @@ -767,7 +762,7 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): if contentids: update_kwargs['CONTENTIDS'] = contentids try: - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() workspace.mets.update_physical_page_attributes(page_id, **update_kwargs) workspace.save_mets() except Exception as err: @@ -805,7 +800,7 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa mets_path = Path(mets_path) if filegrp_mapping: filegrp_mapping = loads(filegrp_mapping) - workspace = Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup) + workspace = ctx.workspace() other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name)) workspace.merge( other_workspace, @@ -829,11 +824,12 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa # ---------------------------------------------------------------------- @workspace_cli.group('backup') -@click.pass_context +@pass_workspace def workspace_backup_cli(ctx): # pylint: disable=unused-argument """ Backing and restoring workspaces - dev edition """ + assert not ctx.mets_server_url, "Workspace backups currently not interoperable with METS Server" @workspace_backup_cli.command('add') @pass_workspace @@ -841,7 +837,7 @@ def workspace_backup_add(ctx): """ Create a new backup """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() backup_manager.add() @workspace_backup_cli.command('list') @@ -850,7 +846,7 @@ def workspace_backup_list(ctx): """ List backups """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() for b in backup_manager.list(): print(b) @@ -862,7 +858,7 @@ def workspace_backup_restore(ctx, choose_first, bak): """ Restore backup BAK """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() backup_manager.restore(bak, choose_first) @workspace_backup_cli.command('undo') @@ -871,7 +867,7 @@ def workspace_backup_undo(ctx): """ Restore the last backup """ - backup_manager = WorkspaceBackupManager(Workspace(ctx.resolver, directory=ctx.directory, mets_basename=ctx.mets_basename, automatic_backup=ctx.automatic_backup)) + backup_manager = ctx.backup_manager() backup_manager.undo() @@ -888,13 +884,8 @@ def workspace_serve_cli(ctx): # pylint: disable=unused-argument @workspace_serve_cli.command('stop') @pass_workspace def workspace_serve_stop(ctx): # pylint: disable=unused-argument - """Stop the METS server""" - workspace = Workspace( - ctx.resolver, - directory=ctx.directory, - mets_basename=ctx.mets_basename, - mets_server_url=ctx.mets_server_url, - ) + """Stop the METS server (saving changes to disk)""" + workspace = ctx.workspace() workspace.mets.stop() @workspace_serve_cli.command('start') From 083df27664f4a40eb2d2baddcbb6bf0fd214df5d Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 00:57:32 +0200 Subject: [PATCH 33/46] ocrd.cli.workspace server: add 'reload' and 'save' --- src/ocrd/cli/workspace.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 6add3f839f..ff4aeef7c5 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -888,6 +888,20 @@ def workspace_serve_stop(ctx): # pylint: disable=unused-argument workspace = ctx.workspace() workspace.mets.stop() +@workspace_serve_cli.command('reload') +@pass_workspace +def workspace_serve_reload(ctx): # pylint: disable=unused-argument + """Reload the METS server from disk""" + workspace = ctx.workspace() + workspace.mets.reload() + +@workspace_serve_cli.command('save') +@pass_workspace +def workspace_serve_save(ctx): # pylint: disable=unused-argument + """Save the METS changes to disk""" + workspace = ctx.workspace() + workspace.mets.save() + @workspace_serve_cli.command('start') @pass_workspace def workspace_serve_start(ctx): # pylint: disable=unused-argument From b2c01610bffd277ef7a3345427ff016280efc3a4 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:36:03 +0200 Subject: [PATCH 34/46] ocrd.cli.validate tasks: pass on --mets-server-url, too --- src/ocrd/cli/validate.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/ocrd/cli/validate.py b/src/ocrd/cli/validate.py index b26803d053..9d0cafd064 100644 --- a/src/ocrd/cli/validate.py +++ b/src/ocrd/cli/validate.py @@ -102,16 +102,19 @@ def validate_page(page, **kwargs): @validate_cli.command('tasks') @click.option('--workspace', nargs=1, required=False, help='Workspace directory these tasks are to be run. If omitted, only validate syntax') @click.option('-M', '--mets-basename', nargs=1, default=DEFAULT_METS_BASENAME, help='Basename of the METS file, used in conjunction with --workspace') +@click.option('-U', '--mets-server-url', help='TCP host URI or UDS path of METS server') @click.option('--overwrite', is_flag=True, default=False, help='When checking against a concrete workspace, simulate overwriting output or page range.') @click.option('-g', '--page-id', help="ID(s) of the pages to process") @click.argument('tasks', nargs=-1, required=True) -def validate_process(tasks, workspace, mets_basename, overwrite, page_id): +def validate_process(tasks, workspace, mets_basename, mets_server_url, overwrite, page_id): ''' Validate a sequence of tasks passable to 'ocrd process' ''' if workspace: - _inform_of_result(validate_tasks([ProcessorTask.parse(t) for t in tasks], - Workspace(Resolver(), directory=workspace, mets_basename=mets_basename), page_id=page_id, overwrite=overwrite)) + _inform_of_result(validate_tasks( + [ProcessorTask.parse(t) for t in tasks], + Workspace(Resolver(), directory=workspace, mets_basename=mets_basename, mets_server_url=mets_server_url), + page_id=page_id, overwrite=overwrite)) else: for t in [ProcessorTask.parse(t) for t in tasks]: _inform_of_result(t.validate()) From 203a06a2a36ac5a74a5ab73ba9c693902e89fc38 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:47:14 +0200 Subject: [PATCH 35/46] run_processor: be robust if ocrd_tool is missing steps --- src/ocrd/processor/helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd/processor/helpers.py b/src/ocrd/processor/helpers.py index 921cfeac80..fb5ca1bb0f 100644 --- a/src/ocrd/processor/helpers.py +++ b/src/ocrd/processor/helpers.py @@ -98,7 +98,7 @@ def run_processor( ocrd_tool = processor.ocrd_tool name = '%s v%s' % (ocrd_tool['executable'], processor.version) - otherrole = ocrd_tool['steps'][0] + otherrole = ocrd_tool.get('steps', [''])[0] logProfile = getLogger('ocrd.process.profile') log.debug("Processor instance %s (%s doing %s)", processor, name, otherrole) t0_wall = perf_counter() From 4fbdd00439b9121dd5f01dd6b4ba2d5f24c251ae Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 13 Sep 2024 00:38:11 +0200 Subject: [PATCH 36/46] lib.bash: fix errexit --- src/ocrd/lib.bash | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/ocrd/lib.bash b/src/ocrd/lib.bash index febaf92ae6..745bc52fe4 100644 --- a/src/ocrd/lib.bash +++ b/src/ocrd/lib.bash @@ -27,6 +27,7 @@ ocrd__log () { ## Ensure minimum version # ht https://stackoverflow.com/posts/4025065 ocrd__minversion () { + set -e local minversion="$1" local version=$(ocrd --version|sed 's/ocrd, version //') #echo "$minversion < $version?" @@ -108,6 +109,7 @@ ocrd__usage () { ## declare -A ocrd__argv=() ## ``` ocrd__parse_argv () { + set -e # if [[ -n "$ZSH_VERSION" ]];then # print -r -- ${+ocrd__argv} ${(t)ocrd__argv} @@ -250,6 +252,7 @@ $params_parsed" } ocrd__wrap () { + set -e declare -gx OCRD_TOOL_JSON="$1" declare -gx OCRD_TOOL_NAME="$2" From c86507951e85ab13412cb6264841272f809ba07e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 14 Sep 2024 01:03:43 +0200 Subject: [PATCH 37/46] tests: make sure ocrd_utils.config gets reset whenever changing it globally --- tests/processor/test_processor.py | 31 +++++++++++++++++++++++++++++-- tests/test_decorators.py | 6 +++++- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/tests/processor/test_processor.py b/tests/processor/test_processor.py index 3a47d2c23f..f2261d0ffb 100644 --- a/tests/processor/test_processor.py +++ b/tests/processor/test_processor.py @@ -6,8 +6,9 @@ from os import environ from tests.base import CapturingTestCase as TestCase, assets, main, copy_of_directory # pylint: disable=import-error, no-name-in-module from tests.data import DummyProcessor, DummyProcessorWithRequiredParameters, DummyProcessorWithOutput, IncompleteProcessor +from tests.test_mets_server import fixture_start_mets_server -from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging +from ocrd_utils import MIMETYPE_PAGE, pushd_popd, initLogging, disableLogging, config from ocrd.resolver import Resolver from ocrd.processor.base import Processor, run_processor, run_cli @@ -28,6 +29,10 @@ def setUp(self): self.workspace = self.resolver.workspace_from_url('mets.xml') self.addCleanup(stack.pop_all().close) + def tearDown(self): + super().tearDown() + config.reset_defaults() + def test_incomplete_processor(self): proc = IncompleteProcessor(None) with self.assertRaises(NotImplementedError): @@ -242,7 +247,29 @@ class ZipTestProcessor(Processor): pass proc = ZipTestProcessor(workspace=ws, input_file_grp='GRP1,GRP2', page_id=page_id) assert [(one, two.ID) for one, two in proc.zip_input_files(require_first=False)] == [(None, 'foobar2')] r = self.capture_out_err() - assert 'ERROR ocrd.processor.base - found no page phys_0001 in file group GRP1' in r.err + assert 'ERROR ocrd.processor.base - Found no page phys_0001 in file group GRP1' in r.err + +def test_run_output_metsserver(start_mets_server): + mets_server_url, ws = start_mets_server + run_processor(DummyProcessorWithOutput, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + mets_server_url=mets_server_url) + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + ws.overwrite_mode = True + run_processor(DummyProcessorWithOutput, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + mets_server_url=mets_server_url) + assert len(ws.mets.find_all_files(fileGrp="OCR-D-OUT")) == len(ws.mets.find_all_files(fileGrp="OCR-D-IMG")) + ws.overwrite_mode = False + with pytest.raises(Exception) as exc: + run_processor(DummyProcessorWithOutput, workspace=ws, + input_file_grp="OCR-D-IMG", + output_file_grp="OCR-D-OUT", + mets_server_url=mets_server_url) + assert "already exists" in str(exc.value) + if __name__ == "__main__": main(__file__) diff --git a/tests/test_decorators.py b/tests/test_decorators.py index df8d6422be..c36577020a 100644 --- a/tests/test_decorators.py +++ b/tests/test_decorators.py @@ -15,7 +15,7 @@ ocrd_loglevel, ocrd_cli_wrap_processor, ) # pylint: disable=protected-access -from ocrd_utils import pushd_popd, VERSION as OCRD_VERSION, disableLogging, initLogging, get_logging_config_files +from ocrd_utils import pushd_popd, VERSION as OCRD_VERSION, disableLogging, initLogging, get_logging_config_files, config @click.command() @ocrd_cli_options @@ -45,6 +45,10 @@ def setUp(self): super().setUp() disableLogging() + def tearDown(self): + super().tearDown() + config.reset_defaults() + def test_minimal(self): exit_code, out, err = self.invoke_cli(cli_with_ocrd_cli_options, ['-l', 'DEBUG']) print(out, err) From 1a13cd394fd7f8a0a12259f7aefc0c3e1b1c8acc Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 16 Sep 2024 16:55:41 +0200 Subject: [PATCH 38/46] ocrd.cli.workspace: assert non-server in cmds mutating METS --- src/ocrd/cli/workspace.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index ff4aeef7c5..415b8e6e2f 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -150,7 +150,8 @@ def workspace_clone(ctx, clobber_mets, download, file_grp, file_id, page_id, mim LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR clone' instead of argument 'WORKSPACE_DIR' ('%s')" % workspace_dir)) ctx.directory = workspace_dir - assert not ctx.mets_server_url + assert not ctx.mets_server_url, \ + f"clone cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.resolver.workspace_from_url( mets_url, dst_dir=ctx.directory, @@ -186,7 +187,8 @@ def workspace_init(ctx, clobber_mets, directory): if directory: LOG.warning(DeprecationWarning("Use 'ocrd workspace --directory DIR init' instead of argument 'DIRECTORY' ('%s')" % directory)) ctx.directory = directory - assert not ctx.mets_server_url + assert not ctx.mets_server_url, \ + f"init cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.resolver.workspace_from_nothing( directory=ctx.directory, mets_basename=ctx.mets_basename, @@ -506,6 +508,8 @@ def workspace_remove_file(ctx, id, force, keep_file): # pylint: disable=redefin (If any ``ID`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ + assert not ctx.mets_server_url, \ + f"remove cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() for i in id: workspace.remove_file(i, force=force, keep_file=keep_file) @@ -524,6 +528,8 @@ def rename_group(ctx, old, new): """ Rename fileGrp (USE attribute ``NEW`` to ``OLD``). """ + assert not ctx.mets_server_url, \ + f"rename-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() workspace.rename_file_group(old, new) workspace.save_mets() @@ -545,6 +551,8 @@ def remove_group(ctx, group, recursive, force, keep_files): (If any ``GROUP`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ + assert not ctx.mets_server_url, \ + f"remove-group cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() for g in group: workspace.remove_file_group(g, recursive=recursive, force=force, keep_files=keep_files) @@ -567,6 +575,8 @@ def prune_files(ctx, file_grp, mimetype, page_id, file_id): (If any ``FILTER`` starts with ``//``, then its remainder will be interpreted as a regular expression.) """ + assert not ctx.mets_server_url, \ + f"prune-files cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() with pushd_popd(workspace.directory): for f in workspace.find_files( @@ -762,6 +772,8 @@ def update_page(ctx, attr_value_pairs, order, orderlabel, contentids, page_id): if contentids: update_kwargs['CONTENTIDS'] = contentids try: + assert not ctx.mets_server_url, \ + f"update-page cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() workspace.mets.update_physical_page_attributes(page_id, **update_kwargs) workspace.save_mets() @@ -800,6 +812,8 @@ def merge(ctx, overwrite, force, copy_files, filegrp_mapping, fileid_mapping, pa mets_path = Path(mets_path) if filegrp_mapping: filegrp_mapping = loads(filegrp_mapping) + assert not ctx.mets_server_url, \ + f"merge cannot be performed with METS Server - stop server, rerun without -U {ctx.mets_server_url}" workspace = ctx.workspace() other_workspace = Workspace(ctx.resolver, directory=str(mets_path.parent), mets_basename=str(mets_path.name)) workspace.merge( From bba597e1d5d4fe72044fb1024de548906cd599d8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 7 Sep 2024 14:25:37 +0200 Subject: [PATCH 39/46] OcrdPage: add PageType.get_ReadingOrderGroups() --- src/ocrd_page_user_methods.py | 1 + .../get_ReadingOrderGroups.py | 33 +++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 src/ocrd_page_user_methods/get_ReadingOrderGroups.py diff --git a/src/ocrd_page_user_methods.py b/src/ocrd_page_user_methods.py index 8a2332e6e5..fe22dd89ab 100644 --- a/src/ocrd_page_user_methods.py +++ b/src/ocrd_page_user_methods.py @@ -116,6 +116,7 @@ def _add_method(class_re, method_name, file_name=None): _add_method(r'^(PageType)$', 'set_Border'), _add_method(r'^(CoordsType)$', 'set_points'), _add_method(r'^(PageType)$', 'get_AllTextLines'), + _add_method(r'^(PageType)$', 'get_ReadingOrderGroups'), # for some reason, pagecontent.xsd does not declare @orientation at the abstract/base RegionType: _add_method(r'^(PageType|AdvertRegionType|MusicRegionType|MapRegionType|ChemRegionType|MathsRegionType|SeparatorRegionType|ChartRegionType|TableRegionType|GraphicRegionType|LineDrawingRegionType|ImageRegionType|TextRegionType)$', 'set_orientation'), ) diff --git a/src/ocrd_page_user_methods/get_ReadingOrderGroups.py b/src/ocrd_page_user_methods/get_ReadingOrderGroups.py new file mode 100644 index 0000000000..e7d6c02b77 --- /dev/null +++ b/src/ocrd_page_user_methods/get_ReadingOrderGroups.py @@ -0,0 +1,33 @@ +def get_ReadingOrderGroups(self) -> dict: + """ + Aggregate recursive ReadingOrder into a dictionary, mapping each regionRef + (i.e. segment `@id`) to its referring group object (i.e one of + + \b + - :py:class:`.RegionRefType` + - :py:class:`.RegionRefIndexedType` + - :py:class:`.OrderedGroupType` + - :py:class:`.OrderedGroupIndexedType` + - :py:class:`.UnoderedGroupType` + - :py:class:`.UnoderedGroupIndexedType` + """ + def get_groupdict(group): + regionrefs = list() + if isinstance(group, (OrderedGroupType, OrderedGroupIndexedType)): + regionrefs = (group.get_RegionRefIndexed() + + group.get_OrderedGroupIndexed() + + group.get_UnorderedGroupIndexed()) + if isinstance(group, (UnorderedGroupType, UnorderedGroupIndexedType)): + regionrefs = (group.get_RegionRef() + + group.get_OrderedGroup() + + group.get_UnorderedGroup()) + refdict = {} + for elem in regionrefs: + refdict[elem.get_regionRef()] = elem + if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): + refdict = {**refdict, **get_groupdict(elem)} + return refdict + ro = self.get_ReadingOrder() + if ro is None: + return {} + return get_groupdict(ro.get_OrderedGroup() or ro.get_UnorderedGroup()) From fa0fadaa536c0daed62abb136dad9a0af15d2e5c Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sat, 7 Sep 2024 14:25:58 +0200 Subject: [PATCH 40/46] update OcrdPage from generateds --- src/ocrd_models/ocrd_page_generateds.py | 55 ++++++++++++++++++++----- 1 file changed, 44 insertions(+), 11 deletions(-) diff --git a/src/ocrd_models/ocrd_page_generateds.py b/src/ocrd_models/ocrd_page_generateds.py index 6fef4c8635..f2b7c0551e 100644 --- a/src/ocrd_models/ocrd_page_generateds.py +++ b/src/ocrd_models/ocrd_page_generateds.py @@ -2,30 +2,28 @@ # -*- coding: utf-8 -*- # -# Generated Wed Nov 3 12:30:32 2021 by generateDS.py version 2.35.20. -# Python 3.6.9 (default, Jan 26 2021, 15:33:00) [GCC 8.4.0] +# Generated Sat Sep 7 14:17:39 2024 by generateDS.py version 2.35.20. +# Python 3.8.17+ (heads/3.8-dirty:1663f8ba84, Aug 15 2023, 18:13:01) [GCC 8.3.0] # # Command line options: # ('-f', '') # ('--root-element', 'PcGts') -# ('-o', 'ocrd_models/ocrd_models/ocrd_page_generateds.py') +# ('-o', 'src/ocrd_models/ocrd_page_generateds.py') # ('--silence', '') # ('--export', 'write etree') # ('--disable-generatedssuper-lookup', '') -# ('--user-methods', 'ocrd_models/ocrd_page_user_methods.py') +# ('--user-methods', 'src/ocrd_page_user_methods.py') # # Command line arguments: -# ocrd_validators/ocrd_validators/page.xsd +# src/ocrd_validators/page.xsd # # Command line: -# /home/kba/monorepo/ocrd_all/venv/bin/generateDS -f --root-element="PcGts" -o "ocrd_models/ocrd_models/ocrd_page_generateds.py" --silence --export="write etree" --disable-generatedssuper-lookup --user-methods="ocrd_models/ocrd_page_user_methods.py" ocrd_validators/ocrd_validators/page.xsd +# /data/ocr-d/ocrd_all/venv38/bin/generateDS -f --root-element="PcGts" -o "src/ocrd_models/ocrd_page_generateds.py" --silence --export="write etree" --disable-generatedssuper-lookup --user-methods="src/ocrd_page_user_methods.py" src/ocrd_validators/page.xsd # # Current working directory (os.getcwd()): # core # -# type: ignore - from itertools import zip_longest import os import sys @@ -223,7 +221,7 @@ def gds_validate_integer_list( try: int(value) except (TypeError, ValueError): - raise_parse_error(node, 'Requires sequence of integer values') + raise_parse_error(node, 'Requires sequence of integer valuess') return values def gds_format_float(self, input_data, input_name=''): return ('%.15f' % input_data).rstrip('0') @@ -1230,9 +1228,10 @@ def __hash__(self): return hash(self.id) @property def id(self): + from ocrd_utils import make_xml_id if hasattr(self, 'pcGtsId'): return self.pcGtsId or '' - return self.imageFilename + return make_xml_id(self.imageFilename) def get_AllAlternativeImagePaths(self, page=True, region=True, line=True, word=True, glyph=True): """ Get all the ``pc:AlternativeImage/@filename`` paths referenced in the PAGE-XML document. @@ -3116,9 +3115,10 @@ def __hash__(self): return hash(self.id) @property def id(self): + from ocrd_utils import make_xml_id if hasattr(self, 'pcGtsId'): return self.pcGtsId or '' - return self.imageFilename + return make_xml_id(self.imageFilename) # pylint: disable=line-too-long,invalid-name,protected-access,missing-module-docstring def _region_class(self, x): # pylint: disable=unused-argument return x.__class__.__name__.replace('RegionType', '') @@ -3314,6 +3314,39 @@ def get_AllTextLines(self, region_order='document', respect_textline_order=True) ret += lines if lo in ['top-to-bottom', 'left-to-right'] else list(reversed(lines)) return ret + def get_ReadingOrderGroups(self) -> dict: + """ + Aggregate recursive ReadingOrder into a dictionary, mapping each regionRef + (i.e. segment `@id`) to its referring group object (i.e one of + + \b + - :py:class:`.RegionRefType` + - :py:class:`.RegionRefIndexedType` + - :py:class:`.OrderedGroupType` + - :py:class:`.OrderedGroupIndexedType` + - :py:class:`.UnoderedGroupType` + - :py:class:`.UnoderedGroupIndexedType` + """ + def get_groupdict(group): + regionrefs = list() + if isinstance(group, (OrderedGroupType, OrderedGroupIndexedType)): + regionrefs = (group.get_RegionRefIndexed() + + group.get_OrderedGroupIndexed() + + group.get_UnorderedGroupIndexed()) + if isinstance(group, (UnorderedGroupType, UnorderedGroupIndexedType)): + regionrefs = (group.get_RegionRef() + + group.get_OrderedGroup() + + group.get_UnorderedGroup()) + refdict = {} + for elem in regionrefs: + refdict[elem.get_regionRef()] = elem + if not isinstance(elem, (RegionRefType, RegionRefIndexedType)): + refdict = {**refdict, **get_groupdict(elem)} + return refdict + ro = self.get_ReadingOrder() + if ro is None: + return {} + return get_groupdict(ro.get_OrderedGroup() or ro.get_UnorderedGroup()) def set_orientation(self, orientation): """ Set deskewing angle to given `orientation` number. From 9641d4abc5436fb2925bc288790984cd0239f80b Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 04:35:07 +0200 Subject: [PATCH 41/46] OcrdMets.get_physical_pages: cover return_divs w/o for_fileIds for_pageIds --- src/ocrd_models/ocrd_mets.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/ocrd_models/ocrd_mets.py b/src/ocrd_models/ocrd_mets.py index 66251a54dc..9eedf9fa34 100644 --- a/src/ocrd_models/ocrd_mets.py +++ b/src/ocrd_models/ocrd_mets.py @@ -598,7 +598,16 @@ def get_physical_pages(self, for_fileIds : Optional[List[str]] = None, for_pageI If return_divs is set, returns div memory objects instead of strings of ids """ if for_fileIds is None and for_pageIds is None: + if return_divs: + if self._cache_flag: + return list(self._page_cache[METS_PAGE_DIV_ATTRIBUTE.ID].values()) + + return [x for x in self._tree.getroot().xpath( + 'mets:structMap[@TYPE="PHYSICAL"]/mets:div[@TYPE="physSequence"]/mets:div[@TYPE="page"]', + namespaces=NS)] + return self.physical_pages + # log = getLogger('ocrd.models.ocrd_mets.get_physical_pages') if for_pageIds is not None: ret = [] From 19ce7d992f567129af74f858e9f0f1ccd8482fce Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Fri, 27 Sep 2024 04:37:03 +0200 Subject: [PATCH 42/46] ocrd.cli.workspace: use physical_pages if possible, fix default output_field --- src/ocrd/cli/workspace.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/ocrd/cli/workspace.py b/src/ocrd/cli/workspace.py index 415b8e6e2f..f66a1e3360 100644 --- a/src/ocrd/cli/workspace.py +++ b/src/ocrd/cli/workspace.py @@ -683,19 +683,15 @@ def list_pages(ctx, output_field, output_format, chunk_number, chunk_index, page will be interpreted as a regular expression.) """ workspace = ctx.workspace() - find_kwargs = {} - if page_id_range and 'ID' in output_field: - find_kwargs['pageId'] = page_id_range - page_ids = sorted({x.pageId for x in workspace.mets.find_files(**find_kwargs) if x.pageId}) ret = [] - - if output_field == ['ID']: - ret = [[x] for x in page_ids] - else: - for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=','.join(page_ids), return_divs=True)): + if page_id_range or list(output_field) != ['ID']: + for i, page_div in enumerate(workspace.mets.get_physical_pages(for_pageIds=page_id_range, return_divs=True)): ret.append([]) for k in output_field: ret[i].append(page_div.get(k, 'None')) + else: + for page_id in workspace.mets.physical_pages: + ret.append([page_id]) if numeric_range: start, end = map(int, numeric_range.split('..')) From 606915ba9e796b7e5642ac8f6cdf86ac8bcccbf3 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 30 Sep 2024 16:02:56 +0200 Subject: [PATCH 43/46] disableLogging: clearer comment Co-authored-by: Konstantin Baierer --- src/ocrd_utils/logging.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrd_utils/logging.py b/src/ocrd_utils/logging.py index 8f45f9c7fc..ac2b3416a4 100644 --- a/src/ocrd_utils/logging.py +++ b/src/ocrd_utils/logging.py @@ -211,7 +211,7 @@ def disableLogging(silent=not config.OCRD_LOGGING_DEBUG): _initialized_flag = False # logging.basicConfig(level=logging.CRITICAL) # logging.disable(logging.ERROR) - # remove all handlers for the ocrd logger + # remove all handlers for the 'ocrd.' and root logger for logger_name in ROOT_OCRD_LOGGERS + ['']: for handler in logging.getLogger(logger_name).handlers[:]: logging.getLogger(logger_name).removeHandler(handler) From 3b908a678f524b37d406022bb05b76515d8303f6 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 30 Sep 2024 17:02:44 +0200 Subject: [PATCH 44/46] :memo: changelog --- CHANGELOG.md | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 351f5a56aa..0d759cb03d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,36 @@ Versioned according to [Semantic Versioning](http://semver.org/). ## Unreleased +## [2.69.0] - 2024-09-30 + +Fixed: + - tests: ensure `ocrd_utils.config` gets reset whenever changing it globally + - `OcrdMetsServer.add_file`: pass on `force` kwarg + - `ocrd.cli.workspace`: consistently pass on `--mets-server-url` and `--backup` + - `ocrd.cli.validate "tasks"`: pass on `--mets-server-url` + - `ocrd.cli.bashlib "input-files"`: pass on `--mets-server-url` + - `lib.bash input-files`: pass on `--mets-server-url`, `--overwrite`, and parameters + - `lib.bash`: fix `errexit` handling + - `ocrd.cli.ocrd-tool "resolve-resource"`: forgot to actually print result + - `Workspace.reload_mets`: handle ClientSideOcrdMets as well + - `disableLogging`: also re-instate root logger to Python defaults + - actually apply CLI `--log-filename`, and show in `--help` + - adapt to Pillow changes + - `ocrd workspace clone`: do pass on `--file-grp` (for download filtering) + - :fire: `OcrdMets.add_agent` without positional arguments + +Changed: + - lib.bash `input-files`: do not try to validate tasks here (now covered by `Processor.verify()`) + - `run_processor`: be robust if `ocrd_tool` is missing `steps` + - `PcGtsType.PageType.id` via `make_xml_id`: replace `/` with `_` + +Added: + - `OcrdPage`: new `PageType.get_ReadingOrderGroups()` to retrieve recursive RO as dict + - ocrd.cli.workspace `server`: add subcommands `reload` and `save` + - METS Server: export and delegate `physical_pages` + - processor CLI: delegate `--resolve-resource`, too + * `OcrdConfig.reset_defaults` to reset config variables to their defaults + ## [2.68.0] - 2024-08-23 Changed: @@ -2164,6 +2194,7 @@ Fixed Initial Release +[2.69.0]: ../../compare/v2.69.0..v2.68.0 [2.68.0]: ../../compare/v2.68.0..v2.67.2 [2.67.2]: ../../compare/v2.67.2..v2.67.1 [2.67.1]: ../../compare/v2.67.1..v2.67.0 From 343a66afcb259d0cafaffdff3e050547f9f8d314 Mon Sep 17 00:00:00 2001 From: kba Date: Mon, 30 Sep 2024 17:16:54 +0200 Subject: [PATCH 45/46] :memo: changelog: remove spurious entries --- CHANGELOG.md | 3 --- 1 file changed, 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d759cb03d..88f6b6cadc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,8 +12,6 @@ Fixed: - `OcrdMetsServer.add_file`: pass on `force` kwarg - `ocrd.cli.workspace`: consistently pass on `--mets-server-url` and `--backup` - `ocrd.cli.validate "tasks"`: pass on `--mets-server-url` - - `ocrd.cli.bashlib "input-files"`: pass on `--mets-server-url` - - `lib.bash input-files`: pass on `--mets-server-url`, `--overwrite`, and parameters - `lib.bash`: fix `errexit` handling - `ocrd.cli.ocrd-tool "resolve-resource"`: forgot to actually print result - `Workspace.reload_mets`: handle ClientSideOcrdMets as well @@ -24,7 +22,6 @@ Fixed: - :fire: `OcrdMets.add_agent` without positional arguments Changed: - - lib.bash `input-files`: do not try to validate tasks here (now covered by `Processor.verify()`) - `run_processor`: be robust if `ocrd_tool` is missing `steps` - `PcGtsType.PageType.id` via `make_xml_id`: replace `/` with `_` From f808b726227d5502426b29dd7ab3a97af83a75e8 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 30 Sep 2024 17:46:34 +0200 Subject: [PATCH 46/46] :memo: update changelog again --- CHANGELOG.md | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 88f6b6cadc..d058ebce96 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,28 +9,31 @@ Versioned according to [Semantic Versioning](http://semver.org/). Fixed: - tests: ensure `ocrd_utils.config` gets reset whenever changing it globally - - `OcrdMetsServer.add_file`: pass on `force` kwarg - `ocrd.cli.workspace`: consistently pass on `--mets-server-url` and `--backup` + - `ocrd.cli.workspace`: make `list-page` work w/ METS Server - `ocrd.cli.validate "tasks"`: pass on `--mets-server-url` - `lib.bash`: fix `errexit` handling - - `ocrd.cli.ocrd-tool "resolve-resource"`: forgot to actually print result - - `Workspace.reload_mets`: handle ClientSideOcrdMets as well - - `disableLogging`: also re-instate root logger to Python defaults - actually apply CLI `--log-filename`, and show in `--help` - adapt to Pillow changes - `ocrd workspace clone`: do pass on `--file-grp` (for download filtering) - - :fire: `OcrdMets.add_agent` without positional arguments + - `OcrdMetsServer.add_file`: pass on `force` kwarg + - `Workspace.reload_mets`: handle ClientSideOcrdMets as well + - `OcrdMets.get_physical_pages`: cover `return_divs` w/o `for_fileIds` and `for_pageIds` + - `disableLogging`: also re-instate root logger to Python defaults Changed: - `run_processor`: be robust if `ocrd_tool` is missing `steps` - `PcGtsType.PageType.id` via `make_xml_id`: replace `/` with `_` + - `ClientSideOcrdMets`: use same logger name prefix as METS Server + - `Processor.zip_input_files`: when `--page-id` yields empty list, just log instead of raise Added: - `OcrdPage`: new `PageType.get_ReadingOrderGroups()` to retrieve recursive RO as dict - - ocrd.cli.workspace `server`: add subcommands `reload` and `save` - METS Server: export and delegate `physical_pages` + - ocrd.cli.workspace `server`: add subcommands `reload` and `save` - processor CLI: delegate `--resolve-resource`, too - * `OcrdConfig.reset_defaults` to reset config variables to their defaults + - `OcrdConfig.reset_defaults` to reset config variables to their defaults + - `ocrd_utils.scale_coordinates` for resizing images ## [2.68.0] - 2024-08-23