diff --git a/test/test_segment_line.py b/test/test_segment_line.py index 99602f2..f63bc6a 100644 --- a/test/test_segment_line.py +++ b/test/test_segment_line.py @@ -1,6 +1,8 @@ from ocrd_tesserocr import TesserocrSegmentRegion from ocrd_tesserocr import TesserocrSegmentLine from ocrd_tesserocr import TesserocrSegment +from ocrd_modelfactory import page_from_file +from ocrd_utils import MIMETYPE_PAGE def test_run_modular(workspace_herold_small): TesserocrSegmentRegion( @@ -14,6 +16,13 @@ def test_run_modular(workspace_herold_small): input_file_grp="OCR-D-SEG-BLOCK", output_file_grp="OCR-D-SEG-LINE" ).process() + out_files = list(workspace_herold_small.find_files( + fileGrp="OCR-D-SEG-LINE", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_lines = out_pcgts.get_Page().get_AllTextLines() + assert len(out_lines) workspace_herold_small.save_mets() def test_run_allinone(workspace_herold_small): @@ -22,4 +31,11 @@ def test_run_allinone(workspace_herold_small): input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-SEG" ).process() + out_files = list(workspace_herold_small.find_files( + fileGrp="OCR-D-SEG", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_lines = out_pcgts.get_Page().get_AllTextLines() + assert len(out_lines) workspace_herold_small.save_mets() diff --git a/test/test_segment_region.py b/test/test_segment_region.py index c250ebd..50f7ed4 100644 --- a/test/test_segment_region.py +++ b/test/test_segment_region.py @@ -1,4 +1,6 @@ from ocrd_tesserocr import TesserocrSegmentRegion +from ocrd_modelfactory import page_from_file +from ocrd_utils import MIMETYPE_PAGE def test_run(workspace_herold_small): TesserocrSegmentRegion( @@ -6,6 +8,13 @@ def test_run(workspace_herold_small): input_file_grp="OCR-D-IMG", output_file_grp="OCR-D-SEG-BLOCK" ).process() + out_files = list(workspace_herold_small.find_files( + fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text']) + assert len(out_blocks) workspace_herold_small.save_mets() def test_run_shrink(workspace_herold_small): @@ -15,6 +24,13 @@ def test_run_shrink(workspace_herold_small): output_file_grp="OCR-D-SEG-BLOCK", parameter={'shrink_polygons': True} ).process() + out_files = list(workspace_herold_small.find_files( + fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text']) + assert len(out_blocks) workspace_herold_small.save_mets() def test_run_sparse(workspace_herold_small): @@ -24,6 +40,13 @@ def test_run_sparse(workspace_herold_small): output_file_grp="OCR-D-SEG-BLOCK", parameter={'sparse_text': True} ).process() + out_files = list(workspace_herold_small.find_files( + fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text']) + assert len(out_blocks) workspace_herold_small.save_mets() def test_run_staves(workspace_herold_small): @@ -33,4 +56,11 @@ def test_run_staves(workspace_herold_small): output_file_grp="OCR-D-SEG-BLOCK", parameter={'find_staves': True, 'find_tables': False} ).process() + out_files = list(workspace_herold_small.find_files( + fileGrp="OCR-D-SEG-BLOCK", pageId="PHYS_0001", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_blocks = out_pcgts.get_Page().get_AllRegions(classes=['Text']) + assert len(out_blocks) workspace_herold_small.save_mets() diff --git a/test/test_segment_word.py b/test/test_segment_word.py index 86fcc6b..86fc28d 100644 --- a/test/test_segment_word.py +++ b/test/test_segment_word.py @@ -1,6 +1,8 @@ from ocrd_tesserocr import TesserocrSegmentRegion from ocrd_tesserocr import TesserocrSegmentLine from ocrd_tesserocr import TesserocrSegmentWord +from ocrd_modelfactory import page_from_file +from ocrd_utils import MIMETYPE_PAGE def test_run_modular(workspace_kant_binarized): TesserocrSegmentRegion( @@ -18,4 +20,12 @@ def test_run_modular(workspace_kant_binarized): input_file_grp="OCR-D-SEG-LINE", output_file_grp="OCR-D-SEG-WORD" ).process() + out_files = list(workspace_kant_binarized.find_files( + fileGrp="OCR-D-SEG-WORD", pageId="P_0017", mimetype=MIMETYPE_PAGE)) + assert len(out_files) + out_pcgts = page_from_file(out_files[0]) + assert out_pcgts is not None + out_lines = out_pcgts.get_Page().get_AllTextLines() + assert len(out_lines) + assert all(len(line.get_Word()) for line in out_lines) workspace_kant_binarized.save_mets()