From 3f6cd52a2740f7a87e76ce3495ed2736fef2de40 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 31 Jan 2022 16:00:03 +0100 Subject: [PATCH 1/3] fix Margin calculations --- ocrd_page_to_alto/convert.py | 93 +++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 44 deletions(-) diff --git a/ocrd_page_to_alto/convert.py b/ocrd_page_to_alto/convert.py index bc71163..6e9b34c 100644 --- a/ocrd_page_to_alto/convert.py +++ b/ocrd_page_to_alto/convert.py @@ -10,7 +10,7 @@ parseString, to_xml) from ocrd_models.constants import NAMESPACES as NAMESPACES_ -from ocrd_utils import getLogger, xywh_from_points +from ocrd_utils import getLogger, bbox_from_points from .utils import ( set_alto_id_from_page_id, @@ -183,55 +183,60 @@ def convert_border(self): page_height = self.page_page.imageHeight setxml(self.alto_page, 'WIDTH', page_width) setxml(self.alto_page, 'HEIGHT', page_height) - page_printspace = self.page_page.get_PrintSpace() - dummy_printspace = False - if page_printspace is None: - self.logger.warning("PAGE-XML has no PrintSpace, trying to fall back to Border") - page_printspace = self.page_page.get_Border() - if page_printspace is None: - dummy_printspace = True + page_border = self.page_page.get_Border() + page_pspace = self.page_page.get_PrintSpace() + if page_pspace is None and not page_border is None: + self.logger.warning("PAGE-XML has Border but no PrintSpace - Margins will be empty") + page_pspace = page_border + elif page_border is None and not page_pspace is None: + self.logger.warning("PAGE-XML has PrintSpace but no Border - Margins will be empty") + page_border = page_pspace + elif page_border is None and page_pspace is None: + self.logger.warning("PAGE-XML has neither Border nor PrintSpace - PrintSpace will fill the image") + alto_pspace = ET.SubElement(self.alto_page, 'PrintSpace') + setxml(alto_pspace, 'VPOS', 0) + setxml(alto_pspace, 'HPOS', 0) + setxml(alto_pspace, 'HEIGHT', page_height) + setxml(alto_pspace, 'WIDTH', page_width) + return alto_pspace - if dummy_printspace: - self.logger.warning("PAGE-XML has neither Border nor PrintSpace") - for pos in ('Top', 'Left', 'Right', 'Bottom'): - margin = ET.SubElement(self.alto_page, '%sMargin' % pos) - for att in ('VPOS', 'HPOS', 'HEIGHT', 'WIDTH'): - setxml(margin, att, 0) - else: - xywh = xywh_from_points(page_printspace.get_Coords().points) + alto_pspace = ET.SubElement(self.alto_page, 'PrintSpace') + set_alto_xywh_from_coords(alto_pspace, page_pspace) + set_alto_shape_from_coords(alto_pspace, page_pspace) + + if not page_border is page_pspace: + bmin_x, bmin_y, bmax_x, bmax_y = bbox_from_points(page_border.get_Coords().points) + pmin_x, pmin_y, pmax_x, pmax_y = bbox_from_points(page_pspace.get_Coords().points) + # + # ╔═══════╗ ╔═══════╗ ╔╗ ╔══╗ + # ║┌───┐ ║ ╚═══════╝ ║║ ║ ║ ┌───┐ + # ║│ │ ║ → + ║║ ║ ║ (margins) + │ │ (pspace) + # ║└───┘ ║ ╔═══════╗ ║║ ║ ║ └───┘ + # ║ ║ ║ ║ ║║ ║ ║ + # ╚═══════╝ ╚═══════╝ ╚╝ ╚══╝ + # alto_topmargin = ET.SubElement(self.alto_page, 'TopMargin') - setxml(alto_topmargin, 'VPOS', 0) - setxml(alto_topmargin, 'HPOS', 0) - setxml(alto_topmargin, 'HEIGHT', xywh['x']) - setxml(alto_topmargin, 'WIDTH', page_width) + setxml(alto_topmargin, 'VPOS', bmin_y) + setxml(alto_topmargin, 'HPOS', bmin_x) + setxml(alto_topmargin, 'HEIGHT', pmin_y - bmin_y) + setxml(alto_topmargin, 'WIDTH', bmax_x - bmin_x) alto_leftmargin = ET.SubElement(self.alto_page, 'LeftMargin') - setxml(alto_leftmargin, 'VPOS', 0) - setxml(alto_leftmargin, 'HPOS', 0) - setxml(alto_leftmargin, 'HEIGHT', page_height) - setxml(alto_leftmargin, 'WIDTH', xywh['x']) + setxml(alto_leftmargin, 'VPOS', bmin_y) + setxml(alto_leftmargin, 'HPOS', bmin_x) + setxml(alto_leftmargin, 'HEIGHT', bmax_y - bmin_y) + setxml(alto_leftmargin, 'WIDTH', pmin_x - bmin_x) alto_rightmargin = ET.SubElement(self.alto_page, 'RightMargin') - setxml(alto_rightmargin, 'VPOS', 0) - setxml(alto_rightmargin, 'HPOS', xywh['x'] + xywh['w']) - setxml(alto_rightmargin, 'HEIGHT', page_height) - setxml(alto_rightmargin, 'WIDTH', page_width - (xywh['x'] + xywh['w'])) + setxml(alto_rightmargin, 'VPOS', bmin_y) + setxml(alto_rightmargin, 'HPOS', pmax_x) + setxml(alto_rightmargin, 'HEIGHT', bmax_y - bmin_y) + setxml(alto_rightmargin, 'WIDTH', bmax_x - pmax_x) alto_bottommargin = ET.SubElement(self.alto_page, 'BottomMargin') - setxml(alto_bottommargin, 'VPOS', xywh['y'] + xywh['h']) - setxml(alto_bottommargin, 'HPOS', 0) - setxml(alto_bottommargin, 'HEIGHT', page_height - (xywh['y'] + xywh['h'])) - setxml(alto_bottommargin, 'WIDTH', page_width) - alto_printspace = ET.SubElement(self.alto_page, 'PrintSpace') - set_alto_xywh_from_coords(alto_printspace, page_printspace) - if version.parse(self.alto_version) >= version.parse('3.1'): - set_alto_shape_from_coords(alto_printspace, page_printspace) - - if dummy_printspace: - alto_printspace = ET.SubElement(self.alto_page, 'PrintSpace') - setxml(alto_printspace, 'VPOS', 0) - setxml(alto_printspace, 'HPOS', 0) - setxml(alto_printspace, 'HEIGHT', page_height) - setxml(alto_printspace, 'WIDTH', page_width) + setxml(alto_bottommargin, 'VPOS', pmax_y) + setxml(alto_bottommargin, 'HPOS', bmin_x) + setxml(alto_bottommargin, 'HEIGHT', bmax_y - pmax_y) + setxml(alto_bottommargin, 'WIDTH', bmax_x - bmin_x) - return alto_printspace + return alto_pspace def convert_metadata(self): alto_measurementunit = ET.SubElement(self.alto_description, 'MeasurementUnit') From ba7d0fcd4ea77507f0fafd219b6567a555acd6c5 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Mon, 31 Jan 2022 16:54:56 +0100 Subject: [PATCH 2/3] convert_text: assign regions to PrintSpace or Margins by bbox --- ocrd_page_to_alto/convert.py | 19 ++++++++++++++++++- ocrd_page_to_alto/utils.py | 15 +++++++++++++++ 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/ocrd_page_to_alto/convert.py b/ocrd_page_to_alto/convert.py index 6e9b34c..990777a 100644 --- a/ocrd_page_to_alto/convert.py +++ b/ocrd_page_to_alto/convert.py @@ -18,6 +18,7 @@ set_alto_shape_from_coords, set_alto_xywh_from_coords, setxml, + contains, get_nth_textequiv) from .styles import TextStylesManager, ParagraphStyleManager, LayoutTagManager @@ -338,7 +339,23 @@ def convert_text(self): reg_alto_type = REGION_PAGE_TO_ALTO[reg_page_type] if not reg_alto_type: raise ValueError("Cannot handle PAGE-XML %sRegion" % reg_page_type) - reg_alto = ET.SubElement(self.alto_printspace, reg_alto_type) + # determine if the region belongs to PrintSpace or to any of the Margins + reg_bbox = bbox_from_points(reg_page.get_Coords().points) + if contains(self.alto_printspace, reg_bbox): + parent = self.alto_printspace + else: + parent = None + for margin in ['LeftMargin', 'RightMargin', 'TopMargin', 'BottomMargin']: + if not hasattr(self.alto_page, margin): + continue + margin = getattr(self.alto_page, margin) + if contains(margin, reg_bbox): + parent = margin + break # pick first match only + if not parent: + parent = self.alto_printspace + self.logger.warning("region '%s' not properly contained in PrintSpace or Margins", reg_page.id) + reg_alto = ET.SubElement(parent, reg_alto_type) set_alto_id_from_page_id(reg_alto, reg_page) set_alto_xywh_from_coords(reg_alto, reg_page) if version.parse(self.alto_version) >= version.parse('3.1'): diff --git a/ocrd_page_to_alto/utils.py b/ocrd_page_to_alto/utils.py index d099dcb..cb07d46 100644 --- a/ocrd_page_to_alto/utils.py +++ b/ocrd_page_to_alto/utils.py @@ -49,3 +49,18 @@ def get_nth_textequiv(reg_page, textequiv_index, textequiv_fallback_strategy): else: return textequivs[textequiv_index].Unicode +def contains(el, bbox): + minx1, miny1, maxx1, maxy1 = bbox + minx2 = int(el.get('HPOS')) + miny2 = int(el.get('VPOS')) + maxx2 = minx2 + int(el.get('WIDTH')) + maxy2 = miny2 + int(el.get('HEIGHT')) + if minx1 < minx2: + return False + if maxx1 > maxx2: + return False + if miny1 < miny2: + return False + if maxy1 > maxy2: + return False + return True From 6f6394cf21653042f743fb60c43fae9235a67f76 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky <38561704+bertsky@users.noreply.github.com> Date: Mon, 31 Jan 2022 21:54:30 +0100 Subject: [PATCH 3/3] =?UTF-8?q?not=20is=20=E2=86=92=20is=20not?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrd_page_to_alto/convert.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ocrd_page_to_alto/convert.py b/ocrd_page_to_alto/convert.py index 990777a..e5183fc 100644 --- a/ocrd_page_to_alto/convert.py +++ b/ocrd_page_to_alto/convert.py @@ -186,10 +186,10 @@ def convert_border(self): setxml(self.alto_page, 'HEIGHT', page_height) page_border = self.page_page.get_Border() page_pspace = self.page_page.get_PrintSpace() - if page_pspace is None and not page_border is None: + if page_pspace is None and page_border is not None: self.logger.warning("PAGE-XML has Border but no PrintSpace - Margins will be empty") page_pspace = page_border - elif page_border is None and not page_pspace is None: + elif page_border is None and page_pspace is not None: self.logger.warning("PAGE-XML has PrintSpace but no Border - Margins will be empty") page_border = page_pspace elif page_border is None and page_pspace is None: @@ -205,7 +205,7 @@ def convert_border(self): set_alto_xywh_from_coords(alto_pspace, page_pspace) set_alto_shape_from_coords(alto_pspace, page_pspace) - if not page_border is page_pspace: + if page_border is not page_pspace: bmin_x, bmin_y, bmax_x, bmax_y = bbox_from_points(page_border.get_Coords().points) pmin_x, pmin_y, pmax_x, pmax_y = bbox_from_points(page_pspace.get_Coords().points) #