Coherency (#112)

* Make segmentation stricter * Add text wrap when exporting to Excel * Add coherency checks on tables * Update column detection * Update column detection * Update tests * Update table creation * Update polars requirements
xavctn · Sep 27, 2023 · fcbc3fa · fcbc3fa
1 parent 94934d7
commit fcbc3fa
Show file tree

Hide file tree

Showing 31 changed files with 667 additions and 518 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-polars[pandas]>=0.18.7
+polars[pandas]>=0.18.7,<0.19.4
 pyarrow>=7
 numpy
 pymupdf>=1.19.1

diff --git a/src/img2table/document/base/__init__.py b/src/img2table/document/base/__init__.py
@@ -102,7 +102,7 @@ def to_xlsx(self, dest: Union[str, Path, io.BytesIO], ocr: "OCRInstance" = None,
         workbook = xlsxwriter.Workbook(dest, {'in_memory': True})
 
         # Create generic cell format
-        cell_format = workbook.add_format({'align': 'center', 'valign': 'vcenter'})
+        cell_format = workbook.add_format({'align': 'center', 'valign': 'vcenter', 'text_wrap': True})
         cell_format.set_border()
 
         # For each extracted table, create a corresponding worksheet and populate it

diff --git a/src/img2table/document/base/rotation.py b/src/img2table/document/base/rotation.py
@@ -90,7 +90,7 @@ def get_relevant_angles(centroids: np.ndarray, ref_height: float, n_max: int = 5
                  )
 
     # Get n most represented angles
-    most_likely_angles = (df_angles.groupby('angle')
+    most_likely_angles = (df_angles.group_by('angle')
                           .count()
                           .sort(by=['count', pl.col('angle').abs()], descending=[True, False])
                           .limit(n_max)

diff --git a/src/img2table/ocr/data.py b/src/img2table/ocr/data.py
@@ -65,7 +65,7 @@ def get_text_cell(self, cell: Cell, margin: int = 0, page_number: int = None, mi
 
         # Group text by parents
         df_text_parent = (df_words_contained
-                          .groupby('parent')
+                          .group_by('parent')
                           .agg([pl.col('x1').min(),
                                 pl.col('x2').max(),
                                 pl.col('y1').min(),
@@ -131,15 +131,15 @@ def get_text_table(self, table: Table, page_number: int = None, min_confidence:
 
         # Group text by parent
         df_text_parent = (df_words_contained
-                          .groupby(['row', 'col', 'parent'])
+                          .group_by(['row', 'col', 'parent'])
                           .agg([pl.col('x1').min(),
                                 pl.col('x2').max(),
                                 pl.col('y1').min(),
                                 pl.col('y2').max(),
-                                pl.col('value').apply(lambda x: ' '.join(x), return_dtype=str).alias('value')])
+                                pl.col('value').map_elements(lambda x: ' '.join(x), return_dtype=str).alias('value')])
                           .sort([pl.col("row"), pl.col("col"), pl.col('y1'), pl.col('x1')])
-                          .groupby(['row', 'col'])
-                          .agg(pl.col('value').apply(lambda x: '\n'.join(x).strip(), return_dtype=str).alias('text'))
+                          .group_by(['row', 'col'])
+                          .agg(pl.col('value').map_elements(lambda x: '\n'.join(x).strip(), return_dtype=str).alias('text'))
                           )
 
         # Implement found values to table cells content

diff --git a/src/img2table/tables/processing/bordered_tables/cells/identification.py b/src/img2table/tables/processing/bordered_tables/cells/identification.py
@@ -102,7 +102,7 @@ def get_cells_dataframe(horizontal_lines: List[Line], vertical_lines: List[Line]
 
     # Get all vertical delimiters by bbox
     df_bbox_delimiters = (df_bbox_v.sort(['idx', "x1_bbox", "x2_bbox", "y1_bbox", "y2_bbox", "x1"])
-                          .groupby(['idx', "x1_bbox", "x2_bbox", "y1_bbox", "y2_bbox"])
+                          .group_by(['idx', "x1_bbox", "x2_bbox", "y1_bbox", "y2_bbox"])
                           .agg(pl.col('x1').alias('dels'))
                           .filter(pl.col("dels").list.lengths() >= 2)
                           )

diff --git a/src/img2table/tables/processing/bordered_tables/lines.py b/src/img2table/tables/processing/bordered_tables/lines.py
@@ -105,7 +105,7 @@ def overlapping_filter(lines: List[Line], max_gap: int = 5) -> List[Line]:
         return []
 
     # Identify if rows are horizontal
-    horizontal = np.mean([l.horizontal for l in lines]) > 0.5
+    horizontal = np.average([l.horizontal for l in lines], weights=[l.length for l in lines]) > 0.5
 
     # If not horizontal, transpose all rows
     if not horizontal:
@@ -278,7 +278,7 @@ def remove_word_lines(lines: List[Line], contours: List[Cell]) -> List[Line]:
 
     # Get lines together with elements that intersect the line
     line_elements = (df_words_lines.filter(pl.col('intersection'))
-                     .groupby(["x1_line", "y1_line", "x2_line", "y2_line", "vertical", "thickness"])
+                     .group_by(["x1_line", "y1_line", "x2_line", "y2_line", "vertical", "thickness"])
                      .agg(pl.struct("x1", "y1", "x2", "y2").alias('intersecting'))
                      .join(df_words_lines.select(["x1_line", "y1_line", "x2_line", "y2_line", "vertical", "thickness"]),
                            on=["x1_line", "y1_line", "x2_line", "y2_line", "vertical", "thickness"],

diff --git a/src/img2table/tables/processing/borderless_tables/__init__.py b/src/img2table/tables/processing/borderless_tables/__init__.py
@@ -50,8 +50,7 @@ def identify_borderless_tables(img: np.ndarray, lines: List[Line], char_length:
     table_segments = segment_image(img=img,
                                    lines=lines,
                                    char_length=char_length,
-                                   median_line_sep=median_line_sep,
-                                   contours=contours)
+                                   median_line_sep=median_line_sep)
 
     # In each segment, create groups of rows and identify tables
     tables = list()
@@ -70,7 +69,9 @@ def identify_borderless_tables(img: np.ndarray, lines: List[Line], char_length:
                 borderless_table = identify_table(columns=column_group,
                                                   table_rows=table_rows,
                                                   contours=contours,
-                                                  lines=lines)
+                                                  lines=lines,
+                                                  median_line_sep=median_line_sep,
+                                                  char_length=char_length)
 
                 if borderless_table:
                     tables.append(borderless_table)

diff --git a/src/img2table/tables/processing/borderless_tables/column_delimiters/columns.py b/src/img2table/tables/processing/borderless_tables/column_delimiters/columns.py
@@ -17,7 +17,7 @@ def get_coherent_ws_height(vertical_ws: List[Cell], unused_ws: List[Cell],
     :return: tuple containing list of vertical whitespaces and list of unused whitespaces resized
     """
     # Define relevant ws
-    relevant_ws = [ws for ws in unused_ws if ws.height >= 0.75 * max([w.height for w in vertical_ws])]
+    relevant_ws = [ws for ws in unused_ws if ws.height >= 0.66 * max([w.height for w in vertical_ws])]
     relevant_ws += vertical_ws
 
     # Group elements in rows
@@ -84,7 +84,7 @@ def identify_missing_vertical_whitespaces(unused_ws: List[Cell], char_length: fl
     new_ws = list()
     # Check if clusters can create a new vertical whitespace
     for cl in ws_clusters:
-        if max([ws.y2 for ws in cl]) - min([ws.y1 for ws in cl]) >= 0.75 * ref_height:
+        if max([ws.y2 for ws in cl]) - min([ws.y1 for ws in cl]) >= 0.66 * ref_height:
             v_ws = Cell(x1=min([ws.x1 for ws in cl]),
                         y1=min([ws.y1 for ws in cl]),
                         x2=max([ws.x2 for ws in cl]),

diff --git a/src/img2table/tables/processing/borderless_tables/column_delimiters/vertical_whitespaces.py b/src/img2table/tables/processing/borderless_tables/column_delimiters/vertical_whitespaces.py
@@ -133,7 +133,7 @@ def get_vertical_whitespaces(table_segment: TableSegment, char_length: float) ->
     # Filter whitespaces by height
     max_height = max([ws.height for ws in vertical_ws])
     vertical_ws = [ws for ws in vertical_ws
-                   if (ws.height >= 0.75 * max_height and ws.width >= 0.5 * char_length) or ws.height == max_height]
+                   if (ws.height >= 0.66 * max_height and ws.width >= 0.5 * char_length) or ws.height == max_height]
 
     # Identify segment whitespaces that are unused
     unused_ws = [ws for ws in table_segment.whitespaces

diff --git a/src/img2table/tables/processing/borderless_tables/layout/__init__.py b/src/img2table/tables/processing/borderless_tables/layout/__init__.py
@@ -3,41 +3,38 @@
 
 import numpy as np
 
-from img2table.tables.objects.cell import Cell
 from img2table.tables.objects.line import Line
-from img2table.tables.processing.borderless_tables.layout.column_segmentation import segment_image_columns
-from img2table.tables.processing.borderless_tables.layout.segment_elements import get_segment_elements
+from img2table.tables.processing.borderless_tables.layout.column_segments import segment_image_columns
+from img2table.tables.processing.borderless_tables.layout.image_elements import get_image_elements
 from img2table.tables.processing.borderless_tables.layout.table_segments import get_table_segments
-from img2table.tables.processing.borderless_tables.model import TableSegment
+from img2table.tables.processing.borderless_tables.model import TableSegment, ImageSegment
 
 
-def segment_image(img: np.ndarray, lines: List[Line], char_length: float, median_line_sep: float,
-                  contours: List[Cell]) -> List[TableSegment]:
+def segment_image(img: np.ndarray, lines: List[Line], char_length: float, median_line_sep: float) -> List[TableSegment]:
     """
     Segment image and its elements
     :param img: image array
     :param lines: list of Line objects of the image
     :param char_length: average character length
     :param median_line_sep: median line separation
-    :param contours: list of image contours
     :return: list of ImageSegment objects with corresponding elements
     """
-    # Segment image using columns
-    column_segments = segment_image_columns(img=img,
-                                            median_line_sep=median_line_sep,
-                                            char_length=char_length,
-                                            contours=contours)
-
-    # Set segment elements
-    column_segments = get_segment_elements(img=img,
-                                           lines=lines,
-                                           img_segments=column_segments,
-                                           char_length=char_length,
-                                           median_line_sep=median_line_sep,
-                                           blur_size=3)
+    # Identify image elements
+    img_elements = get_image_elements(img=img,
+                                      lines=lines,
+                                      char_length=char_length,
+                                      median_line_sep=median_line_sep)
+
+    # Identify column segments
+    y_min, y_max = min([el.y1 for el in img_elements]), max([el.y2 for el in img_elements])
+    image_segment = ImageSegment(x1=0, y1=y_min, x2=img.shape[1], y2=y_max, elements=img_elements)
+
+    col_segments = segment_image_columns(image_segment=image_segment,
+                                         char_length=char_length,
+                                         lines=lines)
 
     # Within each column, identify segments that can correspond to tables
-    tb_segments = [table_segment for col_segment in column_segments
+    tb_segments = [table_segment for col_segment in col_segments
                    for table_segment in get_table_segments(segment=col_segment,
                                                            char_length=char_length,
                                                            median_line_sep=median_line_sep)