Skip to content

Commit

Permalink
Coherency (#112)
Browse files Browse the repository at this point in the history
* Make segmentation stricter

* Add text wrap when exporting to Excel

* Add coherency checks on tables

* Update column detection

* Update column detection

* Update tests

* Update table creation

* Update polars requirements
  • Loading branch information
xavctn authored Sep 27, 2023
1 parent 94934d7 commit fcbc3fa
Show file tree
Hide file tree
Showing 31 changed files with 667 additions and 518 deletions.
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
polars[pandas]>=0.18.7
polars[pandas]>=0.18.7,<0.19.4
pyarrow>=7
numpy
pymupdf>=1.19.1
Expand Down
2 changes: 1 addition & 1 deletion src/img2table/document/base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def to_xlsx(self, dest: Union[str, Path, io.BytesIO], ocr: "OCRInstance" = None,
workbook = xlsxwriter.Workbook(dest, {'in_memory': True})

# Create generic cell format
cell_format = workbook.add_format({'align': 'center', 'valign': 'vcenter'})
cell_format = workbook.add_format({'align': 'center', 'valign': 'vcenter', 'text_wrap': True})
cell_format.set_border()

# For each extracted table, create a corresponding worksheet and populate it
Expand Down
2 changes: 1 addition & 1 deletion src/img2table/document/base/rotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def get_relevant_angles(centroids: np.ndarray, ref_height: float, n_max: int = 5
)

# Get n most represented angles
most_likely_angles = (df_angles.groupby('angle')
most_likely_angles = (df_angles.group_by('angle')
.count()
.sort(by=['count', pl.col('angle').abs()], descending=[True, False])
.limit(n_max)
Expand Down
10 changes: 5 additions & 5 deletions src/img2table/ocr/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def get_text_cell(self, cell: Cell, margin: int = 0, page_number: int = None, mi

# Group text by parents
df_text_parent = (df_words_contained
.groupby('parent')
.group_by('parent')
.agg([pl.col('x1').min(),
pl.col('x2').max(),
pl.col('y1').min(),
Expand Down Expand Up @@ -131,15 +131,15 @@ def get_text_table(self, table: Table, page_number: int = None, min_confidence:

# Group text by parent
df_text_parent = (df_words_contained
.groupby(['row', 'col', 'parent'])
.group_by(['row', 'col', 'parent'])
.agg([pl.col('x1').min(),
pl.col('x2').max(),
pl.col('y1').min(),
pl.col('y2').max(),
pl.col('value').apply(lambda x: ' '.join(x), return_dtype=str).alias('value')])
pl.col('value').map_elements(lambda x: ' '.join(x), return_dtype=str).alias('value')])
.sort([pl.col("row"), pl.col("col"), pl.col('y1'), pl.col('x1')])
.groupby(['row', 'col'])
.agg(pl.col('value').apply(lambda x: '\n'.join(x).strip(), return_dtype=str).alias('text'))
.group_by(['row', 'col'])
.agg(pl.col('value').map_elements(lambda x: '\n'.join(x).strip(), return_dtype=str).alias('text'))
)

# Implement found values to table cells content
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def get_cells_dataframe(horizontal_lines: List[Line], vertical_lines: List[Line]

# Get all vertical delimiters by bbox
df_bbox_delimiters = (df_bbox_v.sort(['idx', "x1_bbox", "x2_bbox", "y1_bbox", "y2_bbox", "x1"])
.groupby(['idx', "x1_bbox", "x2_bbox", "y1_bbox", "y2_bbox"])
.group_by(['idx', "x1_bbox", "x2_bbox", "y1_bbox", "y2_bbox"])
.agg(pl.col('x1').alias('dels'))
.filter(pl.col("dels").list.lengths() >= 2)
)
Expand Down
4 changes: 2 additions & 2 deletions src/img2table/tables/processing/bordered_tables/lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def overlapping_filter(lines: List[Line], max_gap: int = 5) -> List[Line]:
return []

# Identify if rows are horizontal
horizontal = np.mean([l.horizontal for l in lines]) > 0.5
horizontal = np.average([l.horizontal for l in lines], weights=[l.length for l in lines]) > 0.5

# If not horizontal, transpose all rows
if not horizontal:
Expand Down Expand Up @@ -278,7 +278,7 @@ def remove_word_lines(lines: List[Line], contours: List[Cell]) -> List[Line]:

# Get lines together with elements that intersect the line
line_elements = (df_words_lines.filter(pl.col('intersection'))
.groupby(["x1_line", "y1_line", "x2_line", "y2_line", "vertical", "thickness"])
.group_by(["x1_line", "y1_line", "x2_line", "y2_line", "vertical", "thickness"])
.agg(pl.struct("x1", "y1", "x2", "y2").alias('intersecting'))
.join(df_words_lines.select(["x1_line", "y1_line", "x2_line", "y2_line", "vertical", "thickness"]),
on=["x1_line", "y1_line", "x2_line", "y2_line", "vertical", "thickness"],
Expand Down
7 changes: 4 additions & 3 deletions src/img2table/tables/processing/borderless_tables/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,7 @@ def identify_borderless_tables(img: np.ndarray, lines: List[Line], char_length:
table_segments = segment_image(img=img,
lines=lines,
char_length=char_length,
median_line_sep=median_line_sep,
contours=contours)
median_line_sep=median_line_sep)

# In each segment, create groups of rows and identify tables
tables = list()
Expand All @@ -70,7 +69,9 @@ def identify_borderless_tables(img: np.ndarray, lines: List[Line], char_length:
borderless_table = identify_table(columns=column_group,
table_rows=table_rows,
contours=contours,
lines=lines)
lines=lines,
median_line_sep=median_line_sep,
char_length=char_length)

if borderless_table:
tables.append(borderless_table)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def get_coherent_ws_height(vertical_ws: List[Cell], unused_ws: List[Cell],
:return: tuple containing list of vertical whitespaces and list of unused whitespaces resized
"""
# Define relevant ws
relevant_ws = [ws for ws in unused_ws if ws.height >= 0.75 * max([w.height for w in vertical_ws])]
relevant_ws = [ws for ws in unused_ws if ws.height >= 0.66 * max([w.height for w in vertical_ws])]
relevant_ws += vertical_ws

# Group elements in rows
Expand Down Expand Up @@ -84,7 +84,7 @@ def identify_missing_vertical_whitespaces(unused_ws: List[Cell], char_length: fl
new_ws = list()
# Check if clusters can create a new vertical whitespace
for cl in ws_clusters:
if max([ws.y2 for ws in cl]) - min([ws.y1 for ws in cl]) >= 0.75 * ref_height:
if max([ws.y2 for ws in cl]) - min([ws.y1 for ws in cl]) >= 0.66 * ref_height:
v_ws = Cell(x1=min([ws.x1 for ws in cl]),
y1=min([ws.y1 for ws in cl]),
x2=max([ws.x2 for ws in cl]),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def get_vertical_whitespaces(table_segment: TableSegment, char_length: float) ->
# Filter whitespaces by height
max_height = max([ws.height for ws in vertical_ws])
vertical_ws = [ws for ws in vertical_ws
if (ws.height >= 0.75 * max_height and ws.width >= 0.5 * char_length) or ws.height == max_height]
if (ws.height >= 0.66 * max_height and ws.width >= 0.5 * char_length) or ws.height == max_height]

# Identify segment whitespaces that are unused
unused_ws = [ws for ws in table_segment.whitespaces
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,41 +3,38 @@

import numpy as np

from img2table.tables.objects.cell import Cell
from img2table.tables.objects.line import Line
from img2table.tables.processing.borderless_tables.layout.column_segmentation import segment_image_columns
from img2table.tables.processing.borderless_tables.layout.segment_elements import get_segment_elements
from img2table.tables.processing.borderless_tables.layout.column_segments import segment_image_columns
from img2table.tables.processing.borderless_tables.layout.image_elements import get_image_elements
from img2table.tables.processing.borderless_tables.layout.table_segments import get_table_segments
from img2table.tables.processing.borderless_tables.model import TableSegment
from img2table.tables.processing.borderless_tables.model import TableSegment, ImageSegment


def segment_image(img: np.ndarray, lines: List[Line], char_length: float, median_line_sep: float,
contours: List[Cell]) -> List[TableSegment]:
def segment_image(img: np.ndarray, lines: List[Line], char_length: float, median_line_sep: float) -> List[TableSegment]:
"""
Segment image and its elements
:param img: image array
:param lines: list of Line objects of the image
:param char_length: average character length
:param median_line_sep: median line separation
:param contours: list of image contours
:return: list of ImageSegment objects with corresponding elements
"""
# Segment image using columns
column_segments = segment_image_columns(img=img,
median_line_sep=median_line_sep,
char_length=char_length,
contours=contours)

# Set segment elements
column_segments = get_segment_elements(img=img,
lines=lines,
img_segments=column_segments,
char_length=char_length,
median_line_sep=median_line_sep,
blur_size=3)
# Identify image elements
img_elements = get_image_elements(img=img,
lines=lines,
char_length=char_length,
median_line_sep=median_line_sep)

# Identify column segments
y_min, y_max = min([el.y1 for el in img_elements]), max([el.y2 for el in img_elements])
image_segment = ImageSegment(x1=0, y1=y_min, x2=img.shape[1], y2=y_max, elements=img_elements)

col_segments = segment_image_columns(image_segment=image_segment,
char_length=char_length,
lines=lines)

# Within each column, identify segments that can correspond to tables
tb_segments = [table_segment for col_segment in column_segments
tb_segments = [table_segment for col_segment in col_segments
for table_segment in get_table_segments(segment=col_segment,
char_length=char_length,
median_line_sep=median_line_sep)
Expand Down
Loading

0 comments on commit fcbc3fa

Please sign in to comment.