From edc9c8053c46db9b786a3283fb442b5a1f15f8b9 Mon Sep 17 00:00:00 2001 From: Xavier C Date: Wed, 22 Nov 2023 20:51:22 +0100 Subject: [PATCH] Update table creation --- src/img2table/document/base/__init__.py | 5 ++++- src/img2table/tables/objects/table.py | 3 ++- .../processing/bordered_tables/tables/table_creation.py | 5 +++-- .../processing/borderless_tables/table/table_creation.py | 4 ++-- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/img2table/document/base/__init__.py b/src/img2table/document/base/__init__.py index e8129b9..248f410 100644 --- a/src/img2table/document/base/__init__.py +++ b/src/img2table/document/base/__init__.py @@ -103,7 +103,10 @@ def get_table_content(self, tables: Dict[int, List["Table"]], ocr: "OCRInstance" # Reset OCR self.ocr_df = None - return {k: [tb.extracted_table for tb in v] for k, v in tables.items()} + return {k: [tb.extracted_table for tb in v + if (max(tb.nb_rows, tb.nb_columns) >= 2 and not tb._borderless) + or (tb.nb_rows >= 2 and tb.nb_columns >= 3)] + for k, v in tables.items()} def extract_tables(self, ocr: "OCRInstance" = None, implicit_rows: bool = False, borderless_tables: bool = False, min_confidence: int = 50) -> Dict[int, List[ExtractedTable]]: diff --git a/src/img2table/tables/objects/table.py b/src/img2table/tables/objects/table.py index 05b5199..49acfb7 100644 --- a/src/img2table/tables/objects/table.py +++ b/src/img2table/tables/objects/table.py @@ -11,7 +11,7 @@ class Table(TableObject): - def __init__(self, rows: Union[Row, List[Row]]): + def __init__(self, rows: Union[Row, List[Row]], borderless: bool = False): if rows is None: self._items = [] elif isinstance(rows, Row): @@ -19,6 +19,7 @@ def __init__(self, rows: Union[Row, List[Row]]): else: self._items = rows self._title = None + self._borderless = borderless @property def items(self) -> List[Row]: diff --git a/src/img2table/tables/processing/bordered_tables/tables/table_creation.py b/src/img2table/tables/processing/bordered_tables/tables/table_creation.py index 120e6e9..41ff303 100644 --- a/src/img2table/tables/processing/bordered_tables/tables/table_creation.py +++ b/src/img2table/tables/processing/bordered_tables/tables/table_creation.py @@ -85,11 +85,12 @@ def remove_unwanted_elements(table: Table, elements: List[Cell]) -> Table: return table -def cluster_to_table(cluster_cells: List[Cell], elements: List[Cell]) -> Table: +def cluster_to_table(cluster_cells: List[Cell], elements: List[Cell], borderless: bool = False) -> Table: """ Convert a cell cluster to a Table object :param cluster_cells: list of cells that form a table :param elements: list of image elements + :param borderless: boolean indicating if the created table is borderless :return: table with rows inferred from table cells """ # Get list of vertical delimiters @@ -126,7 +127,7 @@ def cluster_to_table(cluster_cells: List[Cell], elements: List[Cell]) -> Table: list_rows.append(Row(cells=list_cells)) # Create table - table = Table(rows=list_rows) + table = Table(rows=list_rows, borderless=borderless) # Remove empty/unnecessary rows and columns from the table, based on elements processed_table = remove_unwanted_elements(table=table, elements=elements) diff --git a/src/img2table/tables/processing/borderless_tables/table/table_creation.py b/src/img2table/tables/processing/borderless_tables/table/table_creation.py index 56e0093..75861cc 100644 --- a/src/img2table/tables/processing/borderless_tables/table/table_creation.py +++ b/src/img2table/tables/processing/borderless_tables/table/table_creation.py @@ -69,6 +69,6 @@ def get_table(columns: DelimiterGroup, table_rows: List[TableRow], contours: Lis list_cells.append(cell) # Create table object - table = cluster_to_table(cluster_cells=list_cells, elements=contours) + table = cluster_to_table(cluster_cells=list_cells, elements=contours, borderless=True) - return table if table.nb_columns >= 3 and table.nb_rows >= 3 else None + return table if table.nb_columns >= 3 and table.nb_rows >= 2 else None