Merge pull request #34 from RapidAI/more_teds_compare

More teds compare
RapidAI · Sep 28, 2024 · 22c96a8 · 22c96a8
2 parents 7257c7a + e72fd56
commit 22c96a8
Show file tree

Hide file tree

Showing 6 changed files with 37 additions and 20 deletions.
diff --git a/README.md b/README.md
@@ -12,8 +12,12 @@
   <a href="https://github.com/RapidAI/TableStructureRec/blob/c41bbd23898cb27a957ed962b0ffee3c74dfeff1/LICENSE"><img alt="GitHub" src="https://img.shields.io/badge/license-Apache 2.0-blue"></a>
 </div>
 
-### 简介
+### 最近更新
+- **2024.9.26**
+  - 修正RapidTable默认英文模型导致的测评结果错误。
+  - 补充测评数据集，补充开源社区更多模型的测评结果
 
+### 简介
 💖该仓库是用来对文档中表格做结构化识别的推理库，包括来自paddle的表格识别模型，
 阿里读光有线和无线表格识别模型，llaipython(微信)贡献的有线表格模型，网易Qanything内置表格分类模型等。
 
@@ -33,15 +37,20 @@
 
 ### 指标结果
 
-[TableRecognitionMetric 评测工具](https://github.com/SWHL/TableRecognitionMetric) [评测数据集](https://huggingface.co/datasets/SWHL/table_rec_test_dataset) [Rapid OCR](https://github.com/RapidAI/RapidOCR)
+[TableRecognitionMetric 评测工具](https://github.com/SWHL/TableRecognitionMetric) [huggingface数据集](https://huggingface.co/datasets/SWHL/table_rec_test_dataset) [modelscope 数据集](https://www.modelscope.cn/datasets/jockerK/TEDS_TEST/files) [Rapid OCR](https://github.com/RapidAI/RapidOCR)
+
+注: StructEqTable 输出为 latex，只取成功转换为html并去除样式标签后进行测评
+
+| 方法                                                                                                                        |    TEDS     | TEDS-only-structure |
+|:---------------------------------------------------------------------------------------------------------------------------|:-----------:|:-------------------:|
+| [deepdoctection(rag-flow)](https://github.com/deepdoctection/deepdoctection?tab=readme-ov-file)                            |   0.59975   |       0.69918       |
+| [ppstructure_table_master](https://github.com/PaddlePaddle/PaddleOCR/tree/main/ppstructure)                                |   0.61606   |       0.73892       |
+| [ppsturcture_table_engine](https://github.com/PaddlePaddle/PaddleOCR/tree/main/ppstructure)                                |   0.67924   |       0.78653       |
+| table_cls + wired_table_rec v1 + lineless_table_rec                                                                        |   0.68507   |       0.75140       |
+| [StructEqTable](https://github.com/UniModal4Reasoning/StructEqTable-Deploy)                                                |   0.67310   |     **0.81210**     |
+| [RapidTable](https://github.com/RapidAI/RapidStructure/blob/b800b156015bf5cd6f5429295cdf48be682fd97e/docs/README_Table.md) |   0.71654   |       0.81067       |
+| table_cls + wired_table_rec v2 + lineless_table_rec                                                                        | **0.73702** |       0.80210       |
 
-| 方法                                                                                                                         |  TEDS   | TEDS-only-structure |
-|:---------------------------------------------------------------------------------------------------------------------------|:-------:|:-------------------:|
-| [RapidTable](https://github.com/RapidAI/RapidStructure/blob/b800b156015bf5cd6f5429295cdf48be682fd97e/docs/README_Table.md) | 0.59765 |       0.68996       |
-| ppstructure_table_master                                                                                                   | 0.59835 |       0.68996       |
-| table_cls + wired_table_rec v1 + lineless_table_rec                                                                        | 0.74692 |       0.83049       |
-| ppsturcture_table_engine                                                                                                   | 0.76835 |       0.83296       |
-| table_cls + wired_table_rec v2 + lineless_table_rec                                                                        | 0.80890 |       0.88011       |
 
 ### 安装
 
@@ -69,9 +78,15 @@ if cls == 'wired':
     table_engine = wired_engine
 else:
     table_engine = lineless_engine
+
 html, elasp, polygons, logic_points, ocr_res = table_engine(img_path)
 print(f"elasp: {elasp}")
 
+# 使用其他ocr模型
+#ocr_engine =RapidOCR(det_model_dir="xxx/det_server_infer.onnx",rec_model_dir="xxx/rec_server_infer.onnx")
+#ocr_res, _ = ocr_engine(img_path)
+#html, elasp, polygons, logic_points, ocr_res = table_engine(img_path, ocr_result=ocr_res)  
+
 # output_dir = f'outputs'
 # complete_html = format_html(html)
 # os.makedirs(os.path.dirname(f"{output_dir}/table.html"), exist_ok=True)
@@ -105,8 +120,7 @@ cv2.imwrite(f'img_rotated.jpg', img)
     - 答：该项目暂时不支持偏移图片识别，请先修正图片，也欢迎提pr来解决这个问题。
 
 2. **问：识别框丢失了内部文字信息**
-   -
-   答：默认使用的rapidocr小模型，如果需要更高精度的效果，可以从 [模型列表](https://rapidai.github.io/RapidOCRDocs/model_list/#_1)
+   - 答：默认使用的rapidocr小模型，如果需要更高精度的效果，可以从 [模型列表](https://rapidai.github.io/RapidOCRDocs/model_list/#_1)
    下载更高精度的ocr模型,在执行时传入ocr_result即可
 
 3. **问：模型支持 gpu 加速吗？**
@@ -116,8 +130,9 @@ cv2.imwrite(f'img_rotated.jpg', img)
 
 ### TODO List
 
-- [ ] 识别前图片偏移修正(完成有线表格小角度偏移修正)
-- [ ] 增加数据集数量，增加更多评测对比
+- [x] 图片小角度偏移修正方法补充
+- [x] 增加数据集数量，增加更多评测对比
+- [ ] 补充复杂场景表格检测和提取，解决旋转和透视导致的低识别率
 - [ ] 优化无线表格模型
 
 ### 处理流程

diff --git a/lineless_table_rec/utils_table_recover.py b/lineless_table_rec/utils_table_recover.py
@@ -421,7 +421,6 @@ def plot_html_table(
     grid = [[None] * max_col for _ in range(max_row)]
 
     valid_start_row = (1 << 16) - 1
-    valid_end_row = 0
     valid_start_col = (1 << 16) - 1
     valid_end_col = 0
     # 将 sorted_logi_points 中的元素填充到 grid 中
@@ -436,7 +435,6 @@ def plot_html_table(
         if ocr_rec_text_list and "".join(ocr_rec_text_list):
             valid_start_row = min(row_start, valid_start_row)
             valid_start_col = min(col_start, valid_start_col)
-            valid_end_row = max(row_end, valid_end_row)
             valid_end_col = max(col_end, valid_end_col)
         for row in range(row_start, row_end + 1):
             for col in range(col_start, col_end + 1):
@@ -447,7 +445,7 @@ def plot_html_table(
 
     # 遍历每行
     for row in range(max_row):
-        if row < valid_start_row or row > valid_end_row:
+        if row < valid_start_row:
             continue
         temp = "<tr>"
         # 遍历每一列

diff --git a/tests/test_files/wired/no_table.jpg b/tests/test_files/wired/no_table.jpg
diff --git a/tests/test_wired_table_rec.py b/tests/test_wired_table_rec.py
@@ -32,6 +32,8 @@
 
 def get_td_nums(html: str) -> int:
     soup = BeautifulSoup(html, "html.parser")
+    if not soup.table:
+        return 0
     tds = soup.table.find_all("td")
     return len(tds)
 
@@ -41,7 +43,7 @@ def test_squeeze_bug():
     ocr_result, _ = ocr_engine(img_path)
     table_str, *_ = table_recog(str(img_path), ocr_result)
     td_nums = get_td_nums(table_str)
-    assert td_nums == 291
+    assert td_nums == 192
 
 
 @pytest.mark.parametrize(
@@ -50,6 +52,7 @@ def test_squeeze_bug():
         ("table_recognition.jpg", 35, "d colsp"),
         ("table2.jpg", 23, "td><td "),
         ("row_span.png", 17, "></td><"),
+        ("no_table.jpg", 1, "d colsp"),
     ],
 )
 def test_input_normal(img_path, gt_td_nums, gt2):

diff --git a/wired_table_rec/table_recover.py b/wired_table_rec/table_recover.py
@@ -43,6 +43,9 @@ def get_rows(polygons: np.array) -> Dict[int, List[int]]:
         result = {}
         thresh = 10.0
         split_idxs = np.argwhere(abs(minus_res) > thresh).squeeze()
+        # 如果都在一行，则将所有下标设置为同一行
+        if split_idxs.size == 0:
+            return {0: [i for i in range(len(y_axis))]}
         if split_idxs.ndim == 0:
             split_idxs = split_idxs[None, ...]
 

diff --git a/wired_table_rec/utils_table_recover.py b/wired_table_rec/utils_table_recover.py
@@ -572,7 +572,6 @@ def plot_html_table(
     grid = [[None] * max_col for _ in range(max_row)]
 
     valid_start_row = (1 << 16) - 1
-    valid_end_row = 0
     valid_start_col = (1 << 16) - 1
     valid_end_col = 0
     # 将 sorted_logi_points 中的元素填充到 grid 中
@@ -587,7 +586,6 @@ def plot_html_table(
         if ocr_rec_text_list and "".join(ocr_rec_text_list):
             valid_start_row = min(row_start, valid_start_row)
             valid_start_col = min(col_start, valid_start_col)
-            valid_end_row = max(row_end, valid_end_row)
             valid_end_col = max(col_end, valid_end_col)
         for row in range(row_start, row_end + 1):
             for col in range(col_start, col_end + 1):
@@ -598,7 +596,7 @@ def plot_html_table(
 
     # 遍历每行
     for row in range(max_row):
-        if row < valid_start_row or row > valid_end_row:
+        if row < valid_start_row:
             continue
         temp = "<tr>"
         # 遍历每一列