Fixes for DoclingDocument and aligned methods on legacy doc

Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
DS4SD · Oct 14, 2024 · 7322553 · 7322553
1 parent f2b3afa
commit 7322553
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 5 deletions.
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
@@ -6,7 +6,7 @@
 """Models for the Docling Document data type."""
 
 from datetime import datetime
-from typing import Generic, Optional, Union
+from typing import Dict, Generic, Optional, Union
 
 from pydantic import (
     AnyHttpUrl,
@@ -434,6 +434,10 @@ def get_map_to_page_dimensions(self):
 
         return pagedims
 
+    def export_to_dict(self) -> Dict:
+        """export_to_dict."""
+        return self.model_dump(mode="json", by_alias=True, exclude_none=True)
+
     def export_to_markdown(  # noqa: C901
         self,
         delim: str = "\n\n",

diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py
@@ -97,7 +97,12 @@ class TableCell(BaseModel):
     def from_dict_format(cls, data: Any) -> Any:
         """from_dict_format."""
         if isinstance(data, Dict):
-            if "bbox" not in data or data["bbox"] is None:
+            # Check if this is a native BoundingBox or a bbox from docling-ibm-models
+            if (
+                "bbox" not in data
+                or data["bbox"] is None
+                or isinstance(data["bbox"], BoundingBox)
+            ):
                 return data
             text = data["bbox"].get("token", "")
             if not len(text):
@@ -403,7 +408,7 @@ def caption_text(self, doc: "DoclingDocument") -> str:
         text = ""
         for cap in self.captions:
             text += cap.resolve(doc).text
-        return ""
+        return text
 
 
 class PictureItem(FloatingItem):
@@ -469,7 +474,7 @@ class TableItem(FloatingItem):
 
     def export_to_dataframe(self) -> pd.DataFrame:
         """Export the table as a Pandas DataFrame."""
-        if self.data is None or self.data.num_rows == 0 or self.data.num_cols == 0:
+        if self.data.num_rows == 0 or self.data.num_cols == 0:
             return pd.DataFrame()
 
         # Count how many rows are column headers
@@ -815,7 +820,7 @@ def add_list_item(
 
     def add_text(
         self,
-        label: str,
+        label: DocItemLabel,
         text: str,
         orig: Optional[str] = None,
         prov: Optional[ProvenanceItem] = None,
@@ -1167,6 +1172,7 @@ def export_to_markdown(  # noqa: C901
                     # Compute the caption
                     if caption := item.caption_text(self):
                         parts.append(caption)
+                        parts.append("\n")
 
                     # Rendered the item
                     if not strict_text:
@@ -1183,6 +1189,7 @@ def export_to_markdown(  # noqa: C901
                     # Compute the caption
                     if caption := item.caption_text(self):
                         parts.append(caption)
+                        parts.append("\n")
 
                     # Rendered the item
                     if not strict_text: