Skip to content

Commit

Permalink
Fixes for DoclingDocument and aligned methods on legacy doc
Browse files Browse the repository at this point in the history
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
  • Loading branch information
cau-git committed Oct 14, 2024
1 parent f2b3afa commit 7322553
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 5 deletions.
6 changes: 5 additions & 1 deletion docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"""Models for the Docling Document data type."""

from datetime import datetime
from typing import Generic, Optional, Union
from typing import Dict, Generic, Optional, Union

from pydantic import (
AnyHttpUrl,
Expand Down Expand Up @@ -434,6 +434,10 @@ def get_map_to_page_dimensions(self):

return pagedims

def export_to_dict(self) -> Dict:
"""export_to_dict."""
return self.model_dump(mode="json", by_alias=True, exclude_none=True)

def export_to_markdown( # noqa: C901
self,
delim: str = "\n\n",
Expand Down
15 changes: 11 additions & 4 deletions docling_core/types/experimental/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,12 @@ class TableCell(BaseModel):
def from_dict_format(cls, data: Any) -> Any:
"""from_dict_format."""
if isinstance(data, Dict):
if "bbox" not in data or data["bbox"] is None:
# Check if this is a native BoundingBox or a bbox from docling-ibm-models
if (
"bbox" not in data
or data["bbox"] is None
or isinstance(data["bbox"], BoundingBox)
):
return data
text = data["bbox"].get("token", "")
if not len(text):
Expand Down Expand Up @@ -403,7 +408,7 @@ def caption_text(self, doc: "DoclingDocument") -> str:
text = ""
for cap in self.captions:
text += cap.resolve(doc).text
return ""
return text


class PictureItem(FloatingItem):
Expand Down Expand Up @@ -469,7 +474,7 @@ class TableItem(FloatingItem):

def export_to_dataframe(self) -> pd.DataFrame:
"""Export the table as a Pandas DataFrame."""
if self.data is None or self.data.num_rows == 0 or self.data.num_cols == 0:
if self.data.num_rows == 0 or self.data.num_cols == 0:
return pd.DataFrame()

# Count how many rows are column headers
Expand Down Expand Up @@ -815,7 +820,7 @@ def add_list_item(

def add_text(
self,
label: str,
label: DocItemLabel,
text: str,
orig: Optional[str] = None,
prov: Optional[ProvenanceItem] = None,
Expand Down Expand Up @@ -1167,6 +1172,7 @@ def export_to_markdown( # noqa: C901
# Compute the caption
if caption := item.caption_text(self):
parts.append(caption)
parts.append("\n")

# Rendered the item
if not strict_text:
Expand All @@ -1183,6 +1189,7 @@ def export_to_markdown( # noqa: C901
# Compute the caption
if caption := item.caption_text(self):
parts.append(caption)
parts.append("\n")

# Rendered the item
if not strict_text:
Expand Down

0 comments on commit 7322553

Please sign in to comment.