Skip to content

Commit

Permalink
Introduce ListItem API, with marker and enumerated properties
Browse files Browse the repository at this point in the history
Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
  • Loading branch information
cau-git committed Oct 11, 2024
1 parent e42a1dd commit baceeae
Show file tree
Hide file tree
Showing 4 changed files with 96 additions and 6 deletions.
73 changes: 71 additions & 2 deletions docling_core/types/experimental/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,14 @@ class DocumentOrigin(BaseModel):
# from any file handler protocol (e.g. https://, file://, s3://)
)

_extra_mimetypes = [
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
"application/vnd.openxmlformats-officedocument.wordprocessingml.template",
"application/vnd.openxmlformats-officedocument.presentationml.template",
"application/vnd.openxmlformats-officedocument.presentationml.slideshow",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
]

@field_validator("binary_hash", mode="before")
@classmethod
def parse_hex_string(cls, value):
Expand All @@ -164,7 +172,7 @@ def parse_hex_string(cls, value):
def validate_mimetype(cls, v):
"""validate_mimetype."""
# Check if the provided MIME type is valid using mimetypes module
if v not in mimetypes.types_map.values():
if v not in mimetypes.types_map.values() and v not in cls._extra_mimetypes:
raise ValueError(f"'{v}' is not a valid MIME type")
return v

Expand Down Expand Up @@ -347,6 +355,14 @@ class SectionHeaderItem(TextItem):
level: LevelNumber


class ListItem(TextItem):
"""SectionItem."""

label: typing.Literal[DocItemLabel.LIST_ITEM] = DocItemLabel.LIST_ITEM
enumerated: bool = False
marker: str # The bullet or number symbol that prefixes this list item


class FloatingItem(DocItem):
"""FloatingItem."""

Expand Down Expand Up @@ -689,7 +705,7 @@ class DoclingDocument(BaseModel):
body: GroupItem = GroupItem(name="_root_", self_ref="#/body") # List[RefItem] = []

groups: List[GroupItem] = []
texts: List[Union[SectionHeaderItem, TextItem]] = []
texts: List[Union[SectionHeaderItem, ListItem, TextItem]] = []
pictures: List[PictureItem] = []
tables: List[TableItem] = []
key_value_items: List[KeyValueItem] = []
Expand Down Expand Up @@ -726,6 +742,50 @@ def add_group(

return group

def add_list_item(
self,
text: str,
enumerated: bool = False,
marker: Optional[str] = None,
orig: Optional[str] = None,
prov: Optional[ProvenanceItem] = None,
parent: Optional[GroupItem] = None,
):
"""add_paragraph.
:param label: str:
:param text: str:
:param orig: Optional[str]: (Default value = None)
:param prov: Optional[ProvenanceItem]: (Default value = None)
:param parent: Optional[GroupItem]: (Default value = None)
"""
if not parent:
parent = self.body

if not orig:
orig = text

marker = marker or "-"

text_index = len(self.texts)
cref = f"#/texts/{text_index}"
list_item = ListItem(
text=text,
orig=orig,
self_ref=cref,
parent=parent.get_ref(),
enumerated=enumerated,
marker=marker,
)
if prov:
list_item.prov.append(prov)

self.texts.append(list_item)
parent.children.append(RefItem(cref=cref))

return list_item

def add_text(
self,
label: str,
Expand Down Expand Up @@ -1061,6 +1121,15 @@ def export_to_markdown( # noqa: C901
else:
markdown_text = f"## {text}"

# secondary titles
elif isinstance(item, ListItem):
if item.enumerated:
marker = item.marker
else:
marker = "-"

markdown_text = f"{marker} {text}"

# normal text
else:
markdown_text = text
Expand Down
5 changes: 4 additions & 1 deletion docling_core/types/experimental/labels.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,10 @@ class GroupLabel(str, Enum):
"""GroupLabel."""

UNSPECIFIED = "unspecified"
LIST = "list" # group label for list container (not the list-items)
LIST = (
"list" # group label for list container (not the list-items) (e.g. HTML <ul/>)
)
ORDERED_LIST = "ordered_list" # List with enumeration (e.g. HTML <ol/>)
CHAPTER = "chapter"
SECTION = "section"
SHEET = "sheet"
Expand Down
9 changes: 9 additions & 0 deletions test/data/docling_document/unit/ListItem.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
children: []
enumerated: true
label: list_item
marker: (1)
orig: whatever
parent: null
prov: []
self_ref: '#'
text: whatever
15 changes: 12 additions & 3 deletions test/test_docling_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
DoclingDocument,
FloatingItem,
KeyValueItem,
ListItem,
PictureItem,
SectionHeaderItem,
TableCell,
Expand Down Expand Up @@ -52,9 +53,9 @@ def read(name: str):
return gold

def verify(dc, obj):
pred = serialise(obj)
pred = serialise(obj).strip()
# print(f"\t{dc.__name__}:\n {pred}")
gold = read(dc.__name__)
gold = read(dc.__name__).strip()

assert pred == gold, f"pred!=gold for {dc.__name__}"

Expand All @@ -70,7 +71,15 @@ def verify(dc, obj):
self_ref="#",
)
verify(dc, obj)

elif dc is ListItem:
obj = dc(
text="whatever",
orig="whatever",
marker="(1)",
enumerated=True,
self_ref="#",
)
verify(dc, obj)
elif dc is FloatingItem:
obj = dc(
label=DocItemLabel.TEXT,
Expand Down

0 comments on commit baceeae

Please sign in to comment.