Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support heading as chunk metadata #36

Merged
merged 1 commit into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions docling_core/transforms/chunker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ class Chunk(BaseModel):
class ChunkWithMetadata(Chunk):
"""Data model for Chunk including metadata."""

page: Optional[int]
bbox: Optional[BoundingBox]
page: Optional[int] = None
bbox: Optional[BoundingBox] = None
heading: Optional[str] = None


class BaseChunker(BaseModel, ABC):
Expand Down
41 changes: 29 additions & 12 deletions docling_core/transforms/chunker/hierarchical_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class HierarchicalChunker(BaseChunker):
"""Chunker implementation leveraging the document layout."""

include_metadata: bool = True
heading_as_metadata: bool = False
min_chunk_len: PositiveInt = 64

class _NodeType(str, Enum):
Expand Down Expand Up @@ -184,7 +185,7 @@ class _TextEntry(BaseModel):

def _build_chunk_impl(
self, doc: DLDocument, doc_map: _DocContext, idx: int, rec: bool = False
) -> list[_TextEntry]:
) -> tuple[list[_TextEntry], Optional[str]]:
if doc.main_text:
item = doc.main_text[idx]
item_type = _HC._norm(item.obj_type)
Expand All @@ -193,7 +194,7 @@ def _build_chunk_impl(
item_type not in self._allowed_types
or item_name in self._disallowed_names_by_type.get(item_type, [])
):
return []
return [], None

c2p = doc_map.dmap

Expand All @@ -219,7 +220,7 @@ def _build_chunk_impl(
else []
)
else:
return []
return [], None
elif isinstance(item, BaseText):
text_entries = [
self._TextEntry(
Expand Down Expand Up @@ -248,21 +249,29 @@ def _build_chunk_impl(
_HC._NodeName.LIST_ITEM,
_HC._NodeName.SUBTITLE_LEVEL_1,
]:
return []
return [], None

if (parent := c2p[idx].parent) is not None:
# prepend with ancestors

parent_res = self._build_chunk_impl(
doc=doc, doc_map=doc_map, idx=parent, rec=True
)
return (
self._build_chunk_impl(
doc=doc, doc_map=doc_map, idx=parent, rec=True
)
+ text_entries
parent_res[0] + text_entries, # expanded text
parent_res[1], # heading
)
else:
# if root, augment with title (if available and different)
return text_entries
if (
self.heading_as_metadata
and isinstance(item, BaseText)
and _HC._norm(item.obj_type) == _HC._NodeType.SUBTITLE_LEVEL_1
):
return [], text_entries[0].text
else:
return text_entries, None
else:
return []
return [], None

def _build_chunk(
self,
Expand All @@ -272,7 +281,9 @@ def _build_chunk(
delim: str,
rec: bool = False,
) -> Optional[Chunk]:
texts = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec)
res = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec)
texts = res[0]
heading = res[1]
concat = delim.join([t.text for t in texts if t.text])
assert doc.main_text is not None
if len(concat) >= self.min_chunk_len:
Expand All @@ -295,6 +306,7 @@ def _build_chunk(
path=path,
page=item.prov[0].page if item.prov else None,
bbox=item.prov[0].bbox if item.prov else None,
heading=heading,
)
else:
return Chunk(
Expand All @@ -315,6 +327,11 @@ def chunk(self, dl_doc: DLDocument, delim="\n", **kwargs: Any) -> Iterator[Chunk
Yields:
Iterator[Chunk]: iterator over extracted chunks
"""
if (not self.include_metadata) and self.heading_as_metadata:
raise RuntimeError(
"To enable `heading_as_metadata`, also `include_metadata` must be True."
)

if dl_doc.main_text:
# extract doc structure incl. metadata for
# each item (e.g. parent, children)
Expand Down
51 changes: 51 additions & 0 deletions test/data/chunker/0_out_chunks_with_meta_incl_heading.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
{
"root": [
{
"path": "$.main-text[0]",
"text": "This paragraph is marginally long enough for getting accepted as a chunk.",
"page": 1,
"bbox": [
0.0,
1.0,
2.0,
3.0
]
},
{
"path": "$.main-text[4]",
"text": "This one should also include the subtitle above since it is long enough.",
"page": 3,
"bbox": [
5.0,
6.0,
7.0,
8.0
],
"heading": "Some subtitle"
},
{
"path": "$.tables[0]",
"text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
"page": 4,
"bbox": [
8.0,
9.0,
10.0,
11.0
],
"heading": "Acquisitions"
},
{
"path": "$.main-text[8]",
"text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
"page": 4,
"bbox": [
8.0,
9.0,
10.0,
11.0
],
"heading": "Acquisitions"
}
]
}
20 changes: 16 additions & 4 deletions test/test_hierarchical_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,25 @@ def test_chunk_without_metadata():
assert exp_data == act_data


def test_chunk_with_metadata():
def test_chunk_with_metadata_heading_in_text():
with open("test/data/chunker/0_inp_dl_doc.json") as f:
data_json = f.read()
dl_doc = DLDocument.model_validate_json(data_json)
chunker = HierarchicalChunker(include_metadata=True)
chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=False)
chunks = chunker.chunk(dl_doc=dl_doc)
act_data = dict(root=[n.model_dump() for n in chunks])
with open("test/data/chunker/0_out_chunks_with_meta.json") as f:
act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
with open("test/data/chunker/0_out_chunks_with_meta_heading_in_text.json") as f:
exp_data = json.load(fp=f)
assert exp_data == act_data


def test_chunk_with_metadata_incl_heading():
with open("test/data/chunker/0_inp_dl_doc.json") as f:
data_json = f.read()
dl_doc = DLDocument.model_validate_json(data_json)
chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=True)
chunks = chunker.chunk(dl_doc=dl_doc)
act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
with open("test/data/chunker/0_out_chunks_with_meta_incl_heading.json") as f:
exp_data = json.load(fp=f)
assert exp_data == act_data
Loading