Skip to content

Commit

Permalink
feat: support heading as chunk metadata
Browse files Browse the repository at this point in the history
Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
  • Loading branch information
vagenas committed Sep 30, 2024
1 parent 17af1fc commit bcd2f8c
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 18 deletions.
5 changes: 3 additions & 2 deletions docling_core/transforms/chunker/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,9 @@ class Chunk(BaseModel):
class ChunkWithMetadata(Chunk):
"""Data model for Chunk including metadata."""

page: Optional[int]
bbox: Optional[BoundingBox]
page: Optional[int] = None
bbox: Optional[BoundingBox] = None
heading: Optional[str] = None


class BaseChunker(BaseModel, ABC):
Expand Down
41 changes: 29 additions & 12 deletions docling_core/transforms/chunker/hierarchical_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ class HierarchicalChunker(BaseChunker):
"""Chunker implementation leveraging the document layout."""

include_metadata: bool = True
heading_as_metadata: bool = False
min_chunk_len: PositiveInt = 64

class _NodeType(str, Enum):
Expand Down Expand Up @@ -184,7 +185,7 @@ class _TextEntry(BaseModel):

def _build_chunk_impl(
self, doc: DLDocument, doc_map: _DocContext, idx: int, rec: bool = False
) -> list[_TextEntry]:
) -> tuple[list[_TextEntry], Optional[str]]:
if doc.main_text:
item = doc.main_text[idx]
item_type = _HC._norm(item.obj_type)
Expand All @@ -193,7 +194,7 @@ def _build_chunk_impl(
item_type not in self._allowed_types
or item_name in self._disallowed_names_by_type.get(item_type, [])
):
return []
return [], None

c2p = doc_map.dmap

Expand All @@ -219,7 +220,7 @@ def _build_chunk_impl(
else []
)
else:
return []
return [], None
elif isinstance(item, BaseText):
text_entries = [
self._TextEntry(
Expand Down Expand Up @@ -248,21 +249,29 @@ def _build_chunk_impl(
_HC._NodeName.LIST_ITEM,
_HC._NodeName.SUBTITLE_LEVEL_1,
]:
return []
return [], None

if (parent := c2p[idx].parent) is not None:
# prepend with ancestors

parent_res = self._build_chunk_impl(
doc=doc, doc_map=doc_map, idx=parent, rec=True
)
return (
self._build_chunk_impl(
doc=doc, doc_map=doc_map, idx=parent, rec=True
)
+ text_entries
parent_res[0] + text_entries, # expanded text
parent_res[1], # heading
)
else:
# if root, augment with title (if available and different)
return text_entries
if (
self.heading_as_metadata
and isinstance(item, BaseText)
and _HC._norm(item.obj_type) == _HC._NodeType.SUBTITLE_LEVEL_1
):
return [], text_entries[0].text
else:
return text_entries, None
else:
return []
return [], None

def _build_chunk(
self,
Expand All @@ -272,7 +281,9 @@ def _build_chunk(
delim: str,
rec: bool = False,
) -> Optional[Chunk]:
texts = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec)
res = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec)
texts = res[0]
heading = res[1]
concat = delim.join([t.text for t in texts if t.text])
assert doc.main_text is not None
if len(concat) >= self.min_chunk_len:
Expand All @@ -295,6 +306,7 @@ def _build_chunk(
path=path,
page=item.prov[0].page if item.prov else None,
bbox=item.prov[0].bbox if item.prov else None,
heading=heading,
)
else:
return Chunk(
Expand All @@ -315,6 +327,11 @@ def chunk(self, dl_doc: DLDocument, delim="\n", **kwargs: Any) -> Iterator[Chunk
Yields:
Iterator[Chunk]: iterator over extracted chunks
"""
if (not self.include_metadata) and self.heading_as_metadata:
raise RuntimeError(
"To enable `heading_as_metadata`, also `include_metadata` must be True."
)

if dl_doc.main_text:
# extract doc structure incl. metadata for
# each item (e.g. parent, children)
Expand Down
51 changes: 51 additions & 0 deletions test/data/chunker/0_out_chunks_with_meta_incl_heading.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
{
"root": [
{
"path": "$.main-text[0]",
"text": "This paragraph is marginally long enough for getting accepted as a chunk.",
"page": 1,
"bbox": [
0.0,
1.0,
2.0,
3.0
]
},
{
"path": "$.main-text[4]",
"text": "This one should also include the subtitle above since it is long enough.",
"page": 3,
"bbox": [
5.0,
6.0,
7.0,
8.0
],
"heading": "Some subtitle"
},
{
"path": "$.tables[0]",
"text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany",
"page": 4,
"bbox": [
8.0,
9.0,
10.0,
11.0
],
"heading": "Acquisitions"
},
{
"path": "$.main-text[8]",
"text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.",
"page": 4,
"bbox": [
8.0,
9.0,
10.0,
11.0
],
"heading": "Acquisitions"
}
]
}
20 changes: 16 additions & 4 deletions test/test_hierarchical_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,25 @@ def test_chunk_without_metadata():
assert exp_data == act_data


def test_chunk_with_metadata():
def test_chunk_with_metadata_heading_in_text():
with open("test/data/chunker/0_inp_dl_doc.json") as f:
data_json = f.read()
dl_doc = DLDocument.model_validate_json(data_json)
chunker = HierarchicalChunker(include_metadata=True)
chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=False)
chunks = chunker.chunk(dl_doc=dl_doc)
act_data = dict(root=[n.model_dump() for n in chunks])
with open("test/data/chunker/0_out_chunks_with_meta.json") as f:
act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
with open("test/data/chunker/0_out_chunks_with_meta_heading_in_text.json") as f:
exp_data = json.load(fp=f)
assert exp_data == act_data


def test_chunk_with_metadata_incl_heading():
with open("test/data/chunker/0_inp_dl_doc.json") as f:
data_json = f.read()
dl_doc = DLDocument.model_validate_json(data_json)
chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=True)
chunks = chunker.chunk(dl_doc=dl_doc)
act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks])
with open("test/data/chunker/0_out_chunks_with_meta_incl_heading.json") as f:
exp_data = json.load(fp=f)
assert exp_data == act_data

0 comments on commit bcd2f8c

Please sign in to comment.