From bcd2f8ce00b7103ccb0e58f64b8f2042330438b1 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Mon, 30 Sep 2024 09:48:35 +0200 Subject: [PATCH] feat: support heading as chunk metadata Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling_core/transforms/chunker/base.py | 5 +- .../chunker/hierarchical_chunker.py | 41 ++++++++++----- ...out_chunks_with_meta_heading_in_text.json} | 0 .../0_out_chunks_with_meta_incl_heading.json | 51 +++++++++++++++++++ test/test_hierarchical_chunker.py | 20 ++++++-- 5 files changed, 99 insertions(+), 18 deletions(-) rename test/data/chunker/{0_out_chunks_with_meta.json => 0_out_chunks_with_meta_heading_in_text.json} (100%) create mode 100644 test/data/chunker/0_out_chunks_with_meta_incl_heading.json diff --git a/docling_core/transforms/chunker/base.py b/docling_core/transforms/chunker/base.py index 9324f83..17650e7 100644 --- a/docling_core/transforms/chunker/base.py +++ b/docling_core/transforms/chunker/base.py @@ -22,8 +22,9 @@ class Chunk(BaseModel): class ChunkWithMetadata(Chunk): """Data model for Chunk including metadata.""" - page: Optional[int] - bbox: Optional[BoundingBox] + page: Optional[int] = None + bbox: Optional[BoundingBox] = None + heading: Optional[str] = None class BaseChunker(BaseModel, ABC): diff --git a/docling_core/transforms/chunker/hierarchical_chunker.py b/docling_core/transforms/chunker/hierarchical_chunker.py index 7b36a17..5b1831d 100644 --- a/docling_core/transforms/chunker/hierarchical_chunker.py +++ b/docling_core/transforms/chunker/hierarchical_chunker.py @@ -26,6 +26,7 @@ class HierarchicalChunker(BaseChunker): """Chunker implementation leveraging the document layout.""" include_metadata: bool = True + heading_as_metadata: bool = False min_chunk_len: PositiveInt = 64 class _NodeType(str, Enum): @@ -184,7 +185,7 @@ class _TextEntry(BaseModel): def _build_chunk_impl( self, doc: DLDocument, doc_map: _DocContext, idx: int, rec: bool = False - ) -> list[_TextEntry]: + ) -> tuple[list[_TextEntry], Optional[str]]: if doc.main_text: item = doc.main_text[idx] item_type = _HC._norm(item.obj_type) @@ -193,7 +194,7 @@ def _build_chunk_impl( item_type not in self._allowed_types or item_name in self._disallowed_names_by_type.get(item_type, []) ): - return [] + return [], None c2p = doc_map.dmap @@ -219,7 +220,7 @@ def _build_chunk_impl( else [] ) else: - return [] + return [], None elif isinstance(item, BaseText): text_entries = [ self._TextEntry( @@ -248,21 +249,29 @@ def _build_chunk_impl( _HC._NodeName.LIST_ITEM, _HC._NodeName.SUBTITLE_LEVEL_1, ]: - return [] + return [], None if (parent := c2p[idx].parent) is not None: # prepend with ancestors + + parent_res = self._build_chunk_impl( + doc=doc, doc_map=doc_map, idx=parent, rec=True + ) return ( - self._build_chunk_impl( - doc=doc, doc_map=doc_map, idx=parent, rec=True - ) - + text_entries + parent_res[0] + text_entries, # expanded text + parent_res[1], # heading ) else: - # if root, augment with title (if available and different) - return text_entries + if ( + self.heading_as_metadata + and isinstance(item, BaseText) + and _HC._norm(item.obj_type) == _HC._NodeType.SUBTITLE_LEVEL_1 + ): + return [], text_entries[0].text + else: + return text_entries, None else: - return [] + return [], None def _build_chunk( self, @@ -272,7 +281,9 @@ def _build_chunk( delim: str, rec: bool = False, ) -> Optional[Chunk]: - texts = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec) + res = self._build_chunk_impl(doc=doc, doc_map=doc_map, idx=idx, rec=rec) + texts = res[0] + heading = res[1] concat = delim.join([t.text for t in texts if t.text]) assert doc.main_text is not None if len(concat) >= self.min_chunk_len: @@ -295,6 +306,7 @@ def _build_chunk( path=path, page=item.prov[0].page if item.prov else None, bbox=item.prov[0].bbox if item.prov else None, + heading=heading, ) else: return Chunk( @@ -315,6 +327,11 @@ def chunk(self, dl_doc: DLDocument, delim="\n", **kwargs: Any) -> Iterator[Chunk Yields: Iterator[Chunk]: iterator over extracted chunks """ + if (not self.include_metadata) and self.heading_as_metadata: + raise RuntimeError( + "To enable `heading_as_metadata`, also `include_metadata` must be True." + ) + if dl_doc.main_text: # extract doc structure incl. metadata for # each item (e.g. parent, children) diff --git a/test/data/chunker/0_out_chunks_with_meta.json b/test/data/chunker/0_out_chunks_with_meta_heading_in_text.json similarity index 100% rename from test/data/chunker/0_out_chunks_with_meta.json rename to test/data/chunker/0_out_chunks_with_meta_heading_in_text.json diff --git a/test/data/chunker/0_out_chunks_with_meta_incl_heading.json b/test/data/chunker/0_out_chunks_with_meta_incl_heading.json new file mode 100644 index 0000000..6c44109 --- /dev/null +++ b/test/data/chunker/0_out_chunks_with_meta_incl_heading.json @@ -0,0 +1,51 @@ +{ + "root": [ + { + "path": "$.main-text[0]", + "text": "This paragraph is marginally long enough for getting accepted as a chunk.", + "page": 1, + "bbox": [ + 0.0, + 1.0, + 2.0, + 3.0 + ] + }, + { + "path": "$.main-text[4]", + "text": "This one should also include the subtitle above since it is long enough.", + "page": 3, + "bbox": [ + 5.0, + 6.0, + 7.0, + 8.0 + ], + "heading": "Some subtitle" + }, + { + "path": "$.tables[0]", + "text": "Atomic Vision, Business = Website design. Atomic Vision, Country = United States. Delix Computer GmbH, Business = Computers and software. Delix Computer GmbH, Country = Germany", + "page": 4, + "bbox": [ + 8.0, + 9.0, + 10.0, + 11.0 + ], + "heading": "Acquisitions" + }, + { + "path": "$.main-text[8]", + "text": "This paragraph is right before the list.\nSome first bullet content here.\nAnd then some second bullet content here.", + "page": 4, + "bbox": [ + 8.0, + 9.0, + 10.0, + 11.0 + ], + "heading": "Acquisitions" + } + ] +} diff --git a/test/test_hierarchical_chunker.py b/test/test_hierarchical_chunker.py index b83a5d6..00ef9d5 100644 --- a/test/test_hierarchical_chunker.py +++ b/test/test_hierarchical_chunker.py @@ -21,13 +21,25 @@ def test_chunk_without_metadata(): assert exp_data == act_data -def test_chunk_with_metadata(): +def test_chunk_with_metadata_heading_in_text(): with open("test/data/chunker/0_inp_dl_doc.json") as f: data_json = f.read() dl_doc = DLDocument.model_validate_json(data_json) - chunker = HierarchicalChunker(include_metadata=True) + chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=False) chunks = chunker.chunk(dl_doc=dl_doc) - act_data = dict(root=[n.model_dump() for n in chunks]) - with open("test/data/chunker/0_out_chunks_with_meta.json") as f: + act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks]) + with open("test/data/chunker/0_out_chunks_with_meta_heading_in_text.json") as f: + exp_data = json.load(fp=f) + assert exp_data == act_data + + +def test_chunk_with_metadata_incl_heading(): + with open("test/data/chunker/0_inp_dl_doc.json") as f: + data_json = f.read() + dl_doc = DLDocument.model_validate_json(data_json) + chunker = HierarchicalChunker(include_metadata=True, heading_as_metadata=True) + chunks = chunker.chunk(dl_doc=dl_doc) + act_data = dict(root=[n.model_dump(exclude_none=True) for n in chunks]) + with open("test/data/chunker/0_out_chunks_with_meta_incl_heading.json") as f: exp_data = json.load(fp=f) assert exp_data == act_data