diff --git a/docling_core/types/experimental/document.py b/docling_core/types/experimental/document.py index bd4fd55..d188435 100644 --- a/docling_core/types/experimental/document.py +++ b/docling_core/types/experimental/document.py @@ -211,6 +211,8 @@ class NodeItem(BaseModel): parent: Optional[RefItem] = None children: List[RefItem] = [] + model_config = ConfigDict(extra="forbid") + def get_ref(self): """get_ref.""" return RefItem(cref=self.self_ref) diff --git a/test/data/docling_document/unit/SectionItem.yaml b/test/data/docling_document/unit/SectionItem.yaml index c31847e..7429499 100644 --- a/test/data/docling_document/unit/SectionItem.yaml +++ b/test/data/docling_document/unit/SectionItem.yaml @@ -1,6 +1,6 @@ children: [] label: text -level: 1 +level: 2 orig: whatever parent: null prov: [] diff --git a/test/data/experimental/dummy_doc.yaml b/test/data/experimental/dummy_doc.yaml index 2bd50a0..d72f454 100644 --- a/test/data/experimental/dummy_doc.yaml +++ b/test/data/experimental/dummy_doc.yaml @@ -36,7 +36,6 @@ texts: - orig: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022" text: "arXiv:2206.01062v1 [cs.CV] 2 Jun 2022" self_ref: "#/texts/0" - hash: 132103230 label: "page_header" parent: $ref: "#/furniture" @@ -52,7 +51,6 @@ texts: - orig: "DocLayNet: A Large Human-Annotated Dataset for\nDocument-Layout Analysis" text: "DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis" self_ref: "#/texts/1" - hash: 2349732 # uint64 hash of self_ref label: "title" parent: $ref: "#/body" @@ -68,7 +66,6 @@ texts: - orig: "OPERATION (cont.)" # nested inside the figure text: "OPERATION (cont.)" self_ref: "#/texts/2" - hash: 6978483 label: "section_header" parent: $ref: "/pictures/0" @@ -85,7 +82,6 @@ texts: - orig: "Figure 1: Four examples of complex page layouts across dif-\nferent document categories" # nested inside the figure text: "Figure 1: Four examples of complex page layouts across different document categories" self_ref: "#/texts/3" - hash: 6978483 label: "caption" parent: $ref: "#/body" @@ -103,7 +99,6 @@ texts: tables: # All tables... - self_ref: "#/table/0" - hash: 98574 label: "table" parent: $ref: "#/body" @@ -133,7 +128,6 @@ tables: # All tables... pictures: # All pictures... - self_ref: "#/pictures/0" - hash: 7782482 label: "picture" parent: $ref: "#/body" @@ -168,7 +162,6 @@ key_value_items: [ ] # All KV-items # We should consider this for pages pages: # Optional, for layout documents 1: - hash: 6203680922337857390 size: width: 768.23 height: 583.15 diff --git a/test/test_docling_doc.py b/test/test_docling_doc.py index eea3f1f..73f644d 100644 --- a/test/test_docling_doc.py +++ b/test/test_docling_doc.py @@ -1,32 +1,32 @@ +from collections import deque + import pytest import yaml -from collections import deque - from docling_core.types.experimental.document import ( BasePictureData, BaseTableData, DescriptionItem, - DoclingDocument, - TableCell, - NodeItem, DocItem, - TextItem, + DoclingDocument, FloatingItem, KeyValueItem, - SectionItem, PictureItem, + SectionItem, + TableCell, TableItem, - BasePictureData, - BaseTableData + TextItem, ) from docling_core.types.experimental.labels import DocItemLabel, GroupLabel + def test_docitems(): # Iterative function to find all subclasses def find_all_subclasses_iterative(base_class): - subclasses = deque([base_class]) # Use a deque for efficient popping from the front + subclasses = deque( + [base_class] + ) # Use a deque for efficient popping from the front all_subclasses = [] while subclasses: @@ -40,57 +40,91 @@ def find_all_subclasses_iterative(base_class): def serialise(obj): return yaml.safe_dump(obj.model_dump(mode="json", by_alias=True)) - def write(name:str, serialisation:str): + def write(name: str, serialisation: str): with open(f"./test/data/docling_document/unit/{name}.yaml", "w") as fw: fw.write(serialisation) - def read(name:str): + def read(name: str): with open(f"./test/data/docling_document/unit/{name}.yaml", "r") as fr: gold = fr.read() return gold - def generate(dc, obj): - write(dc.__name__, pred) - def verify(dc, obj): - pred = serialise(obj) - #print(f"\t{dc.__name__}:\n {pred}") + pred = serialise(obj) + # print(f"\t{dc.__name__}:\n {pred}") gold = read(dc.__name__) - assert pred==gold, f"pred!=gold for {dc.__name__}" - + assert pred == gold, f"pred!=gold for {dc.__name__}" + # Iterate over the derived classes of the BaseClass derived_classes = find_all_subclasses_iterative(DocItem) for dc in derived_classes: if dc is TextItem: - obj = dc(text="whatever", orig="whatever", dloc="sdvsd", label=DocItemLabel.TEXT, self_ref="#") + obj = dc( + text="whatever", + orig="whatever", + dloc="sdvsd", + label=DocItemLabel.TEXT, + self_ref="#", + ) verify(dc, obj) - + elif dc is FloatingItem: - obj = dc(text="whatever", orig="whatever", dloc="sdvsd", label=DocItemLabel.TEXT, self_ref="#") + obj = dc( + text="whatever", + orig="whatever", + dloc="sdvsd", + label=DocItemLabel.TEXT, + self_ref="#", + ) verify(dc, obj) - + elif dc is KeyValueItem: - obj = dc(text="whatever", orig="whatever", dloc="sdvsd", label=DocItemLabel.TEXT, self_ref="#") + obj = dc( + text="whatever", + orig="whatever", + dloc="sdvsd", + label=DocItemLabel.TEXT, + self_ref="#", + ) verify(dc, obj) - + elif dc is SectionItem: - obj = dc(text="whatever", orig="whatever", dloc="sdvsd", label=DocItemLabel.TEXT, self_ref="#") + obj = dc( + text="whatever", + orig="whatever", + dloc="sdvsd", + label=DocItemLabel.TEXT, + self_ref="#", + level=2, + ) verify(dc, obj) - + elif dc is PictureItem: - obj = dc(text="whatever", orig="whatever", dloc="sdvsd", label=DocItemLabel.TEXT, self_ref="#", - data=BasePictureData()) + obj = dc( + text="whatever", + orig="whatever", + dloc="sdvsd", + label=DocItemLabel.TEXT, + self_ref="#", + data=BasePictureData(), + ) verify(dc, obj) - + elif dc is TableItem: - obj = dc(text="whatever", orig="whatever", dloc="sdvsd", label=DocItemLabel.TEXT, self_ref="#", - data=BaseTableData(num_rows=3, num_cols=5, cells=[])) + obj = dc( + text="whatever", + orig="whatever", + dloc="sdvsd", + label=DocItemLabel.TEXT, + self_ref="#", + data=BaseTableData(num_rows=3, num_cols=5, cells=[]), + ) verify(dc, obj) - + else: - print(f"{dc.__name__} is not known") + print(f"{dc.__name__} is not known") assert False, "new derived class detected {dc.__name__}: {e}" @@ -315,4 +349,3 @@ def _construct_doc() -> DoclingDocument: fig_item = doc.add_picture(data=BasePictureData(), caption=fig_caption) return doc -