Skip to content

Commit

Permalink
feat: set DoclingDocument version as SemanticVersion with default
Browse files Browse the repository at this point in the history
Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
  • Loading branch information
ceberam committed Sep 27, 2024
1 parent adc16f3 commit 34ce64b
Show file tree
Hide file tree
Showing 8 changed files with 82 additions and 648 deletions.
13 changes: 12 additions & 1 deletion docling_core/types/experimental/document.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Models for the Docling Document data type."""

import hashlib
import importlib
import mimetypes
import typing
from typing import Any, Dict, List, Optional, Tuple, Union
Expand All @@ -15,6 +16,7 @@
field_validator,
model_validator,
)
from pydantic_extra_types.semantic_version import SemanticVersion
from tabulate import tabulate

from docling_core.types.doc.tokens import DocumentToken
Expand Down Expand Up @@ -619,7 +621,7 @@ class DescriptionItem(BaseModel):
class DoclingDocument(BaseModel):
"""DoclingDocument."""

version: str = "0.1.0" # TODO use SemanticVersion type instead
version: Optional[SemanticVersion] = Field(default=None, validate_default=True)
description: DescriptionItem
name: str # The working name of this document, without extensions
# (could be taken from originating doc, or just "Untitled 1")
Expand All @@ -642,6 +644,15 @@ class DoclingDocument(BaseModel):

pages: Dict[int, PageItem] = {} # empty as default

@field_validator("version")
@classmethod
def check_version_omitted(cls, v: str) -> str:
"""Set the version field to this library version by default."""
if v is None:
return importlib.metadata.version("docling-core")
else:
return v

def _compute_hash(self, obj):
hash_object = hashlib.sha256(obj.encode("utf-8"))
# Convert the hash to an integer
Expand Down
6 changes: 1 addition & 5 deletions docs/Document.json
Original file line number Diff line number Diff line change
Expand Up @@ -323,11 +323,7 @@
"type": "string"
},
"bounding_box": {
"allOf": [
{
"$ref": "#/$defs/BoundingBoxContainer"
}
],
"$ref": "#/$defs/BoundingBoxContainer",
"x-es-suppress": true
},
"prov": {
Expand Down
2 changes: 1 addition & 1 deletion docs/Document.md
Original file line number Diff line number Diff line change
Expand Up @@ -6052,7 +6052,7 @@ Must be one of:
| **Type** | `object` |
| **Required** | Yes |
| **Additional properties** | [[Any type: allowed]](# "Additional Properties of any type are allowed.") |
| **Defined in** | |
| **Defined in** | #/$defs/BoundingBoxContainer |

**Description:** Bounding box container.

Expand Down
6 changes: 1 addition & 5 deletions docs/Generic.json
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,7 @@
"x-es-type": "text"
},
"file-info": {
"allOf": [
{
"$ref": "#/$defs/FileInfoObject"
}
],
"$ref": "#/$defs/FileInfoObject",
"description": "Minimal identification information of the document within a collection.",
"title": "Document information"
}
Expand Down
2 changes: 1 addition & 1 deletion docs/Generic.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@
| **Type** | `object` |
| **Required** | Yes |
| **Additional properties** | [[Any type: allowed]](# "Additional Properties of any type are allowed.") |
| **Defined in** | |
| **Defined in** | #/$defs/FileInfoObject |

**Description:** Minimal identification information of the document within a collection.

Expand Down
665 changes: 31 additions & 634 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ jsonref = "^1.1.0"
json-schema-for-humans = "^1.0.0"
tabulate = "^0.9.0"
pandas = "^2.2.2"
pydantic-extra-types = "^2.9.0"
semver = "^3.0.2"

[tool.poetry.group.dev.dependencies]
black = "^24.4.2"
Expand All @@ -67,7 +69,6 @@ flake8-docstrings = "^1.6.0"
pep8-naming = "^0.13.2"
jsondiff = "^2.0.0"
types-setuptools = "^70.3.0"
python-semantic-release = "^7.32.2"

[tool.setuptools.packages.find]
where = ["docling_core/resources/schemas"]
Expand Down
33 changes: 33 additions & 0 deletions test/test_docling_doc.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import importlib
from collections import deque

import pytest
import yaml
from pydantic import ValidationError

from docling_core.types.experimental.document import (
BasePictureData,
Expand Down Expand Up @@ -349,3 +351,34 @@ def _construct_doc() -> DoclingDocument:
fig_item = doc.add_picture(data=BasePictureData(), caption=fig_caption)

return doc


def test_version_doc():

# default version
version = importlib.metadata.version("docling-core")
doc = DoclingDocument(description=DescriptionItem(), name="Untitled 1")
assert doc.version == version

with open("test/data/experimental/dummy_doc.yaml", "r") as fp:
dict_from_yaml = yaml.safe_load(fp)
doc = DoclingDocument.model_validate(dict_from_yaml)
assert doc.version == version

# custom version at construction
doc = DoclingDocument(
description=DescriptionItem(),
name="Untitled 1",
version="2.1.0-post.8+96354bda",
)
assert doc.version.major == 2
assert doc.version.minor == 1
assert doc.version.patch == 0
assert doc.version.prerelease == "post.8"
assert doc.version.build == "96354bda"
doc_json = doc.model_dump()
assert doc_json["version"] == "2.1.0-post.8+96354bda"

# invalid version
with pytest.raises(ValidationError, match="SemVer"):
DoclingDocument(description=DescriptionItem(), name="Untitled 1", version="abc")

0 comments on commit 34ce64b

Please sign in to comment.