diff --git a/fixtures/dmdsecs_mets.xml b/fixtures/dmdsecs_mets.xml new file mode 100644 index 0000000..20a5972 --- /dev/null +++ b/fixtures/dmdsecs_mets.xml @@ -0,0 +1,85 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/metsrw/__init__.py b/metsrw/__init__.py index 225c63b..4cf76f6 100644 --- a/metsrw/__init__.py +++ b/metsrw/__init__.py @@ -16,6 +16,7 @@ GROUP_ID_PREFIX, urlencode, urldecode, + generate_mdtype_key, ) from .validate import ( METS_XSD_PATH, @@ -67,6 +68,7 @@ "SubSection", "__version__", "feature_broker", + "generate_mdtype_key", "get_schematron", "get_xmlschema", "has_class_methods", diff --git a/metsrw/fsentry.py b/metsrw/fsentry.py index cd60ca4..48b8747 100644 --- a/metsrw/fsentry.py +++ b/metsrw/fsentry.py @@ -4,6 +4,7 @@ from itertools import chain import logging import os +from uuid import uuid4 from lxml import etree import six @@ -173,6 +174,7 @@ def __init__( self.checksumtype = checksumtype self.amdsecs = [] self.dmdsecs = [] + self.dmdsecs_by_mdtype = {} @classmethod def dir(cls, label, children): @@ -298,7 +300,43 @@ def add_rightsmd(self, md, mdtype, mode="mdwrap", **kwargs): return self._add_metadata_element(md, "rightsMD", mdtype, mode, **kwargs) def add_dmdsec(self, md, mdtype, mode="mdwrap", **kwargs): - return self._add_metadata_element(md, "dmdSec", mdtype, mode, **kwargs) + """Add dmdsec. + + Extension of _add_metadata_element that adds a dmdSec and updates the + previous dmdSecs with the same MDTYPE and OTHERMDTYPE attribute values, + marking them as "superseded" and using the same group_id for all of them. + """ + dmdsec = self._add_metadata_element(md, "dmdSec", mdtype, mode, **kwargs) + dmdsec.status = kwargs.get("status") or "original" + mdtype_key = utils.generate_mdtype_key(mdtype, kwargs.get("othermdtype", "")) + if mdtype_key in self.dmdsecs_by_mdtype: + group_id = getattr(self.dmdsecs_by_mdtype[mdtype_key][0], "group_id") + if not group_id: + group_id = str(uuid4()) + dmdsec.group_id = group_id + for previous_dmdsec in self.dmdsecs_by_mdtype[mdtype_key]: + previous_dmdsec.group_id = group_id + if not previous_dmdsec.status: + previous_dmdsec.status = "original" + if not previous_dmdsec.status.endswith("-superseded"): + previous_dmdsec.status += "-superseded" + self.dmdsecs_by_mdtype.setdefault(mdtype_key, []).append(dmdsec) + return dmdsec + + def delete_dmdsec(self, mdtype, othermdtype=""): + """Mark latest dmdsec of mdtype_othermdtype as deleted. + + It doesn't delete the dmdsec from the METS. It only sets its status + attribute to "deleted". + """ + mdtype_key = utils.generate_mdtype_key(mdtype, othermdtype) + if mdtype_key in self.dmdsecs_by_mdtype: + self.dmdsecs_by_mdtype[mdtype_key][-1].status = "deleted" + + def has_dmdsec(self, mdtype, othermdtype=""): + """Check if a dmdsec of mdtype_othermdtype exists for this entry.""" + mdtype_key = utils.generate_mdtype_key(mdtype, othermdtype) + return mdtype_key in self.dmdsecs_by_mdtype def serialize_md_inst(self, md_inst, md_class): """Serialize object ``md_inst`` by transforming it into an @@ -344,9 +382,9 @@ def add_premis_rights(self, md, mode="mdwrap"): mode, ) - def add_dublin_core(self, md, mode="mdwrap"): - # TODO add extra args and create DC object here - return self.add_dmdsec(md, "DC", mode) + def add_dublin_core(self, md, mode="mdwrap", **kwargs): + # TODO create DC object here + return self.add_dmdsec(md, "DC", mode, **kwargs) def add_child(self, child): """Add a child FSEntry to this FSEntry. diff --git a/metsrw/metadata.py b/metsrw/metadata.py index 3568576..aeb63dd 100644 --- a/metsrw/metadata.py +++ b/metsrw/metadata.py @@ -335,9 +335,14 @@ def get_status(self): return self.status if self.subsection == "dmdSec": if self.older is None: - return "original" + status = "original" + if self.newer is not None: + status += "-superseded" else: - return "updated" + status = "update" + if self.newer is not None: + status += "-superseded" + return status if self.subsection in ("techMD", "rightsMD"): # TODO how to handle ones where newer has been deleted? if self.newer is None: diff --git a/metsrw/mets.py b/metsrw/mets.py index a804f96..b70badf 100755 --- a/metsrw/mets.py +++ b/metsrw/mets.py @@ -518,20 +518,19 @@ def _analyze_fptr(fptr_elem, tree, entry_type): @staticmethod def _add_dmdsecs_to_fs_entry(elem, fs_entry, tree): - dmdids = elem.get("DMDID") - if dmdids: - dmdids = dmdids.split() - for dmdid in dmdids: - dmdsec_elem = tree.find( - 'mets:dmdSec[@ID="' + dmdid + '"]', namespaces=utils.NAMESPACES - ) - dmdsec = metadata.SubSection.parse(dmdsec_elem) - fs_entry.dmdsecs.append(dmdsec) - # Create older/newer relationships - fs_entry.dmdsecs.sort(key=lambda x: x.created) - for prev_dmdsec, dmdsec in zip(fs_entry.dmdsecs, fs_entry.dmdsecs[1:]): - if dmdsec.status == "updated": - prev_dmdsec.replace_with(dmdsec) + for dmdid in elem.get("DMDID", "").split(): + dmdsec_elem = tree.find( + 'mets:dmdSec[@ID="' + dmdid + '"]', namespaces=utils.NAMESPACES + ) + dmdsec = metadata.SubSection.parse(dmdsec_elem) + fs_entry.dmdsecs.append(dmdsec) + # Order by creation date and generate mapping by mdtype_othermdtype + fs_entry.dmdsecs.sort(key=lambda x: x.created) + for dmdsec in fs_entry.dmdsecs: + mdtype_key = utils.generate_mdtype_key( + dmdsec.contents.mdtype, getattr(dmdsec.contents, "othermdtype", "") + ) + fs_entry.dmdsecs_by_mdtype.setdefault(mdtype_key, []).append(dmdsec) @staticmethod def _add_amdsecs_to_fs_entry(amdids, fs_entry, tree): diff --git a/metsrw/utils.py b/metsrw/utils.py index 6803b2a..8f4a501 100644 --- a/metsrw/utils.py +++ b/metsrw/utils.py @@ -64,3 +64,11 @@ def urldecode(url): https://tools.ietf.org/html/rfc3986#section-2.1. """ return _urlendecode(url, unquote_plus) + + +def generate_mdtype_key(mdtype, othermdtype=""): + """Used to generate the keys of the FSEntry's dmdsecs_by_mdtype dict.""" + mdtype_key = mdtype + if othermdtype: + mdtype_key += "_" + othermdtype + return mdtype_key diff --git a/tests/test_fsentry.py b/tests/test_fsentry.py index 316eba9..329050a 100644 --- a/tests/test_fsentry.py +++ b/tests/test_fsentry.py @@ -143,6 +143,62 @@ def test_add_metadata_to_fsentry(self): assert len(f1.amdsecs[0].subsections) == 4 + def test_dmdsec_management(self): + """Test addition, check and deletion of dmdSecs.""" + file = metsrw.FSEntry("file[1].txt", file_uuid=str(uuid.uuid4())) + assert file.dmdsecs == [] + assert file.dmdsecs_by_mdtype == {} + assert file.has_dmdsec("DC") is False + assert file.has_dmdsec("OTHER_CUSTOM") is False + file.add_dmdsec("", "DC") + assert len(file.dmdsecs) == 1 + assert len(file.dmdsecs_by_mdtype["DC"]) == 1 + assert file.has_dmdsec("DC") is True + assert file.dmdsecs[0].status == "original" + file.add_dmdsec("", "DC", status="update") + assert len(file.dmdsecs) == 2 + assert len(file.dmdsecs_by_mdtype["DC"]) == 2 + assert file.dmdsecs[0].status == "original-superseded" + assert file.dmdsecs[1].status == "update" + assert file.dmdsecs[0].group_id == file.dmdsecs[1].group_id + file.add_dmdsec("", "DC", status="update") + assert len(file.dmdsecs) == 3 + assert len(file.dmdsecs_by_mdtype["DC"]) == 3 + assert file.dmdsecs[1].status == "update-superseded" + assert file.dmdsecs[2].status == "update" + assert file.dmdsecs[0].group_id == file.dmdsecs[1].group_id + assert file.dmdsecs[0].group_id == file.dmdsecs[2].group_id + file.delete_dmdsec("DC") + assert len(file.dmdsecs) == 3 + assert len(file.dmdsecs_by_mdtype["DC"]) == 3 + assert file.dmdsecs[2].status == "deleted" + file.add_dmdsec("", "OTHER", othermdtype="CUSTOM") + assert len(file.dmdsecs) == 4 + assert len(file.dmdsecs_by_mdtype["DC"]) == 3 + assert len(file.dmdsecs_by_mdtype["OTHER_CUSTOM"]) == 1 + assert file.has_dmdsec("OTHER", othermdtype="CUSTOM") is True + assert file.dmdsecs[3].status == "original" + file.add_dmdsec("", "OTHER", othermdtype="CUSTOM", status="update") + assert len(file.dmdsecs) == 5 + assert len(file.dmdsecs_by_mdtype["DC"]) == 3 + assert len(file.dmdsecs_by_mdtype["OTHER_CUSTOM"]) == 2 + assert file.dmdsecs[3].status == "original-superseded" + assert file.dmdsecs[4].status == "update" + assert file.dmdsecs[3].group_id == file.dmdsecs[4].group_id + file.add_dmdsec("", "OTHER", othermdtype="CUSTOM", status="update") + assert len(file.dmdsecs) == 6 + assert len(file.dmdsecs_by_mdtype["DC"]) == 3 + assert len(file.dmdsecs_by_mdtype["OTHER_CUSTOM"]) == 3 + assert file.dmdsecs[4].status == "update-superseded" + assert file.dmdsecs[5].status == "update" + assert file.dmdsecs[3].group_id == file.dmdsecs[4].group_id + assert file.dmdsecs[3].group_id == file.dmdsecs[5].group_id + file.delete_dmdsec("OTHER", othermdtype="CUSTOM") + assert len(file.dmdsecs) == 6 + assert len(file.dmdsecs_by_mdtype["DC"]) == 3 + assert len(file.dmdsecs_by_mdtype["OTHER_CUSTOM"]) == 3 + assert file.dmdsecs[5].status == "deleted" + def test_add_child(self): """ It should add a new entry to the children list. diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 8e32b5d..678a5e8 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -226,13 +226,13 @@ def test_replacement_dmdsec(self): assert dmdsec_old.get_status() == "original" dmdsec_new = metsrw.SubSection("dmdSec", self.STUB_MDWRAP) dmdsec_old.replace_with(dmdsec_new) - assert dmdsec_old.get_status() == "original" - assert dmdsec_new.get_status() == "updated" + assert dmdsec_old.get_status() == "original-superseded" + assert dmdsec_new.get_status() == "update" dmdsec_newer = metsrw.SubSection("dmdSec", self.STUB_MDWRAP) dmdsec_new.replace_with(dmdsec_newer) - assert dmdsec_old.get_status() == "original" - assert dmdsec_new.get_status() == "updated" - assert dmdsec_newer.get_status() == "updated" + assert dmdsec_old.get_status() == "original-superseded" + assert dmdsec_new.get_status() == "update-superseded" + assert dmdsec_newer.get_status() == "update" def test_subsection_serialize(self): content = metsrw.MDWrap("", None) diff --git a/tests/test_mets.py b/tests/test_mets.py index 3d5b2e1..60f5420 100644 --- a/tests/test_mets.py +++ b/tests/test_mets.py @@ -107,6 +107,28 @@ def test_parse(self): is not None ) + def test_parse_dmdsecs(self): + """It should create FSEntry's ordered dmdsecs and mapping by type.""" + mw = metsrw.METSDocument() + parser = etree.XMLParser(remove_blank_text=True) + root = etree.parse("fixtures/dmdsecs_mets.xml", parser=parser) + mw.tree = root + mw._parse_tree() + objects = mw.get_file(type="Directory", label="objects") + assert len(objects.dmdsecs) == 4 + assert objects.dmdsecs[0].id_string == "dmdSec_1" + assert objects.dmdsecs[-1].id_string == "dmdSec_4" + assert len(objects.dmdsecs_by_mdtype["DC"]) == 1 + assert len(objects.dmdsecs_by_mdtype["OTHER_CUSTOM"]) == 1 + assert len(objects.dmdsecs_by_mdtype["OTHER_MD_A"]) == 1 + assert len(objects.dmdsecs_by_mdtype["OTHER_MD_B"]) == 1 + file = mw.get_file(type="Item", label="Landing_zone.jpg") + assert len(file.dmdsecs) == 5 + assert file.dmdsecs[0].id_string == "dmdSec_5" + assert file.dmdsecs[-1].id_string == "dmdSec_9" + assert len(file.dmdsecs_by_mdtype["DC"]) == 2 + assert len(file.dmdsecs_by_mdtype["OTHER_CUSTOM"]) == 3 + def test_parse_tree_createdate_too_new(self): mw = metsrw.METSDocument() root = etree.parse("fixtures/createdate_too_new.xml") diff --git a/tests/test_utils.py b/tests/test_utils.py index 8f1e9db..ee0d1d6 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -24,3 +24,8 @@ def test_url_encoding(): for url in BAD_URLS: with pytest.raises(ValueError): metsrw.urlencode(url) + + +def test_generate_mdtype_key(): + assert metsrw.generate_mdtype_key("MDTYPE") == "MDTYPE" + assert metsrw.generate_mdtype_key("MDTYPE", "OTHERMDTYPE") == "MDTYPE_OTHERMDTYPE"