Skip to content

Commit

Permalink
Update FSEntry dmdSecs management
Browse files Browse the repository at this point in the history
- Generate a dmdSecs mapping based on metadata type on parse.
- Add `generate_mdtype_key` util.
- Look at that mapping when adding and deleting dmdSecs to handle
  versioning and deletion based on type.
- Modify status attribute values.
  • Loading branch information
jraddaoui committed Feb 28, 2022
1 parent cf564a5 commit 6093527
Show file tree
Hide file tree
Showing 10 changed files with 245 additions and 25 deletions.
85 changes: 85 additions & 0 deletions fixtures/dmdsecs_mets.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
<?xml version='1.0' encoding='UTF-8'?>
<mets:mets xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:mets="http://www.loc.gov/METS/" xsi:schemaLocation="http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version1121/mets.xsd">
<mets:metsHdr CREATEDATE="2022-02-08T12:48:16"/>
<mets:dmdSec ID="dmdSec_1">
<mets:mdWrap MDTYPE="DC">
<mets:xmlData>
<fake/>
</mets:xmlData>
</mets:mdWrap>
</mets:dmdSec>
<mets:dmdSec ID="dmdSec_2" CREATED="2022-02-08T12:48:16" STATUS="original">
<mets:mdWrap MDTYPE="OTHER" OTHERMDTYPE="CUSTOM">
<mets:xmlData>
<fake/>
</mets:xmlData>
</mets:mdWrap>
</mets:dmdSec>
<mets:dmdSec ID="dmdSec_3" CREATED="2022-02-08T12:48:16" STATUS="original">
<mets:mdWrap MDTYPE="OTHER" OTHERMDTYPE="MD_A">
<mets:xmlData>
<fake/>
</mets:xmlData>
</mets:mdWrap>
</mets:dmdSec>
<mets:dmdSec ID="dmdSec_4" CREATED="2022-02-08T12:48:17" STATUS="original">
<mets:mdWrap MDTYPE="OTHER" OTHERMDTYPE="MD_B">
<mets:xmlData>
<fake/>
</mets:xmlData>
</mets:mdWrap>
</mets:dmdSec>
<mets:dmdSec ID="dmdSec_5" CREATED="2022-02-08T12:48:16" STATUS="original-superseded">
<mets:mdWrap MDTYPE="DC">
<mets:xmlData>
<fake/>
</mets:xmlData>
</mets:mdWrap>
</mets:dmdSec>
<mets:dmdSec ID="dmdSec_6" CREATED="2022-02-08T12:48:16" STATUS="original-superseded">
<mets:mdWrap MDTYPE="OTHER" OTHERMDTYPE="CUSTOM">
<mets:xmlData>
<fake/>
</mets:xmlData>
</mets:mdWrap>
</mets:dmdSec>
<mets:dmdSec ID="dmdSec_7" CREATED="2022-02-08T13:50:23" STATUS="update">
<mets:mdWrap MDTYPE="DC">
<mets:xmlData>
<fake/>
</mets:xmlData>
</mets:mdWrap>
</mets:dmdSec>
<mets:dmdSec ID="dmdSec_8" CREATED="2022-02-08T13:50:23" STATUS="update-superseded">
<mets:mdWrap MDTYPE="OTHER" OTHERMDTYPE="CUSTOM">
<mets:xmlData>
<fake/>
</mets:xmlData>
</mets:mdWrap>
</mets:dmdSec>
<mets:dmdSec ID="dmdSec_9" CREATED="2022-02-08T14:20:34" STATUS="deleted">
<mets:mdWrap MDTYPE="OTHER" OTHERMDTYPE="CUSTOM">
<mets:xmlData>
<fake/>
</mets:xmlData>
</mets:mdWrap>
</mets:dmdSec>
<mets:amdSec ID="amdSec_1"/>
<mets:amdSec ID="amdSec_2"/>
<mets:fileSec>
<mets:fileGrp USE="original">
<mets:file ID="file-7ac3a28c-3bff-4db6-b370-14ed08d911c4" GROUPID="Group-7ac3a28c-3bff-4db6-b370-14ed08d911c4" ADMID="amdSec_2">
<mets:FLocat xlink:href="objects/Landing_zone.jpg" LOCTYPE="OTHER" OTHERLOCTYPE="SYSTEM"/>
</mets:file>
</mets:fileGrp>
</mets:fileSec>
<mets:structMap TYPE="physical" ID="structMap_1" LABEL="Archivematica default">
<mets:div TYPE="Directory" LABEL="small_5-a8d845ec-7501-4847-8737-1ae60ed4c689">
<mets:div TYPE="Directory" LABEL="objects" DMDID="dmdSec_1 dmdSec_2 dmdSec_3 dmdSec_4" ADMID="amdSec_1">
<mets:div TYPE="Item" LABEL="Landing_zone.jpg" DMDID="dmdSec_5 dmdSec_6 dmdSec_7 dmdSec_8 dmdSec_9">
<mets:fptr FILEID="file-7ac3a28c-3bff-4db6-b370-14ed08d911c4"/>
</mets:div>
</mets:div>
</mets:div>
</mets:structMap>
</mets:mets>
2 changes: 2 additions & 0 deletions metsrw/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
GROUP_ID_PREFIX,
urlencode,
urldecode,
generate_mdtype_key,
)
from .validate import (
METS_XSD_PATH,
Expand Down Expand Up @@ -67,6 +68,7 @@
"SubSection",
"__version__",
"feature_broker",
"generate_mdtype_key",
"get_schematron",
"get_xmlschema",
"has_class_methods",
Expand Down
46 changes: 42 additions & 4 deletions metsrw/fsentry.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from itertools import chain
import logging
import os
from uuid import uuid4

from lxml import etree
import six
Expand Down Expand Up @@ -173,6 +174,7 @@ def __init__(
self.checksumtype = checksumtype
self.amdsecs = []
self.dmdsecs = []
self.dmdsecs_by_mdtype = {}

@classmethod
def dir(cls, label, children):
Expand Down Expand Up @@ -298,7 +300,43 @@ def add_rightsmd(self, md, mdtype, mode="mdwrap", **kwargs):
return self._add_metadata_element(md, "rightsMD", mdtype, mode, **kwargs)

def add_dmdsec(self, md, mdtype, mode="mdwrap", **kwargs):
return self._add_metadata_element(md, "dmdSec", mdtype, mode, **kwargs)
"""Add dmdsec.
Extension of _add_metadata_element that adds a dmdSec and updates the
previous dmdSecs with the same MDTYPE and OTHERMDTYPE attribute values,
marking them as "superseded" and using the same group_id for all of them.
"""
dmdsec = self._add_metadata_element(md, "dmdSec", mdtype, mode, **kwargs)
dmdsec.status = kwargs.get("status") or "original"
mdtype_key = utils.generate_mdtype_key(mdtype, kwargs.get("othermdtype", ""))
if mdtype_key in self.dmdsecs_by_mdtype:
group_id = getattr(self.dmdsecs_by_mdtype[mdtype_key][0], "group_id")
if not group_id:
group_id = str(uuid4())
dmdsec.group_id = group_id
for previous_dmdsec in self.dmdsecs_by_mdtype[mdtype_key]:
previous_dmdsec.group_id = group_id
if not previous_dmdsec.status:
previous_dmdsec.status = "original"
if not previous_dmdsec.status.endswith("-superseded"):
previous_dmdsec.status += "-superseded"
self.dmdsecs_by_mdtype.setdefault(mdtype_key, []).append(dmdsec)
return dmdsec

def delete_dmdsec(self, mdtype, othermdtype=""):
"""Mark latest dmdsec of mdtype_othermdtype as deleted.
It doesn't delete the dmdsec from the METS. It only sets its status
attribute to "deleted".
"""
mdtype_key = utils.generate_mdtype_key(mdtype, othermdtype)
if mdtype_key in self.dmdsecs_by_mdtype:
self.dmdsecs_by_mdtype[mdtype_key][-1].status = "deleted"

def has_dmdsec(self, mdtype, othermdtype=""):
"""Check if a dmdsec of mdtype_othermdtype exists for this entry."""
mdtype_key = utils.generate_mdtype_key(mdtype, othermdtype)
return mdtype_key in self.dmdsecs_by_mdtype

def serialize_md_inst(self, md_inst, md_class):
"""Serialize object ``md_inst`` by transforming it into an
Expand Down Expand Up @@ -344,9 +382,9 @@ def add_premis_rights(self, md, mode="mdwrap"):
mode,
)

def add_dublin_core(self, md, mode="mdwrap"):
# TODO add extra args and create DC object here
return self.add_dmdsec(md, "DC", mode)
def add_dublin_core(self, md, mode="mdwrap", **kwargs):
# TODO create DC object here
return self.add_dmdsec(md, "DC", mode, **kwargs)

def add_child(self, child):
"""Add a child FSEntry to this FSEntry.
Expand Down
9 changes: 7 additions & 2 deletions metsrw/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,9 +335,14 @@ def get_status(self):
return self.status
if self.subsection == "dmdSec":
if self.older is None:
return "original"
status = "original"
if self.newer is not None:
status += "-superseded"
else:
return "updated"
status = "update"
if self.newer is not None:
status += "-superseded"
return status
if self.subsection in ("techMD", "rightsMD"):
# TODO how to handle ones where newer has been deleted?
if self.newer is None:
Expand Down
27 changes: 13 additions & 14 deletions metsrw/mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,20 +518,19 @@ def _analyze_fptr(fptr_elem, tree, entry_type):

@staticmethod
def _add_dmdsecs_to_fs_entry(elem, fs_entry, tree):
dmdids = elem.get("DMDID")
if dmdids:
dmdids = dmdids.split()
for dmdid in dmdids:
dmdsec_elem = tree.find(
'mets:dmdSec[@ID="' + dmdid + '"]', namespaces=utils.NAMESPACES
)
dmdsec = metadata.SubSection.parse(dmdsec_elem)
fs_entry.dmdsecs.append(dmdsec)
# Create older/newer relationships
fs_entry.dmdsecs.sort(key=lambda x: x.created)
for prev_dmdsec, dmdsec in zip(fs_entry.dmdsecs, fs_entry.dmdsecs[1:]):
if dmdsec.status == "updated":
prev_dmdsec.replace_with(dmdsec)
for dmdid in elem.get("DMDID", "").split():
dmdsec_elem = tree.find(
'mets:dmdSec[@ID="' + dmdid + '"]', namespaces=utils.NAMESPACES
)
dmdsec = metadata.SubSection.parse(dmdsec_elem)
fs_entry.dmdsecs.append(dmdsec)
# Order by creation date and generate mapping by mdtype_othermdtype
fs_entry.dmdsecs.sort(key=lambda x: x.created)
for dmdsec in fs_entry.dmdsecs:
mdtype_key = utils.generate_mdtype_key(
dmdsec.contents.mdtype, getattr(dmdsec.contents, "othermdtype", "")
)
fs_entry.dmdsecs_by_mdtype.setdefault(mdtype_key, []).append(dmdsec)

@staticmethod
def _add_amdsecs_to_fs_entry(amdids, fs_entry, tree):
Expand Down
8 changes: 8 additions & 0 deletions metsrw/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,11 @@ def urldecode(url):
https://tools.ietf.org/html/rfc3986#section-2.1.
"""
return _urlendecode(url, unquote_plus)


def generate_mdtype_key(mdtype, othermdtype=""):
"""Used to generate the keys of the FSEntry's dmdsecs_by_mdtype dict."""
mdtype_key = mdtype
if othermdtype:
mdtype_key += "_" + othermdtype
return mdtype_key
56 changes: 56 additions & 0 deletions tests/test_fsentry.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,62 @@ def test_add_metadata_to_fsentry(self):

assert len(f1.amdsecs[0].subsections) == 4

def test_dmdsec_management(self):
"""Test addition, check and deletion of dmdSecs."""
file = metsrw.FSEntry("file[1].txt", file_uuid=str(uuid.uuid4()))
assert file.dmdsecs == []
assert file.dmdsecs_by_mdtype == {}
assert file.has_dmdsec("DC") is False
assert file.has_dmdsec("OTHER_CUSTOM") is False
file.add_dmdsec("<dc/>", "DC")
assert len(file.dmdsecs) == 1
assert len(file.dmdsecs_by_mdtype["DC"]) == 1
assert file.has_dmdsec("DC") is True
assert file.dmdsecs[0].status == "original"
file.add_dmdsec("<dc/>", "DC", status="update")
assert len(file.dmdsecs) == 2
assert len(file.dmdsecs_by_mdtype["DC"]) == 2
assert file.dmdsecs[0].status == "original-superseded"
assert file.dmdsecs[1].status == "update"
assert file.dmdsecs[0].group_id == file.dmdsecs[1].group_id
file.add_dmdsec("<dc/>", "DC", status="update")
assert len(file.dmdsecs) == 3
assert len(file.dmdsecs_by_mdtype["DC"]) == 3
assert file.dmdsecs[1].status == "update-superseded"
assert file.dmdsecs[2].status == "update"
assert file.dmdsecs[0].group_id == file.dmdsecs[1].group_id
assert file.dmdsecs[0].group_id == file.dmdsecs[2].group_id
file.delete_dmdsec("DC")
assert len(file.dmdsecs) == 3
assert len(file.dmdsecs_by_mdtype["DC"]) == 3
assert file.dmdsecs[2].status == "deleted"
file.add_dmdsec("<custom/>", "OTHER", othermdtype="CUSTOM")
assert len(file.dmdsecs) == 4
assert len(file.dmdsecs_by_mdtype["DC"]) == 3
assert len(file.dmdsecs_by_mdtype["OTHER_CUSTOM"]) == 1
assert file.has_dmdsec("OTHER", othermdtype="CUSTOM") is True
assert file.dmdsecs[3].status == "original"
file.add_dmdsec("<custom/>", "OTHER", othermdtype="CUSTOM", status="update")
assert len(file.dmdsecs) == 5
assert len(file.dmdsecs_by_mdtype["DC"]) == 3
assert len(file.dmdsecs_by_mdtype["OTHER_CUSTOM"]) == 2
assert file.dmdsecs[3].status == "original-superseded"
assert file.dmdsecs[4].status == "update"
assert file.dmdsecs[3].group_id == file.dmdsecs[4].group_id
file.add_dmdsec("<custom/>", "OTHER", othermdtype="CUSTOM", status="update")
assert len(file.dmdsecs) == 6
assert len(file.dmdsecs_by_mdtype["DC"]) == 3
assert len(file.dmdsecs_by_mdtype["OTHER_CUSTOM"]) == 3
assert file.dmdsecs[4].status == "update-superseded"
assert file.dmdsecs[5].status == "update"
assert file.dmdsecs[3].group_id == file.dmdsecs[4].group_id
assert file.dmdsecs[3].group_id == file.dmdsecs[5].group_id
file.delete_dmdsec("OTHER", othermdtype="CUSTOM")
assert len(file.dmdsecs) == 6
assert len(file.dmdsecs_by_mdtype["DC"]) == 3
assert len(file.dmdsecs_by_mdtype["OTHER_CUSTOM"]) == 3
assert file.dmdsecs[5].status == "deleted"

def test_add_child(self):
"""
It should add a new entry to the children list.
Expand Down
10 changes: 5 additions & 5 deletions tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -226,13 +226,13 @@ def test_replacement_dmdsec(self):
assert dmdsec_old.get_status() == "original"
dmdsec_new = metsrw.SubSection("dmdSec", self.STUB_MDWRAP)
dmdsec_old.replace_with(dmdsec_new)
assert dmdsec_old.get_status() == "original"
assert dmdsec_new.get_status() == "updated"
assert dmdsec_old.get_status() == "original-superseded"
assert dmdsec_new.get_status() == "update"
dmdsec_newer = metsrw.SubSection("dmdSec", self.STUB_MDWRAP)
dmdsec_new.replace_with(dmdsec_newer)
assert dmdsec_old.get_status() == "original"
assert dmdsec_new.get_status() == "updated"
assert dmdsec_newer.get_status() == "updated"
assert dmdsec_old.get_status() == "original-superseded"
assert dmdsec_new.get_status() == "update-superseded"
assert dmdsec_newer.get_status() == "update"

def test_subsection_serialize(self):
content = metsrw.MDWrap("<foo/>", None)
Expand Down
22 changes: 22 additions & 0 deletions tests/test_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,28 @@ def test_parse(self):
is not None
)

def test_parse_dmdsecs(self):
"""It should create FSEntry's ordered dmdsecs and mapping by type."""
mw = metsrw.METSDocument()
parser = etree.XMLParser(remove_blank_text=True)
root = etree.parse("fixtures/dmdsecs_mets.xml", parser=parser)
mw.tree = root
mw._parse_tree()
objects = mw.get_file(type="Directory", label="objects")
assert len(objects.dmdsecs) == 4
assert objects.dmdsecs[0].id_string == "dmdSec_1"
assert objects.dmdsecs[-1].id_string == "dmdSec_4"
assert len(objects.dmdsecs_by_mdtype["DC"]) == 1
assert len(objects.dmdsecs_by_mdtype["OTHER_CUSTOM"]) == 1
assert len(objects.dmdsecs_by_mdtype["OTHER_MD_A"]) == 1
assert len(objects.dmdsecs_by_mdtype["OTHER_MD_B"]) == 1
file = mw.get_file(type="Item", label="Landing_zone.jpg")
assert len(file.dmdsecs) == 5
assert file.dmdsecs[0].id_string == "dmdSec_5"
assert file.dmdsecs[-1].id_string == "dmdSec_9"
assert len(file.dmdsecs_by_mdtype["DC"]) == 2
assert len(file.dmdsecs_by_mdtype["OTHER_CUSTOM"]) == 3

def test_parse_tree_createdate_too_new(self):
mw = metsrw.METSDocument()
root = etree.parse("fixtures/createdate_too_new.xml")
Expand Down
5 changes: 5 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,8 @@ def test_url_encoding():
for url in BAD_URLS:
with pytest.raises(ValueError):
metsrw.urlencode(url)


def test_generate_mdtype_key():
assert metsrw.generate_mdtype_key("MDTYPE") == "MDTYPE"
assert metsrw.generate_mdtype_key("MDTYPE", "OTHERMDTYPE") == "MDTYPE_OTHERMDTYPE"

0 comments on commit 6093527

Please sign in to comment.