Skip to content

Commit

Permalink
Support METS of DSpace transfer
Browse files Browse the repository at this point in the history
  • Loading branch information
replaceafill authored Jul 17, 2023
1 parent 25e9783 commit e96f0ff
Show file tree
Hide file tree
Showing 6 changed files with 3,662 additions and 11 deletions.
3,468 changes: 3,468 additions & 0 deletions fixtures/mets_with_dmdsecs_in_filesec.xml

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion metsrw/fsentry.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,9 @@ def _add_metadata_element(self, md, subsection, mdtype, mode="mdwrap", **kwargs)
loctype = kwargs.get("loctype")
label = kwargs.get("label")
otherloctype = kwargs.get("otherloctype")
mdsec = MDRef(md, mdtype, loctype, label, otherloctype)
xptr = kwargs.get("xptr")
othermdtype = kwargs.get("othermdtype")
mdsec = MDRef(md, mdtype, loctype, label, otherloctype, xptr, othermdtype)
subsection = SubSection(subsection, mdsec)
if subsection.subsection == "dmdSec":
self.dmdsecs.append(subsection)
Expand Down Expand Up @@ -425,6 +427,8 @@ def serialize_filesec(self):
el.attrib["GROUPID"] = self.group_id()
if self.admids:
el.set("ADMID", " ".join(self.admids))
if self.dmdids and self.use == "original":
el.set("DMDID", " ".join(self.dmdids))
if self.checksum and self.checksumtype:
el.attrib["CHECKSUM"] = self.checksum
el.attrib["CHECKSUMTYPE"] = self.checksumtype
Expand Down
23 changes: 20 additions & 3 deletions metsrw/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,16 @@ class MDRef:

VALID_LOCTYPE = ("ARK", "URN", "URL", "PURL", "HANDLE", "DOI", "OTHER")

def __init__(self, target, mdtype, loctype, label=None, otherloctype=None):
def __init__(
self,
target,
mdtype,
loctype,
label=None,
otherloctype=None,
xptr=None,
othermdtype=None,
):
self.target = target
self.mdtype = mdtype
self.loctype = loctype
Expand All @@ -443,6 +452,8 @@ def __init__(self, target, mdtype, loctype, label=None, otherloctype=None):
)
self.label = label
self.otherloctype = otherloctype
self.xptr = xptr
self.othermdtype = othermdtype

@classmethod
def parse(cls, root):
Expand Down Expand Up @@ -475,8 +486,10 @@ def parse(cls, root):
# Optional attributes
label = root.get("LABEL")
otherloctype = root.get("OTHERLOCTYPE")
xptr = root.get("XPTR")
othermdtype = root.get("OTHERMDTYPE")

return cls(target, mdtype, loctype, label, otherloctype)
return cls(target, mdtype, loctype, label, otherloctype, xptr, othermdtype)

def serialize(self):
# If the source document is a METS document, the XPTR attribute of
Expand All @@ -491,7 +504,9 @@ def serialize(self):
]
XPTR = "xpointer(id('{}'))".format(" ".join(dmdsecs))
except Exception:
pass
# Otherwise use the Xpointer passed to the constructor.
if self.xptr is not None:
XPTR = self.xptr

el = etree.Element(utils.lxmlns("mets") + "mdRef")
if self.label:
Expand All @@ -510,6 +525,8 @@ def serialize(self):
el.attrib["OTHERLOCTYPE"] = self.otherloctype
if XPTR:
el.attrib["XPTR"] = XPTR
if self.othermdtype:
el.attrib["OTHERMDTYPE"] = self.othermdtype
return el


Expand Down
46 changes: 39 additions & 7 deletions metsrw/mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
AIP_ENTRY_TYPE = "archival information package"
FPtr = namedtuple(
"FPtr",
"file_uuid derived_from use path amdids checksum checksumtype fileid transform_files",
"file_uuid derived_from use path amdids dmdids checksum checksumtype fileid transform_files",
)
TRANSFORM_PREFIX = "TRANSFORM"
TRANSFORM_PREFIX_LEN = len(TRANSFORM_PREFIX)
Expand Down Expand Up @@ -300,17 +300,41 @@ def _filesec(self, files=None):
# Get fileGrp, or create if not exist
filegrp = filegrps.get(file_.use)
if filegrp is None:
filegrp = etree.SubElement(
filesec, utils.lxmlns("mets") + "fileGrp", USE=file_.use
)
filegrp = etree.Element(utils.lxmlns("mets") + "fileGrp", USE=file_.use)
filegrps[file_.use] = filegrp

file_el = file_.serialize_filesec()
if file_el is not None:
filegrp.append(file_el)
for filegrp in self._sort_filegrps(filegrps):
filesec.append(filegrp)

return filesec

def _sort_filegrps(self, filegrps):
uses_order = [
"original",
"submissionDocumentation",
"preservation",
"service",
"access",
"license",
"text/ocr",
"metadata",
"derivative",
]
result = []
count = len(filegrps)
for i, use in enumerate(filegrps.keys()):
filegrp = filegrps[use]
try:
filegrp_position = uses_order.index(use)
except ValueError:
filegrp_position = count + i
result.append((filegrp_position, filegrp))

return [v for i, v in sorted(result)]

def serialize(self, fully_qualified=True, normative_structmap=True):
"""
Returns this document serialized to an xml Element.
Expand Down Expand Up @@ -401,6 +425,7 @@ def _parse_tree_structmap(self, tree, parent_elem, normative_parent_elem=None):
fs_entry = fsentry.FSEntry.from_fptr(
label=None, type_="Item", fptr=fptr
)
self._add_dmdsecs_to_fs_entry(elem, fs_entry, fptr.dmdids)
self._add_amdsecs_to_fs_entry(fptr.amdids, fs_entry, tree)
siblings.append(fs_entry)
continue
Expand All @@ -409,7 +434,7 @@ def _parse_tree_structmap(self, tree, parent_elem, normative_parent_elem=None):
continue
fptr = self._analyze_fptr(fptr_elems[0], tree, entry_type)
fs_entry = fsentry.FSEntry.from_fptr(label, entry_type, fptr)
self._add_dmdsecs_to_fs_entry(elem, fs_entry, tree)
self._add_dmdsecs_to_fs_entry(elem, fs_entry, tree, fptr.dmdids)
self._add_amdsecs_to_fs_entry(fptr.amdids, fs_entry, tree)
siblings.append(fs_entry)
return siblings
Expand Down Expand Up @@ -466,6 +491,7 @@ def _analyze_fptr(fptr_elem, tree, entry_type):
" URL.".format(path)
)
amdids = file_elem.get("ADMID")
dmdids = file_elem.get("DMDID")
checksum = file_elem.get("CHECKSUM")
checksumtype = file_elem.get("CHECKSUMTYPE")
file_id_prefix = utils.FILE_ID_PREFIX
Expand Down Expand Up @@ -510,15 +536,21 @@ def _analyze_fptr(fptr_elem, tree, entry_type):
use,
path,
amdids,
dmdids,
checksum,
checksumtype,
file_id,
transform_files,
)

@staticmethod
def _add_dmdsecs_to_fs_entry(elem, fs_entry, tree):
for dmdid in elem.get("DMDID", "").split():
def _add_dmdsecs_to_fs_entry(elem, fs_entry, tree, dmdids=None):
dmdids_to_add = elem.get("DMDID", "").split()
if dmdids is not None:
dmdids_to_add.extend(
[dmdid for dmdid in dmdids.split() if dmdid not in dmdids_to_add]
)
for dmdid in dmdids_to_add:
dmdsec_elem = tree.find(
'mets:dmdSec[@ID="' + dmdid + '"]', namespaces=utils.NAMESPACES
)
Expand Down
4 changes: 4 additions & 0 deletions tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,13 +398,17 @@ def test_create_extra_params(self):
label="Label",
loctype="OTHER",
otherloctype="OUTSIDE",
xptr="xpointer(id('dmdSec_366 dmdSec_367'))",
othermdtype="METSRIGHTS",
)
mdreffed = mdref.serialize()

assert mdreffed.get("LOCTYPE") == "OTHER"
assert mdreffed.get("OTHERLOCTYPE") == "OUTSIDE"
assert mdreffed.get(metsrw.lxmlns("xlink") + "href") == "path/to/file.txt"
assert mdreffed.get("MDTYPE") == "OTHER"
assert mdreffed.get("XPTR") == "xpointer(id('dmdSec_366 dmdSec_367'))"
assert mdreffed.get("OTHERMDTYPE") == "METSRIGHTS"

def test_create_bad_loctype(self):
metsrw.MDRef(None, None, loctype="ARK")
Expand Down
126 changes: 126 additions & 0 deletions tests/test_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -350,6 +350,31 @@ def test_analyze_fptr(self):
use="original",
path="objects/AM68.csv",
amdids="amdSec_3",
dmdids=None,
checksum=None,
checksumtype=None,
transform_files=[],
)

def test_analyze_fptr_with_dmdsecs_in_filesec(self):
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse("fixtures/mets_with_dmdsecs_in_filesec.xml", parser=parser)
fptr_elem = tree.find(
'/mets:structMap[@TYPE="physical"]//mets:div[@LABEL="bitstream_8266.pdf"]/mets:fptr',
namespaces=metsrw.utils.NAMESPACES,
)

# Test the integrity of the ``FPtr`` object returned.
mw = metsrw.METSDocument()
fptr = mw._analyze_fptr(fptr_elem, tree, "Item")
assert fptr == metsrw.mets.FPtr(
fileid="file-33f5f35a-8bde-4b94-b7cd-3d2c8b8f7a23",
file_uuid="33f5f35a-8bde-4b94-b7cd-3d2c8b8f7a23",
derived_from=None,
use="original",
path="objects/ITEM_2429-2700.zip-2023-07-07T23_05_58.201656_00_00/bitstream_8266.pdf",
amdids="amdSec_3",
dmdids="dmdSec_3 dmdSec_4",
checksum=None,
checksumtype=None,
transform_files=[],
Expand Down Expand Up @@ -1001,6 +1026,107 @@ def test_serialize_normative_structmap(self):
tree = mw.serialize(normative_structmap=False)
assert tree.find(xpath, namespaces=metsrw.NAMESPACES) is None

def test_dspace_mets_dmdsecs(self):
mets_path = "fixtures/mets_with_dmdsecs_in_filesec.xml"
mw = metsrw.METSDocument.fromfile(mets_path)
fsentry = mw.get_file(
type="Item",
path="objects/ITEM_2429-2700.zip-2023-07-07T23_05_58.201656_00_00/bitstream_8266.pdf",
)

# The original object of a DSpace transfer should contain two dmdSecs:
assert len(fsentry.dmdsecs) == 2

# - The first contains Xpointers to descriptive metadata in
# the original mets.xml files exported from DSpace.
xpointer_dmdsec = [
dmdsec
for dmdsec in fsentry.dmdsecs
if isinstance(dmdsec.contents, metsrw.MDRef)
][0].serialize()
assert xpointer_dmdsec.attrib.get("STATUS") == "original"
assert xpointer_dmdsec.attrib.get("CREATED") == "2023-07-07T23:06:15"

xpointer = xpointer_dmdsec.find(
"mets:mdRef",
namespaces=metsrw.utils.NAMESPACES,
)
assert xpointer is not None
assert (
xpointer.attrib.get("LABEL")
== "mets.xml-Group-33f5f35a-8bde-4b94-b7cd-3d2c8b8f7a23"
)
assert xpointer.attrib.get("MDTYPE") == "OTHER"
assert xpointer.attrib.get("LOCTYPE") == "OTHER"
assert xpointer.attrib.get("OTHERLOCTYPE") == "SYSTEM"
assert xpointer.attrib.get("XPTR") == "xpointer(id('dmdSec_366 dmdSec_367'))"

# - The second dmdSec reflects the parent-child relationship between a
# DSpace object and its collection, using the handles as identifiers.
dc_dmdsec = [
dmdsec
for dmdsec in fsentry.dmdsecs
if isinstance(dmdsec.contents, metsrw.MDWrap)
][0].serialize()
assert dc_dmdsec.attrib.get("STATUS") == "original"
assert dc_dmdsec.attrib.get("CREATED") == "2023-07-07T23:06:15"

identifier = dc_dmdsec.find(
'mets:mdWrap[@MDTYPE="DC"]/mets:xmlData/dcterms:dublincore/dc:identifier',
namespaces=metsrw.utils.NAMESPACES,
)
assert identifier is not None
assert identifier.text == "hdl:2429/2700"

terms = dc_dmdsec.find(
'mets:mdWrap[@MDTYPE="DC"]/mets:xmlData/dcterms:dublincore/dcterms:isPartOf',
namespaces=metsrw.utils.NAMESPACES,
)
assert terms is not None
assert terms.text == "hdl:2429/1314"

def test_dspace_mets_amdsec(self):
mets_path = "fixtures/mets_with_dmdsecs_in_filesec.xml"
mw = metsrw.METSDocument.fromfile(mets_path)
fsentry = mw.get_file(
type="Item",
path="objects/ITEM_2429-2700.zip-2023-07-07T23_05_58.201656_00_00/bitstream_8266.pdf",
)

# The original object of a DSpace transfer should contain Xpointers
# to rights metadata in the original mets.xml files exported from
# DSpace.
rights_md = [
subsection.contents
for subsection in fsentry.amdsecs[0].subsections
if isinstance(subsection.contents, metsrw.MDRef)
][0].serialize()
assert (
rights_md.attrib.get("LABEL")
== "mets.xml-988d7030-3cde-43f2-ac1f-2b8cf9d5a70b"
)
assert rights_md.attrib.get("MDTYPE") == "OTHER"
assert rights_md.attrib.get("OTHERMDTYPE") == "METSRIGHTS"
assert rights_md.attrib.get("LOCTYPE") == "OTHER"
assert rights_md.attrib.get("OTHERLOCTYPE") == "SYSTEM"
assert (
rights_md.attrib.get("XPTR")
== "xpointer(id('rightsMD_371 rightsMD_374 rightsMD_384 rightsMD_393 rightsMD_401 rightsMD_409 rightsMD_417 rightsMD_425'))"
)

def test_dspace_filegrp_sorting_in_filesec(self):
mets_path = "fixtures/mets_with_dmdsecs_in_filesec.xml"
mw = metsrw.METSDocument.fromfile(mets_path)
filesec = mw._filesec()

assert [filegrp.attrib["USE"] for filegrp in filesec] == [
"original",
"submissionDocumentation",
"preservation",
"license",
"text/ocr",
]


@pytest.mark.parametrize(
"mets_path, expected_counts",
Expand Down

0 comments on commit e96f0ff

Please sign in to comment.