Skip to content

Commit

Permalink
Fix file_uuid attribute for AIPs
Browse files Browse the repository at this point in the history
A few cases were missing handling for the new `file-` prefix.
Also increments version to 3.11.
  • Loading branch information
cole authored Jul 15, 2019
1 parent ed7293b commit db761ef
Show file tree
Hide file tree
Showing 5 changed files with 50 additions and 5 deletions.
2 changes: 1 addition & 1 deletion metsrw/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@

LOGGER = logging.getLogger(__name__)
LOGGER.addHandler(logging.NullHandler())
__version__ = "0.3.10"
__version__ = "0.3.11"

__all__ = [
"Agent",
Expand Down
4 changes: 3 additions & 1 deletion metsrw/fsentry.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,9 @@ def file_id(self):
if self.is_aip:
if self._fileid:
return self._fileid
return os.path.splitext(os.path.basename(self.path))[0]
return (
utils.FILE_ID_PREFIX + os.path.splitext(os.path.basename(self.path))[0]
)
return utils.FILE_ID_PREFIX + self.file_uuid

def group_id(self):
Expand Down
10 changes: 7 additions & 3 deletions metsrw/mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,12 +439,16 @@ def _analyze_fptr(fptr_elem, tree, entry_type):
checksum = file_elem.get("CHECKSUM")
checksumtype = file_elem.get("CHECKSUMTYPE")
file_id_prefix = utils.FILE_ID_PREFIX
# If the file is an AIP, then its prefix is not "file-" but the
# name of the AIP. Therefore we need to get the extension-less
# If the file is an AIP, then its prefix is the name of the AIP,
# plus `file-` on 1.10+. Therefore we need to get the extension-less
# basename of the AIP's path and remove its UUID suffix to ge
# the prefix to remove from the FILEID attribute value.
if entry_type.lower() == "archival information package":
file_id_prefix = os.path.splitext(os.path.basename(path))[0][:-36]
aip_name = os.path.splitext(os.path.basename(path))[0][:-36]
if file_id.startswith(file_id_prefix):
file_id_prefix = file_id_prefix + aip_name
else:
file_id_prefix = aip_name
# If the file is part of a directory (with no intermediate item), then
# its prefix *may not* be "file-" but the name of the file. This
# pattern is found in old Archivematica METS files, e.g. see
Expand Down
10 changes: 10 additions & 0 deletions tests/test_fsentry.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,16 @@ def test_file_id_success(self):
f = metsrw.FSEntry("level1.txt", file_uuid=file_uuid)
assert f.file_id() == "file-" + file_uuid

def test_aip_file_id(self):
fsentry = metsrw.FSEntry(
file_uuid="9b9f129c-8062-471b-a009-9ee0ad655f08",
type="Archival Information Package",
path="/tmp/example-1-9b9f129c-8062-471b-a009-9ee0ad655f08.7z",
)
assert (
fsentry.file_id() == "file-example-1-9b9f129c-8062-471b-a009-9ee0ad655f08"
)

def test_group_id_no_uuid(self):
""" It should return no group ID. """
f = metsrw.FSEntry("level1.txt")
Expand Down
29 changes: 29 additions & 0 deletions tests/test_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,35 @@ def test_analyze_fptr_from_aip(self):
assert fptr.file_uuid == "7327b00f-d83a-4ae8-bb89-84fce994e827"
assert fptr.use == "Archival Information Package"

def test_analyze_fptr_sets_uuid_from_aip_with_file_id_prefix(self):
"""
Test that AIP FILEIDs with a leading `file-` are parsed properly.
"""
tree = etree.fromstring(
b"""<?xml version='1.0' encoding='utf-8'?>
<mets:mets xmlns:mets="http://www.loc.gov/METS/" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version111/mets.xsd">
<mets:fileSec>
<mets:fileGrp USE="Archival Information Package">
<mets:file ID="file-example-1-9b9f129c-8062-471b-a009-9ee0ad655f08">
<mets:FLocat OTHERLOCTYPE="SYSTEM" LOCTYPE="OTHER" xlink:href="/tmp/example-1-9b9f129c-8062-471b-a009-9ee0ad655f08.7z"/>
</mets:file>
</mets:fileGrp>
</mets:fileSec>
<mets:structMap TYPE="physical">
<mets:div ADMID="amdSec_1" TYPE="Archival Information Package">
<mets:fptr FILEID="file-example-1-9b9f129c-8062-471b-a009-9ee0ad655f08"/>
</mets:div>
</mets:structMap>
</mets:mets>
"""
)
fptr_elem = tree.find(".//mets:fptr[1]", namespaces=metsrw.utils.NAMESPACES)
fptr = metsrw.METSDocument()._analyze_fptr(
fptr_elem, tree, "Archival Information Package"
)

assert fptr.file_uuid == "9b9f129c-8062-471b-a009-9ee0ad655f08"

def test_duplicate_ids(self):
"""
We don't want duplicate ids to be generated, but if specified, they shouldn't break
Expand Down

0 comments on commit db761ef

Please sign in to comment.