From db761efce914888203c1913ee04e068fe9c473e9 Mon Sep 17 00:00:00 2001 From: Cole Maclean Date: Mon, 15 Jul 2019 14:38:09 -0700 Subject: [PATCH] Fix file_uuid attribute for AIPs A few cases were missing handling for the new `file-` prefix. Also increments version to 3.11. --- metsrw/__init__.py | 2 +- metsrw/fsentry.py | 4 +++- metsrw/mets.py | 10 +++++++--- tests/test_fsentry.py | 10 ++++++++++ tests/test_mets.py | 29 +++++++++++++++++++++++++++++ 5 files changed, 50 insertions(+), 5 deletions(-) diff --git a/metsrw/__init__.py b/metsrw/__init__.py index 10e03dc..56f0b95 100644 --- a/metsrw/__init__.py +++ b/metsrw/__init__.py @@ -43,7 +43,7 @@ LOGGER = logging.getLogger(__name__) LOGGER.addHandler(logging.NullHandler()) -__version__ = "0.3.10" +__version__ = "0.3.11" __all__ = [ "Agent", diff --git a/metsrw/fsentry.py b/metsrw/fsentry.py index 6761a73..c479c1c 100644 --- a/metsrw/fsentry.py +++ b/metsrw/fsentry.py @@ -215,7 +215,9 @@ def file_id(self): if self.is_aip: if self._fileid: return self._fileid - return os.path.splitext(os.path.basename(self.path))[0] + return ( + utils.FILE_ID_PREFIX + os.path.splitext(os.path.basename(self.path))[0] + ) return utils.FILE_ID_PREFIX + self.file_uuid def group_id(self): diff --git a/metsrw/mets.py b/metsrw/mets.py index 48b6747..56fe481 100755 --- a/metsrw/mets.py +++ b/metsrw/mets.py @@ -439,12 +439,16 @@ def _analyze_fptr(fptr_elem, tree, entry_type): checksum = file_elem.get("CHECKSUM") checksumtype = file_elem.get("CHECKSUMTYPE") file_id_prefix = utils.FILE_ID_PREFIX - # If the file is an AIP, then its prefix is not "file-" but the - # name of the AIP. Therefore we need to get the extension-less + # If the file is an AIP, then its prefix is the name of the AIP, + # plus `file-` on 1.10+. Therefore we need to get the extension-less # basename of the AIP's path and remove its UUID suffix to ge # the prefix to remove from the FILEID attribute value. if entry_type.lower() == "archival information package": - file_id_prefix = os.path.splitext(os.path.basename(path))[0][:-36] + aip_name = os.path.splitext(os.path.basename(path))[0][:-36] + if file_id.startswith(file_id_prefix): + file_id_prefix = file_id_prefix + aip_name + else: + file_id_prefix = aip_name # If the file is part of a directory (with no intermediate item), then # its prefix *may not* be "file-" but the name of the file. This # pattern is found in old Archivematica METS files, e.g. see diff --git a/tests/test_fsentry.py b/tests/test_fsentry.py index 0e08d5c..7feb522 100644 --- a/tests/test_fsentry.py +++ b/tests/test_fsentry.py @@ -65,6 +65,16 @@ def test_file_id_success(self): f = metsrw.FSEntry("level1.txt", file_uuid=file_uuid) assert f.file_id() == "file-" + file_uuid + def test_aip_file_id(self): + fsentry = metsrw.FSEntry( + file_uuid="9b9f129c-8062-471b-a009-9ee0ad655f08", + type="Archival Information Package", + path="/tmp/example-1-9b9f129c-8062-471b-a009-9ee0ad655f08.7z", + ) + assert ( + fsentry.file_id() == "file-example-1-9b9f129c-8062-471b-a009-9ee0ad655f08" + ) + def test_group_id_no_uuid(self): """ It should return no group ID. """ f = metsrw.FSEntry("level1.txt") diff --git a/tests/test_mets.py b/tests/test_mets.py index a25e00d..2de8a50 100644 --- a/tests/test_mets.py +++ b/tests/test_mets.py @@ -343,6 +343,35 @@ def test_analyze_fptr_from_aip(self): assert fptr.file_uuid == "7327b00f-d83a-4ae8-bb89-84fce994e827" assert fptr.use == "Archival Information Package" + def test_analyze_fptr_sets_uuid_from_aip_with_file_id_prefix(self): + """ + Test that AIP FILEIDs with a leading `file-` are parsed properly. + """ + tree = etree.fromstring( + b""" + + + + + + + + + + + + + + + """ + ) + fptr_elem = tree.find(".//mets:fptr[1]", namespaces=metsrw.utils.NAMESPACES) + fptr = metsrw.METSDocument()._analyze_fptr( + fptr_elem, tree, "Archival Information Package" + ) + + assert fptr.file_uuid == "9b9f129c-8062-471b-a009-9ee0ad655f08" + def test_duplicate_ids(self): """ We don't want duplicate ids to be generated, but if specified, they shouldn't break