Skip to content

Commit

Permalink
Don't generate duplicated ids
Browse files Browse the repository at this point in the history
When some ids, but not all, are autogenerated, duplicates can easily
occur. Changes metsrw to take in to account when manual ids of the same
format are specified.

Also changes id_string into a regular attribute and increments version.
  • Loading branch information
cole authored and sevein committed Apr 17, 2019
1 parent 27c59fa commit d95939c
Show file tree
Hide file tree
Showing 7 changed files with 111 additions and 49 deletions.
4 changes: 2 additions & 2 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@
# built documents.
#
# The short X.Y version.
version = "0.3.7"
version = "0.3.8"
# The full version, including alpha/beta/rc tags.
release = "0.3.7"
release = "0.3.8"

# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
Expand Down
2 changes: 1 addition & 1 deletion metsrw/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@

LOGGER = logging.getLogger(__name__)
LOGGER.addHandler(logging.NullHandler())
__version__ = "0.3.7"
__version__ = "0.3.8"

__all__ = [
"Agent",
Expand Down
4 changes: 2 additions & 2 deletions metsrw/fsentry.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,12 +222,12 @@ def group_id(self):
@property
def admids(self):
""" Returns a list of ADMIDs for this entry. """
return [a.id_string() for a in self.amdsecs]
return [a.id_string for a in self.amdsecs]

@property
def dmdids(self):
""" Returns a list of DMDIDs for this entry. """
return [d.id_string() for d in self.dmdsecs]
return [d.id_string for d in self.dmdsecs]

@property
def children(self):
Expand Down
81 changes: 54 additions & 27 deletions metsrw/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
"""
from __future__ import absolute_import

import collections
import copy
import logging
from lxml import etree
Expand All @@ -16,17 +15,35 @@


LOGGER = logging.getLogger(__name__)
_id_counter = collections.Counter()


def _generate_id(prefix):
class IdGenerator(six.Iterator):
"""Helper class to generate unique, sequential ids.
"""
Generate a counter based id for the prefix given.
"""
count = _id_counter[prefix]
_id_counter.update([prefix])

return "{}_{}".format(prefix, count)
def __init__(self, prefix):
self.counter = 0
self.prefix = prefix

def __next__(self):
self.counter += 1
return u"{}_{}".format(self.prefix, self.counter)

def clear(self):
self.counter = 0

def register_id(self, id_string):
"""Register a manually assigned id as used, to avoid collisions.
"""
try:
prefix, count = id_string.rsplit("_", 1)
count = int(count)
except ValueError:
# We don't need to worry about ids that don't match our pattern
pass
else:
if prefix == self.prefix:
self.counter = max(count, self.counter)


class AMDSec(object):
Expand All @@ -43,26 +60,29 @@ class AMDSec(object):
"""

tag = "amdSec"
_id_generator = IdGenerator(tag)

def __init__(self, section_id=None, subsections=None, tree=None):
if subsections is None:
subsections = []
self.subsections = subsections
self._id = section_id
self._tree = tree
if tree is not None and not section_id:
raise ValueError("If tree is provided, section_id must also be provided")

def id_string(self, force_generate=False):
"""
Returns the ID string for the amdSec.
if section_id is None:
self.id_string = next(self._id_generator)
else:
self._id_generator.register_id(section_id)
self.id_string = section_id

:param bool force_generate: If True, will generate a new ID from 'amdSec' and a random number.
@classmethod
def get_current_id_count(cls):
"""
Returns the current count of AMDSec objects, for id generation
purposes.
"""
# e.g., amdSec_1
if force_generate or not self._id:
self._id = _generate_id(self.tag)
return self._id
return cls._id_generator.counter

@classmethod
def parse(cls, root):
Expand Down Expand Up @@ -91,7 +111,7 @@ def serialize(self, now=None):
"""
if self._tree is not None:
return self._tree
el = etree.Element(utils.lxmlns("mets") + self.tag, ID=self.id_string())
el = etree.Element(utils.lxmlns("mets") + self.tag, ID=self.id_string)
self.subsections.sort()
for child in self.subsections:
el.append(child.serialize(now))
Expand Down Expand Up @@ -266,6 +286,10 @@ class SubSection(object):
"""

ALLOWED_SUBSECTIONS = ("techMD", "rightsMD", "sourceMD", "digiprovMD", "dmdSec")
_id_generators = {
subsection_type: IdGenerator(subsection_type)
for subsection_type in ALLOWED_SUBSECTIONS
}

def __init__(self, subsection, contents, section_id=None):
if subsection not in self.ALLOWED_SUBSECTIONS:
Expand All @@ -274,28 +298,31 @@ def __init__(self, subsection, contents, section_id=None):
)
self.subsection = subsection
self.contents = contents
self._id = section_id
self.status = None
self.older = None
self.newer = None
self.created = None

if section_id is None:
self.id_string = next(self._id_generators[self.subsection])
else:
self.id_string = section_id
self._id_generators[self.subsection].register_id(section_id)

def __lt__(self, other):
# Sort based on the subsection's order in ALLOWED_SUBSECTIONS
# techMDs < rightsMD < sourceMD < digiprovMD < dmdSec
return self.ALLOWED_SUBSECTIONS.index(
self.subsection
) < self.ALLOWED_SUBSECTIONS.index(other.subsection)

def id_string(self, force_generate=False):
@classmethod
def get_current_id_count(cls, subsection_type):
"""
Returns the ID string for this SubSection.
:param bool force_generate: If True, will generate a new ID from the subsection tag and a random number.
Returns the current count of SubSection objects of the type provided,
for id generation purposes.
"""
if force_generate or not self._id:
self._id = _generate_id(self.subsection)
return self._id
return cls._id_generators[subsection_type].counter

def get_status(self):
"""
Expand Down Expand Up @@ -380,7 +407,7 @@ def serialize(self, now=None):
:return: dmdSec/techMD/rightsMD/sourceMD/digiprovMD Element with all children
"""
created = self.created if self.created is not None else now
el = etree.Element(utils.lxmlns("mets") + self.subsection, ID=self.id_string())
el = etree.Element(utils.lxmlns("mets") + self.subsection, ID=self.id_string)
if created: # Don't add CREATED if none was parsed
el.set("CREATED", created)
status = self.get_status()
Expand Down
4 changes: 2 additions & 2 deletions metsrw/mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,8 +219,8 @@ def _collect_mdsec_elements(files):
for a in f.amdsecs:
amdsecs.append(a)

dmdsecs.sort(key=lambda x: x.id_string())
amdsecs.sort(key=lambda x: x.id_string())
dmdsecs.sort(key=lambda x: x.id_string)
amdsecs.sort(key=lambda x: x.id_string)
return dmdsecs + amdsecs

def _structmap(self):
Expand Down
48 changes: 36 additions & 12 deletions tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,16 +104,16 @@ class TestAMDSec(TestCase):

def setUp(self):
# Reset the id generation counter per test
metsrw.metadata._id_counter.clear()
metsrw.metadata.AMDSec._id_generator.clear()

def test_identifier(self):
amdsec_ids = [metsrw.AMDSec().id_string() for _ in range(10)]
amdsec_ids = [metsrw.AMDSec().id_string for _ in range(10)]
# Generate a SubSection in between to make sure our count
# doesn't jump
metsrw.SubSection("techMD", []).id_string()
amdsec_ids.append(metsrw.AMDSec().id_string())
metsrw.SubSection("techMD", []).id_string
amdsec_ids.append(metsrw.AMDSec().id_string)

for index, amdsec_id in enumerate(amdsec_ids):
for index, amdsec_id in enumerate(amdsec_ids, 1):
assert amdsec_id == "amdSec_{}".format(index)

def test_tree_no_id(self):
Expand All @@ -126,24 +126,37 @@ def test_tree_overwrites_serialize(self):
amdsec = metsrw.AMDSec(tree=elem, section_id="id1")
assert amdsec.serialize() == elem

def test_generate_id_skips_existing_id(self):
element = etree.Element("amdSec")
amdsec1 = metsrw.AMDSec(tree=element, section_id="amdSec_1")
amdsec2 = metsrw.AMDSec()
amdsec3 = metsrw.AMDSec()

assert amdsec1.id_string != amdsec2.id_string
assert amdsec1.id_string != amdsec3.id_string
assert amdsec2.id_string != amdsec3.id_string

assert metsrw.AMDSec.get_current_id_count() == 3


class TestSubSection(TestCase):
""" Test SubSection class. """

STUB_MDWRAP = metsrw.MDWrap("<foo/>", "PREMIS:DUMMY")

def setUp(self):
# Reset the id generation counter per test
metsrw.metadata._id_counter.clear()
# Reset the id generation counters per test
for counter in metsrw.metadata.SubSection._id_generators.values():
counter.clear()

def test_identifier(self):
tech_md_ids = [metsrw.SubSection("techMD", []).id_string() for _ in range(10)]
dmdsec_ids = [metsrw.SubSection("dmdSec", []).id_string() for _ in range(10)]
tech_md_ids = [metsrw.SubSection("techMD", []).id_string for _ in range(10)]
dmdsec_ids = [metsrw.SubSection("dmdSec", []).id_string for _ in range(10)]

for index, tech_md_id in enumerate(tech_md_ids):
for index, tech_md_id in enumerate(tech_md_ids, 1):
assert tech_md_id == "techMD_{}".format(index)

for index, dmdsec_id in enumerate(dmdsec_ids):
for index, dmdsec_id in enumerate(dmdsec_ids, 1):
assert dmdsec_id == "dmdSec_{}".format(index)

def test_allowed_tags(self):
Expand Down Expand Up @@ -282,7 +295,7 @@ def test_parse(self):
assert obj.subsection == "techMD"
assert obj.status == "original"
assert obj.created == "2016-01-02T03:04:05"
assert obj.id_string() == "techMD_42"
assert obj.id_string == "techMD_42"
assert obj.newer is obj.older is None
assert isinstance(obj.contents, metsrw.MDRef)

Expand Down Expand Up @@ -340,6 +353,17 @@ def test_roundtrip_bare(self):
assert new.attrib["ID"] == "techMD_42"
assert len(new.attrib) == 1

def test_generate_id_skips_existing_id(self):
techmd1 = metsrw.SubSection("techMD", [], section_id="techMD_1")
techmd2 = metsrw.SubSection("techMD", [])
techmd3 = metsrw.SubSection("techMD", [])

assert techmd1.id_string != techmd2.id_string
assert techmd1.id_string != techmd3.id_string
assert techmd2.id_string != techmd3.id_string

assert metsrw.SubSection.get_current_id_count("techMD") == 3


class TestMDRef(TestCase):
""" Test MDRef class. """
Expand Down
17 changes: 14 additions & 3 deletions tests/test_mets.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,19 +342,30 @@ def test_analyze_fptr_from_aip(self):
assert fptr.file_uuid == "7327b00f-d83a-4ae8-bb89-84fce994e827"
assert fptr.use == "Archival Information Package"

@mock.patch("metsrw.metadata._generate_id", return_value="id_1")
def test_duplicate_ids(self, mock_generate_id):
def test_duplicate_ids(self):
"""
We don't want duplicate ids to be generated, but if specified, they shouldn't break
serialization.
"""
document = metsrw.METSDocument()
fsentry1 = metsrw.FSEntry("file[1].txt", file_uuid=str(uuid.uuid4()))
fsentry1.add_premis_object("<premis>object</premis>")
fsentry1.amdsecs[0].id_string = "id_1"
fsentry2 = metsrw.FSEntry("file[2].txt", file_uuid=str(uuid.uuid4()))
fsentry2.add_premis_object("<premis>object</premis>")
fsentry2.amdsecs[0].id_string = "id_1"

document.append_file(fsentry1)
document.append_file(fsentry2)

reloaded_document = metsrw.METSDocument.fromtree(document.serialize())
# Third time's the charm - previously this failed
metsrw.METSDocument.fromtree(reloaded_document.serialize())
reloaded_document = metsrw.METSDocument.fromtree(reloaded_document.serialize())
amdsecs = reloaded_document.tree.findall("{http://www.loc.gov/METS/}amdSec")

assert len(amdsecs) == 2
assert amdsecs[0].get("ID") == "id_1"
assert amdsecs[1].get("ID") == "id_1"


class TestWholeMETS(TestCase):
Expand Down

0 comments on commit d95939c

Please sign in to comment.