Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make code for storing large annotation elements clearer #961

Merged
merged 1 commit into from
Sep 13, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
# Some annotation elements can be very large. If they pass a size threshold,
# store part of them in an associated file. This is slower, so don't do it for
# small ones.
MAX_ELEMENT_CHECK = 100
MAX_ELEMENT_DOCUMENT = 10000
MAX_ELEMENT_USER_DOCUMENT = 1000000

Expand Down Expand Up @@ -501,6 +502,22 @@ def _boundingBox(self, element):
# simplify to points
return bbox

def _entryIsLarge(self, entry):
"""
Return True is an entry is alrge enough it might not fit in a mongo
document.

:param entry: the entry to check.
:returns: True if the entry is large.
"""
if len(entry['element'].get('points', entry['element'].get(
'values', []))) > MAX_ELEMENT_DOCUMENT:
return True
if ('user' in entry['element'] and
len(pickle.dumps(entry['element'], protocol=4)) > MAX_ELEMENT_USER_DOCUMENT):
return True
return False

def saveElementAsFile(self, annotation, entries):
"""
If an element has a large points or values array, save that array to an
Expand All @@ -510,28 +527,33 @@ def saveElementAsFile(self, annotation, entries):
:param entries: the database entries document. Modified.
"""
item = Item().load(annotation['itemId'], force=True)
element = entries[0]['element'].copy()
entries[0]['element'] = element
key = 'points' if 'points' in element else 'values'
# Use the highest protocol support by all python versions we support
data = pickle.dumps(element.pop(key), protocol=4)
elementFile = Upload().uploadFromFile(
io.BytesIO(data), size=len(data), name='_annotationElementData',
parentType='item', parent=item, user=None,
mimeType='application/json', attachParent=True)
userdata = None
if 'user' in element:
userdata = pickle.dumps(element.pop('user'), protocol=4)
userFile = Upload().uploadFromFile(
io.BytesIO(userdata), size=len(userdata), name='_annotationElementUserData',
for idx, entry in enumerate(entries[:MAX_ELEMENT_CHECK]):
if not self._entryIsLarge(entry):
continue
element = entry['element'].copy()
entries[idx]['element'] = element
key = 'points' if 'points' in element else 'values'
# Use the highest protocol support by all python versions we
# support
data = pickle.dumps(element.pop(key), protocol=4)
elementFile = Upload().uploadFromFile(
io.BytesIO(data), size=len(data), name='_annotationElementData',
parentType='item', parent=item, user=None,
mimeType='application/json', attachParent=True)
entries[0]['datafile'] = {
'key': key,
'fileId': elementFile['_id'],
}
if userdata:
entries[0]['datafile']['userFileId'] = userFile['_id']
userdata = None
if 'user' in element:
userdata = pickle.dumps(element.pop('user'), protocol=4)
userFile = Upload().uploadFromFile(
io.BytesIO(userdata), size=len(userdata), name='_annotationElementUserData',
parentType='item', parent=item, user=None,
mimeType='application/json', attachParent=True)
entry['datafile'] = {
'key': key,
'fileId': elementFile['_id'],
}
if userdata:
entry['datafile']['userFileId'] = userFile['_id']
logger.debug('Storing element as file (%r)', entry)

def updateElementChunk(self, elements, chunk, chunkSize, annotation, now):
"""
Expand All @@ -548,10 +570,8 @@ def updateElementChunk(self, elements, chunk, chunkSize, annotation, now):
'element': element
} for element in elements[chunk:chunk + chunkSize]]
prepTime = time.time() - chunkStartTime
if (len(entries) == 1 and (len(entries[0]['element'].get(
'points', entries[0]['element'].get('values', []))) > MAX_ELEMENT_DOCUMENT or (
'user' in entries[0]['element'] and
len(pickle.dumps(entries[0]['element'], protocol=4)) > MAX_ELEMENT_USER_DOCUMENT))):
if (len(entries) <= MAX_ELEMENT_CHECK and any(
self._entryIsLarge(entry) for entry in entries[:MAX_ELEMENT_CHECK])):
self.saveElementAsFile(annotation, entries)
res = self.collection.insert_many(entries, ordered=False)
for pos, entry in enumerate(entries):
Expand Down