Skip to content

Commit

Permalink
Merge pull request #15 from NaturalHistoryMuseum/josh/swap_mss_irn_fo…
Browse files Browse the repository at this point in the history
…r_guid

Switch MSS name from EMu IRN to GUID
  • Loading branch information
jrdh authored Jan 11, 2022
2 parents 4b20d89 + f4d9acc commit 1ba9de4
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 86 deletions.
49 changes: 28 additions & 21 deletions iiif/profiles/mss.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,12 @@ class MSSImageInfo(ImageInfo):
def __init__(self, profile_name: str, name: str, doc: dict):
"""
:param profile_name: the name of the profile
:param name: the name of the image, this will be EMu IRN
:param name: the name of the image, this will be a GUID
:param doc: the image's doc from the elasticsearch MSS index
"""
super().__init__(profile_name, name, doc.get('width', None), doc.get('height', None))
self.doc = doc
self.emu_irn = doc['id']
# the name of the original file as it appears on SCALE
self.original = doc['file']
# a list of the EMu generated derivatives of the original file. The list should already be
Expand Down Expand Up @@ -170,12 +171,12 @@ async def clean_up(self) -> int:

async def get_info(self, name: str) -> Optional[MSSImageInfo]:
"""
Given an image name (an EMu IRN) returns a MSSImageInfo object or None if the image can't be
Given an image name (a GUID) returns a MSSImageInfo object or None if the image can't be
found/isn't allowed to be accessed. If the image doesn't have width and height stored in the
elasticsearch index for whatever reason then the image will be retrieved and the size
extracted.
:param name: the EMu IRN of the image
:param name: the GUID of the image
:return: an MSSImageInfo instance or None
"""
doc = await self.get_mss_doc(name)
Expand Down Expand Up @@ -220,7 +221,7 @@ async def fetch_source(self, info: MSSImageInfo,

async def download():
with tempfile.NamedTemporaryFile() as f:
async for chunk in self._fetch_file(info.name, file, is_original):
async for chunk in self._fetch_file(info.emu_irn, file, is_original):
f.write(chunk)
f.flush()

Expand All @@ -245,46 +246,52 @@ async def download():

async def get_mss_doc(self, name: str, refresh: bool = False) -> Optional[dict]:
"""
Retrieves a MSS doc and ensures it's should be accessible. For a doc to be returned instead
Retrieves an MSS doc and ensures it's should be accessible. For a doc to be returned instead
of None:
- the EMu IRN must be valid according the to MSS (i.e. the APS)
- the doc must exist in the mss index in elasticsearch
- the EMu IRN (i.e. the name) must be found in either the specimen, index lot or
- the GUID (i.e. the name) must be unique and exist in the mss elasticsearch index
- the EMu IRN that the GUID maps to must be valid according the to the MSS (specifically
the APS)
- the GUID (i.e. the name) must be found in either the specimen, index lot or
artefacts indices as an associated media item
:param name: the image name (the EMu IRN)
:param name: the image name (a GUID)
:param refresh: whether to enforce a refresh of the doc from elasticsearch rather than using
the cache
:return: the mss doc as a dict or None
"""

async def get_doc() -> Optional[dict]:
# first, check that we have a document in the mss index
doc_url = f'{next(self.es_hosts)}/{self.mss_index}/_doc/{name}'
async with self.es_session.get(doc_url) as response:
search_url = f'{next(self.es_hosts)}/{self.mss_index}/_search'
search = Search().filter('term', **{'guid.keyword': name})
async with self.es_session.post(search_url, json=search.to_dict()) as response:
text = await response.text(encoding='utf-8')
info = json.loads(text)
if not info['found']:
result = json.loads(text)
if result['hits']['total'] == 1:
doc = result['hits']['hits'][0]['_source']
emu_irn = doc['id']
else:
# TODO: might be nice to indicate if the GUID is not unique?
return None

# next, check that the irn is associated with a record in the collection datasets
count_url = f'{next(self.es_hosts)}/{self.collection_indices}/_count'
search = Search() \
.filter('term', **{'data.associatedMedia._id': name}) \
.filter('term', **{'data.associatedMedia._id': emu_irn}) \
.filter('term', **{'meta.versions': int(time.time() * 1000)})
async with self.es_session.post(count_url, json=search.to_dict()) as response:
text = await response.text(encoding='utf-8')
if json.loads(text)['count'] == 0:
return None

# finally, check with mss that the irn is valid
async with self.mss_session.get(f'{self.mss_url}/{name}') as response:
async with self.mss_session.get(f'{self.mss_url}/{emu_irn}') as response:
if not response.ok:
return None

# if we get here then all 3 checks have passed
return info['_source']
return doc

if refresh:
self.doc_runner.expire(name)
Expand Down Expand Up @@ -314,33 +321,33 @@ async def stream_original(self, name: str, chunk_size: int = 4096, raise_errors=
doc = await self.get_mss_doc(name)
if doc is not None:
try:
async for chunk in self._fetch_file(name, doc['file'], True, chunk_size):
async for chunk in self._fetch_file(doc['id'], doc['file'], True, chunk_size):
yield chunk
except Exception as e:
if raise_errors:
raise e

async def _fetch_file(self, name: str, file: str, is_original: bool, chunk_size: int = 4096):
async def _fetch_file(self, emu_irn: str, file: str, is_original: bool, chunk_size: int = 4096):
"""
Fetches a file from MSS or, if the file is the original and doesn't exist in MSS, the old
dams servers. Once a source for the requested file is located, the bytes are yielded in
chunks of chunk_size.
:param name: the name of the file (the EMu IRN)
:param emu_irn: the EMu IRN of the multimedia record for the file
:param file: the name of the file to retrieve
:param is_original: whether the file is an original image (if it is and the file doesn't
exist in MSS we'll try looking for a damsurl file)
"""
async with AsyncExitStack() as stack:
file_url = f'{self.mss_url}/{name}/{quote(file)}'
file_url = f'{self.mss_url}/{emu_irn}/{quote(file)}'
response = await stack.enter_async_context(self.mss_session.get(file_url))

if response.status == 401:
raise HTTPException(status_code=401, detail=f'Access denied')

if response.status == 404 and is_original:
# check for a damsurl file
damsurl_file = f'{self.mss_url}/{name}/damsurl'
damsurl_file = f'{self.mss_url}/{emu_irn}/damsurl'
response = await stack.enter_async_context(self.mss_session.get(damsurl_file))
response.raise_for_status()

Expand Down
87 changes: 22 additions & 65 deletions tests/profiles/test_mss.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
#!/usr/bin/env python3
# encoding: utf-8

from unittest.mock import MagicMock, AsyncMock
from unittest.mock import AsyncMock, patch

import json
import pytest
from contextlib import asynccontextmanager

from iiif.profiles import MSSProfile
from iiif.profiles.mss import MSSImageInfo
Expand All @@ -14,6 +13,7 @@

def test_mss_choose_file_no_derivatives():
doc = {
'id': 23,
'file': 'original.tif',
'width': 1000,
'height': 2000,
Expand All @@ -25,6 +25,7 @@ def test_mss_choose_file_no_derivatives():

def test_mss_choose_file_with_derivatives():
doc = {
'id': 23,
'file': 'original.tif',
'width': 1000,
'height': 2000,
Expand Down Expand Up @@ -57,93 +58,49 @@ def create_es_mss_doc(doc):
return json.dumps(es_doc)


@asynccontextmanager
async def mock_mss_profile(config, assoc_media_count, mss_doc, aps_is_ok):
profile = MSSProfile('test', config, 'http://creativecommons.org/licenses/by/4.0/', [''], '', 1,
1, ['collections'])

count_doc = {'count': assoc_media_count}
es_post_mock_response = AsyncMock(text=AsyncMock(return_value=json.dumps(count_doc)))
es_post_mock = AsyncMock(return_value=es_post_mock_response)

es_get_mock_response = AsyncMock(text=AsyncMock(return_value=create_es_mss_doc(mss_doc)))
es_get_mock = AsyncMock(return_value=es_get_mock_response)

mss_get_mock = AsyncMock(return_value=MagicMock(ok=aps_is_ok))

original_sessions = (profile.es_session, profile.mss_session, profile.dm_session)
profile.es_session = MagicMock(
get=MagicMock(return_value=MagicMock(__aenter__=es_get_mock)),
post=MagicMock(return_value=MagicMock(__aenter__=es_post_mock))
)
profile.mss_session = MagicMock(
get=MagicMock(return_value=MagicMock(__aenter__=mss_get_mock)),
)

yield profile

profile.es_session, profile.mss_session, profile.dm_session = original_sessions
await profile.close()
@pytest.fixture
def mss_profile(config):
return MSSProfile('test', config, 'http://creativecommons.org/licenses/by/4.0/', [''], '', 1,
1, ['collections'])


# TODO: write more and better mss profile tests

class TestMSSProfileGetInfo:

@pytest.mark.asyncio
async def test_allowed(self, config):
async def test_allowed(self, mss_profile):
mss_doc = {
'id': 1234,
'file': 'beans.tiff',
'width': 4000,
'height': 1600,
}
async with mock_mss_profile(config, 1, mss_doc, True) as profile:
info = await profile.get_info('1234')
mock_get_mss_doc = AsyncMock(return_value=mss_doc)
with patch.object(mss_profile, 'get_mss_doc', mock_get_mss_doc):
info = await mss_profile.get_info('testname')
assert info is not None
assert info.name == '1234'
assert info.name == 'testname'
assert info.size == (4000, 1600)
assert info.original == 'beans.tiff'

@pytest.mark.asyncio
async def test_missing_collections_doc(self, config):
mss_doc = {
'id': 1234,
'file': 'beans.tiff',
'width': 4000,
'height': 1600,
}
async with mock_mss_profile(config, 0, mss_doc, True) as profile:
info = await profile.get_info('1234')
assert info is None

@pytest.mark.asyncio
async def test_aps_denies(self, config):
mss_doc = {
'id': 1234,
'file': 'beans.tiff',
'width': 4000,
'height': 1600,
}
async with mock_mss_profile(config, 1, mss_doc, False) as profile:
info = await profile.get_info('1234')
assert info is None

@pytest.mark.asyncio
async def test_missing_mss_doc(self, config):
async with mock_mss_profile(config, 1, None, True) as profile:
info = await profile.get_info('1234')
async def test_missing_collections_doc(self, mss_profile):
mock_get_mss_doc = AsyncMock(return_value=None)
with patch.object(mss_profile, 'get_mss_doc', mock_get_mss_doc):
info = await mss_profile.get_info('1234')
assert info is None

@pytest.mark.asyncio
async def test_missing_size(self, config):
async def test_missing_size(self, config, mss_profile):
mss_doc = {
'id': 1234,
'file': 'beans.tiff',
}
async with mock_mss_profile(config, 1, mss_doc, True) as profile:
source_path = create_image(config, 140, 504, 'mss', '1234')
profile.fetch_source = AsyncMock(return_value=source_path)
info = await profile.get_info('1234')
mock_get_mss_doc = AsyncMock(return_value=mss_doc)
with patch.object(mss_profile, 'get_mss_doc', mock_get_mss_doc):
source_path = create_image(config, 140, 504, 'mss', 'test')
mss_profile.fetch_source = AsyncMock(return_value=source_path)
info = await mss_profile.get_info('test')
assert info is not None
assert info.size == (140, 504)

0 comments on commit 1ba9de4

Please sign in to comment.