diff --git a/iiif/profiles/mss.py b/iiif/profiles/mss.py index fe72989..d63c14e 100644 --- a/iiif/profiles/mss.py +++ b/iiif/profiles/mss.py @@ -28,11 +28,12 @@ class MSSImageInfo(ImageInfo): def __init__(self, profile_name: str, name: str, doc: dict): """ :param profile_name: the name of the profile - :param name: the name of the image, this will be EMu IRN + :param name: the name of the image, this will be a GUID :param doc: the image's doc from the elasticsearch MSS index """ super().__init__(profile_name, name, doc.get('width', None), doc.get('height', None)) self.doc = doc + self.emu_irn = doc['id'] # the name of the original file as it appears on SCALE self.original = doc['file'] # a list of the EMu generated derivatives of the original file. The list should already be @@ -170,12 +171,12 @@ async def clean_up(self) -> int: async def get_info(self, name: str) -> Optional[MSSImageInfo]: """ - Given an image name (an EMu IRN) returns a MSSImageInfo object or None if the image can't be + Given an image name (a GUID) returns a MSSImageInfo object or None if the image can't be found/isn't allowed to be accessed. If the image doesn't have width and height stored in the elasticsearch index for whatever reason then the image will be retrieved and the size extracted. - :param name: the EMu IRN of the image + :param name: the GUID of the image :return: an MSSImageInfo instance or None """ doc = await self.get_mss_doc(name) @@ -220,7 +221,7 @@ async def fetch_source(self, info: MSSImageInfo, async def download(): with tempfile.NamedTemporaryFile() as f: - async for chunk in self._fetch_file(info.name, file, is_original): + async for chunk in self._fetch_file(info.emu_irn, file, is_original): f.write(chunk) f.flush() @@ -245,15 +246,16 @@ async def download(): async def get_mss_doc(self, name: str, refresh: bool = False) -> Optional[dict]: """ - Retrieves a MSS doc and ensures it's should be accessible. For a doc to be returned instead + Retrieves an MSS doc and ensures it's should be accessible. For a doc to be returned instead of None: - - the EMu IRN must be valid according the to MSS (i.e. the APS) - - the doc must exist in the mss index in elasticsearch - - the EMu IRN (i.e. the name) must be found in either the specimen, index lot or + - the GUID (i.e. the name) must be unique and exist in the mss elasticsearch index + - the EMu IRN that the GUID maps to must be valid according the to the MSS (specifically + the APS) + - the GUID (i.e. the name) must be found in either the specimen, index lot or artefacts indices as an associated media item - :param name: the image name (the EMu IRN) + :param name: the image name (a GUID) :param refresh: whether to enforce a refresh of the doc from elasticsearch rather than using the cache :return: the mss doc as a dict or None @@ -261,17 +263,22 @@ async def get_mss_doc(self, name: str, refresh: bool = False) -> Optional[dict]: async def get_doc() -> Optional[dict]: # first, check that we have a document in the mss index - doc_url = f'{next(self.es_hosts)}/{self.mss_index}/_doc/{name}' - async with self.es_session.get(doc_url) as response: + search_url = f'{next(self.es_hosts)}/{self.mss_index}/_search' + search = Search().filter('term', **{'guid.keyword': name}) + async with self.es_session.post(search_url, json=search.to_dict()) as response: text = await response.text(encoding='utf-8') - info = json.loads(text) - if not info['found']: + result = json.loads(text) + if result['hits']['total'] == 1: + doc = result['hits']['hits'][0]['_source'] + emu_irn = doc['id'] + else: + # TODO: might be nice to indicate if the GUID is not unique? return None # next, check that the irn is associated with a record in the collection datasets count_url = f'{next(self.es_hosts)}/{self.collection_indices}/_count' search = Search() \ - .filter('term', **{'data.associatedMedia._id': name}) \ + .filter('term', **{'data.associatedMedia._id': emu_irn}) \ .filter('term', **{'meta.versions': int(time.time() * 1000)}) async with self.es_session.post(count_url, json=search.to_dict()) as response: text = await response.text(encoding='utf-8') @@ -279,12 +286,12 @@ async def get_doc() -> Optional[dict]: return None # finally, check with mss that the irn is valid - async with self.mss_session.get(f'{self.mss_url}/{name}') as response: + async with self.mss_session.get(f'{self.mss_url}/{emu_irn}') as response: if not response.ok: return None # if we get here then all 3 checks have passed - return info['_source'] + return doc if refresh: self.doc_runner.expire(name) @@ -314,25 +321,25 @@ async def stream_original(self, name: str, chunk_size: int = 4096, raise_errors= doc = await self.get_mss_doc(name) if doc is not None: try: - async for chunk in self._fetch_file(name, doc['file'], True, chunk_size): + async for chunk in self._fetch_file(doc['id'], doc['file'], True, chunk_size): yield chunk except Exception as e: if raise_errors: raise e - async def _fetch_file(self, name: str, file: str, is_original: bool, chunk_size: int = 4096): + async def _fetch_file(self, emu_irn: str, file: str, is_original: bool, chunk_size: int = 4096): """ Fetches a file from MSS or, if the file is the original and doesn't exist in MSS, the old dams servers. Once a source for the requested file is located, the bytes are yielded in chunks of chunk_size. - :param name: the name of the file (the EMu IRN) + :param emu_irn: the EMu IRN of the multimedia record for the file :param file: the name of the file to retrieve :param is_original: whether the file is an original image (if it is and the file doesn't exist in MSS we'll try looking for a damsurl file) """ async with AsyncExitStack() as stack: - file_url = f'{self.mss_url}/{name}/{quote(file)}' + file_url = f'{self.mss_url}/{emu_irn}/{quote(file)}' response = await stack.enter_async_context(self.mss_session.get(file_url)) if response.status == 401: @@ -340,7 +347,7 @@ async def _fetch_file(self, name: str, file: str, is_original: bool, chunk_size: if response.status == 404 and is_original: # check for a damsurl file - damsurl_file = f'{self.mss_url}/{name}/damsurl' + damsurl_file = f'{self.mss_url}/{emu_irn}/damsurl' response = await stack.enter_async_context(self.mss_session.get(damsurl_file)) response.raise_for_status() diff --git a/tests/profiles/test_mss.py b/tests/profiles/test_mss.py index 2a618c9..1ac2921 100644 --- a/tests/profiles/test_mss.py +++ b/tests/profiles/test_mss.py @@ -1,11 +1,10 @@ #!/usr/bin/env python3 # encoding: utf-8 -from unittest.mock import MagicMock, AsyncMock +from unittest.mock import AsyncMock, patch import json import pytest -from contextlib import asynccontextmanager from iiif.profiles import MSSProfile from iiif.profiles.mss import MSSImageInfo @@ -14,6 +13,7 @@ def test_mss_choose_file_no_derivatives(): doc = { + 'id': 23, 'file': 'original.tif', 'width': 1000, 'height': 2000, @@ -25,6 +25,7 @@ def test_mss_choose_file_no_derivatives(): def test_mss_choose_file_with_derivatives(): doc = { + 'id': 23, 'file': 'original.tif', 'width': 1000, 'height': 2000, @@ -57,33 +58,10 @@ def create_es_mss_doc(doc): return json.dumps(es_doc) -@asynccontextmanager -async def mock_mss_profile(config, assoc_media_count, mss_doc, aps_is_ok): - profile = MSSProfile('test', config, 'http://creativecommons.org/licenses/by/4.0/', [''], '', 1, - 1, ['collections']) - - count_doc = {'count': assoc_media_count} - es_post_mock_response = AsyncMock(text=AsyncMock(return_value=json.dumps(count_doc))) - es_post_mock = AsyncMock(return_value=es_post_mock_response) - - es_get_mock_response = AsyncMock(text=AsyncMock(return_value=create_es_mss_doc(mss_doc))) - es_get_mock = AsyncMock(return_value=es_get_mock_response) - - mss_get_mock = AsyncMock(return_value=MagicMock(ok=aps_is_ok)) - - original_sessions = (profile.es_session, profile.mss_session, profile.dm_session) - profile.es_session = MagicMock( - get=MagicMock(return_value=MagicMock(__aenter__=es_get_mock)), - post=MagicMock(return_value=MagicMock(__aenter__=es_post_mock)) - ) - profile.mss_session = MagicMock( - get=MagicMock(return_value=MagicMock(__aenter__=mss_get_mock)), - ) - - yield profile - - profile.es_session, profile.mss_session, profile.dm_session = original_sessions - await profile.close() +@pytest.fixture +def mss_profile(config): + return MSSProfile('test', config, 'http://creativecommons.org/licenses/by/4.0/', [''], '', 1, + 1, ['collections']) # TODO: write more and better mss profile tests @@ -91,59 +69,38 @@ async def mock_mss_profile(config, assoc_media_count, mss_doc, aps_is_ok): class TestMSSProfileGetInfo: @pytest.mark.asyncio - async def test_allowed(self, config): + async def test_allowed(self, mss_profile): mss_doc = { 'id': 1234, 'file': 'beans.tiff', 'width': 4000, 'height': 1600, } - async with mock_mss_profile(config, 1, mss_doc, True) as profile: - info = await profile.get_info('1234') + mock_get_mss_doc = AsyncMock(return_value=mss_doc) + with patch.object(mss_profile, 'get_mss_doc', mock_get_mss_doc): + info = await mss_profile.get_info('testname') assert info is not None - assert info.name == '1234' + assert info.name == 'testname' assert info.size == (4000, 1600) assert info.original == 'beans.tiff' @pytest.mark.asyncio - async def test_missing_collections_doc(self, config): - mss_doc = { - 'id': 1234, - 'file': 'beans.tiff', - 'width': 4000, - 'height': 1600, - } - async with mock_mss_profile(config, 0, mss_doc, True) as profile: - info = await profile.get_info('1234') - assert info is None - - @pytest.mark.asyncio - async def test_aps_denies(self, config): - mss_doc = { - 'id': 1234, - 'file': 'beans.tiff', - 'width': 4000, - 'height': 1600, - } - async with mock_mss_profile(config, 1, mss_doc, False) as profile: - info = await profile.get_info('1234') - assert info is None - - @pytest.mark.asyncio - async def test_missing_mss_doc(self, config): - async with mock_mss_profile(config, 1, None, True) as profile: - info = await profile.get_info('1234') + async def test_missing_collections_doc(self, mss_profile): + mock_get_mss_doc = AsyncMock(return_value=None) + with patch.object(mss_profile, 'get_mss_doc', mock_get_mss_doc): + info = await mss_profile.get_info('1234') assert info is None @pytest.mark.asyncio - async def test_missing_size(self, config): + async def test_missing_size(self, config, mss_profile): mss_doc = { 'id': 1234, 'file': 'beans.tiff', } - async with mock_mss_profile(config, 1, mss_doc, True) as profile: - source_path = create_image(config, 140, 504, 'mss', '1234') - profile.fetch_source = AsyncMock(return_value=source_path) - info = await profile.get_info('1234') + mock_get_mss_doc = AsyncMock(return_value=mss_doc) + with patch.object(mss_profile, 'get_mss_doc', mock_get_mss_doc): + source_path = create_image(config, 140, 504, 'mss', 'test') + mss_profile.fetch_source = AsyncMock(return_value=source_path) + info = await mss_profile.get_info('test') assert info is not None assert info.size == (140, 504)