From 129dfe4e51f5ab693899ec04daf88561dc1208a1 Mon Sep 17 00:00:00 2001 From: Peter Kerpedjiev Date: Tue, 19 Nov 2019 19:44:29 -0800 Subject: [PATCH 01/14] Added a version to the CHANGELOG --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c407ee70..5cc11e4e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -Future version +v1.14.0 - Added support for bigBed files From ee20110504c1dff686156129d2e9bd624829ce58 Mon Sep 17 00:00:00 2001 From: Peter Kerpedjiev Date: Tue, 7 Apr 2020 19:33:41 -0700 Subject: [PATCH 02/14] Updated the CHANGELOG --- CHANGELOG.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d01b1735..a7a1b864 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,11 @@ +v1.14.1 + +- Always consider proxy headers (X-Forwarded-Host, X-Forwarded-Proto) for redirect URL construction + v1.14.0 - Added support for bigBed files - Update readme installation instructions and troubleshooting instructions for macOS 10.15 -- Always consider proxy headers (X-Forwarded-Host, X-Forwarded-Proto) for redirect URL construction v1.13.0 From 9824bcc5832ac90a1e6cc722596e8ac3f41a9ef1 Mon Sep 17 00:00:00 2001 From: mkeller <7525285+keller-mark@users.noreply.github.com> Date: Wed, 27 May 2020 14:27:16 -0400 Subject: [PATCH 03/14] POST request for aggregation --- environment.yml | 5 ++--- requirements.txt | 2 +- tilesets/generate_tiles.py | 24 +++++++++++++++++++++--- tilesets/views.py | 36 +++++++++++++++++++++++++++++++----- 4 files changed, 55 insertions(+), 12 deletions(-) diff --git a/environment.yml b/environment.yml index 18b27754..40519071 100644 --- a/environment.yml +++ b/environment.yml @@ -1,14 +1,13 @@ -name: higlass-server +name: cistrome-explorer-higlass-server channels: - conda-forge - bioconda - defaults - dependencies: - python>=3.6 - pip - pip: - - pybbi==0.2.0 + - pybbi==0.2.2 - bumpversion==0.5.3 - CacheControl==0.12.4 - cooler==0.8.6 diff --git a/requirements.txt b/requirements.txt index 0f98f24a..317a2380 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -pybbi==0.2.0 +pybbi==0.2.2 bumpversion==0.5.3 CacheControl==0.12.4 cooler==0.8.6 diff --git a/tilesets/generate_tiles.py b/tilesets/generate_tiles.py index a14a645c..8694380d 100644 --- a/tilesets/generate_tiles.py +++ b/tilesets/generate_tiles.py @@ -109,7 +109,7 @@ def extract_tileset_uid(tile_id): def get_tileset_filetype(tileset): return tileset.filetype -def generate_1d_tiles(filename, tile_ids, get_data_function): +def generate_1d_tiles(filename, tile_ids, get_data_function, agg_info): ''' Generate a set of tiles for the given tile_ids. @@ -122,12 +122,23 @@ def generate_1d_tiles(filename, tile_ids, get_data_function): to be retrieved get_data_function: lambda A function which retrieves the data for this tile + agg_info: `dict` + A dict containing keys `agg_groups` (2D array), `agg_func` (str). Returns ------- tile_list: [(tile_id, tile_data),...] A list of tile_id, tile_data tuples ''' + + agg_func_map = { + "sum": lambda x: np.sum(x, axis=0), + "mean": lambda x: np.mean(x, axis=0), + "median": lambda x: np.median(x, axis=0), + "std": lambda x: np.std(x, axis=0), + "var": lambda x: np.var(x, axis=0) + } + generated_tiles = [] for tile_id in tile_ids: @@ -136,6 +147,12 @@ def generate_1d_tiles(filename, tile_ids, get_data_function): dense = get_data_function(filename, tile_position) + if agg_info != None: + agg_func_name = agg_info["agg_func"] + agg_group_arr = agg_info["agg_groups"] + assert(agg_func_name in agg_func_map) + dense = np.array(list(map(agg_func_map[agg_func_name], [ dense[arr] for arr in agg_group_arr ]))) + if len(dense): max_dense = max(dense.reshape(-1,)) min_dense = min(dense.reshape(-1,)) @@ -488,7 +505,7 @@ def generate_tiles(tileset_tile_ids): tile_list: [(tile_id, tile_data),...] A list of tile_id, tile_data tuples ''' - tileset, tile_ids, raw = tileset_tile_ids + tileset, tile_ids, raw, agg_info = tileset_tile_ids if tileset.filetype == 'hitile': return generate_hitile_tiles(tileset, tile_ids) @@ -512,7 +529,8 @@ def generate_tiles(tileset_tile_ids): return generate_1d_tiles( tileset.datafile.path, tile_ids, - ctmu.get_single_tile) + ctmu.get_single_tile, + agg_info) elif tileset.filetype == 'imtiles': return hgim.get_tiles(tileset.datafile.path, tile_ids, raw) elif tileset.filetype == 'bam': diff --git a/tilesets/views.py b/tilesets/views.py index f30b3de9..01d4af86 100644 --- a/tilesets/views.py +++ b/tilesets/views.py @@ -51,6 +51,7 @@ import slugid import urllib +import hashlib try: import cPickle as pickle @@ -376,7 +377,7 @@ def add_transform_type(tile_id): return new_tile_id -@api_view(['GET']) +@api_view(['GET', 'POST']) def tiles(request): '''Retrieve a set of tiles @@ -393,6 +394,23 @@ def tiles(request): (tile_id, tile_data) items. ''' + tileset_to_agg_info = dict() + if request.method == 'POST': + # If this is a POST request, get the aggregation groups from the request body. + try: + tileset_to_agg_info = json.loads(request.body.decode('utf-8')) + for tileset_id, agg_info in tileset_to_agg_info.items(): + assert("agg_groups" in agg_info.keys()) + assert("agg_func" in agg_info.keys()) + agg_groups = agg_info["agg_groups"] + agg_func_name = agg_info["agg_func"] + agg_hash = '.' + hashlib.md5(json.dumps(agg_groups).encode('utf-8')).hexdigest() + '.' + agg_func_name + tileset_to_agg_info[tileset_id]["agg_hash"] = agg_hash + except: + return JsonResponse({ + 'error': 'Unable to parse request body as JSON' + }, status=rfs.HTTP_400_BAD_REQUEST) + # create a set so that we don't fetch the same tile multiple times tileids_to_fetch = set(request.GET.getlist("d")) # with ProcessPoolExecutor() as executor: @@ -435,7 +453,11 @@ def tiles(request): # see if the tile is cached tile_value = None try: - tile_value = rdb.get(tile_id) + if tileset_uuid in tileset_to_agg_info: + agg_info = tileset_to_agg_info[tileset_uuid] + tile_value = rdb.get(tile_id + agg_info["agg_func"]) + else: + tile_value = rdb.get(tile_id) except Exception as ex: # there was an error accessing the cache server # log the error and carry forward fetching the tile @@ -454,7 +476,7 @@ def tiles(request): # fetch the tiles tilesets = [tilesets[tu] for tu in tileids_by_tileset] - accessible_tilesets = [(t, tileids_by_tileset[t.uuid], raw) for t in tilesets if ((not t.private) or request.user == t.owner)] + accessible_tilesets = [(t, tileids_by_tileset[t.uuid], raw, tileset_to_agg_info.get(t.uuid, None)) for t in tilesets if ((not t.private) or request.user == t.owner)] #pool = mp.Pool(6) @@ -477,9 +499,13 @@ def tiles(request): tiles_to_return = {} for (tile_id, tile_value) in generated_tiles: - + tileset_uuid = tgt.extract_tileset_uid(tile_id) try: - rdb.set(tile_id, pickle.dumps(tile_value)) + if tileset_uuid in tileset_to_agg_info: + agg_info = tileset_to_agg_info[tileset_uuid] + rdb.set(tile_id + agg_info["agg_func"], pickle.dumps(tile_value)) + else: + rdb.set(tile_id, pickle.dumps(tile_value)) except Exception as ex: # error caching a tile # log the error and carry forward, this isn't critical From 7b265eaa746dd53d346d55e270f131bcd5bb3408 Mon Sep 17 00:00:00 2001 From: mkeller <7525285+keller-mark@users.noreply.github.com> Date: Wed, 27 May 2020 18:25:09 -0400 Subject: [PATCH 04/14] Changelog --- CHANGELOG.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a7a1b864..f82fc898 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,11 +1,8 @@ -v1.14.1 - -- Always consider proxy headers (X-Forwarded-Host, X-Forwarded-Proto) for redirect URL construction - -v1.14.0 +Future version - Added support for bigBed files - Update readme installation instructions and troubleshooting instructions for macOS 10.15 +- Always consider proxy headers (X-Forwarded-Host, X-Forwarded-Proto) for redirect URL construction v1.13.0 From d953b092946440496906a55bba3cfebd034be9c0 Mon Sep 17 00:00:00 2001 From: mkeller <7525285+keller-mark@users.noreply.github.com> Date: Thu, 28 May 2020 16:12:45 -0400 Subject: [PATCH 05/14] Revert environment name --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 40519071..d1e32669 100644 --- a/environment.yml +++ b/environment.yml @@ -1,4 +1,4 @@ -name: cistrome-explorer-higlass-server +name: higlass-server channels: - conda-forge - bioconda From 2917ea96c0bb9ef72a850c6877595f377eded7ec Mon Sep 17 00:00:00 2001 From: mkeller <7525285+keller-mark@users.noreply.github.com> Date: Thu, 28 May 2020 16:16:29 -0400 Subject: [PATCH 06/14] Changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f82fc898..d2768362 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ Future version - Added support for bigBed files - Update readme installation instructions and troubleshooting instructions for macOS 10.15 - Always consider proxy headers (X-Forwarded-Host, X-Forwarded-Proto) for redirect URL construction +- Added support for server-side aggregation of multivec tiles by sending a `POST` request to the `/tiles` endpoint, where the body contains a JSON object mapping tileset UIDs to objects with properties `agg_groups` (a 2D array where each subarray is a group of rows to aggregate) and `agg_func` (the name of an aggregation function). v1.13.0 From 2f7d3527b7f5720f429b2158d0854456826d7f3c Mon Sep 17 00:00:00 2001 From: mkeller <7525285+keller-mark@users.noreply.github.com> Date: Thu, 28 May 2020 17:02:28 -0400 Subject: [PATCH 07/14] Test --- .gitignore | 1 + tilesets/tests.py | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/.gitignore b/.gitignore index c4a15651..1989832f 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ static/ hgs-static/ config.json +.envrc \ No newline at end of file diff --git a/tilesets/tests.py b/tilesets/tests.py index a895f6cd..14c284f8 100644 --- a/tilesets/tests.py +++ b/tilesets/tests.py @@ -497,6 +497,39 @@ def test_get_tile(self): r = base64.decodestring(content['a.11.0']['dense'].encode('utf-8')) q = np.frombuffer(r, dtype=np.float16) + assert q.shape[0] == 512 + + def test_get_tile_with_aggregation(self): + self.user1 = dcam.User.objects.create_user( + username='user1', password='pass' + ) + upload_file = open('data/chr21.KL.bed.multires.mv5', 'rb') + mv = tm.Tileset.objects.create( + datafile=dcfu.SimpleUploadedFile( + upload_file.name, upload_file.read() + ), + filetype='multivec', + datatype='multivec', + coordSystem="hg38", + owner=self.user1, + uuid='chr21_KL' + ) + + body = { + 'chr21_KL': { + 'agg_groups': [[0, 1], [2, 3, 4], [5, 6]], + 'agg_func': 'sum' + } + } + + ret = self.client.post('/api/v1/tiles/?d=chr21_KL.0.0', json.dumps(body), content_type="application/json") + assert ret.status_code == 200 + content = json.loads(ret.content.decode('utf-8')) + r = base64.decodestring(content['chr21_KL.0.0']['dense'].encode('utf-8')) + q = np.frombuffer(r, dtype=np.float16) + + assert q.shape[0] == 768 + class ChromosomeSizes(dt.TestCase): def test_list_chromsizes(self): From 218540a094601e099e83575d45aa5887e1d3c431 Mon Sep 17 00:00:00 2001 From: mkeller <7525285+keller-mark@users.noreply.github.com> Date: Fri, 29 May 2020 10:39:43 -0400 Subject: [PATCH 08/14] Respond to feedback --- tilesets/generate_tiles.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tilesets/generate_tiles.py b/tilesets/generate_tiles.py index 8694380d..95fac778 100644 --- a/tilesets/generate_tiles.py +++ b/tilesets/generate_tiles.py @@ -136,7 +136,9 @@ def generate_1d_tiles(filename, tile_ids, get_data_function, agg_info): "mean": lambda x: np.mean(x, axis=0), "median": lambda x: np.median(x, axis=0), "std": lambda x: np.std(x, axis=0), - "var": lambda x: np.var(x, axis=0) + "var": lambda x: np.var(x, axis=0), + "max": lambda x: np.amax(x, axis=0), + "min": lambda x: np.amin(x, axis=0), } generated_tiles = [] @@ -149,7 +151,7 @@ def generate_1d_tiles(filename, tile_ids, get_data_function, agg_info): if agg_info != None: agg_func_name = agg_info["agg_func"] - agg_group_arr = agg_info["agg_groups"] + agg_group_arr = [ x if type(x) == list else [x] for x in agg_info["agg_groups"] ] assert(agg_func_name in agg_func_map) dense = np.array(list(map(agg_func_map[agg_func_name], [ dense[arr] for arr in agg_group_arr ]))) @@ -494,11 +496,19 @@ def generate_tiles(tileset_tile_ids): Parameters ---------- + tileset_tile_ids: tuple + A four-tuple containing the following parameters. tileset: tilesets.models.Tileset object The tileset that the tile ids should be retrieved from tile_ids: [str,...] A list of tile_ids (e.g. xyx.0.0.1) identifying the tiles to be retrieved + raw: str or False + The value of the GET request parameter `raw`. + agg_info: dict or None + A dict containing the keys `agg_groups` and `agg_func`, + where `agg_groups` is a 2D array of integers + and `agg_func` is a string name of an aggregation function. Returns ------- From 43ec962c2dc54029b63b1e8959da90e953897c79 Mon Sep 17 00:00:00 2001 From: mkeller <7525285+keller-mark@users.noreply.github.com> Date: Fri, 29 May 2020 16:27:40 -0400 Subject: [PATCH 09/14] WIP: generalize POST body --- environment.yml | 1 + tilesets/generate_tiles.py | 25 +++++------- tilesets/views.py | 82 ++++++++++++++++++++++++++++---------- 3 files changed, 73 insertions(+), 35 deletions(-) diff --git a/environment.yml b/environment.yml index d1e32669..a77c55fe 100644 --- a/environment.yml +++ b/environment.yml @@ -18,6 +18,7 @@ dependencies: - djangorestframework==3.9.1 - h5py==2.6.0 - higlass-python==0.2.1 + - jsonschema==3.2.0 - numba==0.46.0 - numpy==1.17.3 - pandas==0.23.4 diff --git a/tilesets/generate_tiles.py b/tilesets/generate_tiles.py index 95fac778..a6cbcdbb 100644 --- a/tilesets/generate_tiles.py +++ b/tilesets/generate_tiles.py @@ -109,7 +109,7 @@ def extract_tileset_uid(tile_id): def get_tileset_filetype(tileset): return tileset.filetype -def generate_1d_tiles(filename, tile_ids, get_data_function, agg_info): +def generate_1d_tiles(filename, tile_ids, get_data_function, tileset_options): ''' Generate a set of tiles for the given tile_ids. @@ -122,8 +122,8 @@ def generate_1d_tiles(filename, tile_ids, get_data_function, agg_info): to be retrieved get_data_function: lambda A function which retrieves the data for this tile - agg_info: `dict` - A dict containing keys `agg_groups` (2D array), `agg_func` (str). + tileset_options: dict or None + An optional dict containing options, including aggregation options. Returns ------- @@ -149,12 +149,11 @@ def generate_1d_tiles(filename, tile_ids, get_data_function, agg_info): dense = get_data_function(filename, tile_position) - if agg_info != None: - agg_func_name = agg_info["agg_func"] - agg_group_arr = [ x if type(x) == list else [x] for x in agg_info["agg_groups"] ] - assert(agg_func_name in agg_func_map) + if tileset_options != None and "aggGroups" in tileset_options and "aggFunc" in tileset_options: + agg_func_name = tileset_options["aggFunc"] + agg_group_arr = [ x if type(x) == list else [x] for x in tileset_options["aggGroups"] ] dense = np.array(list(map(agg_func_map[agg_func_name], [ dense[arr] for arr in agg_group_arr ]))) - + if len(dense): max_dense = max(dense.reshape(-1,)) min_dense = min(dense.reshape(-1,)) @@ -505,17 +504,15 @@ def generate_tiles(tileset_tile_ids): to be retrieved raw: str or False The value of the GET request parameter `raw`. - agg_info: dict or None - A dict containing the keys `agg_groups` and `agg_func`, - where `agg_groups` is a 2D array of integers - and `agg_func` is a string name of an aggregation function. + tileset_options: dict or None + An optional dict containing tileset options, including aggregation options. Returns ------- tile_list: [(tile_id, tile_data),...] A list of tile_id, tile_data tuples ''' - tileset, tile_ids, raw, agg_info = tileset_tile_ids + tileset, tile_ids, raw, tileset_options = tileset_tile_ids if tileset.filetype == 'hitile': return generate_hitile_tiles(tileset, tile_ids) @@ -540,7 +537,7 @@ def generate_tiles(tileset_tile_ids): tileset.datafile.path, tile_ids, ctmu.get_single_tile, - agg_info) + tileset_options) elif tileset.filetype == 'imtiles': return hgim.get_tiles(tileset.datafile.path, tile_ids, raw) elif tileset.filetype == 'bam': diff --git a/tilesets/views.py b/tilesets/views.py index 01d4af86..355900b0 100644 --- a/tilesets/views.py +++ b/tilesets/views.py @@ -394,25 +394,65 @@ def tiles(request): (tile_id, tile_data) items. ''' - tileset_to_agg_info = dict() + tileids_to_fetch = set() + tileset_to_options = dict() + if request.method == 'POST': - # If this is a POST request, get the aggregation groups from the request body. + # If this is a POST request, parse the request body. try: - tileset_to_agg_info = json.loads(request.body.decode('utf-8')) - for tileset_id, agg_info in tileset_to_agg_info.items(): - assert("agg_groups" in agg_info.keys()) - assert("agg_func" in agg_info.keys()) - agg_groups = agg_info["agg_groups"] - agg_func_name = agg_info["agg_func"] - agg_hash = '.' + hashlib.md5(json.dumps(agg_groups).encode('utf-8')).hexdigest() + '.' + agg_func_name - tileset_to_agg_info[tileset_id]["agg_hash"] = agg_hash + body = json.loads(request.body.decode('utf-8')) except: return JsonResponse({ - 'error': 'Unable to parse request body as JSON' + 'error': 'Unable to parse request body as JSON.' }, status=rfs.HTTP_400_BAD_REQUEST) - - # create a set so that we don't fetch the same tile multiple times - tileids_to_fetch = set(request.GET.getlist("d")) + # Validate the contents of the JSON request body. + if type(body) is not list: + return JsonResponse({ + 'error': 'Expected request body to be a JSON array.' + }, status=rfs.HTTP_400_BAD_REQUEST) + # Iterate over each tileset in the request body array. + for tileset_info in body: + # Ensure that the array consists of JSON objects with the expected properties. + if type(tileset_info) is not dict: + return JsonResponse({ + 'error': 'Expected request body array items to be objects.' + }, status=rfs.HTTP_400_BAD_REQUEST) + if "tilesetUid" not in tileset_info: + return JsonResponse({ + 'error': "Expected tileset info object to have property 'tilesetUid'." + }, status=rfs.HTTP_400_BAD_REQUEST) + if type(tileset_info["tilesetUid"]) is not str: + return JsonResponse({ + 'error': "Expected tileset property 'tilesetUid' type to be string." + }, status=rfs.HTTP_400_BAD_REQUEST) + if "tileIds" not in tileset_info: + return JsonResponse({ + 'error': "Expected tileset info object to have property 'tileIds'." + }, status=rfs.HTTP_400_BAD_REQUEST) + if type(tileset_info["tileIds"]) is not list: + return JsonResponse({ + 'error': "Expected tileset property 'tileIds' type to be array." + }, status=rfs.HTTP_400_BAD_REQUEST) + if "options" not in tileset_info: + return JsonResponse({ + 'error': "Expected tileset info object to have property 'options'." + }, status=rfs.HTTP_400_BAD_REQUEST) + if type(tileset_info["options"]) is not dict: + return JsonResponse({ + 'error': "Expected tileset info property 'options' type to be object." + }, status=rfs.HTTP_400_BAD_REQUEST) + + tileset_uid = tileset_info["tilesetUid"] # can assume it exists and is str + tile_ids = tileset_info["tileIds"] # can assume it exists and is list + tileset_options = tileset_info["options"] # can assume it exists and is dict + tileids_to_fetch.update(tile_ids) + tileset_to_options[tileset_uid] = tileset_options + tileset_to_options[tileset_uid]["options_hash"] = hashlib.md5(json.dumps(tileset_options).encode('utf-8')).hexdigest() + + elif request.method == 'GET': + # create a set so that we don't fetch the same tile multiple times + tileids_to_fetch = set(request.GET.getlist("d")) + # with ProcessPoolExecutor() as executor: # res = executor.map(parallelize, hargs) ''' @@ -453,9 +493,9 @@ def tiles(request): # see if the tile is cached tile_value = None try: - if tileset_uuid in tileset_to_agg_info: - agg_info = tileset_to_agg_info[tileset_uuid] - tile_value = rdb.get(tile_id + agg_info["agg_func"]) + if tileset_uuid in tileset_to_options: + tileset_options = tileset_to_options[tileset_uuid] + tile_value = rdb.get(tile_id + tileset_options["options_hash"]) else: tile_value = rdb.get(tile_id) except Exception as ex: @@ -476,7 +516,7 @@ def tiles(request): # fetch the tiles tilesets = [tilesets[tu] for tu in tileids_by_tileset] - accessible_tilesets = [(t, tileids_by_tileset[t.uuid], raw, tileset_to_agg_info.get(t.uuid, None)) for t in tilesets if ((not t.private) or request.user == t.owner)] + accessible_tilesets = [(t, tileids_by_tileset[t.uuid], raw, tileset_to_options.get(t.uuid, None)) for t in tilesets if ((not t.private) or request.user == t.owner)] #pool = mp.Pool(6) @@ -501,9 +541,9 @@ def tiles(request): for (tile_id, tile_value) in generated_tiles: tileset_uuid = tgt.extract_tileset_uid(tile_id) try: - if tileset_uuid in tileset_to_agg_info: - agg_info = tileset_to_agg_info[tileset_uuid] - rdb.set(tile_id + agg_info["agg_func"], pickle.dumps(tile_value)) + if tileset_uuid in tileset_to_options: + tileset_options = tileset_to_options[tileset_uuid] + rdb.set(tile_id + tileset_options["options_hash"], pickle.dumps(tile_value)) else: rdb.set(tile_id, pickle.dumps(tile_value)) except Exception as ex: From 6bbffab0ccf858bf15f006522d65fc1376548261 Mon Sep 17 00:00:00 2001 From: mkeller <7525285+keller-mark@users.noreply.github.com> Date: Fri, 29 May 2020 17:00:44 -0400 Subject: [PATCH 10/14] Validate POST body with JSON schema --- tilesets/tests.py | 16 ++++++----- tilesets/validate_json.py | 38 ++++++++++++++++++++++++++ tilesets/views.py | 56 +++++++++++++-------------------------- 3 files changed, 66 insertions(+), 44 deletions(-) create mode 100644 tilesets/validate_json.py diff --git a/tilesets/tests.py b/tilesets/tests.py index 14c284f8..7755ae14 100644 --- a/tilesets/tests.py +++ b/tilesets/tests.py @@ -515,14 +515,18 @@ def test_get_tile_with_aggregation(self): uuid='chr21_KL' ) - body = { - 'chr21_KL': { - 'agg_groups': [[0, 1], [2, 3, 4], [5, 6]], - 'agg_func': 'sum' + body = [ + { + "tilesetUid": "chr21_KL", + "tileIds": ["0.0", "0.1"], + "options": { + "aggGroups": [[0, 1], [2, 3, 4], [5, 6]], + "aggFunc": "sum" + } } - } + ] - ret = self.client.post('/api/v1/tiles/?d=chr21_KL.0.0', json.dumps(body), content_type="application/json") + ret = self.client.post('/api/v1/tiles/', json.dumps(body), content_type="application/json") assert ret.status_code == 200 content = json.loads(ret.content.decode('utf-8')) r = base64.decodestring(content['chr21_KL.0.0']['dense'].encode('utf-8')) diff --git a/tilesets/validate_json.py b/tilesets/validate_json.py new file mode 100644 index 00000000..88af7f2a --- /dev/null +++ b/tilesets/validate_json.py @@ -0,0 +1,38 @@ +tiles_post_schema = { + "$schema": "http://json-schema.org/draft-07/schema#", + "definitions": { + "multivecRowAggregationOptions": { + "type": "object", + "required": ["aggGroups", "aggFunc"], + "additionalProperties": False, + "properties": { + "aggGroups": { + "type": "array", + "items": { + "oneOf": [ + { "type": "integer" }, + { "type": "array", "items": { "type": "integer" }} + ] + } + }, + "aggFunc": { + "type": "string", + "enum": ["sum", "mean", "median", "std", "var", "min", "max"] + } + } + } + }, + "type": "array", + "items": { + "type": "object", + "properties": { + "tilesetUid": { "type": "string" }, + "tileIds": { "type": "array", "items": { "type": "string" }}, + "options": { + "oneOf": [ + { "$ref": "#/definitions/multivecRowAggregationOptions" } + ] + } + } + } +} \ No newline at end of file diff --git a/tilesets/views.py b/tilesets/views.py index 355900b0..07792471 100644 --- a/tilesets/views.py +++ b/tilesets/views.py @@ -24,6 +24,7 @@ import tilesets.chromsizes as tcs import tilesets.generate_tiles as tgt +import tilesets.validate_json as tvj import clodius.tiles.bam as ctb import clodius.tiles.cooler as hgco @@ -52,6 +53,8 @@ import slugid import urllib import hashlib +from jsonschema import validate as json_validate +from jsonschema.exceptions import ValidationError as JsonValidationError try: import cPickle as pickle @@ -398,55 +401,32 @@ def tiles(request): tileset_to_options = dict() if request.method == 'POST': - # If this is a POST request, parse the request body. + # This is a POST request, so try to parse the request body as JSON. try: body = json.loads(request.body.decode('utf-8')) except: return JsonResponse({ 'error': 'Unable to parse request body as JSON.' }, status=rfs.HTTP_400_BAD_REQUEST) - # Validate the contents of the JSON request body. - if type(body) is not list: + + # Validate against the JSON schema. + try: + json_validate(instance=body, schema=tvj.tiles_post_schema) + except JsonValidationError as e: return JsonResponse({ - 'error': 'Expected request body to be a JSON array.' + 'error': f"Invalid request body: {e.message}.", }, status=rfs.HTTP_400_BAD_REQUEST) - # Iterate over each tileset in the request body array. + + # Iterate over tilesets to obtain the associated tile IDs and options. for tileset_info in body: - # Ensure that the array consists of JSON objects with the expected properties. - if type(tileset_info) is not dict: - return JsonResponse({ - 'error': 'Expected request body array items to be objects.' - }, status=rfs.HTTP_400_BAD_REQUEST) - if "tilesetUid" not in tileset_info: - return JsonResponse({ - 'error': "Expected tileset info object to have property 'tilesetUid'." - }, status=rfs.HTTP_400_BAD_REQUEST) - if type(tileset_info["tilesetUid"]) is not str: - return JsonResponse({ - 'error': "Expected tileset property 'tilesetUid' type to be string." - }, status=rfs.HTTP_400_BAD_REQUEST) - if "tileIds" not in tileset_info: - return JsonResponse({ - 'error': "Expected tileset info object to have property 'tileIds'." - }, status=rfs.HTTP_400_BAD_REQUEST) - if type(tileset_info["tileIds"]) is not list: - return JsonResponse({ - 'error': "Expected tileset property 'tileIds' type to be array." - }, status=rfs.HTTP_400_BAD_REQUEST) - if "options" not in tileset_info: - return JsonResponse({ - 'error': "Expected tileset info object to have property 'options'." - }, status=rfs.HTTP_400_BAD_REQUEST) - if type(tileset_info["options"]) is not dict: - return JsonResponse({ - 'error': "Expected tileset info property 'options' type to be object." - }, status=rfs.HTTP_400_BAD_REQUEST) - - tileset_uid = tileset_info["tilesetUid"] # can assume it exists and is str - tile_ids = tileset_info["tileIds"] # can assume it exists and is list - tileset_options = tileset_info["options"] # can assume it exists and is dict + tileset_uid = tileset_info["tilesetUid"] + # Prepend the tileset UID to each tile ID suffix. + tile_ids = [ f"{tileset_uid}.{tile_id}" for tile_id in tileset_info["tileIds"] ] tileids_to_fetch.update(tile_ids) + + tileset_options = tileset_info["options"] tileset_to_options[tileset_uid] = tileset_options + # Hash the options object so that the tile can be cached. tileset_to_options[tileset_uid]["options_hash"] = hashlib.md5(json.dumps(tileset_options).encode('utf-8')).hexdigest() elif request.method == 'GET': From f852cc65529f5c01a05220f081a70bf96ff05a8a Mon Sep 17 00:00:00 2001 From: mkeller <7525285+keller-mark@users.noreply.github.com> Date: Fri, 29 May 2020 17:01:52 -0400 Subject: [PATCH 11/14] Requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 317a2380..80fe678d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,6 +9,7 @@ django==2.1.11 djangorestframework==3.9.1 h5py==2.6.0 higlass-python==0.2.1 +jsonschema==3.2.0 numba==0.46.0 numpy==1.17.3 pandas==0.23.4 From 2dd1a963c096dd989ed75ee65947ebc592258211 Mon Sep 17 00:00:00 2001 From: mkeller <7525285+keller-mark@users.noreply.github.com> Date: Fri, 29 May 2020 17:03:51 -0400 Subject: [PATCH 12/14] Rename file with json schemas --- tilesets/{validate_json.py => json_schemas.py} | 0 tilesets/views.py | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename tilesets/{validate_json.py => json_schemas.py} (100%) diff --git a/tilesets/validate_json.py b/tilesets/json_schemas.py similarity index 100% rename from tilesets/validate_json.py rename to tilesets/json_schemas.py diff --git a/tilesets/views.py b/tilesets/views.py index 07792471..87393691 100644 --- a/tilesets/views.py +++ b/tilesets/views.py @@ -24,7 +24,7 @@ import tilesets.chromsizes as tcs import tilesets.generate_tiles as tgt -import tilesets.validate_json as tvj +import tilesets.json_schemas as tjs import clodius.tiles.bam as ctb import clodius.tiles.cooler as hgco @@ -411,7 +411,7 @@ def tiles(request): # Validate against the JSON schema. try: - json_validate(instance=body, schema=tvj.tiles_post_schema) + json_validate(instance=body, schema=tjs.tiles_post_schema) except JsonValidationError as e: return JsonResponse({ 'error': f"Invalid request body: {e.message}.", From b5718ae35fa745beaa2b6f94560668ad82a1693f Mon Sep 17 00:00:00 2001 From: mkeller <7525285+keller-mark@users.noreply.github.com> Date: Fri, 29 May 2020 17:07:57 -0400 Subject: [PATCH 13/14] Tile limit --- tilesets/views.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tilesets/views.py b/tilesets/views.py index 87393691..28fe1999 100644 --- a/tilesets/views.py +++ b/tilesets/views.py @@ -400,6 +400,8 @@ def tiles(request): tileids_to_fetch = set() tileset_to_options = dict() + TILE_LIMIT = 1000 + if request.method == 'POST': # This is a POST request, so try to parse the request body as JSON. try: @@ -433,6 +435,11 @@ def tiles(request): # create a set so that we don't fetch the same tile multiple times tileids_to_fetch = set(request.GET.getlist("d")) + if len(tileids_to_fetch) > TILE_LIMIT: + return JsonResponse({ + 'error': "Too many tiles were requested.", + }, status=rfs.HTTP_400_BAD_REQUEST) + # with ProcessPoolExecutor() as executor: # res = executor.map(parallelize, hargs) ''' From dec93a907f918a8444941d6f263a761d7e3d100c Mon Sep 17 00:00:00 2001 From: mkeller <7525285+keller-mark@users.noreply.github.com> Date: Wed, 17 Jun 2020 19:57:12 -0400 Subject: [PATCH 14/14] Add test for additional tile type --- tilesets/json_schemas.py | 1 + tilesets/tests.py | 20 +++++++++++++++++++- tilesets/views.py | 10 ++++++---- 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/tilesets/json_schemas.py b/tilesets/json_schemas.py index 88af7f2a..e5284864 100644 --- a/tilesets/json_schemas.py +++ b/tilesets/json_schemas.py @@ -25,6 +25,7 @@ "type": "array", "items": { "type": "object", + "required": ["tilesetUid", "tileIds"], "properties": { "tilesetUid": { "type": "string" }, "tileIds": { "type": "array", "items": { "type": "string" }}, diff --git a/tilesets/tests.py b/tilesets/tests.py index 7755ae14..26bfd687 100644 --- a/tilesets/tests.py +++ b/tilesets/tests.py @@ -499,7 +499,7 @@ def test_get_tile(self): assert q.shape[0] == 512 - def test_get_tile_with_aggregation(self): + def test_get_tiles_via_post_with_aggregation(self): self.user1 = dcam.User.objects.create_user( username='user1', password='pass' ) @@ -1005,6 +1005,24 @@ def test_get_tiles(self): except OSError: pass + def test_get_tiles_via_post(self): + c1 = dt.Client() + c1.login(username='user1', password='pass') + + body = [ + { + "tilesetUid": "bb", + "tileIds": ["14.12"] + } + ] + + ret = c1.post('/api/v1/tiles/', json.dumps(body), content_type="application/json") + assert ret.status_code == 200 + content = json.loads(ret.content.decode('utf-8')) + content_len = len(content['bb.14.12']) + + assert content_len == 200 + class CoolerTest(dt.TestCase): def setUp(self): diff --git a/tilesets/views.py b/tilesets/views.py index 28fe1999..4a766b80 100644 --- a/tilesets/views.py +++ b/tilesets/views.py @@ -426,10 +426,12 @@ def tiles(request): tile_ids = [ f"{tileset_uid}.{tile_id}" for tile_id in tileset_info["tileIds"] ] tileids_to_fetch.update(tile_ids) - tileset_options = tileset_info["options"] - tileset_to_options[tileset_uid] = tileset_options - # Hash the options object so that the tile can be cached. - tileset_to_options[tileset_uid]["options_hash"] = hashlib.md5(json.dumps(tileset_options).encode('utf-8')).hexdigest() + tileset_options = tileset_info.get("options", None) + # The "options" property is optional. + if type(tileset_options) == dict: + tileset_to_options[tileset_uid] = tileset_options + # Hash the options object so that the tile can be cached. + tileset_to_options[tileset_uid]["options_hash"] = hashlib.md5(json.dumps(tileset_options).encode('utf-8')).hexdigest() elif request.method == 'GET': # create a set so that we don't fetch the same tile multiple times