diff --git a/tardis/apps/search/api.py b/tardis/apps/search/api.py index 8ae266712..3e3235d12 100644 --- a/tardis/apps/search/api.py +++ b/tardis/apps/search/api.py @@ -1,4 +1,4 @@ -# pylint: disable=C0302,R1702 +# pylint: disable=C0302 """ RESTful API for MyTardis search. Implemented with Tastypie. @@ -6,15 +6,13 @@ .. moduleauthor:: Manish Kumar .. moduleauthor:: Mike Laverick """ -from datetime import datetime import json from django.conf import settings -from django.template.defaultfilters import filesizeformat import pytz from django_elasticsearch_dsl.search import Search -from elasticsearch_dsl import MultiSearch, Q +from elasticsearch_dsl import MultiSearch from tastypie import fields from tastypie.exceptions import ImmediateHttpResponse from tastypie.http import HttpUnauthorized @@ -30,7 +28,17 @@ ParameterName, ) from tardis.apps.projects.models import Project - +from .utils.api import ( + create_user_and_group_query, + query_keywords_and_metadata, + query_apply_filters, + query_add_sorting, + cleaning_acls, + cleaning_ids, + cleaning_preload, + cleaning_parent_filter, + cleaning_results, +) LOCAL_TZ = pytz.timezone(settings.TIME_ZONE) RESULTS_PER_PAGE = settings.RESULTS_PER_PAGE @@ -62,6 +70,8 @@ def to_json(self, data, options=None): class SearchObject(object): + """Basic TastyPie API object to hold search results""" + def __init__(self, hits=None, total_hits=None, id=None): self.hits = hits self.total_hits = total_hits @@ -69,6 +79,8 @@ def __init__(self, hits=None, total_hits=None, id=None): class SchemasObject(object): + """Basic TastyPie API object to hold schemas for filter bar population""" + def __init__(self, schemas=None, id=None): self.schemas = schemas self.id = id @@ -96,6 +108,13 @@ def detail_uri_kwargs(self, bundle_or_obj): return kwargs def get_object_list(self, request): + """ + Populates the API response with schemas and metadata fields that + a user can access. + TODO: Probably separate out PUBLIC_DATA schemas + """ + + # if a user is not logged in, return empty for their schemas if not request.user.is_authenticated: result_dict = { "project": None, @@ -104,40 +123,30 @@ def get_object_list(self, request): "datafile": None, } return [SchemasObject(id=1, schemas=result_dict)] - result_dict = { - "project": [ - *{ - *Project.safe.all(user=request.user) - .prefetch_related("projectparameterset") - .values_list("projectparameterset__schema__id", flat=True) - } - ], - "experiment": [ - *{ - *Experiment.safe.all(user=request.user) - .prefetch_related("experimentparameterset") - .values_list("experimentparameterset__schema__id", flat=True) - } - ], - "dataset": [ - *{ - *Dataset.safe.all(user=request.user) - .prefetch_related("datasetparameterset") - .values_list("datasetparameterset__schema__id", flat=True) - } - ], - "datafile": [ + + # pull out schema IDs for all accessible objects for a user + result_dict = {} + for string, model in { + "project": Project, + "experiment": Experiment, + "dataset": Dataset, + "datafile": DataFile, + }.items(): + result_dict[string] = [ *{ - *DataFile.safe.all(user=request.user) - .prefetch_related("datafileparameterset") - .values_list("datafileparameterset__schema__id", flat=True) + *model.safe.all(user=request.user) + .prefetch_related(string + "parameterset") + .values_list(string + "parameterset__schema__id", flat=True) } - ], - } + ] + + # create a return dictionary of schemas and their non-sensitive metadata fields safe_dict = {} + # iterate over accessible schemas for key, val in result_dict.items(): safe_dict[key] = {} for value in val: + # if object type has schemas, add them to safe_dict if value is not None: schema_id = str(value) schema_dict = { @@ -146,6 +155,7 @@ def get_object_list(self, request): "schema_name": Schema.objects.get(id=value).name, "parameters": {}, } + # get parameter_names associated with schema param_names = ParameterName.objects.filter( schema__id=value, sensitive=False ) @@ -166,7 +176,9 @@ def get_object_list(self, request): "full_name": param.full_name, "data_type": type_dict[param.data_type], } + # append parameter info to relevant schema schema_dict["parameters"][param_id] = param_dict + # add completed schema to schema_dict ready for return safe_dict[key][schema_id] = schema_dict return [SchemasObject(id=1, schemas=safe_dict)] @@ -220,24 +232,35 @@ def create_search_results(self, bundle): # return [SearchObject(id=1, hits=result_dict)] groups = user.groups.all() + # This holds the "text" from all the object specific "keyword" search bars, + # which may ALL have been populated with the same text via the menubar search bar query_text = bundle.data.get("query", None) + # This holds all of the intrinsic and schema-specific metadata filters per object filters = bundle.data.get("filters", None) + # result specific bundles for pagination and sorting request_sorting = bundle.data.get("sort", None) request_size = bundle.data.get("size", 20) request_offset = bundle.data.get("offset", 0) + # result specific bundle to trigger a object specific search update request_type = bundle.data.get("type", None) - # Mock input - # request_for_pag = True - # if request_for_pag: - # request_offset = 37 - # request_size = 50 - # request_sorting = [#{ 'field': ["title"], 'order': "desc" }, - # #{ 'field': ["experiments","title"], 'order': "desc" }, - # { 'field': ["size"], 'order': "desc" }] - # request_type = 'datafile' + + """Mock input + request_for_pag = True + if request_for_pag: + request_offset = 37 + request_size = 50 + request_sorting = [#{ 'field': ["title"], 'order': "desc" }, + #{ 'field': ["experiments","title"], 'order': "desc" }, + { 'field': ["size"], 'order': "desc" }] + request_type = 'datafile' + """ + + # if API request object type isn't specified default to all object types if request_type is None: index_list = ["project", "experiment", "dataset", "datafile"] - match_list = ["name", "title", "description", "filename"] + title_list = ["name", "title", "description", "filename"] + # If API request object type is specified then specify object type + parent + # heirarchy object types, and their intrinsic "title" field names else: # probably some nicer structure/way to do this type_2_list = { @@ -256,505 +279,41 @@ def create_search_results(self, bundle): }, } index_list = type_2_list[request_type]["index"] - match_list = type_2_list[request_type]["match"] + title_list = type_2_list[request_type]["match"] + # Numerically specify the order of heirarchy hierarchy = {"project": 4, "experiment": 3, "dataset": 2, "datafile": 1} + # Define a numerical filter_level for objects, below which we enforce a + # parent-must-be-in-results criteria filter_level = 0 + + # create multisearch object to search all 4 objects in parallel ms = MultiSearch(index=index_list) + + # iterate over object types required in this search request for idx, obj in enumerate(index_list): - # (1) add user/group criteria to searchers - query_obj = Q( - { - "nested": { - "path": "acls", - "query": Q( - { - "bool": { - "must": [ - Q({"match": {"acls.entityId": user.id}}), - Q({"term": {"acls.pluginId": "django_user"}}), - ] - } - } - ), - } - } - ) - for group in groups: - query_obj_group = Q( - { - "nested": { - "path": "acls", - "query": Q( - { - "bool": { - "must": [ - Q({"match": {"acls.entityId": group.id}}), - Q( - { - "term": { - "acls.pluginId": "django_group" - } - } - ), - ] - } - } - ), - } - } - ) - query_obj = query_obj | query_obj_group - # (2) Search on title/keywords + on non-sensitive metadata + # add user/group criteria to searchers + query_obj = create_user_and_group_query(user=user, groups=groups) + + # Search on title/keywords + on non-sensitive metadata if query_text is not None: + # parent-child filter isn't enforced here right now # if filter_level < hierarchy[obj]: # filter_level = hierarchy[obj] if obj in query_text.keys(): - query_obj_text = Q({"match": {match_list[idx]: query_text[obj]}}) - query_obj_text_meta = Q( - { - "nested": { - "path": "parameters.string", - "query": Q( - { - "bool": { - "must": [ - Q( - { - "match": { - "parameters.string.value": query_text[ - obj - ] - } - } - ), - Q( - { - "term": { - "parameters.string.sensitive": False - } - } - ), - ] - } - } - ), - } - } + query_obj = query_keywords_and_metadata( + query_obj, query_text, obj, idx, title_list ) - query_obj_text_meta = query_obj_text | query_obj_text_meta - query_obj = query_obj & query_obj_text_meta - # (3) Apply intrinsic filters + metadata filters to search + # Apply intrinsic filters + metadata filters to search if filters is not None: # filter_op = filters['op'] This isn't used for now - filterlist = filters["content"] - operator_dict = { - "is": "term", - "contains": "match", - ">=": "gte", - "<=": "lte", - } - num_2_type = { - 1: "experiment", - 2: "dataset", - 3: "datafile", - 6: "project", - } - for filter in filterlist: - oper = operator_dict[filter["op"]] - - # (3.1) Apply Schema-parameter / metadata filters to search - if filter["kind"] == "schemaParameter": - schema_id, param_id = filter["target"][0], filter["target"][1] - # check filter is applied to correct object type - if num_2_type[Schema.objects.get(id=schema_id).type] == obj: - if filter_level < hierarchy[obj]: - filter_level = hierarchy[obj] - if filter["type"] == "STRING": - # check if filter query is list of options, or single value - # (elasticsearch can actually handle delimiters in a single string...) - if isinstance(filter["content"], list): - Qdict = {"should": []} - for option in filter["content"]: - qry = Q( - { - "nested": { - "path": "parameters.string", - "query": Q( - { - "bool": { - "must": [ - Q( - { - "match": { - "parameters.string.pn_id": str( - param_id - ) - } - } - ), - Q( - { - oper: { - "parameters.string.value": option - } - } - ), - Q( - { - "term": { - "parameters.string.sensitive": False - } - } - ), - ] - } - } - ), - } - } - ) - Qdict["should"].append(qry) - query_obj_filt = Q({"bool": Qdict}) - else: - query_obj_filt = Q( - { - "nested": { - "path": "parameters.string", - "query": Q( - { - "bool": { - "must": [ - Q( - { - "match": { - "parameters.string.pn_id": str( - param_id - ) - } - } - ), - Q( - { - oper: { - "parameters.string.value": filter[ - "content" - ] - } - } - ), - Q( - { - "term": { - "parameters.string.sensitive": False - } - } - ), - ] - } - } - ), - } - } - ) - elif filter["type"] == "NUMERIC": - query_obj_filt = Q( - { - "nested": { - "path": "parameters.numerical", - "query": Q( - { - "bool": { - "must": [ - Q( - { - "match": { - "parameters.numerical.pn_id": str( - param_id - ) - } - } - ), - Q( - { - "range": { - "parameters.numerical.value": { - oper: filter[ - "content" - ] - } - } - } - ), - Q( - { - "term": { - "parameters.string.sensitive": False - } - } - ), - ] - } - } - ), - } - } - ) - elif filter["type"] == "DATETIME": - query_obj_filt = Q( - { - "nested": { - "path": "parameters.datetime", - "query": Q( - { - "bool": { - "must": [ - Q( - { - "match": { - "parameters.datetime.pn_id": str( - param_id - ) - } - } - ), - Q( - { - "range": { - "parameters.datetime.value": { - oper: filter[ - "content" - ] - } - } - } - ), - Q( - { - "term": { - "parameters.string.sensitive": False - } - } - ), - ] - } - } - ), - } - } - ) - query_obj = query_obj & query_obj_filt - - # (3.2) Apply intrinsic object filters to search - if filter["kind"] == "typeAttribute": - target_objtype, target_fieldtype = ( - filter["target"][0], - filter["target"][1], - ) - if target_objtype == obj: - # Update the heirarchy level at which the - # "parent-in-results" criteria must be applied - if filter_level < hierarchy[obj]: - filter_level = hierarchy[obj] - - # (3.2.1) Apply "Selected Schema" filter - if target_fieldtype == "schema": - # check if filter query is list of options, or single value - if isinstance(filter["content"], list): - Qdict = {"should": []} - for option in filter["content"]: - qry = Q( - { - "nested": { - "path": "parameters.schemas", - "query": Q( - { - oper: { - "parameters.schemas.schema_id": option - } - } - ), - } - } - ) - Qdict["should"].append(qry) - query_obj_filt = Q({"bool": Qdict}) - else: - query_obj_filt = Q( - { - "nested": { - "path": "parameters.schemas", - "query": Q( - { - oper: { - "parameters.schemas.schema_id": filter[ - "content" - ] - } - } - ), - } - } - ) - query_obj = query_obj & query_obj_filt - - # (3.2.2) Apply filters that act on fields which are - # intrinsic to the object (Proj,exp,set,file) - if target_fieldtype in { - "name", - "description", - "title", - "tags", - "filename", - "file_extension", - "created_time", - "start_time", - "end_time", - }: - if filter["type"] == "STRING": - if isinstance(filter["content"], list): - Qdict = {"should": []} - for option in filter["content"]: - if target_fieldtype == "file_extension": - if option[0] == ".": - option = option[1:] - qry = Q({oper: {target_fieldtype: option}}) - Qdict["should"].append(qry) - query_obj_filt = Q({"bool": Qdict}) - else: - if target_fieldtype == "file_extension": - if filter["content"][0] == ".": - filter["content"] = filter["content"][ - 1: - ] - query_obj_filt = Q( - { - oper: { - target_fieldtype: filter["content"] - } - } - ) - elif filter["type"] == "DATETIME": - query_obj_filt = Q( - { - "range": { - target_fieldtype: { - oper: filter["content"] - } - } - } - ) - query_obj = query_obj & query_obj_filt - - # (3.2.3) Apply filters that act on fields which are - # intrinsic to related objects (instruments, users, etc) - if target_fieldtype in { - "principal_investigator", - "projects", - "instrument", - "institution", - "experiments", - "dataset", - }: - nested_fieldtype = filter["target"][2] - if isinstance(filter["content"], list): - Qdict = {"should": []} - for option in filter["content"]: - qry = Q( - { - "nested": { - "path": target_fieldtype, - "query": Q( - { - oper: { - ".".join( - [ - target_fieldtype, - nested_fieldtype, - ] - ): option - } - } - ), - } - } - ) - Qdict["should"].append(qry) - query_obj_filt = Q({"bool": Qdict}) - else: - query_obj_filt = Q( - { - "nested": { - "path": target_fieldtype, - "query": Q( - { - oper: { - ".".join( - [ - target_fieldtype, - nested_fieldtype, - ] - ): filter["content"] - } - } - ), - } - } - ) - # Special handling for list of principal investigators - if target_fieldtype == "principal_investigator": - Qdict_lr = {"should": [query_obj_filt]} - if isinstance(filter["content"], list): - Qdict = {"should": []} - for option in filter["content"]: - qry = Q( - { - "nested": { - "path": target_fieldtype, - "query": Q( - { - "term": { - ".".join( - [ - target_fieldtype, - "username", - ] - ): option - } - } - ), - } - } - ) - Qdict["should"].append(qry) - query_obj_filt = Q({"bool": Qdict}) - else: - query_obj_filt = Q( - { - "nested": { - "path": target_fieldtype, - "query": Q( - { - "term": { - ".".join( - [ - target_fieldtype, - "username", - ] - ): filter["content"] - } - } - ), - } - } - ) - Qdict_lr["should"].append(query_obj_filt) - query_obj_filt = Q({"bool": Qdict_lr}) - query_obj = query_obj & query_obj_filt - - # (4) Define fields not to return in the search results (for brevity) + query_obj, filter_level = query_apply_filters( + query_obj, filters, obj, filter_level, hierarchy + ) + + # Define fields not to return in the search results excluded_fields_list = [ "end_time", "institution", @@ -770,58 +329,19 @@ def create_search_results(self, bundle): "parameters.datetime.pn_id", "acls", ] + # "description" field is crucial for datasets, but too verbose for experiments if obj != "dataset": excluded_fields_list.append("description") - ######TODO (5) Do some sorting - # Default sorting + # Apply sorting filters based upon request and defaults sort_dict = {} - if request_sorting is not None: - if obj in request_sorting: - for sort in request_sorting[obj]: - if len(sort["field"]) > 1: - if sort["field"][-1] in { - "fullname", - "name", - "title", - "description", - "filename", - }: - search_field = ".".join(sort["field"]) + ".raw" - else: - search_field = ".".join(sort["field"]) - sort_dict[search_field] = { - "order": sort["order"], - "nested_path": ".".join(sort["field"][:-1]), - } - - if len(sort["field"]) == 1: - if sort["field"][0] in { - "principal_investigator", - "name", - "title", - "description", - "filename", - }: - sort_dict[sort["field"][0] + ".raw"] = { - "order": sort["order"] - } - elif sort["field"][0] == "size": - if obj == "datafile": - sort_dict[sort["field"][0]] = { - "order": sort["order"] - } - else: - # DO SOME SORTING AFTER ELASTICSEARCH - pass - else: - sort_dict[sort["field"][0]] = {"order": sort["order"]} + sort_dict = query_add_sorting(request_sorting, obj, sort_dict) # If sort dict is still empty even after filters, add in the defaults if not sort_dict: - sort_dict = {match_list[idx] + ".raw": {"order": "asc"}} + sort_dict = {title_list[idx] + ".raw": {"order": "asc"}} - # (6) Add the search to the multi-search object, ready for execution + # Finally, add the search to the multi-search object, ready for execution ms = ms.add( Search(index=obj) .sort(sort_dict) @@ -830,6 +350,7 @@ def create_search_results(self, bundle): .source(excludes=excluded_fields_list) ) + # execute the multi-search object and return results results = ms.execute() # -------------------- @@ -837,112 +358,13 @@ def create_search_results(self, bundle): # -------------------- # load in object IDs for all objects a user has sensitive access to - # projects_sens = {*Project.safe.all(user, viewsensitive=True).values_list("id", flat=True)} - projects_sens_query = ( - user.projectacls.select_related("project") - .filter(canSensitive=True) - .exclude( - effectiveDate__gte=datetime.today(), expiryDate__lte=datetime.today() - ) - .values_list("project__id", flat=True) - ) - for group in groups: - projects_sens_query |= ( - group.projectacls.select_related("project") - .filter(canSensitive=True) - .exclude( - effectiveDate__gte=datetime.today(), - expiryDate__lte=datetime.today(), - ) - .values_list("project__id", flat=True) - ) - projects_sens = [*projects_sens_query.distinct()] - - # experiments_sens = {*Experiment.safe.all(user, viewsensitive=True).values_list("id", flat=True)} - experiments_sens_query = ( - user.experimentacls.select_related("experiment") - .filter(canSensitive=True) - .exclude( - effectiveDate__gte=datetime.today(), expiryDate__lte=datetime.today() - ) - .values_list("experiment__id", flat=True) - ) - for group in groups: - experiments_sens_query |= ( - group.experimentacls.select_related("experiment") - .filter(canSensitive=True) - .exclude( - effectiveDate__gte=datetime.today(), - expiryDate__lte=datetime.today(), - ) - .values_list("experiment__id", flat=True) - ) - experiments_sens = [*experiments_sens_query.distinct()] - - # datasets_sens = {*Dataset.safe.all(user, viewsensitive=True).values_list("id", flat=True)} - datasets_sens_query = ( - user.datasetacls.select_related("dataset") - .filter(canSensitive=True) - .exclude( - effectiveDate__gte=datetime.today(), expiryDate__lte=datetime.today() - ) - .values_list("dataset__id", flat=True) - ) - for group in groups: - datasets_sens_query |= ( - group.datasetacls.select_related("dataset") - .filter(canSensitive=True) - .exclude( - effectiveDate__gte=datetime.today(), - expiryDate__lte=datetime.today(), - ) - .values_list("dataset__id", flat=True) - ) - datasets_sens = [*datasets_sens_query.distinct()] - - # datafiles_sens = {*DataFile.safe.all(user, viewsensitive=True).values_list("id", flat=True)} - datafiles_sens_query = ( - user.datafileacls.select_related("datafile") - .filter(canSensitive=True) - .exclude( - effectiveDate__gte=datetime.today(), expiryDate__lte=datetime.today() - ) - .values_list("datafile__id", flat=True) - ) - for group in groups: - datafiles_sens_query |= ( - group.datafileacls.select_related("datafile") - .filter(canSensitive=True) - .exclude( - effectiveDate__gte=datetime.today(), - expiryDate__lte=datetime.today(), - ) - .values_list("datafile__id", flat=True) - ) - datafiles_sens = [*datafiles_sens_query.distinct()] + projects_sens = cleaning_acls(user, groups, "project", canSensitive=True) + experiments_sens = cleaning_acls(user, groups, "experiment", canSensitive=True) + datasets_sens = cleaning_acls(user, groups, "dataset", canSensitive=True) + datafiles_sens = cleaning_acls(user, groups, "datafile", canSensitive=True) # load in datafile IDs for all datafiles a user has download access to - # datafiles_dl = {*DataFile.safe.all(user, downloadable=True).values_list("id", flat=True)} - - datafiles_dl_query = ( - user.datafileacls.select_related("datafile") - .filter(canDownload=True) - .exclude( - effectiveDate__gte=datetime.today(), expiryDate__lte=datetime.today() - ) - .values_list("datafile__id", flat=True) - ) - for group in groups: - datafiles_dl_query |= ( - group.datafileacls.select_related("datafile") - .filter(canDownload=True) - .exclude( - effectiveDate__gte=datetime.today(), - expiryDate__lte=datetime.today(), - ) - .values_list("datafile__id", flat=True) - ) - datafiles_dl = [*datafiles_dl_query.distinct()] + datafiles_dl = cleaning_acls(user, groups, "datafile", canDownload=True) # re-structure into convenient dictionary preloaded = { @@ -951,318 +373,42 @@ def create_search_results(self, bundle): "dataset": {"sens_list": datasets_sens, "objects": {}}, "datafile": {"sens_list": datafiles_sens, "objects": {}}, } + # load in object IDs for all objects a user has read access to, # and IDs for all of the object's nested-children - regardless of user # access to these child objects (the access check come later) - # projects_values = ["id", "experiment__id", "experiment__datasets__id", - # "experiment__datasets__datafile__id"] - # projects = [*Project.safe.all(user).values_list(*projects_values)] - - projects_query = ( - user.projectacls.select_related("project") - .prefetch_related( - "project__experiments", - "project__experiments__datasets", - "project__experiments__datasets__datafile", - ) - .exclude( - effectiveDate__gte=datetime.today(), expiryDate__lte=datetime.today() - ) - .values_list( - "project__id", - "project__experiments__id", - "project__experiments__datasets__id", - "project__experiments__datasets__datafile__id", - ) + projects = cleaning_ids(user, groups, "project") + experiments = cleaning_ids(user, groups, "experiment") + datasets = cleaning_ids(user, groups, "dataset") + datafiles = cleaning_ids(user, groups, "datafile") + + # add data to preloaded["objects"] dictionary with ID as key + # and nested items as value - key/values. + preloaded = cleaning_preload( + preloaded, projects, experiments, datasets, datafiles ) - for group in groups: - projects_query |= ( - group.projectacls.select_related("project") - .prefetch_related( - "project__experiments", - "project__experiments__datasets", - "project__experiments__datasets__datafile", - ) - .exclude( - effectiveDate__gte=datetime.today(), - expiryDate__lte=datetime.today(), - ) - .values_list( - "project__id", - "project__experiments__id", - "project__experiments__datasets__id", - "project__experiments__datasets__datafile__id", - ) - ) - projects = [*projects_query.distinct()] - - # experiments_values = ["id", "datasets__id", "datasets__datafile__id"] - # experiments = [*Experiment.safe.all(user).values_list(*experiments_values)] - - experiments_query = ( - user.experimentacls.select_related("experiment") - .prefetch_related("experiment__datasets", "experiment__datasets__datafile") - .exclude( - effectiveDate__gte=datetime.today(), expiryDate__lte=datetime.today() - ) - .values_list( - "experiment__id", - "experiment__datasets__id", - "experiment__datasets__datafile__id", - ) - ) - for group in groups: - experiments_query |= ( - group.experimentacls.select_related("experiment") - .prefetch_related( - "experiment__datasets", "experiment__datasets__datafile" - ) - .exclude( - effectiveDate__gte=datetime.today(), - expiryDate__lte=datetime.today(), - ) - .values_list( - "experiment__id", - "experiment__datasets__id", - "experiment__datasets__datafile__id", - ) - ) - experiments = [*experiments_query.distinct()] - - # datasets = [*Dataset.safe.all(user).prefetch_related("datafile").values_list("id", "datafile__id")] - datasets_query = ( - user.datasetacls.select_related("dataset") - .prefetch_related("dataset__datafile") - .exclude( - effectiveDate__gte=datetime.today(), expiryDate__lte=datetime.today() - ) - .values_list("dataset__id", "dataset__datafile__id") - ) - for group in groups: - datasets_query |= ( - group.datasetacls.select_related("dataset") - .exclude( - effectiveDate__gte=datetime.today(), - expiryDate__lte=datetime.today(), - ) - .values_list("dataset__id", "dataset__datafile__id") - ) - datasets = [*datasets_query.distinct()] - - # datafiles = [*DataFile.safe.all(user).values_list("id", "size")] - datafiles_query = ( - user.datafileacls.select_related("datafile") - .exclude( - effectiveDate__gte=datetime.today(), expiryDate__lte=datetime.today() - ) - .values_list("datafile__id", "datafile__size") - ) - for group in groups: - datafiles_query |= ( - group.datafileacls.select_related("datafile") - .exclude( - effectiveDate__gte=datetime.today(), - expiryDate__lte=datetime.today(), - ) - .values_list("datafile__id", "datafile__size") - ) - datafiles = [*datafiles_query.distinct()] - - # add data to preloaded["objects"] dictionary with ID as key and nested items as value - key/values. - # Probably a cleaner/simpler way to do this, but hey ho! - for key, value in { - "project": projects, - "experiment": experiments, - "dataset": datasets, - "datafile": datafiles, - }.items(): - for item in value: - name = item[0] - if name in preloaded[key]["objects"]: - if key == "dataset": - preloaded[key]["objects"][name]["dfs"].add(item[1]) - elif key == "experiment": - preloaded[key]["objects"][name]["sets"].add(item[1]) - preloaded[key]["objects"][name]["dfs"].add(item[2]) - elif key == "project": - preloaded[key]["objects"][name]["exps"].add(item[1]) - preloaded[key]["objects"][name]["sets"].add(item[2]) - preloaded[key]["objects"][name]["dfs"].add(item[3]) - else: - new_dict = {} - if key == "datafile": - new_dict["size"] = item[1] - elif key == "dataset": - new_dict["dfs"] = {item[1]} - elif key == "experiment": - new_dict["sets"] = {item[1]} - new_dict["dfs"] = {item[2]} - elif key == "project": - new_dict["exps"] = {item[1]} - new_dict["sets"] = {item[2]} - new_dict["dfs"] = {item[3]} - preloaded[key]["objects"][name] = new_dict # Create the result object which will be returned to the front-end result_dict = {k: [] for k in ["project", "experiment", "dataset", "datafile"]} # If filters are active, enforce the "parent in results" criteria on relevant objects if filter_level: - # Define parent_type for experiment/datafile (N/A for project, hardcoded for dataset) - parent_child = {"experiment": "projects", "dataset": "experiments"} - # Define hierarchy of types for filter levels - hierarch = [3, 2, 1] # {"experiments":3, "datasets":2, "datafiles":1} - for idx, item in enumerate(results[1:]): - # if active filter level higher than current object type: apply "parent-in-result" filter - if hierarch[idx] < filter_level: - parent_ids = [ - objj["_source"]["id"] for objj in results[idx].hits.hits - ] - parent_ids_set = {*parent_ids} - - for obj_idx, obj in reversed([*enumerate(item.hits.hits)]): - if obj["_index"] != "datafile": - parent_es_ids = [ - parent["id"] - for parent in obj["_source"][ - parent_child[obj["_index"]] - ] - ] - if not any(itemm in parent_es_ids for itemm in parent_ids): - results[idx + 1].hits.hits.pop(obj_idx) - else: - if ( - obj["_source"]["dataset"]["id"] not in parent_ids_set - ): # parent object is idx-1, but idx in enumerate is already shifted by -1, so straight idx - results[idx + 1].hits.hits.pop(obj_idx) + results = cleaning_parent_filter(results, filter_level) + # Count the number of search results after elasticsearch + parent filtering total_hits = { index_list[idx]: len(type.hits.hits) for idx, type in enumerate(results) } + # Pagination done before final cleaning to reduce "clean_parent_ids" duration + # Default Pagination handled by response.get if key isn't specified for item in results: item.hits.hits = item.hits.hits[ request_offset : (request_offset + request_size) ] - # Pagination done before final cleaning to reduce "clean_parent_ids" duration - # Default Pagination handled by response.get if key isn't specified - # result_dict = {k:v[request_offset:(request_offset+request_size)] for k,v in result_dict.items()} - # Clean and prepare the results "hit" objects and append them to the results_dict - for item in results: - for hit_attrdict in item.hits.hits: - hit = hit_attrdict.to_dict() - - # Check to see if indexed object actually exists in DB, if not then skip - if int(hit["_source"]["id"]) not in preloaded[hit["_index"]]["objects"]: - continue - - # Default sensitive permission and size of object - sensitive_bool = False - size = 0 - # If user/group has sensitive permission, update flag - if hit["_source"]["id"] in preloaded[hit["_index"]]["sens_list"]: - sensitive_bool = True - # Re-package parameters into single parameter list - param_list = [] - if "string" in hit["_source"]["parameters"]: - param_list.extend(hit["_source"]["parameters"]["string"]) - if "numerical" in hit["_source"]["parameters"]: - param_list.extend(hit["_source"]["parameters"]["numerical"]) - if "datetime" in hit["_source"]["parameters"]: - param_list.extend(hit["_source"]["parameters"]["datetime"]) - hit["_source"]["parameters"] = param_list - # Remove unused fields to reduce data sent to front-end - hit.pop("_score") - hit.pop("_id") - # hit.pop("_type") - hit.pop("sort") - - # Get count of all nested objects and download status - if hit["_index"] == "datafile": - if hit["_source"]["id"] in datafiles_dl: - hit["_source"]["userDownloadRights"] = "full" - size = hit["_source"]["size"] - else: - hit["_source"]["userDownloadRights"] = "none" - - else: - safe_nested_dfs_set = { - *preloaded["datafile"]["objects"] - }.intersection( - preloaded[hit["_index"]]["objects"][hit["_source"]["id"]]["dfs"] - ) - safe_nested_dfs_count = len(safe_nested_dfs_set) - if hit["_index"] in {"project", "experiment"}: - safe_nested_set = len( - {*preloaded["dataset"]["objects"]}.intersection( - preloaded[hit["_index"]]["objects"][ - hit["_source"]["id"] - ]["sets"] - ) - ) - # Ugly hack, should do a nicer, less verbose loop+type detection - if hit["_index"] == "project": - safe_nested_exp = len( - {*preloaded["experiment"]["objects"]}.intersection( - preloaded[hit["_index"]]["objects"][ - hit["_source"]["id"] - ]["exps"] - ) - ) - hit["_source"]["counts"] = { - "experiments": safe_nested_exp, - "datasets": safe_nested_set, - "datafiles": (safe_nested_dfs_count), - } - if hit["_index"] == "experiment": - hit["_source"]["counts"] = { - "datasets": safe_nested_set, - "datafiles": safe_nested_dfs_count, - } - if hit["_index"] == "dataset": - hit["_source"]["counts"] = {"datafiles": safe_nested_dfs_count} - # Get downloadable datafiles ultimately belonging to this "hit" object - # and calculate the total size of these files - safe_nested_dfs_dl = [ - *safe_nested_dfs_set.intersection(datafiles_dl) - ] - size = sum( - ( - preloaded["datafile"]["objects"][id]["size"] - for id in safe_nested_dfs_dl - ) - ) - # Determine the download state of the "hit" object - # safe_nested_dfs_dl_bool = [id in datafiles_dl for id in safe_nested_dfs] - if safe_nested_dfs_set.issubset(datafiles_dl): - hit["_source"]["userDownloadRights"] = "full" - elif safe_nested_dfs_set.intersection(datafiles_dl): - hit["_source"]["userDownloadRights"] = "partial" - else: - hit["_source"]["userDownloadRights"] = "none" - - hit["_source"]["size"] = filesizeformat(size) - - # if no sensitive access, remove sensitive metadata from response - for idxx, parameter in reversed( - [*enumerate(hit["_source"]["parameters"])] - ): - if not sensitive_bool: - if parameter["sensitive"]: - hit["_source"]["parameters"].pop(idxx) - else: - hit["_source"]["parameters"][idxx].pop("sensitive") - else: - if not parameter["sensitive"]: - hit["_source"]["parameters"][idxx].pop("sensitive") - - # Append hit to results if not already in results. - # Due to non-identical scores in hits for non-sensitive vs sensitive search, - # we require a more complex comparison than just 'is in' as hits are not identical - # if hit["_source"]['id'] not in [objj["_source"]['id'] for objj in result_dict[hit["_index"]+"s"]]: - result_dict[hit["_index"]].append(hit) + result_dict = cleaning_results(results, result_dict, preloaded, datafiles_dl) # Removes parent IDs from hits once parent-filtering applied # Removed for tidiness in returned response to front-end diff --git a/tardis/apps/search/documents.py b/tardis/apps/search/documents.py index 6657c3cc5..9382b2b66 100644 --- a/tardis/apps/search/documents.py +++ b/tardis/apps/search/documents.py @@ -3,10 +3,10 @@ from django.conf import settings from django.contrib.auth.models import User from django.db.models.signals import post_delete - -from elasticsearch_dsl import analyzer, token_filter from django_elasticsearch_dsl import Document, fields from django_elasticsearch_dsl.registries import registry +from elasticsearch_dsl import analyzer, token_filter + from tardis.tardis_portal.models import ( Experiment, @@ -24,14 +24,18 @@ DatafileParameter, DatafileParameterSet, ) - from tardis.apps.projects.models import ( Project, ProjectParameter, ProjectParameterSet, ProjectACL, ) - +from .utils.documents import ( + generic_acl_structure, + generic_parameter_structure, + prepare_generic_acls, + prepare_generic_parameters, +) logger = logging.getLogger(__name__) @@ -58,196 +62,6 @@ ) -def generic_acl_structure(): - """ - Return the ES structure of an ACL. - - - pluginId = type of ACL owner: user/group/token - - entityId = ID of the owner - """ - return fields.NestedField( - properties={ - "pluginId": fields.KeywordField(), - "entityId": fields.KeywordField(), - "canDownload": fields.BooleanField(), - "canSensitive": fields.BooleanField(), - } - ) - - -def generic_parameter_structure(): - """ - Return the ES structure of object parameters and schema. - The parameter structure splits out string/numerical/datetime - parameters so that ES can specifically handle each of their - datatypes. - - - Schemas: - - schema_id: Id of the object schemas - - string/numerical/datetime: - - pn_id: Id of parameter name - - pn_name: Name of parameter name - - value: value of parameter - - sensitive: whether parameter name is sensitive - """ - return fields.NestedField( - properties={ - "string": fields.NestedField( - properties={ - "pn_id": fields.KeywordField(), - "pn_name": fields.KeywordField(), - "value": fields.TextField(), - "sensitive": fields.BooleanField(), - } - ), - "numerical": fields.NestedField( - properties={ - "pn_id": fields.KeywordField(), - "pn_name": fields.KeywordField(), - "value": fields.FloatField(), - "sensitive": fields.BooleanField(), - } - ), - "datetime": fields.NestedField( - properties={ - "pn_id": fields.KeywordField(), - "pn_name": fields.KeywordField(), - "value": fields.DateField(), - "sensitive": fields.BooleanField(), - } - ), - "schemas": fields.NestedField( - properties={"schema_id": fields.KeywordField()} - ), - }, - ) - - -def prepare_generic_acls_build(INSTANCE_ACL_SET, return_list): - """Returns the ACLs associated with this - object, formatted for elasticsearch. - """ - for acl in INSTANCE_ACL_SET: - acl_dict = {} - if acl["user__id"] is not None: - acl_dict["pluginId"] = "django_user" - acl_dict["entityId"] = acl["user__id"] - if acl["group__id"] is not None: - acl_dict["pluginId"] = "django_group" - acl_dict["entityId"] = acl["group__id"] - if acl["token__id"] is not None: - # token access shouldn't be added to search - # unless search is given a way of checking token expiry - continue - # add in permission booleans - acl_dict["canDownload"] = acl["canDownload"] - acl_dict["canSensitive"] = acl["canSensitive"] - if acl_dict not in return_list: - return_list.append(acl_dict) - - -def prepare_generic_acls(type, INSTANCE_ACL_SET, INSTANCE_EXPS=None): - """Returns the ACLs associated with this - object, formatted for elasticsearch. - - This function is mostly just a wrapper around "prepare_generic_acls_build" - to account for current macro/micro behaviour. - """ - return_list = [] - if settings.ONLY_EXPERIMENT_ACLS and type != "experiment": - for exp in INSTANCE_EXPS.all(): - prepare_generic_acls_build( - exp.experimentacl_set.select_related("user", "group", "token") - .all() - .exclude(user__id=settings.PUBLIC_USER_ID) - .values( - "user__id", - "group__id", - "token__id", - "canDownload", - "canSensitive", - ), - return_list, - ) - else: - prepare_generic_acls_build( - INSTANCE_ACL_SET.select_related("user", "group", "token") - .all() - .exclude(user__id=settings.PUBLIC_USER_ID) - .values( - "user__id", - "group__id", - "token__id", - "canDownload", - "canSensitive", - ), - return_list, - ) - return return_list - - -def prepare_generic_parameters(instance, type): - """Returns the parameters associated with the provided instance, - formatted for elasticsearch.""" - - type_dict = { - "project": ProjectParameter, - "experiment": ExperimentParameter, - "dataset": DatasetParameter, - "datafile": DatafileParameter, - } - OBJPARAMETERS = type_dict[type] - - # get list of object parametersets - paramsets = list(instance.getParameterSets()) - parameter_groups = { - "string": [], - "numerical": [], - "datetime": [], - "schemas": [], - } - # iterate over parametersets of an object - for paramset in paramsets: - param_type = {1: "datetime", 2: "string", 3: "numerical"} - # query parameters from parameterset - param_glob = OBJPARAMETERS.objects.filter(parameterset=paramset).values_list( - "name", - "datetime_value", - "string_value", - "numerical_value", - ) - # add schema information to dict - parameter_groups["schemas"].append({"schema_id": paramset.schema_id}) - # iterate over parameter info "name/datetime/string/numerical" - for sublist in param_glob: - # query parametername info using "name" - PN = ParameterName.objects.get(id=sublist[0]) - # build dict for param - param_dict = {} - type_idx = 0 - # iterate over datetime/string/numerical info - for idx, value in enumerate(sublist[1:]): - # if datetime/string/numerical atually contains info - if value not in [None, ""]: - # add parametername info to dict - param_dict["pn_id"] = str(PN.id) - param_dict["pn_name"] = str(PN.full_name) - param_dict["sensitive"] = PN.sensitive - type_idx = idx + 1 - # detect type of param, and add value to dict - if type_idx == 1: - param_dict["value"] = value - elif type_idx == 2: - param_dict["value"] = str(value) - elif type_idx == 3: - param_dict["value"] = float(value) - # if parameter with a value is added, add param_dict to - # parameters_dict - if type_idx: - parameter_groups[param_type[type_idx]].append(param_dict) - return parameter_groups - - class MyTardisDocument(Document): """ Generalised class for MyTardis objects @@ -605,7 +419,8 @@ def update_es_relations(instance, **kwargs): in the django_elasticsearch_dsl package. This function simply re-indexes relevant documents a second time on post_delete. - Probably clashes with the Async CelerySignalProcessor. + Probably clashes with the Async CelerySignalProcessor, so have forced + non-compatability between this function and CelerySignalProcessor=True. """ if isinstance(instance, ProjectACL): parent = instance.project diff --git a/tardis/apps/search/utils/__init__.py b/tardis/apps/search/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tardis/apps/search/utils/api.py b/tardis/apps/search/utils/api.py new file mode 100644 index 000000000..7114f7108 --- /dev/null +++ b/tardis/apps/search/utils/api.py @@ -0,0 +1,803 @@ +""" +helper functions used in api.py +""" + +from datetime import datetime + +from django.template.defaultfilters import filesizeformat +from elasticsearch_dsl import Q + +from tardis.tardis_portal.models import ( + Schema, +) + + +def cleaning_results(results, result_dict, preloaded, datafiles_dl): + """ + Filter out and clean search result hits based on a number of critera. + """ + + for item in results: + for hit_attrdict in item.hits.hits: + hit = hit_attrdict.to_dict() + + # Check to see if indexed object actually exists in DB, if not then skip + if int(hit["_source"]["id"]) not in preloaded[hit["_index"]]["objects"]: + continue + + # Default sensitive permission and size of object + sensitive_bool = False + size = 0 + # If user/group has sensitive permission, update flag + if hit["_source"]["id"] in preloaded[hit["_index"]]["sens_list"]: + sensitive_bool = True + # Re-package parameters into single parameter list + param_list = [] + if "string" in hit["_source"]["parameters"]: + param_list.extend(hit["_source"]["parameters"]["string"]) + if "numerical" in hit["_source"]["parameters"]: + param_list.extend(hit["_source"]["parameters"]["numerical"]) + if "datetime" in hit["_source"]["parameters"]: + param_list.extend(hit["_source"]["parameters"]["datetime"]) + hit["_source"]["parameters"] = param_list + # Remove unused fields to reduce data sent to front-end + hit.pop("_score") + hit.pop("_id") + # hit.pop("_type") + hit.pop("sort") + + # Get count of all nested objects and download status + if hit["_index"] == "datafile": + if hit["_source"]["id"] in datafiles_dl: + hit["_source"]["userDownloadRights"] = "full" + size = hit["_source"]["size"] + else: + hit["_source"]["userDownloadRights"] = "none" + + else: + safe_nested_dfs_set = {*preloaded["datafile"]["objects"]}.intersection( + preloaded[hit["_index"]]["objects"][hit["_source"]["id"]]["dfs"] + ) + safe_nested_dfs_count = len(safe_nested_dfs_set) + if hit["_index"] in {"project", "experiment"}: + safe_nested_set = len( + {*preloaded["dataset"]["objects"]}.intersection( + preloaded[hit["_index"]]["objects"][hit["_source"]["id"]][ + "sets" + ] + ) + ) + # Ugly hack, should do a nicer, less verbose loop+type detection + if hit["_index"] == "project": + safe_nested_exp = len( + {*preloaded["experiment"]["objects"]}.intersection( + preloaded[hit["_index"]]["objects"][hit["_source"]["id"]][ + "exps" + ] + ) + ) + hit["_source"]["counts"] = { + "experiments": safe_nested_exp, + "datasets": safe_nested_set, + "datafiles": (safe_nested_dfs_count), + } + if hit["_index"] == "experiment": + hit["_source"]["counts"] = { + "datasets": safe_nested_set, + "datafiles": safe_nested_dfs_count, + } + if hit["_index"] == "dataset": + hit["_source"]["counts"] = {"datafiles": safe_nested_dfs_count} + # Get downloadable datafiles ultimately belonging to this "hit" object + # and calculate the total size of these files + safe_nested_dfs_dl = [*safe_nested_dfs_set.intersection(datafiles_dl)] + size = sum( + ( + preloaded["datafile"]["objects"][id]["size"] + for id in safe_nested_dfs_dl + ) + ) + # Determine the download state of the "hit" object + # safe_nested_dfs_dl_bool = [id in datafiles_dl for id in safe_nested_dfs] + if safe_nested_dfs_set.issubset(datafiles_dl): + hit["_source"]["userDownloadRights"] = "full" + elif safe_nested_dfs_set.intersection(datafiles_dl): + hit["_source"]["userDownloadRights"] = "partial" + else: + hit["_source"]["userDownloadRights"] = "none" + + hit["_source"]["size"] = filesizeformat(size) + + # if no sensitive access, remove sensitive metadata from response + for idxx, parameter in reversed([*enumerate(hit["_source"]["parameters"])]): + if not sensitive_bool: + if parameter["sensitive"]: + hit["_source"]["parameters"].pop(idxx) + else: + hit["_source"]["parameters"][idxx].pop("sensitive") + else: + if not parameter["sensitive"]: + hit["_source"]["parameters"][idxx].pop("sensitive") + + # Append hit to results if not already in results. + # Due to non-identical scores in hits for non-sensitive vs sensitive search, + # we require a more complex comparison than just 'is in' as hits are not identical + result_dict[hit["_index"]].append(hit) + return result_dict + + +def cleaning_parent_filter(results, filter_level): + """ + filter out hits results based upon level of parent filtering + """ + + # Define parent_type for experiment/datafile + # (N/A for project, hardcoded for dataset) + parent_child = {"experiment": "projects", "dataset": "experiments"} + # Define hierarchy of types for filter levels + hierarch = [3, 2, 1] # {"experiments":3, "datasets":2, "datafiles":1} + for idx, item in enumerate(results[1:]): + # if active filter level higher than current object type: + # apply "parent-in-result" filter + if hierarch[idx] < filter_level: + parent_ids = [objj["_source"]["id"] for objj in results[idx].hits.hits] + parent_ids_set = {*parent_ids} + + for obj_idx, obj in reversed([*enumerate(item.hits.hits)]): + if obj["_index"] != "datafile": + parent_es_ids = [ + parent["id"] + for parent in obj["_source"][parent_child[obj["_index"]]] + ] + if not any(itemm in parent_es_ids for itemm in parent_ids): + results[idx + 1].hits.hits.pop(obj_idx) + else: + if ( + obj["_source"]["dataset"]["id"] not in parent_ids_set + ): # parent object is idx-1, but idx in enumerate + # is already shifted by -1, so straight idx + results[idx + 1].hits.hits.pop(obj_idx) + return results + + +def cleaning_preload(preloaded, projects, experiments, datasets, datafiles): + """ + Populate the preload dictionary with IDs and child IDs of search results. + """ + + # Probably a cleaner/simpler way to do this, but hey ho! + for otype, id_list in { + "project": projects, + "experiment": experiments, + "dataset": datasets, + "datafile": datafiles, + }.items(): + # iterate over objects in list - each list element has the obj_id + # followed by separated lists of any children objects + for ids in id_list: + # extract object ID + obj_id = ids[0] + # Check if the ID already exists in preloaded dict + if obj_id in preloaded[otype]["objects"]: + # add nested children to the dict + # Note: Datafiles don't have children so only in the else clause + if otype == "dataset": + preloaded[otype]["objects"][obj_id]["dfs"].add(ids[1]) + elif otype == "experiment": + preloaded[otype]["objects"][obj_id]["sets"].add(ids[1]) + preloaded[otype]["objects"][obj_id]["dfs"].add(ids[2]) + elif otype == "project": + preloaded[otype]["objects"][obj_id]["exps"].add(ids[1]) + preloaded[otype]["objects"][obj_id]["sets"].add(ids[2]) + preloaded[otype]["objects"][obj_id]["dfs"].add(ids[3]) + else: + # create the new dict for the object and populate with + # children (proj/exp/set) or datafile size (datafile) + new_dict = {} + if otype == "datafile": + new_dict["size"] = ids[1] + elif otype == "dataset": + new_dict["dfs"] = {ids[1]} + elif otype == "experiment": + new_dict["sets"] = {ids[1]} + new_dict["dfs"] = {ids[2]} + elif otype == "project": + new_dict["exps"] = {ids[1]} + new_dict["sets"] = {ids[2]} + new_dict["dfs"] = {ids[3]} + # assign new dict to preloaded dict + preloaded[otype]["objects"][obj_id] = new_dict + return preloaded + + +def cleaning_ids(user, groups, objtype): + """ + Function to build up generic object queries to get ID information + on objects, and specifically also the size of Datafiles. + """ + + if objtype == "project": + prefetch_fields = [ + "project__experiments", + "project__experiments__datasets", + "project__experiments__datasets__datafile", + ] + value_fields = [field + "__id" for field in prefetch_fields] + + if objtype == "experiment": + prefetch_fields = [ + "experiment__datasets", + "experiment__datasets__datafile", + ] + value_fields = [field + "__id" for field in prefetch_fields] + + if objtype == "dataset": + prefetch_fields = [ + "dataset__datafile", + ] + value_fields = [field + "__id" for field in prefetch_fields] + + if objtype == "datafile": + prefetch_fields = None + value_fields = ["datafile__size"] + + id_list = cleaning_acls( + user, + groups, + objtype, + prefetch_fields=prefetch_fields, + value_fields=value_fields, + flat=False, + ) + return id_list + + +def cleaning_acls( + user, + groups, + objtype, + canSensitive=False, + canDownload=False, + prefetch_fields=None, + value_fields=None, + flat=True, +): + """ + Function to build up generic object queries to get ACL or ID information + on objects. + """ + if objtype == "project": + entity = user.projectacls + if objtype == "experiment": + entity = user.experimentacls + if objtype == "dataset": + entity = user.datasetacls + if objtype == "datafile": + entity = user.datafileacls + + query = cleaning_acl_query( + entity, + objtype, + canSensitive=canSensitive, + canDownload=canDownload, + prefetch_fields=prefetch_fields, + value_fields=value_fields, + flat=flat, + ) + for group in groups: + + if objtype == "project": + entity = group.projectacls + if objtype == "experiment": + entity = group.experimentacls + if objtype == "dataset": + entity = group.datasetacls + if objtype == "datafile": + entity = group.datafileacls + + query |= cleaning_acl_query( + entity, + objtype, + canSensitive=canSensitive, + canDownload=canDownload, + prefetch_fields=prefetch_fields, + value_fields=value_fields, + flat=flat, + ) + return [*query.distinct()] + + +def cleaning_acl_query( + entity_acls, + objtype, + canSensitive=False, + canDownload=False, + prefetch_fields=None, + value_fields=None, + flat=True, +): + """ + Function to build up generic object queries to get ACL or ID information + on objects. + """ + # build query on object and related ACLs + query = entity_acls.select_related(objtype) + + # apply specific ACL perm filter + if canSensitive is True: + query = query.filter(canSensitive=True) + if canDownload is True: + query = query.filter(canSensitive=True) + + # if prefetch_fields are specified, add prefetch to query + if prefetch_fields is not None: + query = query.prefetch_related(*prefetch_fields) + + # exclude too-new/expired ACLs + query = query.exclude( + effectiveDate__gte=datetime.today(), + expiryDate__lte=datetime.today(), + ) + + # add OBJ__id to values_list return + value_list_to_add = [objtype + "__id"] + # if list of extra values_list specified, add them to query + if value_fields is not None: + value_list_to_add.extend(value_fields) + + return query.values_list(*value_list_to_add, flat=flat) + + +def query_add_sorting(request_sorting, obj, sort_dict): + """ + Function to build up sorting filters that need to be applied to + a search query + """ + # make sure sorting request contains info and current object/model is in sorting request + if (request_sorting is not None) and (obj in request_sorting): + # check if + # iterate over sort options + for sort in request_sorting[obj]: + + # process nested sort filters + if len(sort["field"]) > 1: + # if in this dict then field adds .raw to end of sort + if sort["field"][-1] in { + "fullname", + "name", + "title", + "description", + "filename", + }: + search_field = ".".join(sort["field"]) + ".raw" + else: + search_field = ".".join(sort["field"]) + # build up sorting dict options + sort_dict[search_field] = { + "order": sort["order"], + "nested_path": ".".join(sort["field"][:-1]), + } + # process non-nested sort filters + if len(sort["field"]) == 1: + # these fields need to have .raw added to them + if sort["field"][0] in { + "principal_investigator", + "name", + "title", + "description", + "filename", + }: + sort_dict[sort["field"][0] + ".raw"] = {"order": sort["order"]} + # size field needs specific handling + elif sort["field"][0] == "size": + # for datafile size is easy to calculate + if obj == "datafile": + sort_dict[sort["field"][0]] = {"order": sort["order"]} + # for parent models we need ACL context for this, + # which is currently not available in ES. + else: + # DO THIS SORTING AFTER ELASTICSEARCH + pass + else: + sort_dict[sort["field"][0]] = {"order": sort["order"]} + return sort_dict + + +def Q_nested(path, query): + """wrapper function for readability of nested ES Queries""" + query = Q({"nested": {"path": path, "query": query}}) + return query + + +def Q_must(query_list): + """wrapper function for readability of must ES Queries""" + query = Q({"bool": {"must": query_list}}) + return query + + +def create_user_and_group_query(user, groups): + """ + This function creates an initial search query object and requires that + any results must have an appropriate User OR Group ACL for a user + any + of their groups. + """ + # query where ACL must match entityId=User.id and pluginId=django_user + query_obj = Q_nested( + path="acls", + query=Q_must( + query_list=[ + Q({"match": {"acls.entityId": user.id}}), + Q({"term": {"acls.pluginId": "django_user"}}), + ] + ), + ) + # queries where ACL must match entityId=group.id and pluginId=django_group + for group in groups: + query_obj_group = Q_nested( + path="acls", + query=Q_must( + query_list=[ + Q({"match": {"acls.entityId": group.id}}), + Q({"term": {"acls.pluginId": "django_group"}}), + ] + ), + ) + # add each group query as an OR to existing query + query_obj = query_obj | query_obj_group + return query_obj + + +def query_keywords_and_metadata(query_obj, query_text, object_type, idx, title_list): + """ + This function takes an existing search query and adds matches for + keywords or non-sensitive metadata fields. + """ + # query where object "title" must match query_text for given object + query_obj_text = Q({"match": {title_list[idx]: query_text[object_type]}}) + # query where non-sensitive parameters of object must match query_text for given object + query_obj_text_meta = Q_nested( + path="parameters.string", + query=Q_must( + query_list=[ + Q({"match": {"parameters.string.value": query_text[object_type]}}), + Q({"term": {"parameters.string.sensitive": False}}), + ] + ), + ) + # stack up query by matching "title" OR "non-sensitive metadata" + query_obj_text_meta = query_obj_text | query_obj_text_meta + # stack keyword+metadata query with existing query via AND (we expect it's the user/group query) + query_obj = query_obj & query_obj_text_meta + return query_obj + + +def _query_filter_on_parameters_type(ptype, param_id, oper, value): + """ + This function is a generic handler to build an individual parameter query. It deals with + the following types of metadata: "string", "numerical", "datetime". + """ + + # cast the type field into the correct spelling for object document structure + type_mapping = {"STRING": "string", "NUMERIC": "numerical", "DATETIME": "datetime"} + ptype = type_mapping[ptype] + + # tweak the operators and value structure based up parameter type + if ptype == "string": + base_oper = oper + query_val = value + + elif ptype in ["numerical", "datetime"]: + base_oper = "range" + query_val = {oper: value} + + # create and return query on parameter using provided inputs + query_obj = Q_nested( + path="parameters." + ptype, + query=Q_must( + query_list=[ + Q({"match": {"parameters." + ptype + ".pn_id": str(param_id)}}), + Q({base_oper: {"parameters." + ptype + ".value": query_val}}), + Q({"term": {"parameters." + ptype + ".sensitive": False}}), + ] + ), + ) + return query_obj + + +def _query_filter_on_parameters(query_obj, filter, obj, filter_level, hierarchy, oper): + """ + This function is a component of the more general "query_apply_filters" function. + Here we build up queries on Schema-parameter / metadata filters. + """ + + # Hardcode of Schema types+numbers here + num_2_type = { + 1: "experiment", + 2: "dataset", + 3: "datafile", + 6: "project", + } + + # extract schema + parameter id from API request + schema_id, param_id = filter["target"][0], filter["target"][1] + + # check filter is applied to correct object type + if num_2_type[Schema.objects.get(id=schema_id).type] == obj: + # redefine "parent-in-results" threshold if required + if filter_level < hierarchy[obj]: + filter_level = hierarchy[obj] + + # check if filter query is list of options or a single value + # (elasticsearch can actually handle delimiters in a single string...) + if isinstance(filter["content"], list): + # if filter is a list, create an "OR" operator list of search queries + # and pass each value individually to build "OR" query + Qdict = {"should": []} + for option in filter["content"]: + query = _query_filter_on_parameters_type( + filter["type"], param_id, oper, option + ) + Qdict["should"].append(query) + # final "OR" query is built up from the individual ones + query_obj_filt = Q({"bool": Qdict}) + + else: + # if filter content not a list of options then pass filter content + # directly as the value to search on + query_obj_filt = _query_filter_on_parameters_type( + filter["type"], param_id, oper, filter["content"] + ) + + # as before, combine filter query with existing query + query_obj = query_obj & query_obj_filt + + # return both the query_obj and the filter level variable + return query_obj, filter_level + + +def _query_filter_on_intrinsic_schemas(query_obj, filter, oper): + """ + This function is applies "filter by schema" to a search query. + """ + # check if filter query is list of options, or single value + if isinstance(filter["content"], list): + # If its a list of filters, build up several queries and combine them with OR + Qdict = {"should": []} + for option in filter["content"]: + qry = Q_nested( + path="parameters.schemas", + query=Q({oper: {"parameters.schemas.schema_id": option}}), + ) + Qdict["should"].append(qry) + query_obj_filt = Q({"bool": Qdict}) + else: + # if its a single filter, just create one query + query_obj_filt = Q_nested( + path="parameters.schemas", + query=Q({oper: {"parameters.schemas.schema_id": filter["content"]}}), + ) + + # combine this filter with the existing query and return it + query_obj = query_obj & query_obj_filt + return query_obj + + +def _query_filter_on_intrinsic_fields(query_obj, filter, oper, target_fieldtype): + """ + This function is applies "filter by intrinsic object fields" to a search query. + It separates the fields into two groups: "string" fields and "datetime" fields. + String fields: name, description, title, tags, filename, file_extension + Datetime fields: created_time, start_time, end_time + """ + + # if the filter type is on a string field + if filter["type"] == "STRING": + # string filters can be comma separated lists, so check for list or single + if isinstance(filter["content"], list): + # if filter is a list, build individual queries for each filter and combine + # using an "OR" + Qdict = {"should": []} + for option in filter["content"]: + # Special treatment for file extension to remove leading fullstops + if target_fieldtype == "file_extension": + if option[0] == ".": + option = option[1:] + # add individual query to list of query components + qry = Q({oper: {target_fieldtype: option}}) + Qdict["should"].append(qry) + # build final query from individual components + query_obj_filt = Q({"bool": Qdict}) + else: + # Special treatment for file extension to remove leading fullstops + if target_fieldtype == "file_extension": + if filter["content"][0] == ".": + filter["content"] = filter["content"][1:] + # simply build the filter query + query_obj_filt = Q({oper: {target_fieldtype: filter["content"]}}) + # if the filter type is a datetime field + elif filter["type"] == "DATETIME": + # simply build the single query + query_obj_filt = Q({"range": {target_fieldtype: {oper: filter["content"]}}}) + # combine the filter query with existing search query and return it + query_obj = query_obj & query_obj_filt + return query_obj + + +def _query_filter_relations_builder(target_fieldtype, oper, nested_fieldtype, option): + """ + This function is a generic query builder for the relational filter queries. + """ + # rest_list + query_obj = Q_nested( + path=target_fieldtype, + query=Q( + { + oper: { + ".".join( + [ + target_fieldtype, + nested_fieldtype, + ] + ): option + } + } + ), + ) + + return query_obj + + +def _query_filter_on_intrinsic_relations(query_obj, filter, oper, target_fieldtype): + """ + This function is applies "filter by intrinsic relations" to a search query. + This includes the following relational fields (and the object on which it acts): + - principal_investigator (Project), + - projects (Experiment), + - instrument (Dataset), + - institution (Project), + - experiments (Dataset), + - dataset (Datafile), + """ + + # extract the the nested field type from filter + nested_fieldtype = filter["target"][2] + # determine if filter is a list of filters or individual search term + if isinstance(filter["content"], list): + Qdict = {"should": []} + # iterate over options and bu8ild individual queries to combine with "OR" + for option in filter["content"]: + qry = _query_filter_relations_builder( + target_fieldtype, oper, nested_fieldtype, option + ) + Qdict["should"].append(qry) + # build final query out of individual components + query_obj_filt = Q({"bool": Qdict}) + else: + # if individual search term, simply build it + query_obj_filt = _query_filter_relations_builder( + target_fieldtype, oper, nested_fieldtype, filter["content"] + ) + # Special handling for list of principal investigators + if target_fieldtype == "principal_investigator": + # principal investigator builds on existing intrinsic relation query! + Qdict_lr = {"should": [query_obj_filt]} + # determine if filter is a list of filters or individual search term + if isinstance(filter["content"], list): + Qdict = {"should": []} + # iterate over options and bu8ild individual queries to combine with "OR" + for option in filter["content"]: + qry = _query_filter_relations_builder( + target_fieldtype, "term", "username", option + ) + Qdict["should"].append(qry) + # build final query out of individual components + query_obj_filt = Q({"bool": Qdict}) + else: + # if individual search term, simply build it + query_obj_filt = _query_filter_relations_builder( + target_fieldtype, "term", "username", filter["content"] + ) + # build special "OR" query for principal_investigator search + Qdict_lr["should"].append(query_obj_filt) + query_obj_filt = Q({"bool": Qdict_lr}) + # combine intrinsic relation filter with existing search query and return + query_obj = query_obj & query_obj_filt + return query_obj + + +def _query_filter_on_intrinsics(query_obj, filter, obj, filter_level, hierarchy, oper): + """ + This function is a component of the more general "query_apply_filters" function. + Here we build up queries on intrinsic filters such as filter by Schema, filter by object + properties, filter by related object properties. + """ + + # Extract out the filter object type and the filter field/variable name + target_objtype, target_fieldtype = ( + filter["target"][0], + filter["target"][1], + ) + + # Of course, only process filter if it's the right object for this object search + if target_objtype == obj: + # Update the heirarchy level at which the + # "parent-in-results" criteria must be applied + if filter_level < hierarchy[obj]: + filter_level = hierarchy[obj] + + # Apply "Selected Schema" filter + if target_fieldtype == "schema": + query_obj = _query_filter_on_intrinsic_schemas(query_obj, filter, oper) + + # Apply filters that act on fields which are intrinsic to the object (Proj,exp,set,file) + if target_fieldtype in { + "name", + "description", + "title", + "tags", + "filename", + "file_extension", + "created_time", + "start_time", + "end_time", + }: + query_obj = _query_filter_on_intrinsic_fields( + query_obj, filter, oper, target_fieldtype + ) + + # Apply filters that act on fields which are intrinsic to related objects (instruments, users, etc) + if target_fieldtype in { + "principal_investigator", + "projects", + "instrument", + "institution", + "experiments", + "dataset", + }: + + query_obj = _query_filter_on_intrinsic_relations( + query_obj, filter, oper, target_fieldtype + ) + + # return function-updated query_obj, and filter_level + return query_obj, filter_level + + +def query_apply_filters(query_obj, filters, obj, filter_level, hierarchy): + """ + This function takes an existing search query and adds matches for + the various filters on metadata that can be applied. + """ + + # combination logic of filters can theoretically be applied, but not supported yet. + # filter_op = filters['op'] This isn't used for now + + filterlist = filters["content"] + + # Define operator translations between API and ES + operator_dict = { + "is": "term", + "contains": "match", + ">=": "gte", + "<=": "lte", + } + + # Iterate over filters in request + for filter in filterlist: + + # Extract required operator for filter + oper = operator_dict[filter["op"]] + + # Apply Schema-parameter / metadata filters to search + if filter["kind"] == "schemaParameter": + query_obj, filter_level = _query_filter_on_parameters( + query_obj, filter, obj, filter_level, hierarchy, oper + ) + + # Apply intrinsic object filters to search + if filter["kind"] == "typeAttribute": + query_obj, filter_level = _query_filter_on_intrinsics( + query_obj, filter, obj, filter_level, hierarchy, oper + ) + + return query_obj, filter_level diff --git a/tardis/apps/search/utils/documents.py b/tardis/apps/search/utils/documents.py new file mode 100644 index 000000000..cd9c6fc5e --- /dev/null +++ b/tardis/apps/search/utils/documents.py @@ -0,0 +1,207 @@ +""" +helper functions used in documents.py +""" + +from django.conf import settings +from django_elasticsearch_dsl import fields + +from tardis.tardis_portal.models import ( + ParameterName, + ExperimentParameter, + DatasetParameter, + DatafileParameter, +) + +from tardis.apps.projects.models import ( + ProjectParameter, +) + + +def generic_acl_structure(): + """ + Return the ES structure of an ACL. + + - pluginId = type of ACL owner: user/group/token + - entityId = ID of the owner + """ + return fields.NestedField( + properties={ + "pluginId": fields.KeywordField(), + "entityId": fields.KeywordField(), + "canDownload": fields.BooleanField(), + "canSensitive": fields.BooleanField(), + } + ) + + +def generic_parameter_structure(): + """ + Return the ES structure of object parameters and schema. + The parameter structure splits out string/numerical/datetime + parameters so that ES can specifically handle each of their + datatypes. + + - Schemas: + - schema_id: Id of the object schemas + - string/numerical/datetime: + - pn_id: Id of parameter name + - pn_name: Name of parameter name + - value: value of parameter + - sensitive: whether parameter name is sensitive + """ + return fields.NestedField( + properties={ + "string": fields.NestedField( + properties={ + "pn_id": fields.KeywordField(), + "pn_name": fields.KeywordField(), + "value": fields.TextField(), + "sensitive": fields.BooleanField(), + } + ), + "numerical": fields.NestedField( + properties={ + "pn_id": fields.KeywordField(), + "pn_name": fields.KeywordField(), + "value": fields.FloatField(), + "sensitive": fields.BooleanField(), + } + ), + "datetime": fields.NestedField( + properties={ + "pn_id": fields.KeywordField(), + "pn_name": fields.KeywordField(), + "value": fields.DateField(), + "sensitive": fields.BooleanField(), + } + ), + "schemas": fields.NestedField( + properties={"schema_id": fields.KeywordField()} + ), + }, + ) + + +def prepare_generic_acls_build(INSTANCE_ACL_SET, return_list): + """Returns the ACLs associated with this + object, formatted for elasticsearch. + """ + for acl in INSTANCE_ACL_SET: + acl_dict = {} + if acl["user__id"] is not None: + acl_dict["pluginId"] = "django_user" + acl_dict["entityId"] = acl["user__id"] + if acl["group__id"] is not None: + acl_dict["pluginId"] = "django_group" + acl_dict["entityId"] = acl["group__id"] + if acl["token__id"] is not None: + # token access shouldn't be added to search + # unless search is given a way of checking token expiry + continue + # add in permission booleans + acl_dict["canDownload"] = acl["canDownload"] + acl_dict["canSensitive"] = acl["canSensitive"] + if acl_dict not in return_list: + return_list.append(acl_dict) + + +def prepare_generic_acls(type, INSTANCE_ACL_SET, INSTANCE_EXPS=None): + """Returns the ACLs associated with this + object, formatted for elasticsearch. + + This function is mostly just a wrapper around "prepare_generic_acls_build" + to account for current macro/micro behaviour. + """ + return_list = [] + if settings.ONLY_EXPERIMENT_ACLS and type != "experiment": + for exp in INSTANCE_EXPS.all(): + prepare_generic_acls_build( + exp.experimentacl_set.select_related("user", "group", "token") + .all() + .exclude(user__id=settings.PUBLIC_USER_ID) + .values( + "user__id", + "group__id", + "token__id", + "canDownload", + "canSensitive", + ), + return_list, + ) + else: + prepare_generic_acls_build( + INSTANCE_ACL_SET.select_related("user", "group", "token") + .all() + .exclude(user__id=settings.PUBLIC_USER_ID) + .values( + "user__id", + "group__id", + "token__id", + "canDownload", + "canSensitive", + ), + return_list, + ) + return return_list + + +def prepare_generic_parameters(instance, type): + """Returns the parameters associated with the provided instance, + formatted for elasticsearch.""" + + type_dict = { + "project": ProjectParameter, + "experiment": ExperimentParameter, + "dataset": DatasetParameter, + "datafile": DatafileParameter, + } + OBJPARAMETERS = type_dict[type] + + # get list of object parametersets + paramsets = list(instance.getParameterSets()) + parameter_groups = { + "string": [], + "numerical": [], + "datetime": [], + "schemas": [], + } + # iterate over parametersets of an object + for paramset in paramsets: + param_type = {1: "datetime", 2: "string", 3: "numerical"} + # query parameters from parameterset + param_glob = OBJPARAMETERS.objects.filter(parameterset=paramset).values_list( + "name", + "datetime_value", + "string_value", + "numerical_value", + ) + # add schema information to dict + parameter_groups["schemas"].append({"schema_id": paramset.schema_id}) + # iterate over parameter info "name/datetime/string/numerical" + for sublist in param_glob: + # query parametername info using "name" + PN = ParameterName.objects.get(id=sublist[0]) + # build dict for param + param_dict = {} + type_idx = 0 + # iterate over datetime/string/numerical info + for idx, value in enumerate(sublist[1:]): + # if datetime/string/numerical atually contains info + if value not in [None, ""]: + # add parametername info to dict + param_dict["pn_id"] = str(PN.id) + param_dict["pn_name"] = str(PN.full_name) + param_dict["sensitive"] = PN.sensitive + type_idx = idx + 1 + # detect type of param, and add value to dict + if type_idx == 1: + param_dict["value"] = value + elif type_idx == 2: + param_dict["value"] = str(value) + elif type_idx == 3: + param_dict["value"] = float(value) + # if parameter with a value is added, add param_dict to + # parameters_dict + if type_idx: + parameter_groups[param_type[type_idx]].append(param_dict) + return parameter_groups