diff --git a/enterprise_catalog/apps/api_client/discovery.py b/enterprise_catalog/apps/api_client/discovery.py index 3e4d5d644..4b2697112 100644 --- a/enterprise_catalog/apps/api_client/discovery.py +++ b/enterprise_catalog/apps/api_client/discovery.py @@ -49,13 +49,14 @@ def _calculate_backoff(self, attempt_count): """ return (self.BACKOFF_FACTOR * (2 ** (attempt_count - 1))) - def _retrieve_metadata_for_content_filter(self, content_filter, page, request_params): + def _retrieve_metadata_page_for_content_filter(self, content_filter, page, request_params): """ Makes a request to discovery's /search/all/ endpoint with the specified content_filter, page, and request_params """ LOGGER.info(f'Retrieving results from course-discovery for page {page}...') attempts = 0 + request_params_with_page = request_params | {'page': page} while True: attempts = attempts + 1 successful = True @@ -64,7 +65,7 @@ def _retrieve_metadata_for_content_filter(self, content_filter, page, request_pa response = self.client.post( DISCOVERY_SEARCH_ALL_ENDPOINT, json=content_filter, - params=request_params, + params=request_params_with_page, timeout=self.HTTP_TIMEOUT, ) successful = response.status_code < 400 @@ -99,6 +100,34 @@ def _retrieve_metadata_for_content_filter(self, content_filter, page, request_pa ) raise err + def retrieve_metadata_for_content_filter(self, content_filter, request_params): + """ + """ + request_params_customized = request_params | { + # Increase number of results per page for the course-discovery response + 'page_size': 100, + # Ensure paginated results are consistently ordered by `aggregation_key` and `start` + 'ordering': 'aggregation_key,start', + } + page = 1 + results = [] + try: + response = self._retrieve_metadata_page_for_content_filter(content_filter, page, request_params_customized) + results += response.get('results', []) + # Traverse all pages and concatenate results + while response.get('next'): + page += 1 + response = self._retrieve_metadata_page_for_content_filter(content_filter, page, request_params_customized) + results += response.get('results', []) + except Exception as exc: + LOGGER.exception( + 'Could not retrieve content items from course-discovery (page %s): %s', + page, + exc, + ) + raise exc + return results + def _retrieve_course_reviews(self, request_params): """ Makes a request to discovery's /api/v1/course_review/ paginated endpoint @@ -251,7 +280,7 @@ def get_video_skills(self, video_usage_key): return video_skills - def get_metadata_by_query(self, catalog_query): + def get_metadata_by_query(self, catalog_query, extra_query_params=None): """ Return results from the discovery service's search/all endpoint. @@ -264,30 +293,17 @@ def get_metadata_by_query(self, catalog_query): request_params = { # Omit non-active course runs from the course-discovery results 'exclude_expired_course_run': True, - # Increase number of results per page for the course-discovery response - 'page_size': 100, - # Ensure paginated results are consistently ordered by `aggregation_key` and `start` - 'ordering': 'aggregation_key,start', # Ensure to fetch learner pathways as part of search/all endpoint response. 'include_learner_pathways': True, - } - - page = 1 + } | extra_query_params results = [] + try: content_filter = catalog_query.content_filter - response = self._retrieve_metadata_for_content_filter(content_filter, page, request_params) - results += response.get('results', []) - # Traverse all pages and concatenate results - while response.get('next'): - page += 1 - request_params.update({'page': page}) - response = self._retrieve_metadata_for_content_filter(content_filter, page, request_params) - results += response.get('results', []) + results.extend(self.retrieve_metadata_for_content_filter(content_filter, request_params)) except Exception as exc: LOGGER.exception( - 'Could not retrieve content items from course-discovery (page %s) for catalog query %s: %s', - page, + 'Could not retrieve content items for catalog query %s: %s', catalog_query, exc, ) diff --git a/enterprise_catalog/apps/catalog/migrations/0040_contentmetadata_catalog_queries_for_restricted_course_run.py b/enterprise_catalog/apps/catalog/migrations/0040_contentmetadata_catalog_queries_for_restricted_course_run.py new file mode 100644 index 000000000..26e4ac294 --- /dev/null +++ b/enterprise_catalog/apps/catalog/migrations/0040_contentmetadata_catalog_queries_for_restricted_course_run.py @@ -0,0 +1,26 @@ +# Generated by Django 4.2.16 on 2024-09-19 23:34 + +import collections +from django.db import migrations, models +import jsonfield.encoder +import jsonfield.fields + + +class Migration(migrations.Migration): + + dependencies = [ + ('catalog', '0039_alter_catalogquery_unique_together_and_more'), + ] + + operations = [ + migrations.AddField( + model_name='contentmetadata', + name='catalog_queries_for_restricted_course_run', + field=models.ManyToManyField(related_name='restricted_course_runs', to='catalog.catalogquery'), + ), + migrations.AlterField( + model_name='catalogquery', + name='content_filter', + field=jsonfield.fields.JSONField(default=dict, dump_kwargs={'cls': jsonfield.encoder.JSONEncoder, 'ensure_ascii': False, 'indent': 4, 'separators': (',', ':')}, help_text="Query parameters which will be used to filter the discovery service's search/all endpoint results, specified as a JSON object.", load_kwargs={'object_pairs_hook': collections.OrderedDict}), + ), + ] diff --git a/enterprise_catalog/apps/catalog/models.py b/enterprise_catalog/apps/catalog/models.py index ce4d163d9..39f3b4331 100644 --- a/enterprise_catalog/apps/catalog/models.py +++ b/enterprise_catalog/apps/catalog/models.py @@ -20,7 +20,7 @@ get_most_recent_modified_time, update_query_parameters, ) -from enterprise_catalog.apps.api_client.discovery import CatalogQueryMetadata +from enterprise_catalog.apps.api_client.discovery import CatalogQueryMetadata, DiscoveryApiClient from enterprise_catalog.apps.api_client.enterprise_cache import ( EnterpriseCustomerDetails, ) @@ -46,6 +46,7 @@ get_content_type, get_content_uuid, get_parent_content_key, + is_content_restricted, localized_utcnow, ) @@ -629,6 +630,22 @@ class ContentMetadata(TimeStampedModel): ) catalog_queries = models.ManyToManyField(CatalogQuery) + # Restricted runs receive their own parallel M2M mapping with CatalogQuery + # primarily because we don't want restricted runs to impact any existing + # logic as we roll out support. + # + # The queries we write generally don't include runs directly, so the + # `catalog_queries` field above generally doesn't relate course runs + # directly to CatalogQuery objects. As a result, downstream code broadly + # assumes that a run belongs to a CatalogQuery to which its parent course + # also belongs. Relating runs directly to CatalogQueries is a fundamental + # departure from the original architecture and base assumptions, hence the + # new field just to be on the safe side. + catalog_queries_for_restricted_course_run = models.ManyToManyField( + CatalogQuery, + related_name='restricted_course_runs', + ) + history = HistoricalRecords() objects = ContentMetadataManager() @@ -994,6 +1011,69 @@ def _check_content_association_threshold(catalog_query, metadata_list): return False +def get_restricted_runs_from_discovery(metadata_including_restricted, catalog_query, dry_run=False): + """ + """ + # Fast exit if the catalog query's content filter doesn't even specify restricted runs. + restricted_runs_allowed = catalog_query.restricted_runs_allowed + if not restricted_runs_allowed: + return [] + + course_keys_matching_query = { + get_content_key(metadata) for metadata in metadata_including_restricted + if get_content_type(metadata) == COURSE + } + # Collect all run keys explicitly marked as "allowed" in the content filter, and also a child of a course that + # matches the content filter. + restricted_run_keys_to_fetch = [] + for course_key_with_restricted_runs, restricted_run_keys_allowed in restricted_runs_allowed.items(): + if not course_key_with_restricted_runs in course_keys_matching_query: + LOGGER.warning( + ( + "Course key %s requested by content-filter for CatalogQuery %s to allow restricted runs, but the " + "course did not match the rest of the filter, so its restricted runs will not be included in this " + "CatalogQuery." + ), + course_key_with_restricted_runs, + catalog_query, + ) + continue + restricted_run_keys_to_fetch.extend(restricted_run_keys_allowed) + + # Fast exit if none of the courses requested actually match the content filter. + if not restricted_run_keys_to_fetch: + return [] + + # Prepare and make the API call to discovery to actually fetch the restricted runs. + content_filter = { + 'key': restricted_run_keys_to_fetch, + } + request_params = { + # Omit non-active course runs from the course-discovery results + 'exclude_expired_course_run': True, + # Make sure to include restricted runs. + 'include_restricted': 'custom-b2b-enterprise', + } + discovery_client = DiscoveryApiClient() + return discovery_client.retrieve_metadata_for_content_filter(content_filter, request_params) + + +def associate_restricted_runs_with_query(metadata, catalog_query, dry_run=False): + """ + """ + metadata_list = create_content_metadata(metadata, catalog_query, dry_run) + if dry_run: + old_restricted_runs_count = catalog_query.restricted_course_runs.count() + new_restricted_runs_count = len(metadata_list) + if old_restricted_runs_count != new_restricted_runs_count: + LOGGER.info('[Dry Run] Updated restricted_course_runs count ({} -> {}) for {}'.format( + old_restricted_runs_count, new_restricted_runs_count, catalog_query)) + else: + catalog_query.restricted_course_runs.set(metadata_list, clear=True) + associated_content_keys = [metadata.content_key for metadata in metadata_list] + return associated_content_keys + + def associate_content_metadata_with_query(metadata, catalog_query, dry_run=False): """ Creates or updates a ContentMetadata object for each entry in `metadata`, @@ -1129,37 +1209,93 @@ def update_contentmetadata_from_discovery(catalog_query, dry_run=False): Returns: list of str: Returns the content keys that were associated from the query results. """ - + # Generate a content filter without the `restricted_runs_allowed` key which + # discovery doesn't understand anyway. + content_filter_clean = {k: v for k, v in catalog_query.content_filter.items() if k != 'restricted_runs_allowed'} + discovery_client = DiscoveryApiClient() try: - # metadata will be an empty dict if unavailable from cache or API. - metadata = CatalogQueryMetadata(catalog_query).metadata + # Fetch all content metadata matching the content filter, restricted or not. + # (metadata will be an empty dict if unavailable from cache or API.) + metadata = discovery_client.get_metadata_by_query( + content_filter_clean, + extra_query_params={"include_restricted": "custom-b2b-enterprise"}, + ) except Exception as exc: LOGGER.exception(f'update_contentmetadata_from_discovery failed {catalog_query}') raise exc - # associate content metadata with a catalog query only when we get valid results - # back from the discovery service. if metadata is `None`, an error occurred while - # calling discovery and we should not proceed with the below association logic. - if metadata: - metadata_content_keys = [get_content_key(entry) for entry in metadata] - LOGGER.info( - 'Retrieved %d content items (%d unique) from course-discovery for catalog query %s', - len(metadata_content_keys), - len(set(metadata_content_keys)), - catalog_query, - ) + # `None` likely means nothing matched the content filter, so exit early. + if not metadata: + return [] - associated_content_keys = associate_content_metadata_with_query(metadata, catalog_query, dry_run) - LOGGER.info( - 'Associated %d content items (%d unique) with catalog query %s', - len(associated_content_keys), - len(set(associated_content_keys)), - catalog_query, - ) + # Partition metadata dicts by unrestricted vs. restricted. + restricted_metadata = [] + unrestricted_metadata = [] + for metadata_dict in metadata: + if is_content_restricted(metadata_dict): + restricted_metadata += metadata_dict + else: + unrestricted_metadata += metadata_dict - return associated_content_keys + # First pass, associate unrestricted content metadata with this CatalogQuery. + unrestricted_metadata_content_keys = [get_content_key(entry) for entry in unrestricted_metadata] + LOGGER.info( + 'Retrieved %d unrestricted content items (%d unique) from course-discovery for catalog query %s', + len(unrestricted_metadata_content_keys), + len(set(unrestricted_metadata_content_keys)), + catalog_query, + ) + associated_content_keys = associate_content_metadata_with_query(metadata, catalog_query, dry_run) + LOGGER.info( + 'Associated %d content items (%d unique) with catalog query %s', + len(associated_content_keys), + len(set(associated_content_keys)), + catalog_query, + ) - return [] + # At this point, we know the `unrestricted_metadata` partition contains + # courses and runs that should be related to the CatalogQuery as restricted + # content, but it's still only a subset of restricted content that needs to + # be related. What's missing are any restricted runs that didn't match the + # original content filter, but whose parent (course) key DID match. + restricted_metadata_content_keys = [get_content_key(entry) for entry in restricted_metadata] + LOGGER.info( + 'Retrieved %d restricted content items (%d unique) from course-discovery for catalog query %s', + len(restricted_metadata_content_keys), + len(set(restricted_metadata_content_keys)), + catalog_query, + ) + resticted_run_metadata = get_restricted_runs_from_discovery( + # Intentionally pass the un-partitioned metadata list so that we can + # discover all restricted runs matching any course that matches the + # content_filter, restricted-only or not. + metadata, + catalog_query, + dry_run, + ) + associated_restricted_run_keys = associate_restricted_runs_with_query( + # Union the following metadata lists: + # * `restricted_metadata`: + # Includes any restricted-only courses, or restricted runs directly + # matching the content filter, BUT not necessarily all restricted + # runs beneath matching courses. + # * `restricted_run_metadata`: + # Includes all restricted runs part of any course matching the + # content filter, BUT no restricted-only courses. + restricted_metadata | resticted_run_metadata, + catalog_query, + dry_run + ) + LOGGER.info( + 'Associated %d restricted runs (%d unique) with catalog query %s', + len(associated_restricted_run_keys), + len(set(associated_restricted_run_keys)), + catalog_query, + ) + + # Consumers expect only the keys associated via the ContentMetadata.catalog_queries M2M field, not including + # restricted runs associated via the ContentMetadata.catalog_queries_for_restricted_course_run M2M field. + return associated_content_keys class CatalogUpdateCommandConfig(ConfigurationModel): diff --git a/enterprise_catalog/apps/catalog/utils.py b/enterprise_catalog/apps/catalog/utils.py index 7a046424d..503d19612 100644 --- a/enterprise_catalog/apps/catalog/utils.py +++ b/enterprise_catalog/apps/catalog/utils.py @@ -17,7 +17,7 @@ get_decoded_jwt as get_decoded_jwt_from_cookie from pytz import UTC -from enterprise_catalog.apps.catalog.constants import COURSE_RUN +from enterprise_catalog.apps.catalog.constants import COURSE, COURSE_RUN LOGGER = getLogger(__name__) @@ -176,3 +176,19 @@ def get_course_run_by_uuid(course, course_run_uuid): except IndexError: return None return course_run + +def is_run_restricted(run_metadata_dict): + return run_metadata_dict.get('restriction_type') == 'restricted-b2b-enterprise' + +def is_content_restricted(metadata_dict): + """ + The given course metadata contains ONLY restricted runs, or the given run is restricted. + """ + content_type = get_content_type(metadata_dict) + if content_type == COURSE: + run_dicts = metadata_dict.get('course_runs', []) + return all(is_run_restricted(run) for run in run_dicts) + elif content_type == COURSE_RUN: + return is_run_restricted(metadata_dict) + # Programs, Learner Pathways, and other content types are never considered "restricted". + return False