Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Relate restricted runs to CatalogQuery objects (ENT-9405) #947

Draft
wants to merge 9 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 36 additions & 20 deletions enterprise_catalog/apps/api_client/discovery.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,14 @@ def _calculate_backoff(self, attempt_count):
"""
return (self.BACKOFF_FACTOR * (2 ** (attempt_count - 1)))

def _retrieve_metadata_for_content_filter(self, content_filter, page, request_params):
def _retrieve_metadata_page_for_content_filter(self, content_filter, page, request_params):
"""
Makes a request to discovery's /search/all/ endpoint with the specified
content_filter, page, and request_params
"""
LOGGER.info(f'Retrieving results from course-discovery for page {page}...')
attempts = 0
request_params_with_page = request_params | {'page': page}
while True:
attempts = attempts + 1
successful = True
Expand All @@ -64,7 +65,7 @@ def _retrieve_metadata_for_content_filter(self, content_filter, page, request_pa
response = self.client.post(
DISCOVERY_SEARCH_ALL_ENDPOINT,
json=content_filter,
params=request_params,
params=request_params_with_page,
timeout=self.HTTP_TIMEOUT,
)
successful = response.status_code < 400
Expand Down Expand Up @@ -99,6 +100,34 @@ def _retrieve_metadata_for_content_filter(self, content_filter, page, request_pa
)
raise err

def retrieve_metadata_for_content_filter(self, content_filter, request_params):
"""
"""
request_params_customized = request_params | {
# Increase number of results per page for the course-discovery response
'page_size': 100,
# Ensure paginated results are consistently ordered by `aggregation_key` and `start`
'ordering': 'aggregation_key,start',
}
page = 1
results = []
try:
response = self._retrieve_metadata_page_for_content_filter(content_filter, page, request_params_customized)
results += response.get('results', [])
# Traverse all pages and concatenate results
while response.get('next'):
page += 1
response = self._retrieve_metadata_page_for_content_filter(content_filter, page, request_params_customized)
results += response.get('results', [])
except Exception as exc:
LOGGER.exception(
'Could not retrieve content items from course-discovery (page %s): %s',
page,
exc,
)
raise exc
return results

def _retrieve_course_reviews(self, request_params):
"""
Makes a request to discovery's /api/v1/course_review/ paginated endpoint
Expand Down Expand Up @@ -251,7 +280,7 @@ def get_video_skills(self, video_usage_key):

return video_skills

def get_metadata_by_query(self, catalog_query):
def get_metadata_by_query(self, catalog_query, extra_query_params=None):
"""
Return results from the discovery service's search/all endpoint.

Expand All @@ -264,30 +293,17 @@ def get_metadata_by_query(self, catalog_query):
request_params = {
# Omit non-active course runs from the course-discovery results
'exclude_expired_course_run': True,
# Increase number of results per page for the course-discovery response
'page_size': 100,
# Ensure paginated results are consistently ordered by `aggregation_key` and `start`
'ordering': 'aggregation_key,start',
# Ensure to fetch learner pathways as part of search/all endpoint response.
'include_learner_pathways': True,
}

page = 1
} | extra_query_params
results = []

try:
content_filter = catalog_query.content_filter
response = self._retrieve_metadata_for_content_filter(content_filter, page, request_params)
results += response.get('results', [])
# Traverse all pages and concatenate results
while response.get('next'):
page += 1
request_params.update({'page': page})
response = self._retrieve_metadata_for_content_filter(content_filter, page, request_params)
results += response.get('results', [])
results.extend(self.retrieve_metadata_for_content_filter(content_filter, request_params))
except Exception as exc:
LOGGER.exception(
'Could not retrieve content items from course-discovery (page %s) for catalog query %s: %s',
page,
'Could not retrieve content items for catalog query %s: %s',
catalog_query,
exc,
)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Generated by Django 4.2.16 on 2024-09-19 23:34

import collections
from django.db import migrations, models
import jsonfield.encoder
import jsonfield.fields


class Migration(migrations.Migration):

dependencies = [
('catalog', '0039_alter_catalogquery_unique_together_and_more'),
]

operations = [
migrations.AddField(
model_name='contentmetadata',
name='catalog_queries_for_restricted_course_run',
field=models.ManyToManyField(related_name='restricted_course_runs', to='catalog.catalogquery'),
),
migrations.AlterField(
model_name='catalogquery',
name='content_filter',
field=jsonfield.fields.JSONField(default=dict, dump_kwargs={'cls': jsonfield.encoder.JSONEncoder, 'ensure_ascii': False, 'indent': 4, 'separators': (',', ':')}, help_text="Query parameters which will be used to filter the discovery service's search/all endpoint results, specified as a JSON object.", load_kwargs={'object_pairs_hook': collections.OrderedDict}),
),
]
184 changes: 160 additions & 24 deletions enterprise_catalog/apps/catalog/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
get_most_recent_modified_time,
update_query_parameters,
)
from enterprise_catalog.apps.api_client.discovery import CatalogQueryMetadata
from enterprise_catalog.apps.api_client.discovery import CatalogQueryMetadata, DiscoveryApiClient
from enterprise_catalog.apps.api_client.enterprise_cache import (
EnterpriseCustomerDetails,
)
Expand All @@ -46,6 +46,7 @@
get_content_type,
get_content_uuid,
get_parent_content_key,
is_content_restricted,
localized_utcnow,
)

Expand Down Expand Up @@ -629,6 +630,22 @@ class ContentMetadata(TimeStampedModel):
)
catalog_queries = models.ManyToManyField(CatalogQuery)

# Restricted runs receive their own parallel M2M mapping with CatalogQuery
# primarily because we don't want restricted runs to impact any existing
# logic as we roll out support.
#
# The queries we write generally don't include runs directly, so the
# `catalog_queries` field above generally doesn't relate course runs
# directly to CatalogQuery objects. As a result, downstream code broadly
# assumes that a run belongs to a CatalogQuery to which its parent course
# also belongs. Relating runs directly to CatalogQueries is a fundamental
# departure from the original architecture and base assumptions, hence the
# new field just to be on the safe side.
catalog_queries_for_restricted_course_run = models.ManyToManyField(
CatalogQuery,
related_name='restricted_course_runs',
)
Comment on lines +644 to +647
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we create an explicit M2M model to represent this and use through?


history = HistoricalRecords()

objects = ContentMetadataManager()
Expand Down Expand Up @@ -994,6 +1011,69 @@ def _check_content_association_threshold(catalog_query, metadata_list):
return False


def get_restricted_runs_from_discovery(metadata_including_restricted, catalog_query, dry_run=False):
"""
"""
# Fast exit if the catalog query's content filter doesn't even specify restricted runs.
restricted_runs_allowed = catalog_query.restricted_runs_allowed
if not restricted_runs_allowed:
return []

course_keys_matching_query = {
get_content_key(metadata) for metadata in metadata_including_restricted
if get_content_type(metadata) == COURSE
}
# Collect all run keys explicitly marked as "allowed" in the content filter, and also a child of a course that
# matches the content filter.
restricted_run_keys_to_fetch = []
for course_key_with_restricted_runs, restricted_run_keys_allowed in restricted_runs_allowed.items():
if not course_key_with_restricted_runs in course_keys_matching_query:
LOGGER.warning(
(
"Course key %s requested by content-filter for CatalogQuery %s to allow restricted runs, but the "
"course did not match the rest of the filter, so its restricted runs will not be included in this "
"CatalogQuery."
),
course_key_with_restricted_runs,
catalog_query,
)
continue
restricted_run_keys_to_fetch.extend(restricted_run_keys_allowed)

# Fast exit if none of the courses requested actually match the content filter.
if not restricted_run_keys_to_fetch:
return []

# Prepare and make the API call to discovery to actually fetch the restricted runs.
content_filter = {
'key': restricted_run_keys_to_fetch,
}
request_params = {
# Omit non-active course runs from the course-discovery results
'exclude_expired_course_run': True,
# Make sure to include restricted runs.
'include_restricted': 'custom-b2b-enterprise',
}
discovery_client = DiscoveryApiClient()
return discovery_client.retrieve_metadata_for_content_filter(content_filter, request_params)


def associate_restricted_runs_with_query(metadata, catalog_query, dry_run=False):
"""
"""
metadata_list = create_content_metadata(metadata, catalog_query, dry_run)
if dry_run:
old_restricted_runs_count = catalog_query.restricted_course_runs.count()
new_restricted_runs_count = len(metadata_list)
if old_restricted_runs_count != new_restricted_runs_count:
LOGGER.info('[Dry Run] Updated restricted_course_runs count ({} -> {}) for {}'.format(
old_restricted_runs_count, new_restricted_runs_count, catalog_query))
else:
catalog_query.restricted_course_runs.set(metadata_list, clear=True)
associated_content_keys = [metadata.content_key for metadata in metadata_list]
return associated_content_keys


def associate_content_metadata_with_query(metadata, catalog_query, dry_run=False):
"""
Creates or updates a ContentMetadata object for each entry in `metadata`,
Expand Down Expand Up @@ -1129,37 +1209,93 @@ def update_contentmetadata_from_discovery(catalog_query, dry_run=False):
Returns:
list of str: Returns the content keys that were associated from the query results.
"""

# Generate a content filter without the `restricted_runs_allowed` key which
# discovery doesn't understand anyway.
content_filter_clean = {k: v for k, v in catalog_query.content_filter.items() if k != 'restricted_runs_allowed'}
discovery_client = DiscoveryApiClient()
try:
# metadata will be an empty dict if unavailable from cache or API.
metadata = CatalogQueryMetadata(catalog_query).metadata
# Fetch all content metadata matching the content filter, restricted or not.
# (metadata will be an empty dict if unavailable from cache or API.)
metadata = discovery_client.get_metadata_by_query(
content_filter_clean,
extra_query_params={"include_restricted": "custom-b2b-enterprise"},
)
except Exception as exc:
LOGGER.exception(f'update_contentmetadata_from_discovery failed {catalog_query}')
raise exc

# associate content metadata with a catalog query only when we get valid results
# back from the discovery service. if metadata is `None`, an error occurred while
# calling discovery and we should not proceed with the below association logic.
if metadata:
metadata_content_keys = [get_content_key(entry) for entry in metadata]
LOGGER.info(
'Retrieved %d content items (%d unique) from course-discovery for catalog query %s',
len(metadata_content_keys),
len(set(metadata_content_keys)),
catalog_query,
)
# `None` likely means nothing matched the content filter, so exit early.
if not metadata:
return []

associated_content_keys = associate_content_metadata_with_query(metadata, catalog_query, dry_run)
LOGGER.info(
'Associated %d content items (%d unique) with catalog query %s',
len(associated_content_keys),
len(set(associated_content_keys)),
catalog_query,
)
# Partition metadata dicts by unrestricted vs. restricted.
restricted_metadata = []
unrestricted_metadata = []
for metadata_dict in metadata:
if is_content_restricted(metadata_dict):
restricted_metadata += metadata_dict
else:
unrestricted_metadata += metadata_dict
Comment on lines +1231 to +1238
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it necessary to mix the restricted run fetching in with the unrestricted fetching? Since we're going to make a second set of requests to discovery anyway, can we do as just two completely separate chunks of work?

  1. Request the content filter without requesting restricted runs.
  2. Request only the restricted runs defined in the content filter.


return associated_content_keys
# First pass, associate unrestricted content metadata with this CatalogQuery.
unrestricted_metadata_content_keys = [get_content_key(entry) for entry in unrestricted_metadata]
LOGGER.info(
'Retrieved %d unrestricted content items (%d unique) from course-discovery for catalog query %s',
len(unrestricted_metadata_content_keys),
len(set(unrestricted_metadata_content_keys)),
catalog_query,
)
associated_content_keys = associate_content_metadata_with_query(metadata, catalog_query, dry_run)
LOGGER.info(
'Associated %d content items (%d unique) with catalog query %s',
len(associated_content_keys),
len(set(associated_content_keys)),
catalog_query,
)

return []
# At this point, we know the `unrestricted_metadata` partition contains
# courses and runs that should be related to the CatalogQuery as restricted
# content, but it's still only a subset of restricted content that needs to
# be related. What's missing are any restricted runs that didn't match the
# original content filter, but whose parent (course) key DID match.
restricted_metadata_content_keys = [get_content_key(entry) for entry in restricted_metadata]
LOGGER.info(
'Retrieved %d restricted content items (%d unique) from course-discovery for catalog query %s',
len(restricted_metadata_content_keys),
len(set(restricted_metadata_content_keys)),
catalog_query,
)
resticted_run_metadata = get_restricted_runs_from_discovery(
# Intentionally pass the un-partitioned metadata list so that we can
# discover all restricted runs matching any course that matches the
# content_filter, restricted-only or not.
metadata,
catalog_query,
dry_run,
)
associated_restricted_run_keys = associate_restricted_runs_with_query(
# Union the following metadata lists:
# * `restricted_metadata`:
# Includes any restricted-only courses, or restricted runs directly
# matching the content filter, BUT not necessarily all restricted
# runs beneath matching courses.
# * `restricted_run_metadata`:
# Includes all restricted runs part of any course matching the
# content filter, BUT no restricted-only courses.
restricted_metadata | resticted_run_metadata,
catalog_query,
dry_run
)
LOGGER.info(
'Associated %d restricted runs (%d unique) with catalog query %s',
len(associated_restricted_run_keys),
len(set(associated_restricted_run_keys)),
catalog_query,
)
Comment on lines +1256 to +1294
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe throw this whole chunk into a new function?


# Consumers expect only the keys associated via the ContentMetadata.catalog_queries M2M field, not including
# restricted runs associated via the ContentMetadata.catalog_queries_for_restricted_course_run M2M field.
return associated_content_keys


class CatalogUpdateCommandConfig(ConfigurationModel):
Expand Down
18 changes: 17 additions & 1 deletion enterprise_catalog/apps/catalog/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
get_decoded_jwt as get_decoded_jwt_from_cookie
from pytz import UTC

from enterprise_catalog.apps.catalog.constants import COURSE_RUN
from enterprise_catalog.apps.catalog.constants import COURSE, COURSE_RUN


LOGGER = getLogger(__name__)
Expand Down Expand Up @@ -176,3 +176,19 @@ def get_course_run_by_uuid(course, course_run_uuid):
except IndexError:
return None
return course_run

def is_run_restricted(run_metadata_dict):
return run_metadata_dict.get('restriction_type') == 'restricted-b2b-enterprise'

def is_content_restricted(metadata_dict):
"""
The given course metadata contains ONLY restricted runs, or the given run is restricted.
"""
content_type = get_content_type(metadata_dict)
if content_type == COURSE:
run_dicts = metadata_dict.get('course_runs', [])
return all(is_run_restricted(run) for run in run_dicts)
elif content_type == COURSE_RUN:
return is_run_restricted(metadata_dict)
# Programs, Learner Pathways, and other content types are never considered "restricted".
return False
Loading