From 1a680a975361dcf4613e35d37887ec6648f3083a Mon Sep 17 00:00:00 2001 From: Troy Sankey Date: Fri, 30 Jun 2023 01:03:37 -0400 Subject: [PATCH] fix: reindex-algolia - include UUIDs from ALL related objects --- enterprise_catalog/apps/api/tasks.py | 215 ++++++++++++------ .../apps/api/tests/test_tasks.py | 56 ++--- 2 files changed, 156 insertions(+), 115 deletions(-) diff --git a/enterprise_catalog/apps/api/tasks.py b/enterprise_catalog/apps/api/tasks.py index 21b62e527..ef10a4606 100644 --- a/enterprise_catalog/apps/api/tasks.py +++ b/enterprise_catalog/apps/api/tasks.py @@ -33,6 +33,7 @@ ) from enterprise_catalog.apps.catalog.constants import ( COURSE, + COURSE_RUN, DISCOVERY_COURSE_KEY_BATCH_SIZE, DISCOVERY_PROGRAM_KEY_BATCH_SIZE, LEARNER_PATHWAY, @@ -500,20 +501,40 @@ def index_enterprise_catalog_in_algolia_task(self, force=False, dry_run=False): raise exep -def get_pathways_by_associated_content(): - """ Prefetch course id and program id to pathway id mapping. """ - pathway_membership_by_course_key = defaultdict(set) - pathway_membership_by_program_key = defaultdict(set) - pathways = ContentMetadata.objects.filter(content_type=LEARNER_PATHWAY).prefetch_related( +def _precalculate_content_mappings(): + """ + Precalculate various mappings between different types of related content. + + Returns: + 2-tuple(dict): + - First element: Mapping of program content_key to list of course run and course ContentMetadata objects. + - Second element: Mapping of learner pathway content_key to list of program and course ContentMetadata objects. + """ + program_to_courses_courseruns_mapping = defaultdict(set) + pathway_to_programs_courses_mapping = defaultdict(set) + courseruns_courses_programs = ContentMetadata.objects.filter( + content_type__in=[COURSE_RUN, COURSE, PROGRAM], + ).prefetch_related( 'associated_content_metadata' ) - for pathway in pathways: - for associated_content in pathway.associated_content_metadata.all(): - if associated_content.content_type == COURSE: - pathway_membership_by_course_key[associated_content.content_key].add(pathway) - elif associated_content.content_type == PROGRAM: - pathway_membership_by_program_key[associated_content.content_key].add(pathway) - return pathway_membership_by_course_key, pathway_membership_by_program_key + for metadata in courseruns_courses_programs: + if metadata.content_type == COURSE_RUN: + for associated_content in metadata.associated_content_metadata.all(): + if associated_content.content_type == PROGRAM: + program_to_courses_courseruns_mapping[associated_content.content_key].add(metadata) + # TODO: Question: Can pathways map directly to course runs? + if metadata.content_type == COURSE: + for associated_content in metadata.associated_content_metadata.all(): + if associated_content.content_type == PROGRAM: + program_to_courses_courseruns_mapping[associated_content.content_key].add(metadata) + elif associated_content.content_type == LEARNER_PATHWAY: + pathway_to_programs_courses_mapping[associated_content.content_key].add(metadata) + else: + for associated_content in metadata.associated_content_metadata.all(): + if associated_content.content_type == LEARNER_PATHWAY: + pathway_to_programs_courses_mapping[associated_content.content_key].add(metadata) + + return program_to_courses_courseruns_mapping, pathway_to_programs_courses_mapping def add_metadata_to_algolia_objects( @@ -579,8 +600,8 @@ def add_metadata_to_algolia_objects( def _get_algolia_products_for_batch( batch_num, content_keys_batch, - course_to_pathway_mapping, - program_to_pathway_mapping, + program_to_courses_courseruns_mapping, + pathway_to_programs_courses_mapping, context_accumulator, ): """ @@ -598,75 +619,120 @@ def _get_algolia_products_for_batch( catalog_uuids_by_key = defaultdict(set) customer_uuids_by_key = defaultdict(set) catalog_queries_by_key = defaultdict(set) - # Retrieve ContentMetadata records for courses, course runs, programs and learner pathways that match the specified - # content_keys in the content_key, parent_content_key or associated content metadata's content_key. - query = ( + + # Create a shared convenience queryset to prefetch catalogs for all metadata lookups below. + all_catalog_queries = CatalogQuery.objects.prefetch_related('enterprise_catalogs') + + # Retrieve ContentMetadata records for: + # * Course runs, courses, programs and learner pathways that are directly requested, and + # * Courses and programs indirectly related to something directly requested. + # - e.g. A course that was not directly requested, but is a member of a program which was requested. + # - e.g. A program that was not directly requested, but is a member of a pathway which was requested. + content_metadata_no_courseruns = ContentMetadata.objects.filter( + # All content (courses, course runs, programs, pathways) directly requested. Q(content_key__in=content_keys_batch) - | Q(parent_content_key__in=content_keys_batch) + # All course runs, courses, or programs contained in programs or pathways requested. | Q( + content_type__in=[COURSE_RUN, COURSE, PROGRAM], + associated_content_metadata__content_type__in=[PROGRAM, LEARNER_PATHWAY], associated_content_metadata__content_key__in=content_keys_batch, - content_type__in=[PROGRAM, LEARNER_PATHWAY] ) + # All programs, pathways to which any requested course belongs. + | Q( + content_type__in=[PROGRAM, LEARNER_PATHWAY], + associated_content_metadata__content_type__in=[COURSE, PROGRAM], + associated_content_metadata__content_key__in=content_keys_batch, + ) + ).prefetch_related( + Prefetch('catalog_queries', queryset=all_catalog_queries), ) - catalog_queries = CatalogQuery.objects.prefetch_related( - 'enterprise_catalogs', - ) - associated_programs_query = ContentMetadata.objects.filter(content_type__in=[PROGRAM, LEARNER_PATHWAY]) - content_metadata = ContentMetadata.objects.filter(query).prefetch_related( - Prefetch('catalog_queries', queryset=catalog_queries), - Prefetch('associated_content_metadata', queryset=associated_programs_query, to_attr='associated_programs'), + # Retrieve ContentMetadata records for any course run which is part of any course found in the previous query. + course_content_keys = content_metadata_no_courseruns.filter( + content_type=COURSE, + ).values_list('content_key', flat=True) + content_metadata_courseruns = ContentMetadata.objects.filter( + parent_content_key__in=course_content_keys + ).prefetch_related( + Prefetch('catalog_queries', queryset=all_catalog_queries), ) - # Retrieve the associated enterprise_catalog_uuids and enterprise_customer_uuids for each selected ContentMetadata - # record, storing them for later retrieval. - for metadata in content_metadata: + # Combine both querysets to represent all the ContentMetadata needed to process this batch. + # + # DEFICIENCY: This final set does not guarantee inclusion of courses (or course runs) indirectly related to a + # requested to a pathway via an association chain of course->program->pathway. This maybe should be added! When it + # is added, a related change must be made in the third pass (below) to chain + # `pathway_to_programs_courses_mapping` and `program_to_courses_courseruns_mapping` to actually collect the UUIDs. + content_metadata_to_process = content_metadata_no_courseruns.union(content_metadata_courseruns) + + # First pass over the batch of content. The goal for this pass is to collect all the UUIDs directly associated with + # each content. This DOES NOT capture any UUIDs indirectly related to programs or pathways via associated courses + # or programs. + for metadata in content_metadata_to_process: if metadata.content_type in (COURSE, PROGRAM, LEARNER_PATHWAY): content_key = metadata.content_key else: + # Course runs should contribute their UUIDs to the parent course. content_key = metadata.parent_content_key - - associated_queries = metadata.catalog_queries.all() - enterprise_catalog_uuids = set() - enterprise_customer_uuids = set() - enterprise_catalog_queries = set() - for query in associated_queries: - enterprise_catalog_queries.add((str(query.uuid), query.title)) - associated_catalogs = query.enterprise_catalogs.all() + associated_catalog_queries = metadata.catalog_queries.all() + for catalog_query in associated_catalog_queries: + catalog_queries_by_key[content_key].add((str(catalog_query.uuid), catalog_query.title)) + # This line is possible thanks to `all_catalog_queries` with the prefectch_related() above. + associated_catalogs = catalog_query.enterprise_catalogs.all() for catalog in associated_catalogs: - enterprise_catalog_uuids.add(str(catalog.uuid)) - enterprise_customer_uuids.add(str(catalog.enterprise_uuid)) - - # add to any existing enterprise catalog uuids, enterprise customer uuids or catalog query uuids - catalog_uuids_by_key[content_key].update(enterprise_catalog_uuids) - customer_uuids_by_key[content_key].update(enterprise_customer_uuids) - catalog_queries_by_key[content_key].update(enterprise_catalog_queries) - - if metadata.content_type == COURSE: - for program_metadata in metadata.associated_programs: - content_key = program_metadata.content_key - catalog_uuids_by_key[content_key].update(enterprise_catalog_uuids) - customer_uuids_by_key[content_key].update(enterprise_customer_uuids) - catalog_queries_by_key[content_key].update(enterprise_catalog_queries) - - for pathway in course_to_pathway_mapping[metadata.content_key]: - catalog_queries_by_key[pathway.content_key].update(enterprise_catalog_queries) - catalog_uuids_by_key[pathway.content_key].update(enterprise_catalog_uuids) - customer_uuids_by_key[pathway.content_key].update(enterprise_customer_uuids) - - if metadata.content_type == PROGRAM: - for pathway in program_to_pathway_mapping[content_key]: - catalog_queries_by_key[pathway.content_key].update(enterprise_catalog_queries) - catalog_uuids_by_key[pathway.content_key].update(enterprise_catalog_uuids) - customer_uuids_by_key[pathway.content_key].update(enterprise_customer_uuids) + catalog_uuids_by_key[content_key].add(str(catalog.uuid)) + customer_uuids_by_key[content_key].add(str(catalog.enterprise_uuid)) + + # Second pass. This time the goal is to capture indirect relationships on programs: + # * For each program: + # - Absorb all UUIDs associated with every associated course. + for metadata in content_metadata_to_process: + if metadata.content_type != PROGRAM: + continue + program_content_key = metadata.content_key + for metadata in program_to_courses_courseruns_mapping[program_content_key]: + catalog_queries_by_key[program_content_key].update(catalog_queries_by_key[metadata.content_key]) + catalog_uuids_by_key[program_content_key].update(catalog_uuids_by_key[metadata.content_key]) + customer_uuids_by_key[program_content_key].update(customer_uuids_by_key[metadata.content_key]) + + # Third pass. This time the goal is to capture indirect relationships on pathways: + # * For each pathway: + # - Absorb all UUIDs associated with every associated course. + # - Absorb all UUIDs associated with every associated program. + for metadata in content_metadata_to_process: + if metadata.content_type != LEARNER_PATHWAY: + continue + pathway_content_key = metadata.content_key + + # TODO: Question: Can pathways map to course runs directly? + for metadata in pathway_to_programs_courses_mapping[pathway_content_key]: + catalog_queries_by_key[pathway_content_key].update(catalog_queries_by_key[metadata.content_key]) + catalog_uuids_by_key[pathway_content_key].update(catalog_uuids_by_key[metadata.content_key]) + customer_uuids_by_key[pathway_content_key].update(customer_uuids_by_key[metadata.content_key]) + + # Extra disabled logic to additionally absorb UUIDs from courses linked to this pathway indirectly via a + # program (chain of association is course -> program -> pathway). This doesn't work because + # content_metadata_to_process queryset for this batch has insuficcient records to support this feature. + # + # if metadata.content_type == PROGRAM: + # for course_metadata in program_to_courses_mapping[program_metadata.content_key]: + # catalog_queries_by_key[pathway_content_key].update( + # catalog_queries_by_key[course_metadata.content_key] + # ) + # catalog_uuids_by_key[pathway_content_key].update( + # catalog_uuids_by_key[course_metadata.content_key] + # ) + # customer_uuids_by_key[pathway_content_key].update( + # customer_uuids_by_key[course_metadata.content_key] + # ) # iterate over courses, programs and pathways and add their metadata to the list of objects to be indexed - filtered_content_metadata = content_metadata.filter( - Q(content_type=COURSE) - | Q(content_type=PROGRAM) - | Q(content_type=LEARNER_PATHWAY) + content_metadata_to_index = ( + metadata for metadata in content_metadata_to_process + if metadata.content_type in [COURSE, PROGRAM, LEARNER_PATHWAY] ) - for metadata in filtered_content_metadata: + num_content_metadata_indexed = 0 + for metadata in content_metadata_to_index: # Check if we've indexed the course recently # (programs/pathways are indexed every time regardless of last indexing) if _was_recently_indexed(metadata.content_key) and metadata.content_type not in [PROGRAM, LEARNER_PATHWAY]: @@ -684,6 +750,8 @@ def _get_algolia_products_for_batch( catalog_queries_by_key[metadata.content_key], ) + num_content_metadata_indexed += 1 + # In case there are multiple CourseMetadata records that share the exact same content_uuid (which would cause an # algolia objectID collision), do not send more than one. Note that selection of duplicate content is # non-deterministic because we do not use order_by() on the queryset. @@ -703,8 +771,8 @@ def _get_algolia_products_for_batch( logger.info( f'[ENTERPRISE_CATALOG_ALGOLIA_REINDEX] batch#{batch_num}: ' f'{len(content_keys_batch)} content keys, ' - f'{content_metadata.count()} content metadata found, ' - f'{filtered_content_metadata.count()} content metadata processed, ' + f'{content_metadata_to_process.count()} content metadata found, ' + f'{num_content_metadata_indexed} content metadata indexed, ' f'{len(algolia_products_by_object_id)} generated algolia products kept, ' f'{duplicate_algolia_records_discarded} generated algolia products discarded.' ) @@ -733,7 +801,10 @@ def _index_content_keys_in_algolia(content_keys, algolia_client): f'[ENTERPRISE_CATALOG_ALGOLIA_REINDEX] There are {len(content_keys)} total content keys to include in the' f' Algolia index.' ) - course_to_pathway_mapping, program_to_pathway_mapping = get_pathways_by_associated_content() + ( + program_to_courses_courseruns_mapping, + pathway_to_programs_courses_mapping, + ) = _precalculate_content_mappings() context_accumulator = { 'total_algolia_products_count': 0, 'discarded_algolia_object_ids': defaultdict(int), @@ -744,8 +815,8 @@ def _index_content_keys_in_algolia(content_keys, algolia_client): _get_algolia_products_for_batch( batch_num, content_keys_batch, - course_to_pathway_mapping, - program_to_pathway_mapping, + program_to_courses_courseruns_mapping, + pathway_to_programs_courses_mapping, context_accumulator, ) for batch_num, content_keys_batch diff --git a/enterprise_catalog/apps/api/tests/test_tasks.py b/enterprise_catalog/apps/api/tests/test_tasks.py index 2243dcf3a..2fb2fda7c 100644 --- a/enterprise_catalog/apps/api/tests/test_tasks.py +++ b/enterprise_catalog/apps/api/tests/test_tasks.py @@ -638,8 +638,8 @@ def mock_replace_all_objects(products_iterable): def mock_get_algolia_products_for_batch( batch_num, content_keys_batch, - course_to_pathway_mapping, - program_to_pathway_mapping, + program_to_courses_courseruns_mapping, + pathway_to_programs_courses_mapping, context_accumulator, ): return [{'key': content_key, 'foo': 'bar'} for content_key in content_keys_batch] @@ -660,17 +660,14 @@ def mock_get_algolia_products_for_batch( {'key': 'course-v1:edX+testX+4', 'foo': 'bar'}, ] - # pylint: disable=too-many-statements - @mock.patch('enterprise_catalog.apps.api.tasks._was_recently_indexed', - side_effect=[ - False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True - ]) + @mock.patch('enterprise_catalog.apps.api.tasks._was_recently_indexed') @mock.patch('enterprise_catalog.apps.api.tasks.get_initialized_algolia_client', return_value=mock.MagicMock()) def test_index_algolia_with_all_uuids(self, mock_search_client, mock_was_recently_indexed): """ Assert that the correct data is sent to Algolia index, with the expected enterprise catalog and enterprise customer associations. """ + mock_was_recently_indexed.return_value = False algolia_data = self._set_up_factory_data_for_algolia() course_associated_program_metadata = ContentMetadataFactory(content_type=PROGRAM, content_key='program-1') pathway_program_metadata = ContentMetadataFactory(content_type=PROGRAM, content_key='program-2') @@ -702,12 +699,9 @@ def mock_replace_all_objects(products_iterable): with mock.patch('enterprise_catalog.apps.api.tasks.ALGOLIA_FIELDS', self.ALGOLIA_FIELDS): with self.assertLogs(level='INFO') as info_logs: tasks.index_enterprise_catalog_in_algolia_task() # pylint: disable=no-value-for-parameter - # call it a second time, make assertions that only one thing happened below - tasks.index_enterprise_catalog_in_algolia_task() # pylint: disable=no-value-for-parameter products_found_log_records = [record for record in info_logs.output if ' products found.' in record] assert '[ENTERPRISE_CATALOG_ALGOLIA_REINDEX] 15 products found.' in products_found_log_records[0] - assert '[ENTERPRISE_CATALOG_ALGOLIA_REINDEX] 12 products found.' in products_found_log_records[1] # create expected data to be added/updated in the Algolia index. expected_algolia_objects_to_index = [] @@ -733,15 +727,15 @@ def mock_replace_all_objects(products_iterable): program_uuid = course_associated_program_metadata.json_metadata.get('uuid') expected_algolia_program_objects.append({ 'objectID': f'program-{program_uuid}-catalog-uuids-0', - 'enterprise_catalog_uuids': [str(self.enterprise_catalog_courses.uuid)], + 'enterprise_catalog_uuids': algolia_data['catalog_uuids'], }) expected_algolia_program_objects.append({ 'objectID': f'program-{program_uuid}-customer-uuids-0', - 'enterprise_customer_uuids': [str(self.enterprise_catalog_courses.enterprise_uuid)], + 'enterprise_customer_uuids': algolia_data['customer_uuids'], }) expected_algolia_program_objects.append({ 'objectID': f'program-{program_uuid}-catalog-query-uuids-0', - 'enterprise_catalog_query_uuids': [str(self.enterprise_catalog_courses.catalog_query.uuid)], + 'enterprise_catalog_query_uuids': sorted(algolia_data['query_uuids']), 'enterprise_catalog_query_titles': [self.enterprise_catalog_courses.catalog_query.title], }) @@ -766,17 +760,17 @@ def mock_replace_all_objects(products_iterable): expected_algolia_pathway_objects.append({ 'key': pathway_metadata.content_key, 'objectID': f'learnerpathway-{pathway_uuid}-catalog-uuids-0', - 'enterprise_catalog_uuids': [str(self.enterprise_catalog_courses.uuid)], + 'enterprise_catalog_uuids': algolia_data['catalog_uuids'], }) expected_algolia_pathway_objects.append({ 'key': pathway_metadata.content_key, 'objectID': f'learnerpathway-{pathway_uuid}-customer-uuids-0', - 'enterprise_customer_uuids': [str(self.enterprise_catalog_courses.enterprise_uuid)], + 'enterprise_customer_uuids': algolia_data['customer_uuids'], }) expected_algolia_pathway_objects.append({ 'key': pathway_metadata.content_key, 'objectID': f'learnerpathway-{pathway_uuid}-catalog-query-uuids-0', - 'enterprise_catalog_query_uuids': [str(self.enterprise_catalog_courses.catalog_query.uuid)], + 'enterprise_catalog_query_uuids': sorted(algolia_data['query_uuids']), 'enterprise_catalog_query_titles': [self.enterprise_catalog_courses.catalog_query.title], }) @@ -811,40 +805,16 @@ def mock_replace_all_objects(products_iterable): actual_algolia_products_sent_sequence[0], key=itemgetter('objectID') ) - unsorted_expected_calls_args = expected_algolia_program_objects + expected_algolia_pathway_objects + \ - expected_algolia_program_objects2 + expected_algolia_pathway_objects2 - expected_second_call_args = sorted(unsorted_expected_calls_args, key=itemgetter('objectID')) - actual_second_call_args = sorted( - actual_algolia_products_sent_sequence[1], key=itemgetter('objectID') - ) self.assertEqual(expected_first_call_args, actual_first_call_args) - self.assertEqual(expected_second_call_args, actual_second_call_args) - - # Verify that we checked the cache twice, though - mock_was_recently_indexed.assert_has_calls([ - mock.call(self.course_metadata_published.content_key), - mock.call(self.course_metadata_published.content_key), - mock.call(course_associated_program_metadata.content_key), - mock.call(pathway_program_metadata.content_key), - mock.call(pathway_metadata.content_key), - mock.call(pathway_metadata2.content_key), - mock.call(pathway_metadata2.content_key), - mock.call(self.course_metadata_published.content_key), - mock.call(self.course_metadata_published.content_key), - mock.call(course_associated_program_metadata.content_key), - mock.call(pathway_program_metadata.content_key), - mock.call(pathway_metadata.content_key), - mock.call(pathway_metadata2.content_key), - mock.call(pathway_metadata2.content_key), - ]) - @mock.patch('enterprise_catalog.apps.api.tasks._was_recently_indexed', return_value=False) + @mock.patch('enterprise_catalog.apps.api.tasks._was_recently_indexed') @mock.patch('enterprise_catalog.apps.api.tasks.get_initialized_algolia_client', return_value=mock.MagicMock()) def test_index_algolia_with_batched_uuids(self, mock_search_client, mock_was_recently_indexed): """ Assert that the correct data is sent to Algolia index, with the expected enterprise catalog, enterprise customer, and catalog query associations. """ + mock_was_recently_indexed.return_value = False algolia_data = self._set_up_factory_data_for_algolia() actual_algolia_products_sent = None @@ -1023,7 +993,7 @@ def test_index_algolia_dry_run(self, mock_search_client, mock_was_recently_index Make sure the dry_run argument functions correctly and does not call replace_all_objects(). """ with mock.patch('enterprise_catalog.apps.api.tasks.ALGOLIA_UUID_BATCH_SIZE', 1), \ - mock.patch('enterprise_catalog.apps.api.tasks.REINDEX_TASK_BATCH_SIZE', 1), \ + mock.patch('enterprise_catalog.apps.api.tasks.REINDEX_TASK_BATCH_SIZE', 10), \ mock.patch('enterprise_catalog.apps.api.tasks.ALGOLIA_FIELDS', self.ALGOLIA_FIELDS): with self.assertLogs(level='INFO') as info_logs: # For some reason in order to call a celery task in-memory you must pass kwargs as args.