From 1a680a975361dcf4613e35d37887ec6648f3083a Mon Sep 17 00:00:00 2001
From: Troy Sankey <tsankey@edx.org>
Date: Fri, 30 Jun 2023 01:03:37 -0400
Subject: [PATCH] fix: reindex-algolia - include UUIDs from ALL related objects

---
 enterprise_catalog/apps/api/tasks.py          | 215 ++++++++++++------
 .../apps/api/tests/test_tasks.py              |  56 ++---
 2 files changed, 156 insertions(+), 115 deletions(-)

diff --git a/enterprise_catalog/apps/api/tasks.py b/enterprise_catalog/apps/api/tasks.py
index 21b62e527..ef10a4606 100644
--- a/enterprise_catalog/apps/api/tasks.py
+++ b/enterprise_catalog/apps/api/tasks.py
@@ -33,6 +33,7 @@
 )
 from enterprise_catalog.apps.catalog.constants import (
     COURSE,
+    COURSE_RUN,
     DISCOVERY_COURSE_KEY_BATCH_SIZE,
     DISCOVERY_PROGRAM_KEY_BATCH_SIZE,
     LEARNER_PATHWAY,
@@ -500,20 +501,40 @@ def index_enterprise_catalog_in_algolia_task(self, force=False, dry_run=False):
         raise exep
 
 
-def get_pathways_by_associated_content():
-    """ Prefetch course id and program id to pathway id mapping. """
-    pathway_membership_by_course_key = defaultdict(set)
-    pathway_membership_by_program_key = defaultdict(set)
-    pathways = ContentMetadata.objects.filter(content_type=LEARNER_PATHWAY).prefetch_related(
+def _precalculate_content_mappings():
+    """
+    Precalculate various mappings between different types of related content.
+
+    Returns:
+        2-tuple(dict):
+            - First element: Mapping of program content_key to list of course run and course ContentMetadata objects.
+            - Second element: Mapping of learner pathway content_key to list of program and course ContentMetadata objects.
+    """
+    program_to_courses_courseruns_mapping = defaultdict(set)
+    pathway_to_programs_courses_mapping = defaultdict(set)
+    courseruns_courses_programs = ContentMetadata.objects.filter(
+        content_type__in=[COURSE_RUN, COURSE, PROGRAM],
+    ).prefetch_related(
         'associated_content_metadata'
     )
-    for pathway in pathways:
-        for associated_content in pathway.associated_content_metadata.all():
-            if associated_content.content_type == COURSE:
-                pathway_membership_by_course_key[associated_content.content_key].add(pathway)
-            elif associated_content.content_type == PROGRAM:
-                pathway_membership_by_program_key[associated_content.content_key].add(pathway)
-    return pathway_membership_by_course_key, pathway_membership_by_program_key
+    for metadata in courseruns_courses_programs:
+        if metadata.content_type == COURSE_RUN:
+            for associated_content in metadata.associated_content_metadata.all():
+                if associated_content.content_type == PROGRAM:
+                    program_to_courses_courseruns_mapping[associated_content.content_key].add(metadata)
+                # TODO: Question: Can pathways map directly to course runs?
+        if metadata.content_type == COURSE:
+            for associated_content in metadata.associated_content_metadata.all():
+                if associated_content.content_type == PROGRAM:
+                    program_to_courses_courseruns_mapping[associated_content.content_key].add(metadata)
+                elif associated_content.content_type == LEARNER_PATHWAY:
+                    pathway_to_programs_courses_mapping[associated_content.content_key].add(metadata)
+        else:
+            for associated_content in metadata.associated_content_metadata.all():
+                if associated_content.content_type == LEARNER_PATHWAY:
+                    pathway_to_programs_courses_mapping[associated_content.content_key].add(metadata)
+
+    return program_to_courses_courseruns_mapping, pathway_to_programs_courses_mapping
 
 
 def add_metadata_to_algolia_objects(
@@ -579,8 +600,8 @@ def add_metadata_to_algolia_objects(
 def _get_algolia_products_for_batch(
     batch_num,
     content_keys_batch,
-    course_to_pathway_mapping,
-    program_to_pathway_mapping,
+    program_to_courses_courseruns_mapping,
+    pathway_to_programs_courses_mapping,
     context_accumulator,
 ):
     """
@@ -598,75 +619,120 @@ def _get_algolia_products_for_batch(
     catalog_uuids_by_key = defaultdict(set)
     customer_uuids_by_key = defaultdict(set)
     catalog_queries_by_key = defaultdict(set)
-    # Retrieve ContentMetadata records for courses, course runs, programs and learner pathways that match the specified
-    # content_keys in the content_key, parent_content_key or associated content metadata's content_key.
-    query = (
+
+    # Create a shared convenience queryset to prefetch catalogs for all metadata lookups below.
+    all_catalog_queries = CatalogQuery.objects.prefetch_related('enterprise_catalogs')
+
+    # Retrieve ContentMetadata records for:
+    # * Course runs, courses, programs and learner pathways that are directly requested, and
+    # * Courses and programs indirectly related to something directly requested.
+    #   - e.g. A course that was not directly requested, but is a member of a program which was requested.
+    #   - e.g. A program that was not directly requested, but is a member of a pathway which was requested.
+    content_metadata_no_courseruns = ContentMetadata.objects.filter(
+        # All content (courses, course runs, programs, pathways) directly requested.
         Q(content_key__in=content_keys_batch)
-        | Q(parent_content_key__in=content_keys_batch)
+        # All course runs, courses, or programs contained in programs or pathways requested.
         | Q(
+            content_type__in=[COURSE_RUN, COURSE, PROGRAM],
+            associated_content_metadata__content_type__in=[PROGRAM, LEARNER_PATHWAY],
             associated_content_metadata__content_key__in=content_keys_batch,
-            content_type__in=[PROGRAM, LEARNER_PATHWAY]
         )
+        # All programs, pathways to which any requested course belongs.
+        | Q(
+            content_type__in=[PROGRAM, LEARNER_PATHWAY],
+            associated_content_metadata__content_type__in=[COURSE, PROGRAM],
+            associated_content_metadata__content_key__in=content_keys_batch,
+        )
+    ).prefetch_related(
+        Prefetch('catalog_queries', queryset=all_catalog_queries),
     )
 
-    catalog_queries = CatalogQuery.objects.prefetch_related(
-        'enterprise_catalogs',
-    )
-    associated_programs_query = ContentMetadata.objects.filter(content_type__in=[PROGRAM, LEARNER_PATHWAY])
-    content_metadata = ContentMetadata.objects.filter(query).prefetch_related(
-        Prefetch('catalog_queries', queryset=catalog_queries),
-        Prefetch('associated_content_metadata', queryset=associated_programs_query, to_attr='associated_programs'),
+    # Retrieve ContentMetadata records for any course run which is part of any course found in the previous query.
+    course_content_keys = content_metadata_no_courseruns.filter(
+        content_type=COURSE,
+    ).values_list('content_key', flat=True)
+    content_metadata_courseruns = ContentMetadata.objects.filter(
+        parent_content_key__in=course_content_keys
+    ).prefetch_related(
+        Prefetch('catalog_queries', queryset=all_catalog_queries),
     )
 
-    # Retrieve the associated enterprise_catalog_uuids and enterprise_customer_uuids for each selected ContentMetadata
-    # record, storing them for later retrieval.
-    for metadata in content_metadata:
+    # Combine both querysets to represent all the ContentMetadata needed to process this batch.
+    #
+    # DEFICIENCY: This final set does not guarantee inclusion of courses (or course runs) indirectly related to a
+    # requested to a pathway via an association chain of course->program->pathway.  This maybe should be added!  When it
+    # is added, a related change must be made in the third pass (below) to chain
+    # `pathway_to_programs_courses_mapping` and `program_to_courses_courseruns_mapping` to actually collect the UUIDs.
+    content_metadata_to_process = content_metadata_no_courseruns.union(content_metadata_courseruns)
+
+    # First pass over the batch of content.  The goal for this pass is to collect all the UUIDs directly associated with
+    # each content.  This DOES NOT capture any UUIDs indirectly related to programs or pathways via associated courses
+    # or programs.
+    for metadata in content_metadata_to_process:
         if metadata.content_type in (COURSE, PROGRAM, LEARNER_PATHWAY):
             content_key = metadata.content_key
         else:
+            # Course runs should contribute their UUIDs to the parent course.
             content_key = metadata.parent_content_key
-
-        associated_queries = metadata.catalog_queries.all()
-        enterprise_catalog_uuids = set()
-        enterprise_customer_uuids = set()
-        enterprise_catalog_queries = set()
-        for query in associated_queries:
-            enterprise_catalog_queries.add((str(query.uuid), query.title))
-            associated_catalogs = query.enterprise_catalogs.all()
+        associated_catalog_queries = metadata.catalog_queries.all()
+        for catalog_query in associated_catalog_queries:
+            catalog_queries_by_key[content_key].add((str(catalog_query.uuid), catalog_query.title))
+            # This line is possible thanks to `all_catalog_queries` with the prefectch_related() above.
+            associated_catalogs = catalog_query.enterprise_catalogs.all()
             for catalog in associated_catalogs:
-                enterprise_catalog_uuids.add(str(catalog.uuid))
-                enterprise_customer_uuids.add(str(catalog.enterprise_uuid))
-
-        # add to any existing enterprise catalog uuids, enterprise customer uuids or catalog query uuids
-        catalog_uuids_by_key[content_key].update(enterprise_catalog_uuids)
-        customer_uuids_by_key[content_key].update(enterprise_customer_uuids)
-        catalog_queries_by_key[content_key].update(enterprise_catalog_queries)
-
-        if metadata.content_type == COURSE:
-            for program_metadata in metadata.associated_programs:
-                content_key = program_metadata.content_key
-                catalog_uuids_by_key[content_key].update(enterprise_catalog_uuids)
-                customer_uuids_by_key[content_key].update(enterprise_customer_uuids)
-                catalog_queries_by_key[content_key].update(enterprise_catalog_queries)
-
-            for pathway in course_to_pathway_mapping[metadata.content_key]:
-                catalog_queries_by_key[pathway.content_key].update(enterprise_catalog_queries)
-                catalog_uuids_by_key[pathway.content_key].update(enterprise_catalog_uuids)
-                customer_uuids_by_key[pathway.content_key].update(enterprise_customer_uuids)
-
-        if metadata.content_type == PROGRAM:
-            for pathway in program_to_pathway_mapping[content_key]:
-                catalog_queries_by_key[pathway.content_key].update(enterprise_catalog_queries)
-                catalog_uuids_by_key[pathway.content_key].update(enterprise_catalog_uuids)
-                customer_uuids_by_key[pathway.content_key].update(enterprise_customer_uuids)
+                catalog_uuids_by_key[content_key].add(str(catalog.uuid))
+                customer_uuids_by_key[content_key].add(str(catalog.enterprise_uuid))
+
+    # Second pass.  This time the goal is to capture indirect relationships on programs:
+    #  * For each program:
+    #    - Absorb all UUIDs associated with every associated course.
+    for metadata in content_metadata_to_process:
+        if metadata.content_type != PROGRAM:
+            continue
+        program_content_key = metadata.content_key
+        for metadata in program_to_courses_courseruns_mapping[program_content_key]:
+            catalog_queries_by_key[program_content_key].update(catalog_queries_by_key[metadata.content_key])
+            catalog_uuids_by_key[program_content_key].update(catalog_uuids_by_key[metadata.content_key])
+            customer_uuids_by_key[program_content_key].update(customer_uuids_by_key[metadata.content_key])
+
+    # Third pass.  This time the goal is to capture indirect relationships on pathways:
+    #  * For each pathway:
+    #    - Absorb all UUIDs associated with every associated course.
+    #    - Absorb all UUIDs associated with every associated program.
+    for metadata in content_metadata_to_process:
+        if metadata.content_type != LEARNER_PATHWAY:
+            continue
+        pathway_content_key = metadata.content_key
+
+        # TODO: Question: Can pathways map to course runs directly?
+        for metadata in pathway_to_programs_courses_mapping[pathway_content_key]:
+            catalog_queries_by_key[pathway_content_key].update(catalog_queries_by_key[metadata.content_key])
+            catalog_uuids_by_key[pathway_content_key].update(catalog_uuids_by_key[metadata.content_key])
+            customer_uuids_by_key[pathway_content_key].update(customer_uuids_by_key[metadata.content_key])
+
+            # Extra disabled logic to additionally absorb UUIDs from courses linked to this pathway indirectly via a
+            # program (chain of association is course -> program -> pathway).  This doesn't work because
+            # content_metadata_to_process queryset for this batch has insuficcient records to support this feature.
+            #
+            # if metadata.content_type == PROGRAM:
+            #     for course_metadata in program_to_courses_mapping[program_metadata.content_key]:
+            #         catalog_queries_by_key[pathway_content_key].update(
+            #             catalog_queries_by_key[course_metadata.content_key]
+            #         )
+            #         catalog_uuids_by_key[pathway_content_key].update(
+            #             catalog_uuids_by_key[course_metadata.content_key]
+            #         )
+            #         customer_uuids_by_key[pathway_content_key].update(
+            #             customer_uuids_by_key[course_metadata.content_key]
+            #         )
 
     # iterate over courses, programs and pathways and add their metadata to the list of objects to be indexed
-    filtered_content_metadata = content_metadata.filter(
-        Q(content_type=COURSE)
-        | Q(content_type=PROGRAM)
-        | Q(content_type=LEARNER_PATHWAY)
+    content_metadata_to_index = (
+        metadata for metadata in content_metadata_to_process
+        if metadata.content_type in [COURSE, PROGRAM, LEARNER_PATHWAY]
     )
-    for metadata in filtered_content_metadata:
+    num_content_metadata_indexed = 0
+    for metadata in content_metadata_to_index:
         # Check if we've indexed the course recently
         # (programs/pathways are indexed every time regardless of last indexing)
         if _was_recently_indexed(metadata.content_key) and metadata.content_type not in [PROGRAM, LEARNER_PATHWAY]:
@@ -684,6 +750,8 @@ def _get_algolia_products_for_batch(
             catalog_queries_by_key[metadata.content_key],
         )
 
+        num_content_metadata_indexed += 1
+
     # In case there are multiple CourseMetadata records that share the exact same content_uuid (which would cause an
     # algolia objectID collision), do not send more than one.  Note that selection of duplicate content is
     # non-deterministic because we do not use order_by() on the queryset.
@@ -703,8 +771,8 @@ def _get_algolia_products_for_batch(
     logger.info(
         f'[ENTERPRISE_CATALOG_ALGOLIA_REINDEX] batch#{batch_num}: '
         f'{len(content_keys_batch)} content keys, '
-        f'{content_metadata.count()} content metadata found, '
-        f'{filtered_content_metadata.count()} content metadata processed, '
+        f'{content_metadata_to_process.count()} content metadata found, '
+        f'{num_content_metadata_indexed} content metadata indexed, '
         f'{len(algolia_products_by_object_id)} generated algolia products kept, '
         f'{duplicate_algolia_records_discarded} generated algolia products discarded.'
     )
@@ -733,7 +801,10 @@ def _index_content_keys_in_algolia(content_keys, algolia_client):
         f'[ENTERPRISE_CATALOG_ALGOLIA_REINDEX] There are {len(content_keys)} total content keys to include in the'
         f' Algolia index.'
     )
-    course_to_pathway_mapping, program_to_pathway_mapping = get_pathways_by_associated_content()
+    (
+        program_to_courses_courseruns_mapping,
+        pathway_to_programs_courses_mapping,
+    ) = _precalculate_content_mappings()
     context_accumulator = {
         'total_algolia_products_count': 0,
         'discarded_algolia_object_ids': defaultdict(int),
@@ -744,8 +815,8 @@ def _index_content_keys_in_algolia(content_keys, algolia_client):
         _get_algolia_products_for_batch(
             batch_num,
             content_keys_batch,
-            course_to_pathway_mapping,
-            program_to_pathway_mapping,
+            program_to_courses_courseruns_mapping,
+            pathway_to_programs_courses_mapping,
             context_accumulator,
         )
         for batch_num, content_keys_batch
diff --git a/enterprise_catalog/apps/api/tests/test_tasks.py b/enterprise_catalog/apps/api/tests/test_tasks.py
index 2243dcf3a..2fb2fda7c 100644
--- a/enterprise_catalog/apps/api/tests/test_tasks.py
+++ b/enterprise_catalog/apps/api/tests/test_tasks.py
@@ -638,8 +638,8 @@ def mock_replace_all_objects(products_iterable):
         def mock_get_algolia_products_for_batch(
             batch_num,
             content_keys_batch,
-            course_to_pathway_mapping,
-            program_to_pathway_mapping,
+            program_to_courses_courseruns_mapping,
+            pathway_to_programs_courses_mapping,
             context_accumulator,
         ):
             return [{'key': content_key, 'foo': 'bar'} for content_key in content_keys_batch]
@@ -660,17 +660,14 @@ def mock_get_algolia_products_for_batch(
             {'key': 'course-v1:edX+testX+4', 'foo': 'bar'},
         ]
 
-    # pylint: disable=too-many-statements
-    @mock.patch('enterprise_catalog.apps.api.tasks._was_recently_indexed',
-                side_effect=[
-                    False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True
-                ])
+    @mock.patch('enterprise_catalog.apps.api.tasks._was_recently_indexed')
     @mock.patch('enterprise_catalog.apps.api.tasks.get_initialized_algolia_client', return_value=mock.MagicMock())
     def test_index_algolia_with_all_uuids(self, mock_search_client, mock_was_recently_indexed):
         """
         Assert that the correct data is sent to Algolia index, with the expected enterprise
         catalog and enterprise customer associations.
         """
+        mock_was_recently_indexed.return_value = False
         algolia_data = self._set_up_factory_data_for_algolia()
         course_associated_program_metadata = ContentMetadataFactory(content_type=PROGRAM, content_key='program-1')
         pathway_program_metadata = ContentMetadataFactory(content_type=PROGRAM, content_key='program-2')
@@ -702,12 +699,9 @@ def mock_replace_all_objects(products_iterable):
         with mock.patch('enterprise_catalog.apps.api.tasks.ALGOLIA_FIELDS', self.ALGOLIA_FIELDS):
             with self.assertLogs(level='INFO') as info_logs:
                 tasks.index_enterprise_catalog_in_algolia_task()  # pylint: disable=no-value-for-parameter
-                # call it a second time, make assertions that only one thing happened below
-                tasks.index_enterprise_catalog_in_algolia_task()  # pylint: disable=no-value-for-parameter
 
         products_found_log_records = [record for record in info_logs.output if ' products found.' in record]
         assert '[ENTERPRISE_CATALOG_ALGOLIA_REINDEX] 15 products found.' in products_found_log_records[0]
-        assert '[ENTERPRISE_CATALOG_ALGOLIA_REINDEX] 12 products found.' in products_found_log_records[1]
 
         # create expected data to be added/updated in the Algolia index.
         expected_algolia_objects_to_index = []
@@ -733,15 +727,15 @@ def mock_replace_all_objects(products_iterable):
         program_uuid = course_associated_program_metadata.json_metadata.get('uuid')
         expected_algolia_program_objects.append({
             'objectID': f'program-{program_uuid}-catalog-uuids-0',
-            'enterprise_catalog_uuids': [str(self.enterprise_catalog_courses.uuid)],
+            'enterprise_catalog_uuids': algolia_data['catalog_uuids'],
         })
         expected_algolia_program_objects.append({
             'objectID': f'program-{program_uuid}-customer-uuids-0',
-            'enterprise_customer_uuids': [str(self.enterprise_catalog_courses.enterprise_uuid)],
+            'enterprise_customer_uuids': algolia_data['customer_uuids'],
         })
         expected_algolia_program_objects.append({
             'objectID': f'program-{program_uuid}-catalog-query-uuids-0',
-            'enterprise_catalog_query_uuids': [str(self.enterprise_catalog_courses.catalog_query.uuid)],
+            'enterprise_catalog_query_uuids': sorted(algolia_data['query_uuids']),
             'enterprise_catalog_query_titles': [self.enterprise_catalog_courses.catalog_query.title],
         })
 
@@ -766,17 +760,17 @@ def mock_replace_all_objects(products_iterable):
         expected_algolia_pathway_objects.append({
             'key': pathway_metadata.content_key,
             'objectID': f'learnerpathway-{pathway_uuid}-catalog-uuids-0',
-            'enterprise_catalog_uuids': [str(self.enterprise_catalog_courses.uuid)],
+            'enterprise_catalog_uuids': algolia_data['catalog_uuids'],
         })
         expected_algolia_pathway_objects.append({
             'key': pathway_metadata.content_key,
             'objectID': f'learnerpathway-{pathway_uuid}-customer-uuids-0',
-            'enterprise_customer_uuids': [str(self.enterprise_catalog_courses.enterprise_uuid)],
+            'enterprise_customer_uuids': algolia_data['customer_uuids'],
         })
         expected_algolia_pathway_objects.append({
             'key': pathway_metadata.content_key,
             'objectID': f'learnerpathway-{pathway_uuid}-catalog-query-uuids-0',
-            'enterprise_catalog_query_uuids': [str(self.enterprise_catalog_courses.catalog_query.uuid)],
+            'enterprise_catalog_query_uuids': sorted(algolia_data['query_uuids']),
             'enterprise_catalog_query_titles': [self.enterprise_catalog_courses.catalog_query.title],
         })
 
@@ -811,40 +805,16 @@ def mock_replace_all_objects(products_iterable):
             actual_algolia_products_sent_sequence[0], key=itemgetter('objectID')
         )
 
-        unsorted_expected_calls_args = expected_algolia_program_objects + expected_algolia_pathway_objects + \
-            expected_algolia_program_objects2 + expected_algolia_pathway_objects2
-        expected_second_call_args = sorted(unsorted_expected_calls_args, key=itemgetter('objectID'))
-        actual_second_call_args = sorted(
-            actual_algolia_products_sent_sequence[1], key=itemgetter('objectID')
-        )
         self.assertEqual(expected_first_call_args, actual_first_call_args)
-        self.assertEqual(expected_second_call_args, actual_second_call_args)
-
-        # Verify that we checked the cache twice, though
-        mock_was_recently_indexed.assert_has_calls([
-            mock.call(self.course_metadata_published.content_key),
-            mock.call(self.course_metadata_published.content_key),
-            mock.call(course_associated_program_metadata.content_key),
-            mock.call(pathway_program_metadata.content_key),
-            mock.call(pathway_metadata.content_key),
-            mock.call(pathway_metadata2.content_key),
-            mock.call(pathway_metadata2.content_key),
-            mock.call(self.course_metadata_published.content_key),
-            mock.call(self.course_metadata_published.content_key),
-            mock.call(course_associated_program_metadata.content_key),
-            mock.call(pathway_program_metadata.content_key),
-            mock.call(pathway_metadata.content_key),
-            mock.call(pathway_metadata2.content_key),
-            mock.call(pathway_metadata2.content_key),
-        ])
 
-    @mock.patch('enterprise_catalog.apps.api.tasks._was_recently_indexed', return_value=False)
+    @mock.patch('enterprise_catalog.apps.api.tasks._was_recently_indexed')
     @mock.patch('enterprise_catalog.apps.api.tasks.get_initialized_algolia_client', return_value=mock.MagicMock())
     def test_index_algolia_with_batched_uuids(self, mock_search_client, mock_was_recently_indexed):
         """
         Assert that the correct data is sent to Algolia index, with the expected enterprise
         catalog, enterprise customer, and catalog query associations.
         """
+        mock_was_recently_indexed.return_value = False
         algolia_data = self._set_up_factory_data_for_algolia()
 
         actual_algolia_products_sent = None
@@ -1023,7 +993,7 @@ def test_index_algolia_dry_run(self, mock_search_client, mock_was_recently_index
         Make sure the dry_run argument functions correctly and does not call replace_all_objects().
         """
         with mock.patch('enterprise_catalog.apps.api.tasks.ALGOLIA_UUID_BATCH_SIZE', 1), \
-             mock.patch('enterprise_catalog.apps.api.tasks.REINDEX_TASK_BATCH_SIZE', 1), \
+             mock.patch('enterprise_catalog.apps.api.tasks.REINDEX_TASK_BATCH_SIZE', 10), \
              mock.patch('enterprise_catalog.apps.api.tasks.ALGOLIA_FIELDS', self.ALGOLIA_FIELDS):
             with self.assertLogs(level='INFO') as info_logs:
                 # For some reason in order to call a celery task in-memory you must pass kwargs as args.