feat: do not index objects that are not indexable

This commit represents a change of behavior from the original implementation. Previously, programs would be indexed just by touching an indexable object, regardless of whether the program passed the `_should_index_program()` test. After this change, ONLY indexable programs are actually indexed. Also add a ton of tests for specific scenarios to validate that claim. ENT-7311
openedx · Jul 31, 2023 · 6fa565d · 6fa565d
1 parent 181d2dd
commit 6fa565d
Show file tree

Hide file tree

Showing 2 changed files with 607 additions and 179 deletions.
diff --git a/enterprise_catalog/apps/api/tasks.py b/enterprise_catalog/apps/api/tasks.py
@@ -609,6 +609,7 @@ def add_metadata_to_algolia_objects(
 def _get_algolia_products_for_batch(
     batch_num,
     content_keys_batch,
+    all_indexable_content_keys,
     program_to_courses_mapping,
     pathway_to_programs_courses_mapping,
     context_accumulator,
@@ -621,6 +622,16 @@ def _get_algolia_products_for_batch(
     Callers can also maintain a fixed memory cap by only keeping a fixed number of output objects in-memory at any given
     time.
 
+    ONLY objects that are indexable are indexed.  Past versions of this function caused programs to be indexed just by
+    touching an indexable course or pathway, but now they will only be indexed if they pass the
+    `_should_index_program()` test.
+
+    Note: course runs are not indexed despite having their UUIDs inherited.
+
+    UUIDs are inherited all the way from course runs to courses to programs to pathways.  Specifically, a given object
+    in the tree will be indexed with union of all UUIDs from every node in the sub-tree.  E.g. a pathway object will be
+    indexed with the UUIDs found on the pathway + program + course + course run beneath.
+
     Returns:
         list of dict: Algolia products to index.
     """
@@ -641,26 +652,26 @@ def _get_algolia_products_for_batch(
     content_metadata_no_courseruns = ContentMetadata.objects.filter(
         # All content (courses, course runs, programs, pathways) directly requested.
         Q(content_key__in=content_keys_batch)
-        # All course runs, courses, or programs contained in programs or pathways requested.
+        # All course runs, courses, or programs contained in programs or pathways requested.  In order to collect all
+        # UUIDs for a given program or pathway, all containing objects are needed too, but those may not happen to be
+        # part of the current batch.
+        # This could include non-indexable content, so they will need to be filtered out next.
         | Q(
             content_type__in=[COURSE_RUN, COURSE, PROGRAM],
             associated_content_metadata__content_type__in=[PROGRAM, LEARNER_PATHWAY],
             associated_content_metadata__content_key__in=content_keys_batch,
         )
-        # All programs, pathways to which any requested course belongs.
-        | Q(
-            content_type__in=[PROGRAM, LEARNER_PATHWAY],
-            associated_content_metadata__content_type__in=[COURSE, PROGRAM],
-            associated_content_metadata__content_key__in=content_keys_batch,
-        )
     ).prefetch_related(
         Prefetch('catalog_queries', queryset=all_catalog_queries),
     )
+    # Perform filtering of non-indexable objects in-memory because the list may be too long to shove into a SQL query.
+    content_metadata_no_courseruns = [
+        cm for cm in content_metadata_no_courseruns
+        if cm.content_key in all_indexable_content_keys
+    ]
 
     # Retrieve ContentMetadata records for any course run which is part of any course found in the previous query.
-    course_content_keys = content_metadata_no_courseruns.filter(
-        content_type=COURSE,
-    ).values_list('content_key', flat=True)
+    course_content_keys = [cm.content_key for cm in content_metadata_no_courseruns]
     content_metadata_courseruns = ContentMetadata.objects.filter(
         parent_content_key__in=course_content_keys
     ).prefetch_related(
@@ -673,7 +684,7 @@ def _get_algolia_products_for_batch(
     # requested to a pathway via an association chain of course->program->pathway.  This maybe should be added!  When it
     # is added, a related change must be made in the third pass (below) to chain
     # `pathway_to_programs_courses_mapping` and `program_to_courses_mapping` to actually collect the UUIDs.
-    content_metadata_to_process = content_metadata_no_courseruns.union(content_metadata_courseruns)
+    content_metadata_to_process = content_metadata_no_courseruns + list(content_metadata_courseruns)
 
     # First pass over the batch of content.  The goal for this pass is to collect all the UUIDs directly associated with
     # each content.  This DOES NOT capture any UUIDs indirectly related to programs or pathways via associated courses
@@ -786,7 +797,7 @@ def _get_algolia_products_for_batch(
         f'{_reindex_algolia_prefix(dry_run)} '
         f'batch#{batch_num}: '
         f'{len(content_keys_batch)} content keys, '
-        f'{content_metadata_to_process.count()} content metadata found, '
+        f'{len(content_metadata_to_process)} content metadata found, '
         f'{num_content_metadata_indexed} content metadata indexed, '
         f'{len(algolia_products_by_object_id)} generated algolia products kept, '
         f'{duplicate_algolia_records_discarded} generated algolia products discarded.'
@@ -830,6 +841,7 @@ def _index_content_keys_in_algolia(content_keys, algolia_client, dry_run=False):
         _get_algolia_products_for_batch(
             batch_num,
             content_keys_batch,
+            content_keys,
             program_to_courses_mapping,
             pathway_to_programs_courses_mapping,
             context_accumulator,