From 285e0e287363b0d8a921c0b0d0aa5ecdb1f2a7fd Mon Sep 17 00:00:00 2001
From: Alie Langston <alangsto@wellesley.edu>
Date: Tue, 28 Nov 2023 10:19:15 -0500
Subject: [PATCH] feat: expose functionality to extract unit content

---
 learning_assistant/api.py              |  75 ++++++++++++
 learning_assistant/constants.py        |   6 +
 learning_assistant/platform_imports.py |  47 ++++++++
 learning_assistant/text_utils.py       |  62 ++++++++++
 tests/test_api.py                      | 155 ++++++++++++++++++++++++-
 tests/test_text_utils.py               |  57 +++++++++
 6 files changed, 401 insertions(+), 1 deletion(-)
 create mode 100644 learning_assistant/platform_imports.py
 create mode 100644 learning_assistant/text_utils.py
 create mode 100644 tests/test_text_utils.py

diff --git a/learning_assistant/api.py b/learning_assistant/api.py
index fbc9bf3..17bb9f4 100644
--- a/learning_assistant/api.py
+++ b/learning_assistant/api.py
@@ -1,7 +1,16 @@
 """
 Library for the learning_assistant app.
 """
+from learning_assistant.constants import ACCEPTED_CATEGORY_TYPES, CATEGORY_TYPE_MAP
 from learning_assistant.models import CoursePrompt
+from learning_assistant.platform_imports import (
+    block_get_children,
+    block_leaf_filter,
+    get_single_block,
+    get_text_transcript,
+    traverse_block_pre_order,
+)
+from learning_assistant.text_utils import html_to_text
 
 
 def get_deserialized_prompt_content_by_course_id(course_id):
@@ -23,3 +32,69 @@ def get_setup_messages(course_id):
         setup_messages = [{'role': 'system', 'content': x} for x in message_content]
         return setup_messages
     return None
+
+
+def _extract_block_contents(child, category):
+    """
+    Process the child contents based on its category.
+
+    Returns a string or None if there are no contents available.
+    """
+    if category == 'html':
+        content_html = child.get_html()
+        text = html_to_text(content_html)
+        return text
+
+    if category == 'video':
+        transcript = get_text_transcript(child)  # may be None
+        return transcript
+
+    return None
+
+
+def _leaf_filter(block):
+    """
+    Return only leaf nodes of a particular category.
+    """
+    is_leaf = block_leaf_filter(block)
+    category = block.category
+
+    return is_leaf and category in ACCEPTED_CATEGORY_TYPES
+
+
+def _get_children_contents(block):
+    """
+    Given a specific block, return the content type and text of a pre-order traversal of the blocks children.
+    """
+    leaf_nodes = traverse_block_pre_order(block, block_get_children, _leaf_filter)
+
+    length = 0
+    items = []
+
+    for node in leaf_nodes:
+        category = node.category
+        content = _extract_block_contents(node, category)
+
+        if content:
+            length += len(content)
+            items.append({
+                'content_type': CATEGORY_TYPE_MAP.get(category),
+                'content_text': content,
+            })
+
+    return length, items
+
+
+def get_block_content(request, user_id, course_id, unit_usage_key):
+    """
+    Public wrapper for retrieving the content of a given block's children.
+
+    Returns
+        length - the cummulative length of a block's children's content
+        items - a list of dictionaries containing the content type and text for each child
+    """
+    block = get_single_block(request, user_id, course_id, unit_usage_key)
+
+    length, items = _get_children_contents(block)
+
+    return length, items
diff --git a/learning_assistant/constants.py b/learning_assistant/constants.py
index 580616c..06097ff 100644
--- a/learning_assistant/constants.py
+++ b/learning_assistant/constants.py
@@ -8,3 +8,9 @@
 EXTERNAL_COURSE_KEY_PATTERN = r'([A-Za-z0-9-_:]+)'
 
 COURSE_ID_PATTERN = rf'(?P<course_id>({INTERNAL_COURSE_KEY_PATTERN}|{EXTERNAL_COURSE_KEY_PATTERN}))'
+
+ACCEPTED_CATEGORY_TYPES = ['html', 'video']
+CATEGORY_TYPE_MAP = {
+    "html": "TEXT",
+    "video": "VIDEO",
+}
diff --git a/learning_assistant/platform_imports.py b/learning_assistant/platform_imports.py
new file mode 100644
index 0000000..d29de31
--- /dev/null
+++ b/learning_assistant/platform_imports.py
@@ -0,0 +1,47 @@
+"""
+Contain all imported functions coming out of the platform.
+
+We know these functions will be available at run time, but they
+cannot be imported normally.
+"""
+
+
+def get_text_transcript(video_block):
+    """Get the transcript for a video block in text format, or None."""
+    # pylint: disable=import-error, import-outside-toplevel
+    from xmodule.exceptions import NotFoundError
+    from xmodule.video_block.transcripts_utils import get_transcript
+    try:
+        transcript, _, _ = get_transcript(video_block, output_format='txt')
+    except NotFoundError:
+        # some old videos have no transcripts, just accept that reality
+        return None
+    return transcript
+
+
+def get_single_block(request, user_id, course_id, usage_key_string, course=None):
+    """Load a single xblock."""
+    # pylint: disable=import-error, import-outside-toplevel
+    from lms.djangoapps.courseware.block_renderer import load_single_xblock
+    return load_single_xblock(request, user_id, course_id, usage_key_string, course)
+
+
+def traverse_block_pre_order(start_node, get_children, filter_func=None):
+    """Traverse a DAG or tree in pre-order."""
+    # pylint: disable=import-error, import-outside-toplevel
+    from openedx.core.lib.graph_traversals import traverse_pre_order
+    return traverse_pre_order(start_node, get_children, filter_func)
+
+
+def block_leaf_filter(block):
+    """Return only leaf nodes."""
+    # pylint: disable=import-error, import-outside-toplevel
+    from openedx.core.lib.graph_traversals import leaf_filter
+    return leaf_filter(block)
+
+
+def block_get_children(block):
+    """Return children of a given block."""
+    # pylint: disable=import-error, import-outside-toplevel
+    from openedx.core.lib.graph_traversals import get_children
+    return get_children(block)
diff --git a/learning_assistant/text_utils.py b/learning_assistant/text_utils.py
new file mode 100644
index 0000000..728fd14
--- /dev/null
+++ b/learning_assistant/text_utils.py
@@ -0,0 +1,62 @@
+"""
+Text manipulation utils. This has been copied from the ai-aside repository.
+"""
+
+from html.parser import HTMLParser
+from re import sub
+
+from django.conf import settings
+
+
+def cleanup_text(text):
+    """
+    Remove litter from replacing or manipulating text.
+    """
+    stripped = sub(r'[^\S\r\n]+', ' ', text)  # Removing extra spaces
+    stripped = sub(r'\n{2,}', '\n', stripped)  # Removing extra new lines
+    stripped = sub(r'(\s+)?\n(\s+)?', '\n', stripped)  # Removing starting extra spacesbetween new lines
+    stripped = sub(r'(^(\s+)\n?)|(\n(\s+)?$)', '', stripped)  # Trim
+
+    return stripped
+
+
+class _HTMLToTextHelper(HTMLParser):  # lint-amnesty, pylint: disable=abstract-method
+    """
+    Helper function for html_to_text below.
+    """
+
+    _is_content = True
+
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.reset()
+        self.fed = []
+
+    def handle_starttag(self, tag, _):
+        """On each tag, check whether this is a tag we think is content."""
+        tags_to_filter = getattr(settings, 'HTML_TAGS_TO_REMOVE', None)
+        self._is_content = not (tags_to_filter and tag in tags_to_filter)
+
+    def handle_data(self, data):
+        """Handle tag data by appending text we think is content."""
+        if self._is_content:
+            self.fed.append(data)
+
+    def handle_entityref(self, name):
+        """If there is an entity, append the reference to the text."""
+        if self._is_content:
+            self.fed.append('&%s;' % name)
+
+    def get_data(self):
+        """Join together the separate data chunks into one cohesive string."""
+        return ''.join(self.fed)
+
+
+def html_to_text(html):
+    """Strip the html tags off of the text to return plaintext."""
+    htmlstripper = _HTMLToTextHelper()
+    htmlstripper.feed(html)
+    text = htmlstripper.get_data()
+    text = cleanup_text(text)
+
+    return text
diff --git a/tests/test_api.py b/tests/test_api.py
index 3316f98..bf198d7 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -1,11 +1,55 @@
 """
 Test cases for the learning-assistant api module.
 """
+from unittest.mock import MagicMock, patch
+
+import ddt
 from django.test import TestCase
+from opaque_keys.edx.keys import UsageKey
 
-from learning_assistant.api import get_deserialized_prompt_content_by_course_id, get_setup_messages
+from learning_assistant.api import (
+    _extract_block_contents,
+    _get_children_contents,
+    _leaf_filter,
+    get_block_content,
+    get_deserialized_prompt_content_by_course_id,
+    get_setup_messages,
+)
 from learning_assistant.models import CoursePrompt
 
+fake_transcript = 'This is the text version from the transcript'
+
+
+class FakeChild:
+    """Fake child block for testing"""
+    transcript_download_format = 'txt'
+
+    def __init__(self, category, test_id='test-id', test_html='<div>This is a test</div>'):
+        self.category = category
+        self.published_on = 'published-on-{}'.format(test_id)
+        self.edited_on = 'edited-on-{}'.format(test_id)
+        self.scope_ids = lambda: None
+        self.scope_ids.def_id = 'def-id-{}'.format(test_id)
+        self.html = test_html
+        self.transcript = fake_transcript
+
+    def get_html(self):
+        if self.category == 'html':
+            return self.html
+
+        return None
+
+
+class FakeBlock:
+    "Fake block for testing, returns given children"
+    def __init__(self, children):
+        self.children = children
+        self.scope_ids = lambda: None
+        self.scope_ids.usage_id = UsageKey.from_string('block-v1:edX+A+B+type@vertical+block@verticalD')
+
+    def get_children(self):
+        return self.children
+
 
 class LearningAssistantAPITests(TestCase):
     """
@@ -38,3 +82,112 @@ def test_get_setup_messages(self):
     def test_get_setup_messages_invalid_course_id(self):
         setup_messages = get_setup_messages('course-v1:edx+fake+19')
         self.assertIsNone(setup_messages)
+
+
+@ddt.ddt
+class GetBlockContentAPITests(TestCase):
+    """
+    Test suite for the get_block_content api function.
+    """
+
+    def setUp(self):
+        self.children = [
+            FakeChild('html', '01', '''
+                        <p>
+                            Lorem ipsum dolor sit amet, consectetur adipiscing elit.
+                            Vivamus dapibus elit lacus, at vehicula arcu vehicula in.
+                            In id felis arcu. Maecenas elit quam, volutpat cursus pharetra vel, tempor at lorem.
+                            Fusce luctus orci quis tempor aliquet.
+                        </p>'''),
+            FakeChild('html', '02', '''
+                        <Everything on the content on this child is inside a tag, so parsing it would return almost>
+                            Nothing
+                        </The quick brown fox jumps over the lazy dog>'''),
+            FakeChild('video', '03'),
+            FakeChild('unknown', '04')
+        ]
+        self.block = FakeBlock(self.children)
+
+        self.course_id = 'course-v1:edx+test+23'
+
+    @ddt.data(
+        ('video', True),
+        ('html', True),
+        ('unknown', False)
+    )
+    @ddt.unpack
+    @patch('learning_assistant.api.block_leaf_filter')
+    def test_block_leaf_filter(self, category, expected_value, mock_leaf_filter):
+        mock_leaf_filter.return_value = True
+
+        block = FakeChild(category)
+
+        is_leaf = _leaf_filter(block)
+        self.assertEqual(is_leaf, expected_value)
+
+    @ddt.data(
+        'video',
+        'html',
+        'unknown'
+    )
+    @patch('learning_assistant.api.html_to_text')
+    @patch('learning_assistant.api.get_text_transcript')
+    def test_extract_block_contents(self, category, mock_html, mock_transcript):
+        mock_return = 'This is the block content'
+        mock_html.return_value = mock_return
+        mock_transcript.return_value = mock_return
+
+        block = FakeChild(category)
+
+        block_content = _extract_block_contents(block, category)
+
+        if category in ['html', 'video']:
+            self.assertEqual(block_content, mock_return)
+        else:
+            self.assertIsNone(block_content)
+
+    @patch('learning_assistant.api.traverse_block_pre_order')
+    @patch('learning_assistant.api.html_to_text')
+    @patch('learning_assistant.api.get_text_transcript')
+    def test_get_children_contents(self, mock_transcript, mock_html, mock_traversal):
+        mock_traversal.return_value = self.children
+        block_content = 'This is the block content'
+        mock_html.return_value = block_content
+        mock_transcript.return_value = block_content
+
+        length, items = _get_children_contents(self.block)
+
+        expected_items = [
+            {'content_type': 'TEXT', 'content_text': block_content},
+            {'content_type': 'TEXT', 'content_text': block_content},
+            {'content_type': 'VIDEO', 'content_text': block_content}
+        ]
+
+        # expected length should be equivalent to the sum of the content length in each of the 3 child blocks
+        # that are either video or html
+        self.assertEqual(length, len(block_content) * 3)
+        self.assertEqual(len(items), 3)
+        self.assertEqual(items, expected_items)
+
+    @patch('learning_assistant.api.get_single_block')
+    @patch('learning_assistant.api._get_children_contents')
+    def test_get_block_content(self, mock_get_children_contents, mock_get_single_block):
+        mock_get_single_block.return_value = self.block
+
+        block_content = 'This is the block content'
+        content_items = [{'content_type': 'TEXT', 'content_text': block_content}]
+        mock_get_children_contents.return_value = (len(block_content), content_items)
+
+        # mock arguments that are passed through to `get_single_block` function. the value of these
+        # args does not matter for this test right now, as the `get_single_block` function is entirely mocked.
+        request = MagicMock()
+        user_id = 1
+        course_id = self.course_id
+        unit_usage_key = 'block-v1:edX+A+B+type@vertical+block@verticalD'
+
+        length, items = get_block_content(request, user_id, course_id, unit_usage_key)
+
+        mock_get_children_contents.assert_called_with(self.block)
+
+        self.assertEqual(length, len(block_content))
+        self.assertEqual(items, content_items)
diff --git a/tests/test_text_utils.py b/tests/test_text_utils.py
new file mode 100644
index 0000000..3fbb244
--- /dev/null
+++ b/tests/test_text_utils.py
@@ -0,0 +1,57 @@
+"""Tests for text utils used by the blocks"""
+import unittest
+from textwrap import dedent
+
+from learning_assistant.text_utils import html_to_text
+
+
+class TestSummaryHookAside(unittest.TestCase):
+    """Tests of text utils as used by the summary hook"""
+    def test_html_to_text(self):
+        html_content = '''\
+            <div>
+                <h1>Lorem Ipsum</h1>
+                <p>Lorem ipsum dolor <em style="font-size: 99pt;">sit amet</em>, consectetur adipiscing elit.</p>
+                <!-- The next paragraph will have an intentional mismatching close tag -->
+                <p>Sed volutpat velit sed dui <i class="something">fringilla</i> fermentum.</div>
+                <p>Nullam quis velit at turpis lacinia convallis.</p>
+            </div>'''
+        expected_text = dedent('''\
+            Lorem Ipsum
+            Lorem ipsum dolor sit amet, consectetur adipiscing elit.
+            Sed volutpat velit sed dui fringilla fermentum.
+            Nullam quis velit at turpis lacinia convallis.''')
+        text = html_to_text(html_content)
+        self.assertEqual(text, expected_text)
+
+    def test_html_to_text_messy(self):
+        html_content = '''\
+            <custom>
+                <mismatching>Lorem Ipsum</tags>
+                > Lorem ipsum dolor sit amet, consectetur adipiscing elit.</p>
+                > Sed volutpat velit sed dui fringilla fermentum.</42>
+                <p>> Nullam quis velit at turpis lacinia convallis.</p>'''
+        expected_text = dedent('''\
+            Lorem Ipsum
+            > Lorem ipsum dolor sit amet, consectetur adipiscing elit.
+            > Sed volutpat velit sed dui fringilla fermentum.
+            > Nullam quis velit at turpis lacinia convallis.''')
+        text = html_to_text(html_content)
+        self.assertEqual(text, expected_text)
+
+    def test_html_to_text_iframe(self):
+        html_content = '''\
+            <iframe
+                src="https://mitx.qualtrics.com/jfe/form/SV_8ocnXlmaDwMRFgF?user_id=%%USER_ID%%"
+                height="1200"
+                width="100%"
+                title="Nothing useful can be parsed from this, maybe fetching the src in the future?"
+            ></iframe>
+            '''
+        expected_text = dedent('')
+        text = html_to_text(html_content)
+        self.assertEqual(text, expected_text)
+
+
+if __name__ == '__main__':
+    unittest.main()