From 285e0e287363b0d8a921c0b0d0aa5ecdb1f2a7fd Mon Sep 17 00:00:00 2001 From: Alie Langston Date: Tue, 28 Nov 2023 10:19:15 -0500 Subject: [PATCH] feat: expose functionality to extract unit content --- learning_assistant/api.py | 75 ++++++++++++ learning_assistant/constants.py | 6 + learning_assistant/platform_imports.py | 47 ++++++++ learning_assistant/text_utils.py | 62 ++++++++++ tests/test_api.py | 155 ++++++++++++++++++++++++- tests/test_text_utils.py | 57 +++++++++ 6 files changed, 401 insertions(+), 1 deletion(-) create mode 100644 learning_assistant/platform_imports.py create mode 100644 learning_assistant/text_utils.py create mode 100644 tests/test_text_utils.py diff --git a/learning_assistant/api.py b/learning_assistant/api.py index fbc9bf3..17bb9f4 100644 --- a/learning_assistant/api.py +++ b/learning_assistant/api.py @@ -1,7 +1,16 @@ """ Library for the learning_assistant app. """ +from learning_assistant.constants import ACCEPTED_CATEGORY_TYPES, CATEGORY_TYPE_MAP from learning_assistant.models import CoursePrompt +from learning_assistant.platform_imports import ( + block_get_children, + block_leaf_filter, + get_single_block, + get_text_transcript, + traverse_block_pre_order, +) +from learning_assistant.text_utils import html_to_text def get_deserialized_prompt_content_by_course_id(course_id): @@ -23,3 +32,69 @@ def get_setup_messages(course_id): setup_messages = [{'role': 'system', 'content': x} for x in message_content] return setup_messages return None + + +def _extract_block_contents(child, category): + """ + Process the child contents based on its category. + + Returns a string or None if there are no contents available. + """ + if category == 'html': + content_html = child.get_html() + text = html_to_text(content_html) + return text + + if category == 'video': + transcript = get_text_transcript(child) # may be None + return transcript + + return None + + +def _leaf_filter(block): + """ + Return only leaf nodes of a particular category. + """ + is_leaf = block_leaf_filter(block) + category = block.category + + return is_leaf and category in ACCEPTED_CATEGORY_TYPES + + +def _get_children_contents(block): + """ + Given a specific block, return the content type and text of a pre-order traversal of the blocks children. + """ + leaf_nodes = traverse_block_pre_order(block, block_get_children, _leaf_filter) + + length = 0 + items = [] + + for node in leaf_nodes: + category = node.category + content = _extract_block_contents(node, category) + + if content: + length += len(content) + items.append({ + 'content_type': CATEGORY_TYPE_MAP.get(category), + 'content_text': content, + }) + + return length, items + + +def get_block_content(request, user_id, course_id, unit_usage_key): + """ + Public wrapper for retrieving the content of a given block's children. + + Returns + length - the cummulative length of a block's children's content + items - a list of dictionaries containing the content type and text for each child + """ + block = get_single_block(request, user_id, course_id, unit_usage_key) + + length, items = _get_children_contents(block) + + return length, items diff --git a/learning_assistant/constants.py b/learning_assistant/constants.py index 580616c..06097ff 100644 --- a/learning_assistant/constants.py +++ b/learning_assistant/constants.py @@ -8,3 +8,9 @@ EXTERNAL_COURSE_KEY_PATTERN = r'([A-Za-z0-9-_:]+)' COURSE_ID_PATTERN = rf'(?P({INTERNAL_COURSE_KEY_PATTERN}|{EXTERNAL_COURSE_KEY_PATTERN}))' + +ACCEPTED_CATEGORY_TYPES = ['html', 'video'] +CATEGORY_TYPE_MAP = { + "html": "TEXT", + "video": "VIDEO", +} diff --git a/learning_assistant/platform_imports.py b/learning_assistant/platform_imports.py new file mode 100644 index 0000000..d29de31 --- /dev/null +++ b/learning_assistant/platform_imports.py @@ -0,0 +1,47 @@ +""" +Contain all imported functions coming out of the platform. + +We know these functions will be available at run time, but they +cannot be imported normally. +""" + + +def get_text_transcript(video_block): + """Get the transcript for a video block in text format, or None.""" + # pylint: disable=import-error, import-outside-toplevel + from xmodule.exceptions import NotFoundError + from xmodule.video_block.transcripts_utils import get_transcript + try: + transcript, _, _ = get_transcript(video_block, output_format='txt') + except NotFoundError: + # some old videos have no transcripts, just accept that reality + return None + return transcript + + +def get_single_block(request, user_id, course_id, usage_key_string, course=None): + """Load a single xblock.""" + # pylint: disable=import-error, import-outside-toplevel + from lms.djangoapps.courseware.block_renderer import load_single_xblock + return load_single_xblock(request, user_id, course_id, usage_key_string, course) + + +def traverse_block_pre_order(start_node, get_children, filter_func=None): + """Traverse a DAG or tree in pre-order.""" + # pylint: disable=import-error, import-outside-toplevel + from openedx.core.lib.graph_traversals import traverse_pre_order + return traverse_pre_order(start_node, get_children, filter_func) + + +def block_leaf_filter(block): + """Return only leaf nodes.""" + # pylint: disable=import-error, import-outside-toplevel + from openedx.core.lib.graph_traversals import leaf_filter + return leaf_filter(block) + + +def block_get_children(block): + """Return children of a given block.""" + # pylint: disable=import-error, import-outside-toplevel + from openedx.core.lib.graph_traversals import get_children + return get_children(block) diff --git a/learning_assistant/text_utils.py b/learning_assistant/text_utils.py new file mode 100644 index 0000000..728fd14 --- /dev/null +++ b/learning_assistant/text_utils.py @@ -0,0 +1,62 @@ +""" +Text manipulation utils. This has been copied from the ai-aside repository. +""" + +from html.parser import HTMLParser +from re import sub + +from django.conf import settings + + +def cleanup_text(text): + """ + Remove litter from replacing or manipulating text. + """ + stripped = sub(r'[^\S\r\n]+', ' ', text) # Removing extra spaces + stripped = sub(r'\n{2,}', '\n', stripped) # Removing extra new lines + stripped = sub(r'(\s+)?\n(\s+)?', '\n', stripped) # Removing starting extra spacesbetween new lines + stripped = sub(r'(^(\s+)\n?)|(\n(\s+)?$)', '', stripped) # Trim + + return stripped + + +class _HTMLToTextHelper(HTMLParser): # lint-amnesty, pylint: disable=abstract-method + """ + Helper function for html_to_text below. + """ + + _is_content = True + + def __init__(self): + HTMLParser.__init__(self) + self.reset() + self.fed = [] + + def handle_starttag(self, tag, _): + """On each tag, check whether this is a tag we think is content.""" + tags_to_filter = getattr(settings, 'HTML_TAGS_TO_REMOVE', None) + self._is_content = not (tags_to_filter and tag in tags_to_filter) + + def handle_data(self, data): + """Handle tag data by appending text we think is content.""" + if self._is_content: + self.fed.append(data) + + def handle_entityref(self, name): + """If there is an entity, append the reference to the text.""" + if self._is_content: + self.fed.append('&%s;' % name) + + def get_data(self): + """Join together the separate data chunks into one cohesive string.""" + return ''.join(self.fed) + + +def html_to_text(html): + """Strip the html tags off of the text to return plaintext.""" + htmlstripper = _HTMLToTextHelper() + htmlstripper.feed(html) + text = htmlstripper.get_data() + text = cleanup_text(text) + + return text diff --git a/tests/test_api.py b/tests/test_api.py index 3316f98..bf198d7 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1,11 +1,55 @@ """ Test cases for the learning-assistant api module. """ +from unittest.mock import MagicMock, patch + +import ddt from django.test import TestCase +from opaque_keys.edx.keys import UsageKey -from learning_assistant.api import get_deserialized_prompt_content_by_course_id, get_setup_messages +from learning_assistant.api import ( + _extract_block_contents, + _get_children_contents, + _leaf_filter, + get_block_content, + get_deserialized_prompt_content_by_course_id, + get_setup_messages, +) from learning_assistant.models import CoursePrompt +fake_transcript = 'This is the text version from the transcript' + + +class FakeChild: + """Fake child block for testing""" + transcript_download_format = 'txt' + + def __init__(self, category, test_id='test-id', test_html='
This is a test
'): + self.category = category + self.published_on = 'published-on-{}'.format(test_id) + self.edited_on = 'edited-on-{}'.format(test_id) + self.scope_ids = lambda: None + self.scope_ids.def_id = 'def-id-{}'.format(test_id) + self.html = test_html + self.transcript = fake_transcript + + def get_html(self): + if self.category == 'html': + return self.html + + return None + + +class FakeBlock: + "Fake block for testing, returns given children" + def __init__(self, children): + self.children = children + self.scope_ids = lambda: None + self.scope_ids.usage_id = UsageKey.from_string('block-v1:edX+A+B+type@vertical+block@verticalD') + + def get_children(self): + return self.children + class LearningAssistantAPITests(TestCase): """ @@ -38,3 +82,112 @@ def test_get_setup_messages(self): def test_get_setup_messages_invalid_course_id(self): setup_messages = get_setup_messages('course-v1:edx+fake+19') self.assertIsNone(setup_messages) + + +@ddt.ddt +class GetBlockContentAPITests(TestCase): + """ + Test suite for the get_block_content api function. + """ + + def setUp(self): + self.children = [ + FakeChild('html', '01', ''' +

+ Lorem ipsum dolor sit amet, consectetur adipiscing elit. + Vivamus dapibus elit lacus, at vehicula arcu vehicula in. + In id felis arcu. Maecenas elit quam, volutpat cursus pharetra vel, tempor at lorem. + Fusce luctus orci quis tempor aliquet. +

'''), + FakeChild('html', '02', ''' + + Nothing + '''), + FakeChild('video', '03'), + FakeChild('unknown', '04') + ] + self.block = FakeBlock(self.children) + + self.course_id = 'course-v1:edx+test+23' + + @ddt.data( + ('video', True), + ('html', True), + ('unknown', False) + ) + @ddt.unpack + @patch('learning_assistant.api.block_leaf_filter') + def test_block_leaf_filter(self, category, expected_value, mock_leaf_filter): + mock_leaf_filter.return_value = True + + block = FakeChild(category) + + is_leaf = _leaf_filter(block) + self.assertEqual(is_leaf, expected_value) + + @ddt.data( + 'video', + 'html', + 'unknown' + ) + @patch('learning_assistant.api.html_to_text') + @patch('learning_assistant.api.get_text_transcript') + def test_extract_block_contents(self, category, mock_html, mock_transcript): + mock_return = 'This is the block content' + mock_html.return_value = mock_return + mock_transcript.return_value = mock_return + + block = FakeChild(category) + + block_content = _extract_block_contents(block, category) + + if category in ['html', 'video']: + self.assertEqual(block_content, mock_return) + else: + self.assertIsNone(block_content) + + @patch('learning_assistant.api.traverse_block_pre_order') + @patch('learning_assistant.api.html_to_text') + @patch('learning_assistant.api.get_text_transcript') + def test_get_children_contents(self, mock_transcript, mock_html, mock_traversal): + mock_traversal.return_value = self.children + block_content = 'This is the block content' + mock_html.return_value = block_content + mock_transcript.return_value = block_content + + length, items = _get_children_contents(self.block) + + expected_items = [ + {'content_type': 'TEXT', 'content_text': block_content}, + {'content_type': 'TEXT', 'content_text': block_content}, + {'content_type': 'VIDEO', 'content_text': block_content} + ] + + # expected length should be equivalent to the sum of the content length in each of the 3 child blocks + # that are either video or html + self.assertEqual(length, len(block_content) * 3) + self.assertEqual(len(items), 3) + self.assertEqual(items, expected_items) + + @patch('learning_assistant.api.get_single_block') + @patch('learning_assistant.api._get_children_contents') + def test_get_block_content(self, mock_get_children_contents, mock_get_single_block): + mock_get_single_block.return_value = self.block + + block_content = 'This is the block content' + content_items = [{'content_type': 'TEXT', 'content_text': block_content}] + mock_get_children_contents.return_value = (len(block_content), content_items) + + # mock arguments that are passed through to `get_single_block` function. the value of these + # args does not matter for this test right now, as the `get_single_block` function is entirely mocked. + request = MagicMock() + user_id = 1 + course_id = self.course_id + unit_usage_key = 'block-v1:edX+A+B+type@vertical+block@verticalD' + + length, items = get_block_content(request, user_id, course_id, unit_usage_key) + + mock_get_children_contents.assert_called_with(self.block) + + self.assertEqual(length, len(block_content)) + self.assertEqual(items, content_items) diff --git a/tests/test_text_utils.py b/tests/test_text_utils.py new file mode 100644 index 0000000..3fbb244 --- /dev/null +++ b/tests/test_text_utils.py @@ -0,0 +1,57 @@ +"""Tests for text utils used by the blocks""" +import unittest +from textwrap import dedent + +from learning_assistant.text_utils import html_to_text + + +class TestSummaryHookAside(unittest.TestCase): + """Tests of text utils as used by the summary hook""" + def test_html_to_text(self): + html_content = '''\ +
+

Lorem Ipsum

+

Lorem ipsum dolor sit amet, consectetur adipiscing elit.

+ +

Sed volutpat velit sed dui fringilla fermentum.

+

Nullam quis velit at turpis lacinia convallis.

+ ''' + expected_text = dedent('''\ + Lorem Ipsum + Lorem ipsum dolor sit amet, consectetur adipiscing elit. + Sed volutpat velit sed dui fringilla fermentum. + Nullam quis velit at turpis lacinia convallis.''') + text = html_to_text(html_content) + self.assertEqual(text, expected_text) + + def test_html_to_text_messy(self): + html_content = '''\ + + Lorem Ipsum + > Lorem ipsum dolor sit amet, consectetur adipiscing elit.

+ > Sed volutpat velit sed dui fringilla fermentum. +

> Nullam quis velit at turpis lacinia convallis.

''' + expected_text = dedent('''\ + Lorem Ipsum + > Lorem ipsum dolor sit amet, consectetur adipiscing elit. + > Sed volutpat velit sed dui fringilla fermentum. + > Nullam quis velit at turpis lacinia convallis.''') + text = html_to_text(html_content) + self.assertEqual(text, expected_text) + + def test_html_to_text_iframe(self): + html_content = '''\ + + ''' + expected_text = dedent('') + text = html_to_text(html_content) + self.assertEqual(text, expected_text) + + +if __name__ == '__main__': + unittest.main()