Skip to content

Commit

Permalink
feat: expose functionality to extract unit content
Browse files Browse the repository at this point in the history
  • Loading branch information
alangsto committed Nov 29, 2023
1 parent aafb65c commit 285e0e2
Show file tree
Hide file tree
Showing 6 changed files with 401 additions and 1 deletion.
75 changes: 75 additions & 0 deletions learning_assistant/api.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,16 @@
"""
Library for the learning_assistant app.
"""
from learning_assistant.constants import ACCEPTED_CATEGORY_TYPES, CATEGORY_TYPE_MAP
from learning_assistant.models import CoursePrompt
from learning_assistant.platform_imports import (
block_get_children,
block_leaf_filter,
get_single_block,
get_text_transcript,
traverse_block_pre_order,
)
from learning_assistant.text_utils import html_to_text


def get_deserialized_prompt_content_by_course_id(course_id):
Expand All @@ -23,3 +32,69 @@ def get_setup_messages(course_id):
setup_messages = [{'role': 'system', 'content': x} for x in message_content]
return setup_messages
return None


def _extract_block_contents(child, category):
"""
Process the child contents based on its category.
Returns a string or None if there are no contents available.
"""
if category == 'html':
content_html = child.get_html()
text = html_to_text(content_html)
return text

if category == 'video':
transcript = get_text_transcript(child) # may be None
return transcript

return None


def _leaf_filter(block):
"""
Return only leaf nodes of a particular category.
"""
is_leaf = block_leaf_filter(block)
category = block.category

return is_leaf and category in ACCEPTED_CATEGORY_TYPES


def _get_children_contents(block):
"""
Given a specific block, return the content type and text of a pre-order traversal of the blocks children.
"""
leaf_nodes = traverse_block_pre_order(block, block_get_children, _leaf_filter)

length = 0
items = []

for node in leaf_nodes:
category = node.category
content = _extract_block_contents(node, category)

if content:
length += len(content)
items.append({
'content_type': CATEGORY_TYPE_MAP.get(category),
'content_text': content,
})

return length, items


def get_block_content(request, user_id, course_id, unit_usage_key):
"""
Public wrapper for retrieving the content of a given block's children.
Returns
length - the cummulative length of a block's children's content
items - a list of dictionaries containing the content type and text for each child
"""
block = get_single_block(request, user_id, course_id, unit_usage_key)

length, items = _get_children_contents(block)

return length, items
6 changes: 6 additions & 0 deletions learning_assistant/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,9 @@
EXTERNAL_COURSE_KEY_PATTERN = r'([A-Za-z0-9-_:]+)'

COURSE_ID_PATTERN = rf'(?P<course_id>({INTERNAL_COURSE_KEY_PATTERN}|{EXTERNAL_COURSE_KEY_PATTERN}))'

ACCEPTED_CATEGORY_TYPES = ['html', 'video']
CATEGORY_TYPE_MAP = {
"html": "TEXT",
"video": "VIDEO",
}
47 changes: 47 additions & 0 deletions learning_assistant/platform_imports.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
"""
Contain all imported functions coming out of the platform.
We know these functions will be available at run time, but they
cannot be imported normally.
"""


def get_text_transcript(video_block):
"""Get the transcript for a video block in text format, or None."""
# pylint: disable=import-error, import-outside-toplevel
from xmodule.exceptions import NotFoundError
from xmodule.video_block.transcripts_utils import get_transcript
try:
transcript, _, _ = get_transcript(video_block, output_format='txt')
except NotFoundError:

Check warning on line 16 in learning_assistant/platform_imports.py

View check run for this annotation

Codecov / codecov/patch

learning_assistant/platform_imports.py#L12-L16

Added lines #L12 - L16 were not covered by tests
# some old videos have no transcripts, just accept that reality
return None
return transcript

Check warning on line 19 in learning_assistant/platform_imports.py

View check run for this annotation

Codecov / codecov/patch

learning_assistant/platform_imports.py#L18-L19

Added lines #L18 - L19 were not covered by tests


def get_single_block(request, user_id, course_id, usage_key_string, course=None):
"""Load a single xblock."""
# pylint: disable=import-error, import-outside-toplevel
from lms.djangoapps.courseware.block_renderer import load_single_xblock
return load_single_xblock(request, user_id, course_id, usage_key_string, course)

Check warning on line 26 in learning_assistant/platform_imports.py

View check run for this annotation

Codecov / codecov/patch

learning_assistant/platform_imports.py#L25-L26

Added lines #L25 - L26 were not covered by tests


def traverse_block_pre_order(start_node, get_children, filter_func=None):
"""Traverse a DAG or tree in pre-order."""
# pylint: disable=import-error, import-outside-toplevel
from openedx.core.lib.graph_traversals import traverse_pre_order
return traverse_pre_order(start_node, get_children, filter_func)

Check warning on line 33 in learning_assistant/platform_imports.py

View check run for this annotation

Codecov / codecov/patch

learning_assistant/platform_imports.py#L32-L33

Added lines #L32 - L33 were not covered by tests


def block_leaf_filter(block):
"""Return only leaf nodes."""
# pylint: disable=import-error, import-outside-toplevel
from openedx.core.lib.graph_traversals import leaf_filter
return leaf_filter(block)

Check warning on line 40 in learning_assistant/platform_imports.py

View check run for this annotation

Codecov / codecov/patch

learning_assistant/platform_imports.py#L39-L40

Added lines #L39 - L40 were not covered by tests


def block_get_children(block):
"""Return children of a given block."""
# pylint: disable=import-error, import-outside-toplevel
from openedx.core.lib.graph_traversals import get_children
return get_children(block)
62 changes: 62 additions & 0 deletions learning_assistant/text_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
"""
Text manipulation utils. This has been copied from the ai-aside repository.
"""

from html.parser import HTMLParser
from re import sub

from django.conf import settings


def cleanup_text(text):
"""
Remove litter from replacing or manipulating text.
"""
stripped = sub(r'[^\S\r\n]+', ' ', text) # Removing extra spaces
stripped = sub(r'\n{2,}', '\n', stripped) # Removing extra new lines
stripped = sub(r'(\s+)?\n(\s+)?', '\n', stripped) # Removing starting extra spacesbetween new lines
stripped = sub(r'(^(\s+)\n?)|(\n(\s+)?$)', '', stripped) # Trim

return stripped


class _HTMLToTextHelper(HTMLParser): # lint-amnesty, pylint: disable=abstract-method
"""
Helper function for html_to_text below.
"""

_is_content = True

def __init__(self):
HTMLParser.__init__(self)
self.reset()
self.fed = []

def handle_starttag(self, tag, _):
"""On each tag, check whether this is a tag we think is content."""
tags_to_filter = getattr(settings, 'HTML_TAGS_TO_REMOVE', None)
self._is_content = not (tags_to_filter and tag in tags_to_filter)

def handle_data(self, data):
"""Handle tag data by appending text we think is content."""
if self._is_content:
self.fed.append(data)

def handle_entityref(self, name):
"""If there is an entity, append the reference to the text."""
if self._is_content:
self.fed.append('&%s;' % name)

def get_data(self):
"""Join together the separate data chunks into one cohesive string."""
return ''.join(self.fed)


def html_to_text(html):
"""Strip the html tags off of the text to return plaintext."""
htmlstripper = _HTMLToTextHelper()
htmlstripper.feed(html)
text = htmlstripper.get_data()
text = cleanup_text(text)

return text
155 changes: 154 additions & 1 deletion tests/test_api.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,55 @@
"""
Test cases for the learning-assistant api module.
"""
from unittest.mock import MagicMock, patch

import ddt
from django.test import TestCase
from opaque_keys.edx.keys import UsageKey

from learning_assistant.api import get_deserialized_prompt_content_by_course_id, get_setup_messages
from learning_assistant.api import (
_extract_block_contents,
_get_children_contents,
_leaf_filter,
get_block_content,
get_deserialized_prompt_content_by_course_id,
get_setup_messages,
)
from learning_assistant.models import CoursePrompt

fake_transcript = 'This is the text version from the transcript'


class FakeChild:
"""Fake child block for testing"""
transcript_download_format = 'txt'

def __init__(self, category, test_id='test-id', test_html='<div>This is a test</div>'):
self.category = category
self.published_on = 'published-on-{}'.format(test_id)
self.edited_on = 'edited-on-{}'.format(test_id)
self.scope_ids = lambda: None
self.scope_ids.def_id = 'def-id-{}'.format(test_id)
self.html = test_html
self.transcript = fake_transcript

def get_html(self):
if self.category == 'html':
return self.html

return None


class FakeBlock:
"Fake block for testing, returns given children"
def __init__(self, children):
self.children = children
self.scope_ids = lambda: None
self.scope_ids.usage_id = UsageKey.from_string('block-v1:edX+A+B+type@vertical+block@verticalD')

def get_children(self):
return self.children


class LearningAssistantAPITests(TestCase):
"""
Expand Down Expand Up @@ -38,3 +82,112 @@ def test_get_setup_messages(self):
def test_get_setup_messages_invalid_course_id(self):
setup_messages = get_setup_messages('course-v1:edx+fake+19')
self.assertIsNone(setup_messages)


@ddt.ddt
class GetBlockContentAPITests(TestCase):
"""
Test suite for the get_block_content api function.
"""

def setUp(self):
self.children = [
FakeChild('html', '01', '''
<p>
Lorem ipsum dolor sit amet, consectetur adipiscing elit.
Vivamus dapibus elit lacus, at vehicula arcu vehicula in.
In id felis arcu. Maecenas elit quam, volutpat cursus pharetra vel, tempor at lorem.
Fusce luctus orci quis tempor aliquet.
</p>'''),
FakeChild('html', '02', '''
<Everything on the content on this child is inside a tag, so parsing it would return almost>
Nothing
</The quick brown fox jumps over the lazy dog>'''),
FakeChild('video', '03'),
FakeChild('unknown', '04')
]
self.block = FakeBlock(self.children)

self.course_id = 'course-v1:edx+test+23'

@ddt.data(
('video', True),
('html', True),
('unknown', False)
)
@ddt.unpack
@patch('learning_assistant.api.block_leaf_filter')
def test_block_leaf_filter(self, category, expected_value, mock_leaf_filter):
mock_leaf_filter.return_value = True

block = FakeChild(category)

is_leaf = _leaf_filter(block)
self.assertEqual(is_leaf, expected_value)

@ddt.data(
'video',
'html',
'unknown'
)
@patch('learning_assistant.api.html_to_text')
@patch('learning_assistant.api.get_text_transcript')
def test_extract_block_contents(self, category, mock_html, mock_transcript):
mock_return = 'This is the block content'
mock_html.return_value = mock_return
mock_transcript.return_value = mock_return

block = FakeChild(category)

block_content = _extract_block_contents(block, category)

if category in ['html', 'video']:
self.assertEqual(block_content, mock_return)
else:
self.assertIsNone(block_content)

@patch('learning_assistant.api.traverse_block_pre_order')
@patch('learning_assistant.api.html_to_text')
@patch('learning_assistant.api.get_text_transcript')
def test_get_children_contents(self, mock_transcript, mock_html, mock_traversal):
mock_traversal.return_value = self.children
block_content = 'This is the block content'
mock_html.return_value = block_content
mock_transcript.return_value = block_content

length, items = _get_children_contents(self.block)

expected_items = [
{'content_type': 'TEXT', 'content_text': block_content},
{'content_type': 'TEXT', 'content_text': block_content},
{'content_type': 'VIDEO', 'content_text': block_content}
]

# expected length should be equivalent to the sum of the content length in each of the 3 child blocks
# that are either video or html
self.assertEqual(length, len(block_content) * 3)
self.assertEqual(len(items), 3)
self.assertEqual(items, expected_items)

@patch('learning_assistant.api.get_single_block')
@patch('learning_assistant.api._get_children_contents')
def test_get_block_content(self, mock_get_children_contents, mock_get_single_block):
mock_get_single_block.return_value = self.block

block_content = 'This is the block content'
content_items = [{'content_type': 'TEXT', 'content_text': block_content}]
mock_get_children_contents.return_value = (len(block_content), content_items)

# mock arguments that are passed through to `get_single_block` function. the value of these
# args does not matter for this test right now, as the `get_single_block` function is entirely mocked.
request = MagicMock()
user_id = 1
course_id = self.course_id
unit_usage_key = 'block-v1:edX+A+B+type@vertical+block@verticalD'

length, items = get_block_content(request, user_id, course_id, unit_usage_key)

mock_get_children_contents.assert_called_with(self.block)

self.assertEqual(length, len(block_content))
self.assertEqual(items, content_items)
Loading

0 comments on commit 285e0e2

Please sign in to comment.