Skip to content

Commit

Permalink
feat: update youtube transcript fetch to allow all languages (#34436)
Browse files Browse the repository at this point in the history
* feat: allow all languages

* feat: add youtube transcript import functions as drf
  • Loading branch information
KristinAoki authored Mar 29, 2024
1 parent 25409de commit 9f734a7
Show file tree
Hide file tree
Showing 11 changed files with 158 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,5 @@
from .advanced_settings import AdvancedSettingsFieldSerializer, CourseAdvancedSettingsSerializer
from .assets import AssetSerializer
from .tabs import CourseTabSerializer, CourseTabUpdateSerializer, TabIDLocatorSerializer
from .transcripts import TranscriptSerializer
from .transcripts import TranscriptSerializer, YoutubeTranscriptCheckSerializer, YoutubeTranscriptUploadSerializer
from .xblock import XblockSerializer
25 changes: 25 additions & 0 deletions cms/djangoapps/contentstore/rest_api/v0/serializers/transcripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,28 @@ class TranscriptSerializer(StrictSerializer):
edx_video_id = serializers.CharField()
language_code = serializers.CharField(required=False, allow_null=True)
new_language_code = serializers.CharField(required=False, allow_null=True)


class YoutubeTranscriptCheckSerializer(StrictSerializer):
"""
Strict Serializer for YouTube transcripts check
"""
html5_local = serializers.ListField(
child=serializers.CharField()
)
html5_equal = serializers.BooleanField()
is_youtube_mode = serializers.BooleanField()
youtube_local = serializers.BooleanField()
youtube_server = serializers.BooleanField()
youtube_diff = serializers.BooleanField()
current_item_subs = serializers.ListField(required=False, allow_null=True)
status = serializers.CharField()
command = serializers.CharField()


class YoutubeTranscriptUploadSerializer(StrictSerializer):
"""
Strict Serializer for YouTube transcripts upload
"""
edx_video_id = serializers.CharField()
status = serializers.CharField()
21 changes: 18 additions & 3 deletions cms/djangoapps/contentstore/rest_api/v0/urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,16 @@

from openedx.core.constants import COURSE_ID_PATTERN

from .views import AdvancedCourseSettingsView, CourseTabSettingsView, CourseTabListView, CourseTabReorderView
from .views import (
AdvancedCourseSettingsView,
CourseTabSettingsView,
CourseTabListView,
CourseTabReorderView,
TranscriptView,
YoutubeTranscriptCheckView,
YoutubeTranscriptUploadView,
)
from .views import assets
from .views import transcripts
from .views import authoring_videos
from .views import xblock

Expand Down Expand Up @@ -68,7 +75,7 @@
),
re_path(
fr'^video_transcripts/{settings.COURSE_ID_PATTERN}$',
transcripts.TranscriptView.as_view(), name='cms_api_video_transcripts'
TranscriptView.as_view(), name='cms_api_video_transcripts'
),
re_path(
fr'^xblock/{settings.COURSE_ID_PATTERN}$',
Expand All @@ -78,4 +85,12 @@
fr'^xblock/{settings.COURSE_ID_PATTERN}/{settings.USAGE_KEY_PATTERN}$',
xblock.XblockView.as_view(), name='cms_api_xblock'
),
re_path(
fr'^youtube_transcripts/{settings.COURSE_ID_PATTERN}/check?$',
YoutubeTranscriptCheckView.as_view(), name='cms_api_youtube_transcripts_check'
),
re_path(
fr'^youtube_transcripts/{settings.COURSE_ID_PATTERN}/upload?$',
YoutubeTranscriptUploadView.as_view(), name='cms_api_youtube_transcripts_upload'
),
]
1 change: 1 addition & 0 deletions cms/djangoapps/contentstore/rest_api/v0/views/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
"""
from .advanced_settings import AdvancedCourseSettingsView
from .tabs import CourseTabSettingsView, CourseTabListView, CourseTabReorderView
from .transcripts import TranscriptView, YoutubeTranscriptCheckView, YoutubeTranscriptUploadView
51 changes: 49 additions & 2 deletions cms/djangoapps/contentstore/rest_api/v0/views/transcripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@
from common.djangoapps.util.json_request import expect_json_in_class_view

from cms.djangoapps.contentstore.api import course_author_access_required

from cms.djangoapps.contentstore.views.transcripts_ajax import check_transcripts, replace_transcripts
from cms.djangoapps.contentstore.transcript_storage_handlers import (
upload_transcript,
delete_video_transcript_or_404,
handle_transcript_download,
)
import cms.djangoapps.contentstore.toggles as contentstore_toggles
from ..serializers import TranscriptSerializer
from ..serializers import TranscriptSerializer, YoutubeTranscriptCheckSerializer, YoutubeTranscriptUploadSerializer
from rest_framework.parsers import (MultiPartParser, FormParser)
from openedx.core.lib.api.parsers import TypedFileUploadParser

Expand Down Expand Up @@ -68,3 +68,50 @@ def destroy(self, request, course_key_string): # pylint: disable=arguments-diff
"""

return delete_video_transcript_or_404(request)


@view_auth_classes()
class YoutubeTranscriptCheckView(DeveloperErrorViewMixin, RetrieveAPIView):
"""
public rest API endpoints for the CMS API YouTube transcripts.
youtube_id: required argument, needed to authorize course authors and identify the video.
edx_video_id: required argument, needed to identify the transcript.
xblock_id: required argument, needed to identify the transcript.
"""
serializer_class = YoutubeTranscriptCheckSerializer
parser_classes = (MultiPartParser, FormParser, TypedFileUploadParser)

def dispatch(self, request, *args, **kwargs):
if not toggles.use_studio_content_api():
raise Http404
return super().dispatch(request, *args, **kwargs)

@course_author_access_required
def retrieve(self, request, course_key_string): # pylint: disable=arguments-differ
"""
Get the status of youtube transcripts for a given youtube video
"""
return check_transcripts(request)


@view_auth_classes()
class YoutubeTranscriptUploadView(DeveloperErrorViewMixin, RetrieveAPIView):
"""
public rest API endpoints for the CMS API YouTube transcripts.
youtube_id: required argument, needed to authorize course authors and identify the video.
xblock_id: required argument, needed to identify the transcript.
"""
serializer_class = YoutubeTranscriptUploadSerializer
parser_classes = (MultiPartParser, FormParser, TypedFileUploadParser)

def dispatch(self, request, *args, **kwargs):
if not toggles.use_studio_content_api():
raise Http404
return super().dispatch(request, *args, **kwargs)

@course_author_access_required
def retrieve(self, request, course_key_string): # pylint: disable=arguments-differ
"""
Get the youtube transcripts for a give youtube video and add them to video block
"""
return replace_transcripts(request)
9 changes: 6 additions & 3 deletions cms/djangoapps/contentstore/tests/test_transcripts_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -464,14 +464,16 @@ def test_youtube_bad_status_code(self, mock_get):
setup_caption_responses(mock_get, 'en', 'test', track_status_code)
youtube_id = 'bad_youtube_id'
with self.assertRaises(transcripts_utils.GetTranscriptsFromYouTubeException):
transcripts_utils.get_transcripts_from_youtube(youtube_id, settings, translation)
link = transcripts_utils.get_transcript_links_from_youtube(youtube_id, settings, translation)
transcripts_utils.get_transcript_from_youtube(link, youtube_id, translation)

@patch('xmodule.video_block.transcripts_utils.requests.get')
def test_youtube_empty_text(self, mock_get):
setup_caption_responses(mock_get, 'en', '')
youtube_id = 'bad_youtube_id'
with self.assertRaises(transcripts_utils.GetTranscriptsFromYouTubeException):
transcripts_utils.get_transcripts_from_youtube(youtube_id, settings, translation)
link = transcripts_utils.get_transcript_links_from_youtube(youtube_id, settings, translation)
transcripts_utils.get_transcript_from_youtube(link, youtube_id, translation)

def test_youtube_good_result(self):
caption_response_string = textwrap.dedent("""<?xml version="1.0" encoding="utf-8" ?>
Expand All @@ -491,7 +493,8 @@ def test_youtube_good_result(self):
language_code = 'en'
with patch('xmodule.video_block.transcripts_utils.requests.get') as mock_get:
setup_caption_responses(mock_get, language_code, caption_response_string)
transcripts = transcripts_utils.get_transcripts_from_youtube(youtube_id, settings, translation)
link = transcripts_utils.get_transcript_links_from_youtube(youtube_id, settings, translation)
transcripts = transcripts_utils.get_transcript_from_youtube(link['en'], youtube_id, translation)

self.assertEqual(transcripts, expected_transcripts)
self.assertEqual(2, len(mock_get.mock_calls))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -628,7 +628,7 @@ def test_rename_transcript_fails_on_unknown_category(self):
@ddt.ddt
@patch(
'cms.djangoapps.contentstore.views.transcripts_ajax.download_youtube_subs',
Mock(return_value=SJSON_TRANSCRIPT_CONTENT)
Mock(return_value=[['en', SJSON_TRANSCRIPT_CONTENT]])
)
class TestReplaceTranscripts(BaseTranscripts):
"""
Expand Down
24 changes: 16 additions & 8 deletions cms/djangoapps/contentstore/views/transcripts_ajax.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,9 @@
get_transcript,
get_transcript_for_video,
get_transcript_from_val,
get_transcripts_from_youtube,
get_transcript_link_from_youtube
get_transcript_from_youtube,
get_transcript_link_from_youtube,
get_transcript_links_from_youtube,
)

__all__ = [
Expand Down Expand Up @@ -345,13 +346,17 @@ def check_transcripts(request): # lint-amnesty, pylint: disable=too-many-statem
#check youtube local and server transcripts for equality
if transcripts_presence['youtube_server'] and transcripts_presence['youtube_local']:
try:
youtube_server_subs = get_transcripts_from_youtube(
transcript_links = get_transcript_links_from_youtube(
youtube_id,
settings,
item.runtime.service(item, "i18n")
)
if json.loads(local_transcripts) == youtube_server_subs: # check transcripts for equality
transcripts_presence['youtube_diff'] = False
for (_, link) in transcript_links.items():
youtube_server_subs = get_transcript_from_youtube(
link, youtube_id, item.runtime.service(item, "i18n")
)
if json.loads(local_transcripts) == youtube_server_subs: # check transcripts for equality
transcripts_presence['youtube_diff'] = False
except GetTranscriptsFromYouTubeException:
pass

Expand Down Expand Up @@ -450,7 +455,6 @@ def _validate_transcripts_data(request):
data = json.loads(request.GET.get('data', '{}'))
if not data:
raise TranscriptsRequestValidationException(_('Incoming video data is empty.'))

try:
item = _get_item(request, data)
except (InvalidKeyError, ItemNotFoundError):
Expand Down Expand Up @@ -512,7 +516,6 @@ def validate_transcripts_request(request, include_yt=False, include_html5=False)
for video in videos
if video['type'] != 'youtube'
}

return error, validated_data


Expand Down Expand Up @@ -622,8 +625,13 @@ def replace_transcripts(request):
# 2. Link a video to video component if its not already linked to one.
edx_video_id = link_video_to_component(video, request.user)

# for transcript in transcript_links:

# 3. Upload YT transcript to DS for the linked video ID.
success = save_video_transcript(edx_video_id, Transcript.SJSON, transcript_content, language_code='en')
success = True
for transcript in transcript_content:
[language_code, json_content] = transcript
success = save_video_transcript(edx_video_id, Transcript.SJSON, json_content, language_code)
if success:
response = JsonResponse({'edx_video_id': edx_video_id, 'status': 'Success'}, status=200)
else:
Expand Down
3 changes: 2 additions & 1 deletion cms/lib/spectacular.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ def cms_api_filter(endpoints):
path.startswith("/api/contentstore/v0/xblock") or
path.startswith("/api/contentstore/v0/videos") or
path.startswith("/api/contentstore/v0/video_transcripts") or
path.startswith("/api/contentstore/v0/file_assets")
path.startswith("/api/contentstore/v0/file_assets") or
path.startswith("/api/contentstore/v0/youtube_transcripts")
):
filtered.append((path, path_regex, method, callback))
return filtered
15 changes: 9 additions & 6 deletions xmodule/tests/test_transcripts_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,21 +110,24 @@ class TranscriptsUtilsTest(TestCase):
"""

@mock.patch('requests.get')
@ddt.data("en", "en-US", "en-GB")
@ddt.data("en", "en-US", "en-GB", 'fr')
def test_get_transcript_link_from_youtube(self, language_code, mock_get):
"""
Happy path test: english caption link returned when video page HTML has one english caption
Happy path test: dict of caption links returned when video page HTML has at least one caption
"""
mock_get.return_value = YoutubeVideoHTMLResponse.with_caption_track(language_code)

language_specific_caption_link = get_transcript_link_from_youtube(YOUTUBE_VIDEO_ID)
self.assertEqual(language_specific_caption_link, CAPTION_URL_UTF8_DECODED_TEMPLATE.format(language_code))
self.assertEqual(
language_specific_caption_link,
{language_code: CAPTION_URL_UTF8_DECODED_TEMPLATE.format(language_code)}
)

@ mock.patch('requests.get')
@ddt.data("fr", None)
def test_get_caption_no_english_caption(self, language_code, mock_get):
@ddt.data(None)
def test_get_caption_no_caption(self, language_code, mock_get):
"""
No caption link returned when video page HTML contains no caption in English
No caption link returned when video page HTML contains no caption
"""
mock_get.return_value = YoutubeVideoHTMLResponse.with_caption_track(language_code)

Expand Down
43 changes: 30 additions & 13 deletions xmodule/video_block/transcripts_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,19 +180,22 @@ def get_transcript_link_from_youtube(youtube_id):
try:
youtube_html = requests.get(f"{youtube_url_base}{youtube_id}")
caption_re = settings.YOUTUBE['TRANSCRIPTS']['CAPTION_TRACKS_REGEX']
allowed_language_codes = settings.YOUTUBE['TRANSCRIPTS']['ALLOWED_LANGUAGE_CODES']
caption_matched = re.search(caption_re, youtube_html.content.decode("utf-8"))
if caption_matched:
caption_tracks = json.loads(f'[{caption_matched.group("caption_tracks")}]')
caption_links = {}
for caption in caption_tracks:
if "languageCode" in caption.keys() and caption["languageCode"] in allowed_language_codes:
return caption.get("baseUrl")
language_code = caption.get('languageCode', None)
if language_code and not language_code == 'None':
link = caption.get("baseUrl")
caption_links[language_code] = link
return None if not caption_links else caption_links
return None
except ConnectionError:
return None


def get_transcripts_from_youtube(youtube_id, settings, i18n, youtube_transcript_name=''): # lint-amnesty, pylint: disable=redefined-outer-name
def get_transcript_links_from_youtube(youtube_id, settings, i18n, youtube_transcript_name=''): # lint-amnesty, pylint: disable=redefined-outer-name
"""
Gets transcripts from youtube for youtube_id.
Expand All @@ -202,18 +205,29 @@ def get_transcripts_from_youtube(youtube_id, settings, i18n, youtube_transcript_
Returns (status, transcripts): bool, dict.
"""
_ = i18n.gettext
transcript_links = get_transcript_link_from_youtube(youtube_id)

utf8_parser = etree.XMLParser(encoding='utf-8')

transcript_link = get_transcript_link_from_youtube(youtube_id)

if not transcript_link:
if not transcript_links:
msg = _("Can't get transcript link from Youtube for {youtube_id}.").format(
youtube_id=youtube_id,
)
raise GetTranscriptsFromYouTubeException(msg)

data = requests.get(transcript_link)
return transcript_links


def get_transcript_from_youtube(link, youtube_id, i18n):
"""
Gets transcripts from youtube for youtube_id.
Parses only utf-8 encoded transcripts.
Other encodings are not supported at the moment.
Returns (status, transcripts): bool, dict.
"""
_ = i18n.gettext
utf8_parser = etree.XMLParser(encoding='utf-8')
data = requests.get(link)

if data.status_code != 200 or not data.text:
msg = _("Can't receive transcripts from Youtube for {youtube_id}. Status code: {status_code}.").format(
Expand Down Expand Up @@ -258,9 +272,12 @@ def download_youtube_subs(youtube_id, video_block, settings): # lint-amnesty, p
"""
i18n = video_block.runtime.service(video_block, "i18n")
_ = i18n.gettext

subs = get_transcripts_from_youtube(youtube_id, settings, i18n)
return json.dumps(subs, indent=2)
transcript_links = get_transcript_links_from_youtube(youtube_id, settings, i18n)
subs = []
for (language_code, link) in transcript_links.items():
sub = get_transcript_from_youtube(link, youtube_id, i18n)
subs.append([language_code, json.dumps(sub, indent=2)])
return subs


def remove_subs_from_store(subs_id, item, lang='en'):
Expand Down

0 comments on commit 9f734a7

Please sign in to comment.