diff --git a/audio/speech/getting-started/get_started_with_chirp_2_sdk_features.ipynb b/audio/speech/getting-started/get_started_with_chirp_2_sdk_features.ipynb index 8b303a6cb7..d86baa61ce 100644 --- a/audio/speech/getting-started/get_started_with_chirp_2_sdk_features.ipynb +++ b/audio/speech/getting-started/get_started_with_chirp_2_sdk_features.ipynb @@ -1,1032 +1,1013 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ur8xi4C7S06n" - }, - "outputs": [], - "source": [ - "# Copyright 2024 Google LLC\n", - "#\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "JAPoU8Sm5E6e" - }, - "source": [ - "# Get started with Chirp 2 - Advanced features\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Google
Open in Colab\n", - "
\n", - "
\n", - " \n", - " \"Google
Open in Colab Enterprise\n", - "
\n", - "
\n", - " \n", - " \"Vertex
Open in Vertex AI Workbench\n", - "
\n", - "
\n", - " \n", - " \"GitHub
View on GitHub\n", - "
\n", - "
" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "84f0f73a0f76" - }, - "source": [ - "| | |\n", - "|-|-|\n", - "| Author(s) | [Ivan Nardini](https://github.com/inardini) |" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "tvgnzT1CKxrO" - }, - "source": [ - "## Overview\n", - "\n", - "In this tutorial, you learn about how to use [Chirp 2](https://cloud.google.com/speech-to-text/v2/docs/chirp_2-model), the latest generation of Google's multilingual ASR-specific models, and its new features, including word-level timestamps, model adaptation, and speech translation." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "61RBz8LLbxCR" - }, - "source": [ - "## Get started" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "No17Cw5hgx12" - }, - "source": [ - "### Install Speech-to-Text SDK and other required packages\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "e73_ZgKWYedz" - }, - "outputs": [], - "source": [ - "! apt update -y -qq\n", - "! apt install ffmpeg -y -qq" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tFy3H3aPgx12" - }, - "outputs": [], - "source": [ - "%pip install --quiet 'google-cloud-speech' 'protobuf<4.21' 'google-auth==2.27.0' 'pydub' 'etils' 'jiwer' 'ffmpeg-python' 'plotly' 'gradio'" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "R5Xep4W9lq-Z" - }, - "source": [ - "### Restart runtime\n", - "\n", - "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", - "\n", - "The restart might take a minute or longer. After it's restarted, continue to the next step." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XRvKdaPDTznN" - }, - "outputs": [], - "source": [ - "import IPython\n", - "\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SbmM4z7FOBpM" - }, - "source": [ - "
\n", - "⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️\n", - "
\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "dmWOrTJ3gx13" - }, - "source": [ - "### Authenticate your notebook environment (Colab only)\n", - "\n", - "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NyKGtVQjgx13" - }, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "if \"google.colab\" in sys.modules:\n", - " from google.colab import auth\n", - "\n", - " auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DF4l8DTdWgPY" - }, - "source": [ - "### Set Google Cloud project information and initialize Speech-to-Text V2 SDK\n", - "\n", - "To get started using the Speech-to-Text API, you must have an existing Google Cloud project and [enable the Speech-to-Text API](https://console.cloud.google.com/flows/enableapi?apiid=speech.googleapis.com).\n", - "\n", - "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WIQyBhAn_9tK" - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "PROJECT_ID = \"[your-project-id]\" # @param {type:\"string\", isTemplate: true}\n", - "\n", - "if PROJECT_ID == \"[your-project-id]\":\n", - " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", - "\n", - "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nqwi-5ufWp_B" - }, - "outputs": [], - "source": [ - "from google.api_core.client_options import ClientOptions\n", - "from google.cloud.speech_v2 import SpeechClient\n", - "\n", - "API_ENDPOINT = f\"{LOCATION}-speech.googleapis.com\"\n", - "\n", - "client = SpeechClient(\n", - " client_options=ClientOptions(\n", - " api_endpoint=API_ENDPOINT,\n", - " )\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5303c05f7aa6" - }, - "source": [ - "### Import libraries" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6fc324893334" - }, - "outputs": [], - "source": [ - "from google.cloud.speech_v2.types import cloud_speech\n", - "import gradio as gr" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "qqm0OQpAYCph" - }, - "outputs": [], - "source": [ - "import io\n", - "import os\n", - "\n", - "import IPython.display as ipd\n", - "from etils import epath as ep\n", - "from pydub import AudioSegment" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sP8GBj3tBAC1" - }, - "source": [ - "### Set constants" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rXTVeU1uBBqY" - }, - "outputs": [], - "source": [ - "INPUT_AUDIO_SAMPLE_FILE_URI = (\n", - " \"gs://github-repo/audio_ai/speech_recognition/attention_is_all_you_need_podcast.wav\"\n", - ")\n", - "\n", - "RECOGNIZER = client.recognizer_path(PROJECT_ID, LOCATION, \"_\")\n", - "\n", - "MAX_CHUNK_SIZE = 25600" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "djgFxrGC_Ykd" - }, - "source": [ - "### Helpers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Zih8W_wC_caW" - }, - "outputs": [], - "source": [ - "def read_audio_file(audio_file_path: str) -> bytes:\n", - " \"\"\"\n", - " Read audio file as bytes.\n", - " \"\"\"\n", - " if audio_file_path.startswith(\"gs://\"):\n", - " with ep.Path(audio_file_path).open(\"rb\") as f:\n", - " audio_bytes = f.read()\n", - " else:\n", - " with open(audio_file_path, \"rb\") as f:\n", - " audio_bytes = f.read()\n", - " return audio_bytes\n", - "\n", - "\n", - "def save_audio_sample(audio_bytes: bytes, output_file_uri: str) -> None:\n", - " \"\"\"\n", - " Save audio sample as a file in Google Cloud Storage.\n", - " \"\"\"\n", - "\n", - " output_file_path = ep.Path(output_file_uri)\n", - " if not output_file_path.parent.exists():\n", - " output_file_path.parent.mkdir(parents=True, exist_ok=True)\n", - "\n", - " with output_file_path.open(\"wb\") as f:\n", - " f.write(audio_bytes)\n", - "\n", - "\n", - "def extract_audio_sample(audio_bytes: bytes, duration: int) -> bytes:\n", - " \"\"\"\n", - " Extracts a random audio sample of a given duration from an audio file.\n", - " \"\"\"\n", - " audio = AudioSegment.from_file(io.BytesIO(audio_bytes))\n", - " start_time = 0\n", - " audio_sample = audio[start_time : start_time + duration * 1000]\n", - "\n", - " audio_bytes = io.BytesIO()\n", - " audio_sample.export(audio_bytes, format=\"wav\")\n", - " audio_bytes.seek(0)\n", - "\n", - " return audio_bytes.read()\n", - "\n", - "\n", - "def play_audio_sample(audio_bytes: bytes) -> None:\n", - " \"\"\"\n", - " Plays the audio sample in a notebook.\n", - " \"\"\"\n", - " audio_file = io.BytesIO(audio_bytes)\n", - " ipd.display(ipd.Audio(audio_file.read(), rate=44100))\n", - "\n", - "\n", - "def parse_real_time_recognize_response(response) -> list[tuple[str, int]]:\n", - " \"\"\"Parse real-time responses from the Speech-to-Text API\"\"\"\n", - " real_time_recognize_results = []\n", - " for result in response.results:\n", - " real_time_recognize_results.append(\n", - " (result.alternatives[0].transcript, result.result_end_offset)\n", - " )\n", - " return real_time_recognize_results\n", - "\n", - "\n", - "def parse_words_real_time_recognize_response(response):\n", - " \"\"\"\n", - " Parse the word-level results from a real-time speech recognition response.\n", - " \"\"\"\n", - " real_time_recognize_results = []\n", - " for result in response.results:\n", - " for word_info in result.alternatives[0].words:\n", - " word = word_info.word\n", - " start_time = word_info.start_offset.seconds\n", - " end_time = word_info.end_offset.seconds\n", - " real_time_recognize_results.append(\n", - " {\"word\": word, \"start\": start_time, \"end\": end_time}\n", - " )\n", - " return real_time_recognize_results\n", - "\n", - "\n", - "def print_transcription(\n", - " audio_sample_bytes: bytes, transcriptions: str, play_audio=True\n", - ") -> None:\n", - " \"\"\"Prettify the play of the audio and the associated print of the transcription text in a notebook\"\"\"\n", - "\n", - " if play_audio:\n", - " # Play the audio sample\n", - " display(ipd.HTML(\"Audio:\"))\n", - " play_audio_sample(audio_sample_bytes)\n", - " display(ipd.HTML(\"
\"))\n", - "\n", - " # Display the transcription text\n", - " display(ipd.HTML(\"Transcription:\"))\n", - " for transcription, _ in transcriptions:\n", - " formatted_text = f\"
{transcription}
\"\n", - " display(ipd.HTML(formatted_text))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "y4MO5i9X4yq3" - }, - "source": [ - "### Prepare audio samples\n", - "\n", - "The podcast audio is ~ 8 mins. Depending on the audio length, you can use different transcribe API methods. To learn more, check out the official documentation. " - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4uTeBXo6dZlS" - }, - "source": [ - "#### Read the audio file\n", - "\n", - "Let's start reading the input audio sample you want to transcribe.\n", - "\n", - "In this case, it is a podcast generated with NotebookLM about the \"Attention is all you need\" [paper](https://arxiv.org/abs/1706.03762)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pjzwMWqpdldM" - }, - "outputs": [], - "source": [ - "input_audio_bytes = read_audio_file(INPUT_AUDIO_SAMPLE_FILE_URI)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "cIAl9Lyd4niN" - }, - "source": [ - "#### Prepare a short audio sample (< 1 min)\n", - "\n", - "Extract a short audio sample from the original one for streaming and real-time audio processing." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "MofmWRSH4niO" - }, - "outputs": [], - "source": [ - "short_audio_sample_bytes = extract_audio_sample(input_audio_bytes, 30)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "AC2YeY7v4niO" - }, - "outputs": [], - "source": [ - "play_audio_sample(short_audio_sample_bytes)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "VPVDNRyVxquo" - }, - "source": [ - "## Improve transcription using Chirp 2's word-timing and speech adaptation features\n", - "\n", - "Chirp 2 supports word-level timestamps for each transcribed word and speech adaptation to help the model improving recognition accuracy for specific terms or proper nouns." - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "oYCgDay2hAgB" - }, - "source": [ - "### Perform real-time speech recognition with word-timing" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "F83r9aiNhAgD" - }, - "source": [ - "#### Define real-time recognition configuration with `enable_word_time_offsets` parameter.\n", - "\n", - "You define the real-time recognition configuration which allows you to set the model to use, language code of the audio and more.\n", - "\n", - "In this case, you enable word timing feature. When True, the top result includes a list of words and the start and end time offsets (timestamps) for those words." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "j0WprZ-phAgD" - }, - "outputs": [], - "source": [ - "wt_real_time_config = cloud_speech.RecognitionConfig(\n", - " auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),\n", - " language_codes=[\"en-US\"],\n", - " model=\"chirp_2\",\n", - " features=cloud_speech.RecognitionFeatures(\n", - " enable_word_time_offsets=True,\n", - " enable_automatic_punctuation=True,\n", - " ),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "r2TqksAqhAgD" - }, - "source": [ - "#### Define the real-time request configuration\n", - "\n", - "Next, you define the real-time request passing the configuration and the audio sample you want to transcribe.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nh55mSzXhAgD" - }, - "outputs": [], - "source": [ - "wt_real_time_request = cloud_speech.RecognizeRequest(\n", - " config=wt_real_time_config, content=short_audio_sample_bytes, recognizer=RECOGNIZER\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "817YXVBli0aY" - }, - "source": [ - "#### Run the real-time recognition request\n", - "\n", - "Finally you submit the real-time recognition request." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rc0cBrVsi7UG" - }, - "outputs": [], - "source": [ - "wt_response = client.recognize(request=wt_real_time_request)\n", - "wt_real_time_recognize_results = parse_real_time_recognize_response(wt_response)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "J2vpMSv7CZ_2" - }, - "source": [ - "And you use a helper function to visualize transcriptions and the associated streams." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ezH51rLH4CBR" - }, - "outputs": [], - "source": [ - "for transcription, _ in wt_real_time_recognize_results:\n", - " print_transcription(short_audio_sample_bytes, transcription)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "iFhUcPcO-Zeh" - }, - "source": [ - "#### Visualize word timings" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mhH42sab-3Tg" - }, - "outputs": [], - "source": [ - "n = 10\n", - "word_timings = parse_words_real_time_recognize_response(wt_response)\n", - "for word_info in word_timings[:n]:\n", - " print(\n", - " f\"Word: {word_info['word']} - Start: {word_info['start']} sec - End: {word_info['end']} sec\"\n", - " )" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IFOq3SK0qOT_" - }, - "source": [ - "### Improve real-time speech recognition accuracy with model adaptation\n", - "\n", - "So far, Chirp 2 transcribes the podcast correctly. That's in part because podcasts are recorded in ideal enviroments like a recording studio. But that's not always the case. For example, suppose that your audio data is recorded in noisy environment or the recording has strong accents or someone speaks quickly.\n", - "\n", - "To handle this and many other scenarios and improve real-time speech recognition accuracy, you can use model adaptation. To enable model adaptation with Chirp 2, you use the `adaptation` parameter.\n", - "\n", - "With `adaptation` parameter, you provide \"hints\" to the speech recognizer to favor specific words and phrases (`AdaptationPhraseSet` class) in the results. And for each hint you can define a hint boost which is the probability that a specific word or phrase will be recognized over other similar sounding phrases. Be careful to use higher boost. Higher the boost, higher is the chance of false positive recognition as well. We recommend using a binary search approach to finding the optimal value for your use case as well as adding phrases both with and without boost to your requests.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1--AAmsYq-vG" - }, - "source": [ - "#### Define real-time recognition configuration with `adaptation` parameter\n", - "\n", - "You define a new real-time recognition configuration which includes the `adaptation` configuration.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "qUr76NRcq-vH" - }, - "outputs": [], - "source": [ - "adaptation_real_time_config = cloud_speech.RecognitionConfig(\n", - " auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),\n", - " language_codes=[\"en-US\"],\n", - " model=\"chirp_2\",\n", - " features=cloud_speech.RecognitionFeatures(\n", - " enable_automatic_punctuation=True,\n", - " ),\n", - " adaptation=cloud_speech.SpeechAdaptation(\n", - " phrase_sets=[\n", - " cloud_speech.SpeechAdaptation.AdaptationPhraseSet(\n", - " inline_phrase_set=cloud_speech.PhraseSet(\n", - " phrases=[\n", - " {\n", - " \"value\": \"you know\", # often mumbled or spoken quickly\n", - " \"boost\": 10.0,\n", - " },\n", - " {\n", - " \"value\": \"what are they called again?\" # hesitations and changes in intonation\n", - " },\n", - " {\n", - " \"value\": \"Yeah, it's wild.\" # short interjections have brevity and the emotional inflection\n", - " },\n", - " ]\n", - " )\n", - " )\n", - " ]\n", - " ),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2Lm8e-2Cq-vH" - }, - "source": [ - "#### Define the real-time request configuration" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pH9ZxWFIq-vH" - }, - "outputs": [], - "source": [ - "adaptation_real_time_request = cloud_speech.RecognizeRequest(\n", - " config=adaptation_real_time_config,\n", - " content=short_audio_sample_bytes,\n", - " recognizer=RECOGNIZER,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xaQSQMZZq-vH" - }, - "source": [ - "#### Run the real-time recognition request" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KYiCZjDWq-vH" - }, - "outputs": [], - "source": [ - "adapted_response = client.recognize(request=adaptation_real_time_request)\n", - "adapted_real_time_recognize_results = parse_real_time_recognize_response(\n", - " adapted_response\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "xOjvJHHmq-vH" - }, - "source": [ - "And you use a helper function to visualize transcriptions and the associated streams." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7Xipn8H4q-vH" - }, - "outputs": [], - "source": [ - "for transcription, _ in adapted_real_time_recognize_results:\n", - " print_transcription(short_audio_sample_bytes, transcription)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "s0sIpQmJr40s" - }, - "source": [ - "## Transcript and translate using language-agnostic transcription and language translation\n", - "\n", - "Chirp 2 supports language-agnostic audio transcription and language translation. This means that Chirp 2 is capable of recognizing the language of the input audio and, at the same time, translate the outcome transcription in many different language.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "5QpacKEDRStJ" - }, - "source": [ - "#### Define real-time recognition configuration with `language_code` and `translationConfig` parameters.\n", - "\n", - "You define a real-time recognition configuration by setting language codes in both `language_codes` and `translationConfig` parameters :\n", - "\n", - "* When `language_codes=[\"auto\"]`, you enable language-agnostic transcription to auto to detect language.\n", - "\n", - "* When `target_language=language_code` where `language_code` is one of the language in this list but different from the original language, you enable language translation." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "9vaW49XqUD2v" - }, - "outputs": [], - "source": [ - "target_language_code = \"ca-ES\" # @param {type:\"string\", isTemplate: true}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y3Z_vaKpRStK" - }, - "outputs": [], - "source": [ - "ts_real_time_config = cloud_speech.RecognitionConfig(\n", - " auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),\n", - " language_codes=[\"en-US\"],\n", - " translation_config=cloud_speech.TranslationConfig(\n", - " target_language=target_language_code\n", - " ),\n", - " model=\"chirp_2\",\n", - " features=cloud_speech.RecognitionFeatures(\n", - " enable_automatic_punctuation=True,\n", - " ),\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nPGCDLWARStK" - }, - "source": [ - "#### Define the real-time request configuration" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5_spCiHDRStK" - }, - "outputs": [], - "source": [ - "ts_real_time_request = cloud_speech.RecognizeRequest(\n", - " config=ts_real_time_config, content=short_audio_sample_bytes, recognizer=RECOGNIZER\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Mzr69sLjRStK" - }, - "source": [ - "#### Run the real-time recognition request" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "NvcPOWLkRStK" - }, - "outputs": [], - "source": [ - "ts_response = client.recognize(request=ts_real_time_request)\n", - "ts_real_time_recognize_results = parse_real_time_recognize_response(ts_response)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "STjng1ZiRStK" - }, - "source": [ - "And you use a helper function to visualize transcriptions and the associated streams." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "EhTgg3qwRStL" - }, - "outputs": [], - "source": [ - "print_transcription(short_audio_sample_bytes, transcription, play_audio=False)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "_MkA144iQrAn" - }, - "source": [ - "## Chirp 2 playground\n", - "\n", - "To play with Chirp 2, you can create a simple Gradio application where you enable several Chirp 2 features.\n", - "\n", - "Below you have an example for language-agnostic transcription and language translation with Chirp 2.\n", - "\n", - "To know more, check out the official documentation [here](https://cloud.google.com/speech-to-text/v2/docs/chirp_2-model).\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WjuuZHTbQwkF" - }, - "outputs": [], - "source": [ - "def transcribe_audio(audio, enable_translation, target_language_code):\n", - " \"\"\"Transcribe the given audio file with optional features.\"\"\"\n", - "\n", - " # Set variables\n", - " project_id = os.environ.get(\"GOOGLE_CLOUD_PROJECT\", PROJECT_ID)\n", - " location = os.environ.get(\"GOOGLE_CLOUD_REGION\", LOCATION)\n", - " api_endpoint = f\"{location}-speech.googleapis.com\"\n", - "\n", - " # initiate client\n", - " client = SpeechClient(\n", - " client_options=ClientOptions(\n", - " api_endpoint=api_endpoint,\n", - " )\n", - " )\n", - "\n", - " # read the audio\n", - " with open(audio, \"rb\") as audio_file:\n", - " content = audio_file.read()\n", - "\n", - " # define language agnostic real time recognition configuration\n", - " real_time_config = cloud_speech.RecognitionConfig(\n", - " model=\"chirp_2\",\n", - " language_codes=[\"auto\"],\n", - " features=cloud_speech.RecognitionFeatures(\n", - " enable_automatic_punctuation=True,\n", - " ),\n", - " auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),\n", - " )\n", - "\n", - " if enable_translation:\n", - " real_time_config.language_codes = [\"en-US\"]\n", - " real_time_config.translation_config = cloud_speech.TranslationConfig(\n", - " target_language=target_language_code\n", - " )\n", - "\n", - " # define real-time recognition request\n", - " recognizer = client.recognizer_path(project_id, location, \"_\")\n", - "\n", - " real_time_request = cloud_speech.RecognizeRequest(\n", - " config=real_time_config,\n", - " content=content,\n", - " recognizer=recognizer,\n", - " )\n", - "\n", - " response = client.recognize(request=real_time_request)\n", - "\n", - " full_transcript = \"\"\n", - " for result in response.results:\n", - " full_transcript += result.alternatives[0].transcript + \" \"\n", - " return full_transcript.strip()\n", - "\n", - "\n", - "def speech_to_text(audio, enable_translation=False, target_language_code=None):\n", - " if audio is None:\n", - " return \"\"\n", - "\n", - " text = transcribe_audio(audio, enable_translation, target_language_code)\n", - " return text" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "HQzUnSkErDTN" - }, - "outputs": [], - "source": [ - "# Create Gradio interface\n", - "demo = gr.Interface(\n", - " fn=speech_to_text,\n", - " inputs=[\n", - " gr.Audio(type=\"filepath\", label=\"Audio input\"),\n", - " gr.Checkbox(label=\"🧠 Enable Translation\"),\n", - " gr.Dropdown(\n", - " label=\"Select language to translate\",\n", - " choices=[\"ca-ES\", \"cy-GB\", \"de-DE\", \"ja-JP\", \"zh-Hans-CN\"],\n", - " interactive=True,\n", - " multiselect=False,\n", - " ),\n", - " ],\n", - " outputs=[gr.Textbox(label=\"📄 Transcription\")],\n", - " title=\"Chirp 2 Playground\",\n", - " description=\"

Speak or pass an audio and get the transcription!

\",\n", - ")\n", - "\n", - "# Launch the app\n", - "demo.launch()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "XqTpn06QrEiZ" - }, - "outputs": [], - "source": [ - "demo.close()" - ] - } - ], - "metadata": { - "colab": { - "name": "get_started_with_chirp_2_sdk_features.ipynb", - "toc_visible": true - }, - "environment": { - "kernel": "python3", - "name": "tf2-cpu.2-11.m125", - "type": "gcloud", - "uri": "us-docker.pkg.dev/deeplearning-platform-release/gcr.io/tf2-cpu.2-11:m125" - }, - "kernelspec": { - "display_name": "Python 3 (Local)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.15" - } - }, - "nbformat": 4, - "nbformat_minor": 4 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ur8xi4C7S06n" + }, + "outputs": [], + "source": [ + "# Copyright 2024 Google LLC\n", + "#\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "JAPoU8Sm5E6e" + }, + "source": [ + "# Get started with Chirp 2 - Advanced features\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Google
Open in Colab\n", + "
\n", + "
\n", + " \n", + " \"Google
Open in Colab Enterprise\n", + "
\n", + "
\n", + " \n", + " \"Vertex
Open in Vertex AI Workbench\n", + "
\n", + "
\n", + " \n", + " \"GitHub
View on GitHub\n", + "
\n", + "
" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "84f0f73a0f76" + }, + "source": [ + "| | |\n", + "|-|-|\n", + "| Author(s) | [Ivan Nardini](https://github.com/inardini) |" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "tvgnzT1CKxrO" + }, + "source": [ + "## Overview\n", + "\n", + "In this tutorial, you learn about how to use [Chirp 2](https://cloud.google.com/speech-to-text/v2/docs/chirp_2-model), the latest generation of Google's multilingual ASR-specific models, and its new features, including word-level timestamps, model adaptation, and speech translation." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "61RBz8LLbxCR" + }, + "source": [ + "## Get started" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "No17Cw5hgx12" + }, + "source": [ + "### Install Speech-to-Text SDK and other required packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "e73_ZgKWYedz" + }, + "outputs": [], + "source": [ + "! apt update -y -qq\n", + "! apt install ffmpeg -y -qq" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tFy3H3aPgx12" + }, + "outputs": [], + "source": [ + "%pip install --quiet 'google-cloud-speech' 'protobuf<4.21' 'google-auth==2.27.0' 'pydub' 'etils' 'jiwer' 'ffmpeg-python' 'plotly' 'gradio'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "R5Xep4W9lq-Z" + }, + "source": [ + "### Restart runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which restarts the current kernel.\n", + "\n", + "The restart might take a minute or longer. After it's restarted, continue to the next step." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XRvKdaPDTznN" + }, + "outputs": [], + "source": [ + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SbmM4z7FOBpM" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. Wait until it's finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dmWOrTJ3gx13" + }, + "source": [ + "### Authenticate your notebook environment (Colab only)\n", + "\n", + "If you're running this notebook on Google Colab, run the cell below to authenticate your environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NyKGtVQjgx13" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DF4l8DTdWgPY" + }, + "source": [ + "### Set Google Cloud project information and initialize Speech-to-Text V2 SDK\n", + "\n", + "To get started using the Speech-to-Text API, you must have an existing Google Cloud project and [enable the Speech-to-Text API](https://console.cloud.google.com/flows/enableapi?apiid=speech.googleapis.com).\n", + "\n", + "Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WIQyBhAn_9tK" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "PROJECT_ID = \"[your-project-id]\" # @param {type:\"string\", isTemplate: true}\n", + "\n", + "if PROJECT_ID == \"[your-project-id]\":\n", + " PROJECT_ID = str(os.environ.get(\"GOOGLE_CLOUD_PROJECT\"))\n", + "\n", + "LOCATION = os.environ.get(\"GOOGLE_CLOUD_REGION\", \"us-central1\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nqwi-5ufWp_B" + }, + "outputs": [], + "source": [ + "from google.api_core.client_options import ClientOptions\n", + "from google.cloud.speech_v2 import SpeechClient\n", + "\n", + "API_ENDPOINT = f\"{LOCATION}-speech.googleapis.com\"\n", + "\n", + "client = SpeechClient(\n", + " client_options=ClientOptions(\n", + " api_endpoint=API_ENDPOINT,\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5303c05f7aa6" + }, + "source": [ + "### Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "6fc324893334" + }, + "outputs": [], + "source": [ + "from google.cloud.speech_v2.types import cloud_speech\n", + "import gradio as gr" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qqm0OQpAYCph" + }, + "outputs": [], + "source": [ + "import io\n", + "import os\n", + "\n", + "import IPython.display as ipd\n", + "from etils import epath as ep\n", + "from pydub import AudioSegment" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sP8GBj3tBAC1" + }, + "source": [ + "### Set constants" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rXTVeU1uBBqY" + }, + "outputs": [], + "source": [ + "INPUT_AUDIO_SAMPLE_FILE_URI = (\n", + " \"gs://github-repo/audio_ai/speech_recognition/attention_is_all_you_need_podcast.wav\"\n", + ")\n", + "\n", + "RECOGNIZER = client.recognizer_path(PROJECT_ID, LOCATION, \"_\")\n", + "\n", + "MAX_CHUNK_SIZE = 25600" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "djgFxrGC_Ykd" + }, + "source": [ + "### Helpers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Zih8W_wC_caW" + }, + "outputs": [], + "source": [ + "def read_audio_file(audio_file_path: str) -> bytes:\n", + " \"\"\"\n", + " Read audio file as bytes.\n", + " \"\"\"\n", + " if audio_file_path.startswith(\"gs://\"):\n", + " with ep.Path(audio_file_path).open(\"rb\") as f:\n", + " audio_bytes = f.read()\n", + " else:\n", + " with open(audio_file_path, \"rb\") as f:\n", + " audio_bytes = f.read()\n", + " return audio_bytes\n", + "\n", + "\n", + "def save_audio_sample(audio_bytes: bytes, output_file_uri: str) -> None:\n", + " \"\"\"\n", + " Save audio sample as a file in Google Cloud Storage.\n", + " \"\"\"\n", + "\n", + " output_file_path = ep.Path(output_file_uri)\n", + " if not output_file_path.parent.exists():\n", + " output_file_path.parent.mkdir(parents=True, exist_ok=True)\n", + "\n", + " with output_file_path.open(\"wb\") as f:\n", + " f.write(audio_bytes)\n", + "\n", + "\n", + "def extract_audio_sample(audio_bytes: bytes, duration: int) -> bytes:\n", + " \"\"\"\n", + " Extracts a random audio sample of a given duration from an audio file.\n", + " \"\"\"\n", + " audio = AudioSegment.from_file(io.BytesIO(audio_bytes))\n", + " start_time = 0\n", + " audio_sample = audio[start_time : start_time + duration * 1000]\n", + "\n", + " audio_bytes = io.BytesIO()\n", + " audio_sample.export(audio_bytes, format=\"wav\")\n", + " audio_bytes.seek(0)\n", + "\n", + " return audio_bytes.read()\n", + "\n", + "\n", + "def play_audio_sample(audio_bytes: bytes) -> None:\n", + " \"\"\"\n", + " Plays the audio sample in a notebook.\n", + " \"\"\"\n", + " audio_file = io.BytesIO(audio_bytes)\n", + " ipd.display(ipd.Audio(audio_file.read(), rate=44100))\n", + "\n", + "\n", + "def parse_real_time_recognize_response(response) -> list[tuple[str, int]]:\n", + " \"\"\"Parse real-time responses from the Speech-to-Text API\"\"\"\n", + " real_time_recognize_results = []\n", + " for result in response.results:\n", + " real_time_recognize_results.append(\n", + " (result.alternatives[0].transcript, result.result_end_offset)\n", + " )\n", + " return real_time_recognize_results\n", + "\n", + "\n", + "def parse_words_real_time_recognize_response(response):\n", + " \"\"\"\n", + " Parse the word-level results from a real-time speech recognition response.\n", + " \"\"\"\n", + " real_time_recognize_results = []\n", + " for result in response.results:\n", + " for word_info in result.alternatives[0].words:\n", + " word = word_info.word\n", + " start_time = word_info.start_offset.seconds\n", + " end_time = word_info.end_offset.seconds\n", + " real_time_recognize_results.append(\n", + " {\"word\": word, \"start\": start_time, \"end\": end_time}\n", + " )\n", + " return real_time_recognize_results\n", + "\n", + "\n", + "def print_transcription(\n", + " audio_sample_bytes: bytes, transcriptions: str, play_audio=True\n", + ") -> None:\n", + " \"\"\"Prettify the play of the audio and the associated print of the transcription text in a notebook\"\"\"\n", + "\n", + " if play_audio:\n", + " # Play the audio sample\n", + " display(ipd.HTML(\"Audio:\"))\n", + " play_audio_sample(audio_sample_bytes)\n", + " display(ipd.HTML(\"
\"))\n", + "\n", + " # Display the transcription text\n", + " display(ipd.HTML(\"Transcription:\"))\n", + " for transcription, _ in transcriptions:\n", + " formatted_text = f\"
{transcription}
\"\n", + " display(ipd.HTML(formatted_text))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "y4MO5i9X4yq3" + }, + "source": [ + "### Prepare audio samples\n", + "\n", + "The podcast audio is ~ 8 mins. Depending on the audio length, you can use different transcribe API methods. To learn more, check out the official documentation. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4uTeBXo6dZlS" + }, + "source": [ + "#### Read the audio file\n", + "\n", + "Let's start reading the input audio sample you want to transcribe.\n", + "\n", + "In this case, it is a podcast generated with NotebookLM about the \"Attention is all you need\" [paper](https://arxiv.org/abs/1706.03762)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pjzwMWqpdldM" + }, + "outputs": [], + "source": [ + "input_audio_bytes = read_audio_file(INPUT_AUDIO_SAMPLE_FILE_URI)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "cIAl9Lyd4niN" + }, + "source": [ + "#### Prepare a short audio sample (< 1 min)\n", + "\n", + "Extract a short audio sample from the original one for streaming and real-time audio processing." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "MofmWRSH4niO" + }, + "outputs": [], + "source": [ + "short_audio_sample_bytes = extract_audio_sample(input_audio_bytes, 30)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "AC2YeY7v4niO" + }, + "outputs": [], + "source": [ + "play_audio_sample(short_audio_sample_bytes)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VPVDNRyVxquo" + }, + "source": [ + "## Improve transcription using Chirp 2's word-timing and speech adaptation features\n", + "\n", + "Chirp 2 supports word-level timestamps for each transcribed word and speech adaptation to help the model improving recognition accuracy for specific terms or proper nouns." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "oYCgDay2hAgB" + }, + "source": [ + "### Perform real-time speech recognition with word-timing" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "F83r9aiNhAgD" + }, + "source": [ + "#### Define real-time recognition configuration with `enable_word_time_offsets` parameter.\n", + "\n", + "You define the real-time recognition configuration which allows you to set the model to use, language code of the audio and more.\n", + "\n", + "In this case, you enable word timing feature. When True, the top result includes a list of words and the start and end time offsets (timestamps) for those words." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "j0WprZ-phAgD" + }, + "outputs": [], + "source": [ + "wt_real_time_config = cloud_speech.RecognitionConfig(\n", + " auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),\n", + " language_codes=[\"en-US\"],\n", + " model=\"chirp_2\",\n", + " features=cloud_speech.RecognitionFeatures(\n", + " enable_word_time_offsets=True,\n", + " enable_automatic_punctuation=True,\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "r2TqksAqhAgD" + }, + "source": [ + "#### Define the real-time request configuration\n", + "\n", + "Next, you define the real-time request passing the configuration and the audio sample you want to transcribe.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nh55mSzXhAgD" + }, + "outputs": [], + "source": [ + "wt_real_time_request = cloud_speech.RecognizeRequest(\n", + " config=wt_real_time_config, content=short_audio_sample_bytes, recognizer=RECOGNIZER\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "817YXVBli0aY" + }, + "source": [ + "#### Run the real-time recognition request\n", + "\n", + "Finally you submit the real-time recognition request." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rc0cBrVsi7UG" + }, + "outputs": [], + "source": [ + "wt_response = client.recognize(request=wt_real_time_request)\n", + "wt_real_time_recognize_results = parse_real_time_recognize_response(wt_response)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "J2vpMSv7CZ_2" + }, + "source": [ + "And you use a helper function to visualize transcriptions and the associated streams." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ezH51rLH4CBR" + }, + "outputs": [], + "source": [ + "for transcription, _ in wt_real_time_recognize_results:\n", + " print_transcription(short_audio_sample_bytes, transcription)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "iFhUcPcO-Zeh" + }, + "source": [ + "#### Visualize word timings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mhH42sab-3Tg" + }, + "outputs": [], + "source": [ + "n = 10\n", + "word_timings = parse_words_real_time_recognize_response(wt_response)\n", + "for word_info in word_timings[:n]:\n", + " print(\n", + " f\"Word: {word_info['word']} - Start: {word_info['start']} sec - End: {word_info['end']} sec\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IFOq3SK0qOT_" + }, + "source": [ + "### Improve real-time speech recognition accuracy with model adaptation\n", + "\n", + "So far, Chirp 2 transcribes the podcast correctly. That's in part because podcasts are recorded in ideal enviroments like a recording studio. But that's not always the case. For example, suppose that your audio data is recorded in noisy environment or the recording has strong accents or someone speaks quickly.\n", + "\n", + "To handle this and many other scenarios and improve real-time speech recognition accuracy, you can use model adaptation. To enable model adaptation with Chirp 2, you use the `adaptation` parameter.\n", + "\n", + "With `adaptation` parameter, you provide \"hints\" to the speech recognizer to favor specific words and phrases (`AdaptationPhraseSet` class) in the results. And for each hint you can define a hint boost which is the probability that a specific word or phrase will be recognized over other similar sounding phrases. Be careful to use higher boost. Higher the boost, higher is the chance of false positive recognition as well. We recommend using a binary search approach to finding the optimal value for your use case as well as adding phrases both with and without boost to your requests.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1--AAmsYq-vG" + }, + "source": [ + "#### Define real-time recognition configuration with `adaptation` parameter\n", + "\n", + "You define a new real-time recognition configuration which includes the `adaptation` configuration.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "qUr76NRcq-vH" + }, + "outputs": [], + "source": [ + "adaptation_real_time_config = cloud_speech.RecognitionConfig(\n", + " auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),\n", + " language_codes=[\"en-US\"],\n", + " model=\"chirp_2\",\n", + " features=cloud_speech.RecognitionFeatures(\n", + " enable_automatic_punctuation=True,\n", + " ),\n", + " adaptation=cloud_speech.SpeechAdaptation(\n", + " phrase_sets=[\n", + " cloud_speech.SpeechAdaptation.AdaptationPhraseSet(\n", + " inline_phrase_set=cloud_speech.PhraseSet(\n", + " phrases=[\n", + " {\n", + " \"value\": \"you know\", # often mumbled or spoken quickly\n", + " \"boost\": 10.0,\n", + " },\n", + " {\n", + " \"value\": \"what are they called again?\" # hesitations and changes in intonation\n", + " },\n", + " {\n", + " \"value\": \"Yeah, it's wild.\" # short interjections have brevity and the emotional inflection\n", + " },\n", + " ]\n", + " )\n", + " )\n", + " ]\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2Lm8e-2Cq-vH" + }, + "source": [ + "#### Define the real-time request configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pH9ZxWFIq-vH" + }, + "outputs": [], + "source": [ + "adaptation_real_time_request = cloud_speech.RecognizeRequest(\n", + " config=adaptation_real_time_config,\n", + " content=short_audio_sample_bytes,\n", + " recognizer=RECOGNIZER,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xaQSQMZZq-vH" + }, + "source": [ + "#### Run the real-time recognition request" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KYiCZjDWq-vH" + }, + "outputs": [], + "source": [ + "adapted_response = client.recognize(request=adaptation_real_time_request)\n", + "adapted_real_time_recognize_results = parse_real_time_recognize_response(\n", + " adapted_response\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xOjvJHHmq-vH" + }, + "source": [ + "And you use a helper function to visualize transcriptions and the associated streams." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7Xipn8H4q-vH" + }, + "outputs": [], + "source": [ + "for transcription, _ in adapted_real_time_recognize_results:\n", + " print_transcription(short_audio_sample_bytes, transcription)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "s0sIpQmJr40s" + }, + "source": [ + "## Transcript and translate using language-agnostic transcription and language translation\n", + "\n", + "Chirp 2 supports language-agnostic audio transcription and language translation. This means that Chirp 2 is capable of recognizing the language of the input audio and, at the same time, translate the outcome transcription in many different language.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5QpacKEDRStJ" + }, + "source": [ + "#### Define real-time recognition configuration with `language_code` and `translationConfig` parameters.\n", + "\n", + "You define a real-time recognition configuration by setting language codes in both `language_codes` and `translationConfig` parameters :\n", + "\n", + "* When `language_codes=[\"auto\"]`, you enable language-agnostic transcription to auto to detect language.\n", + "\n", + "* When `target_language=language_code` where `language_code` is one of the language in this list but different from the original language, you enable language translation." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9vaW49XqUD2v" + }, + "outputs": [], + "source": [ + "target_language_code = \"ca-ES\" # @param {type:\"string\", isTemplate: true}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y3Z_vaKpRStK" + }, + "outputs": [], + "source": [ + "ts_real_time_config = cloud_speech.RecognitionConfig(\n", + " auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),\n", + " language_codes=[\"en-US\"],\n", + " translation_config=cloud_speech.TranslationConfig(\n", + " target_language=target_language_code\n", + " ),\n", + " model=\"chirp_2\",\n", + " features=cloud_speech.RecognitionFeatures(\n", + " enable_automatic_punctuation=True,\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nPGCDLWARStK" + }, + "source": [ + "#### Define the real-time request configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5_spCiHDRStK" + }, + "outputs": [], + "source": [ + "ts_real_time_request = cloud_speech.RecognizeRequest(\n", + " config=ts_real_time_config, content=short_audio_sample_bytes, recognizer=RECOGNIZER\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Mzr69sLjRStK" + }, + "source": [ + "#### Run the real-time recognition request" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "NvcPOWLkRStK" + }, + "outputs": [], + "source": [ + "ts_response = client.recognize(request=ts_real_time_request)\n", + "ts_real_time_recognize_results = parse_real_time_recognize_response(ts_response)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "STjng1ZiRStK" + }, + "source": [ + "And you use a helper function to visualize transcriptions and the associated streams." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "EhTgg3qwRStL" + }, + "outputs": [], + "source": [ + "print_transcription(short_audio_sample_bytes, transcription, play_audio=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_MkA144iQrAn" + }, + "source": [ + "## Chirp 2 playground\n", + "\n", + "To play with Chirp 2, you can create a simple Gradio application where you enable several Chirp 2 features.\n", + "\n", + "Below you have an example for language-agnostic transcription and language translation with Chirp 2.\n", + "\n", + "To know more, check out the official documentation [here](https://cloud.google.com/speech-to-text/v2/docs/chirp_2-model).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WjuuZHTbQwkF" + }, + "outputs": [], + "source": [ + "def transcribe_audio(audio, enable_translation, target_language_code):\n", + " \"\"\"Transcribe the given audio file with optional features.\"\"\"\n", + "\n", + " # Set variables\n", + " project_id = os.environ.get(\"GOOGLE_CLOUD_PROJECT\", PROJECT_ID)\n", + " location = os.environ.get(\"GOOGLE_CLOUD_REGION\", LOCATION)\n", + " api_endpoint = f\"{location}-speech.googleapis.com\"\n", + "\n", + " # initiate client\n", + " client = SpeechClient(\n", + " client_options=ClientOptions(\n", + " api_endpoint=api_endpoint,\n", + " )\n", + " )\n", + "\n", + " # read the audio\n", + " with open(audio, \"rb\") as audio_file:\n", + " content = audio_file.read()\n", + "\n", + " # define language agnostic real time recognition configuration\n", + " real_time_config = cloud_speech.RecognitionConfig(\n", + " model=\"chirp_2\",\n", + " language_codes=[\"auto\"],\n", + " features=cloud_speech.RecognitionFeatures(\n", + " enable_automatic_punctuation=True,\n", + " ),\n", + " auto_decoding_config=cloud_speech.AutoDetectDecodingConfig(),\n", + " )\n", + "\n", + " if enable_translation:\n", + " real_time_config.language_codes = [\"en-US\"]\n", + " real_time_config.translation_config = cloud_speech.TranslationConfig(\n", + " target_language=target_language_code\n", + " )\n", + "\n", + " # define real-time recognition request\n", + " recognizer = client.recognizer_path(project_id, location, \"_\")\n", + "\n", + " real_time_request = cloud_speech.RecognizeRequest(\n", + " config=real_time_config,\n", + " content=content,\n", + " recognizer=recognizer,\n", + " )\n", + "\n", + " response = client.recognize(request=real_time_request)\n", + "\n", + " full_transcript = \"\"\n", + " for result in response.results:\n", + " full_transcript += result.alternatives[0].transcript + \" \"\n", + " return full_transcript.strip()\n", + "\n", + "\n", + "def speech_to_text(audio, enable_translation=False, target_language_code=None):\n", + " if audio is None:\n", + " return \"\"\n", + "\n", + " text = transcribe_audio(audio, enable_translation, target_language_code)\n", + " return text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "HQzUnSkErDTN" + }, + "outputs": [], + "source": [ + "# Create Gradio interface\n", + "demo = gr.Interface(\n", + " fn=speech_to_text,\n", + " inputs=[\n", + " gr.Audio(type=\"filepath\", label=\"Audio input\"),\n", + " gr.Checkbox(label=\"🧠 Enable Translation\"),\n", + " gr.Dropdown(\n", + " label=\"Select language to translate\",\n", + " choices=[\"ca-ES\", \"cy-GB\", \"de-DE\", \"ja-JP\", \"zh-Hans-CN\"],\n", + " interactive=True,\n", + " multiselect=False,\n", + " ),\n", + " ],\n", + " outputs=[gr.Textbox(label=\"📄 Transcription\")],\n", + " title=\"Chirp 2 Playground\",\n", + " description=\"

Speak or pass an audio and get the transcription!

\",\n", + ")\n", + "\n", + "# Launch the app\n", + "demo.launch()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "XqTpn06QrEiZ" + }, + "outputs": [], + "source": [ + "demo.close()" + ] + } + ], + "metadata": { + "colab": { + "name": "get_started_with_chirp_2_sdk_features.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 }