openvinotoolkit · eaidova · Oct 1, 2024 · Sep 24, 2024 · Sep 27, 2024 · Sep 30, 2024
diff --git a/.ci/spellcheck/.pyspelling.wordlist.txt b/.ci/spellcheck/.pyspelling.wordlist.txt
@@ -976,6 +976,7 @@ wikitext
 WIKITQ
 Wofk
 WTQ
+WhisperPipeline
 wuerstchen
 WuerstchenDiffNeXt
 Würstchen

diff --git a/notebooks/whisper-asr-genai/README.md b/notebooks/whisper-asr-genai/README.md
@@ -0,0 +1,28 @@
+# Automatic speech recognition using Whisper and OpenVINO with Generate API
+
+[Whisper](https://openai.com/index/whisper/) is an automatic speech recognition (ASR) system trained on 680,000 hours of multilingual and multitask supervised data collected from the web.
+
+In this tutorial, we consider how to run Whisper using OpenVINO with Generate API. We will use the pre-trained model from the [Hugging Face Transformers](https://github.com/openvinotoolkit/openvino.genai) library. The [Hugging Face Optimum Intel](https://huggingface.co/docs/optimum/intel/index) library converts the models to OpenVINO™ IR format. To simplify the user experience, we will use [OpenVINO Generate API](https://github.com/openvinotoolkit/openvino.genai) for [Whisper automatic speech recognition scenarios](https://github.com/openvinotoolkit/openvino.genai/blob/master/samples/python/whisper_speech_recognition/README.md).
+
+## Notebook Contents
+
+This notebook demonstrates how to perform automatic speech recognition (ASR) using the Whisper model and OpenVINO.
+
+The tutorial consists of following steps:
+1. Download PyTorch model
+2. Run PyTorch model inference
+3. Convert the model using OpenVINO Integration with HuggingFace Optimum.
+4. Run the model using Generate API.
+5. Compare the performance of PyTorch and the OpenVINO model.
+6. Quantize the OpenVINO model with NNCF.
+7. Check quantized model result for the demo video.
+8. Compare model size, performance and accuracy of original and quantized models.
+9. Launch an interactive demo for speech recognition
+
+
+## Installation Instructions
+
+This is a self-contained example that relies solely on its code.</br>
+We recommend running the notebook in a virtual environment. You only need a Jupyter server to start.
+For details, please refer to [Installation Guide](../../README.md).
+<img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=5b5a4db0-7875-4bfb-bdbd-01698b5b1a77&file=notebooks/whisper-asr-genai/README.md" />
diff --git a/notebooks/whisper-asr-genai/gradio_helper.py b/notebooks/whisper-asr-genai/gradio_helper.py
@@ -0,0 +1,118 @@
+from pathlib import Path
+from transformers.pipelines.audio_utils import ffmpeg_read
+
+from typing import Callable
+import gradio as gr
+import requests
+import time
+
+audio_en_example_path = Path("en_example.wav")
+audio_ml_example_path = Path("ml_example.wav")
+
+if not audio_en_example_path.exists():
+    r = requests.get("https://huggingface.co/spaces/distil-whisper/whisper-vs-distil-whisper/resolve/main/assets/example_1.wav")
+    with open(audio_en_example_path, "wb") as f:
+        f.write(r.content)
+
+
+if not audio_ml_example_path.exists():
+    r = requests.get("https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/jeanNL.wav")
+    with open(audio_ml_example_path, "wb") as f:
+        f.write(r.content)
+
+
+MAX_AUDIO_MINS = 30  # maximum audio input in minutes
+
+
+class GradioPipeline:
+    def __init__(self, ov_pipe, multilingual=False, quantized=False) -> None:
+        self.pipe = ov_pipe
+        self.multilingual = multilingual
+        self.quantized = quantized
+
+    def forward(self, inputs, task="transcribe", language="auto"):
+        generate_kwargs = {}
+        if not self.multilingual and task != "Transcribe":
+            raise gr.Error("The model only supports English. The task 'translate' could not be applied.")
+        elif task == "Translate":
+            generate_kwargs = {"task": "translate"}
+            if language and language != "auto":
+                generate_kwargs["language"] = language
+
+        if inputs is None:
+            raise gr.Error("No audio file submitted! Please record or upload an audio file before submitting your request.")
+
+        with open(inputs, "rb") as f:
+            inputs = f.read()
+
+        inputs = ffmpeg_read(inputs, 16000)
+        audio_length_mins = len(inputs) / 16000 / 60
+
+        if audio_length_mins > MAX_AUDIO_MINS:
+            raise gr.Error(
+                f"To ensure fair usage of the Space, the maximum audio length permitted is {MAX_AUDIO_MINS} minutes."
+                f"Got an audio of length {round(audio_length_mins, 3)} minutes."
+            )
+
+        start_time = time.time()
+        ov_text = self.pipe.generate(inputs.copy(), **generate_kwargs)
+        ov_time = time.time() - start_time
+        ov_time = round(ov_time, 2)
+
+        return ov_text, ov_time
+
+
+def make_demo(gr_pipeline):
+    examples = [[str(audio_en_example_path), ""]]
+    if gr_pipeline.multilingual:
+        examples.append([str(audio_ml_example_path), "<|fr|>"])
+
+    with gr.Blocks() as demo:
+        gr.HTML(
+            f"""
+                    <div style="text-align: center; max-width: 700px; margin: 0 auto;">
+                    <div
+                        style="
+                        display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;
+                        "
+                    >
+                        <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;">
+                        OpenVINO Generate API Whisper demo {'with quantized model.' if gr_pipeline.quantized else ''}
+                        </h1>
+                    </div>
+                    </div>
+                """
+        )
+        audio = gr.components.Audio(type="filepath", label="Audio input")
+        language = gr.components.Textbox(
+            label="Language.",
+            info="List of avalible language you can find in generation_config.lang_to_id dictionary. Example: <|en|>. 'auto' or empty string will mean autodetection",
+            value="auto",
+        )
+        with gr.Row():
+            button_transcribe = gr.Button("Transcribe")
+            button_translate = gr.Button("Translate", visible=gr_pipeline.multilingual)
+        with gr.Row():
+            infer_time = gr.components.Textbox(label="OpenVINO Whisper Generation Time (s)")
+        with gr.Row():
+            result = gr.components.Textbox(label="OpenVINO Whisper Result", show_copy_button=True)
+        button_transcribe.click(
+            fn=gr_pipeline.forward,
+            inputs=[audio, button_transcribe, language],
+            outputs=[result, infer_time],
+        )
+        button_translate.click(
+            fn=gr_pipeline.forward,
+            inputs=[audio, button_translate, language],
+            outputs=[result, infer_time],
+        )
+        gr.Markdown("## Examples")
+        gr.Examples(
+            examples,
+            inputs=[audio, language],
+            outputs=[result, infer_time],
+            fn=gr_pipeline.forward,
+            cache_examples=False,
+        )
+
+    return demo