diff --git a/notebooks/whisper-asr-genai/whisper-asr-genai.ipynb b/notebooks/whisper-asr-genai/whisper-asr-genai.ipynb index f83573f4bcc..51c7ab47891 100644 --- a/notebooks/whisper-asr-genai/whisper-asr-genai.ipynb +++ b/notebooks/whisper-asr-genai/whisper-asr-genai.ipynb @@ -942,8 +942,7 @@ "+model = OVModelForSpeechSeq2Seq.from_pretrained(model_id, export=True)\n", "```\n", "\n", - "Like the original PyTorch model, the OpenVINO model is also compatible with HuggingFace [pipeline](https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline) interface for `automatic-speech-recognition`. \n", - "Pipeline can be used for long audio transcription. Distil-Whisper uses a chunked algorithm to transcribe long-form audio files. In practice, this chunked long-form algorithm is 9x faster than the sequential algorithm proposed by OpenAI in the Whisper paper. To enable chunking, pass the chunk_length_s parameter to the pipeline. For Distil-Whisper, a chunk length of 15 seconds is optimal. To activate batching, pass the argument batch_size." + "Like the original PyTorch model, the OpenVINO model is also compatible with HuggingFace [pipeline](https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline) interface for `automatic-speech-recognition`. " ] }, { @@ -1049,14 +1048,6 @@ "from datasets import load_dataset\n", "from tqdm.notebook import tqdm\n", "\n", - "def extract_input_features(sample):\n", - " input_features = processor(\n", - " sample[\"audio\"][\"array\"],\n", - " sampling_rate=sample[\"audio\"][\"sampling_rate\"],\n", - " return_tensors=\"pt\",\n", - " ).input_features\n", - " return input_features\n", - "\n", "\n", "\n", "CALIBRATION_DATASET_SIZE = 30\n", diff --git a/notebooks/whisper-subtitles-generation/whisper-subtitles-generation.ipynb b/notebooks/whisper-subtitles-generation/whisper-subtitles-generation.ipynb index 0ed0c8ac67d..29b9223efe2 100644 --- a/notebooks/whisper-subtitles-generation/whisper-subtitles-generation.ipynb +++ b/notebooks/whisper-subtitles-generation/whisper-subtitles-generation.ipynb @@ -13,7 +13,7 @@ "\n", "You can find more information about this model in the [research paper](https://cdn.openai.com/papers/whisper.pdf), [OpenAI blog](https://openai.com/blog/whisper/), [model card](https://github.com/openai/whisper/blob/main/model-card.md) and GitHub [repository](https://github.com/openai/whisper).\n", "\n", - "In this notebook, we will use Whisper with OpenVINO to generate subtitles in a sample video. Additionally, we will use [NNCF](https://github.com/openvinotoolkit/nncf) improving model performance by INT8 quantization.\n", + "In this notebook, we will use Whisper model with [OpenVINO Generate API](https://github.com/openvinotoolkit/openvino.genai) for [Whisper automatic speech recognition scenarios](https://github.com/openvinotoolkit/openvino.genai/blob/master/samples/python/whisper_speech_recognition/README.md) to generate subtitles in a sample video. Additionally, we will use [NNCF](https://github.com/openvinotoolkit/nncf) improving model performance by INT8 quantization.\n", "Notebook contains the following steps:\n", "1. Download the model.\n", "2. Instantiate the PyTorch model pipeline.\n", @@ -80,15 +80,24 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 14, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ - "%pip install -q \"openvino>=2024.1.0\" \"nncf>=2.10.0\"\n", + "%pip install -q \"nncf>=2.13.0\"\n", + "%pip install -q --pre -U \"openvino\" \"openvino-tokenizers\" \"openvino-genai\" --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly\n", "%pip install -q \"python-ffmpeg<=1.0.16\" moviepy \"onnx!=1.16.2\" \"git+https://github.com/huggingface/optimum-intel.git\" \"torch>=2.1\" --extra-index-url https://download.pytorch.org/whl/cpu\n", - "%pip install -q \"yt_dlp>=2024.8.6\" soundfile librosa jiwer\n", + "%pip install -q -U \"yt_dlp>=2024.8.6\" soundfile librosa jiwer\n", "%pip install -q \"gradio>=4.19\"" ] }, @@ -123,12 +132,12 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "6f1b0c699e444e7fa375cf5cc59d9a7c", + "model_id": "efb7aa798bc14370be0d6610e3266aff", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Dropdown(description='Model:', index=6, options=('openai/whisper-large-v3', 'openai/whisper-large-v2', 'openai…" + "Dropdown(description='Model:', index=7, options=('openai/whisper-large-v3-turbo', 'openai/whisper-large-v3', '…" ] }, "execution_count": 2, @@ -168,30 +177,50 @@ "### Convert model to OpenVINO Intermediate Representation (IR) format using Optimum-Intel.\n", "[back to top ⬆️](#Table-of-contents:)\n", "\n", - "The Hugging Face Optimum API is a high-level API that enables us to convert and quantize models from the Hugging Face Transformers library to the OpenVINO™ IR format. For more details, refer to the [Hugging Face Optimum documentation](https://huggingface.co/docs/optimum/intel/inference).\n", - "\n", - "Optimum Intel can be used to load optimized models from the [Hugging Face Hub](https://huggingface.co/docs/optimum/intel/hf.co/models) and create pipelines to run an inference with OpenVINO Runtime using Hugging Face APIs. The Optimum Inference models are API compatible with Hugging Face Transformers models. This means we just need to replace the `AutoModelForXxx` class with the corresponding `OVModelForXxx` class.\n", + "Listed Whisper model are available for downloading via the [HuggingFace hub](https://huggingface.co/openai). We will use optimum-cli interface for exporting it into OpenVINO Intermediate Representation (IR) format.\n", "\n", - "Below is an example of the whisper-tiny model\n", - "\n", - "```diff\n", - "-from transformers import AutoModelForSpeechSeq2Seq\n", - "+from optimum.intel.openvino import OVModelForSpeechSeq2Seq\n", - "from transformers import AutoTokenizer, pipeline\n", + "Optimum CLI interface for converting models supports export to OpenVINO (supported starting optimum-intel 1.12 version).\n", + "General command format:\n", "\n", - "model_id = \"openai/whisper-tiny\"\n", - "-model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)\n", - "+model = OVModelForSpeechSeq2Seq.from_pretrained(model_id, export=True)\n", + "```bash\n", + "optimum-cli export openvino --model --task \n", "```\n", "\n", - "Model class initialization starts with calling the `from_pretrained` method. When downloading and converting the Transformers model, the parameter `export=True` should be added. We can save the converted model for the next usage with the `save_pretrained` method. Alternatively, model conversion can be performed using Optimum-CLI interface. You can find more details about Optimum-Intel and Optimum CLI usage in this [tutorial](../hugging-face-hub/hugging-face-hub.ipynb). The command bellow illustrates how to convert whisper using optimum cli.\n" + "where `--model` argument is model id from HuggingFace Hub or local directory with model (saved using `.save_pretrained` method), `--task ` is one of [supported task](https://huggingface.co/docs/optimum/exporters/task_manager) that exported model should solve. For LLMs it will be `automatic-speech-recognition-with-past`. If model initialization requires to use remote code, `--trust-remote-code` flag additionally should be passed. Full list of supported arguments available via `--help` For more details and examples of usage, please check [optimum documentation](https://huggingface.co/docs/optimum/intel/inference#export).\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Framework not specified. Using pt to export the model.\n", + "Automatic task detection to automatic-speech-recognition-with-past (possible synonyms are: speech2seq-lm-with-past).\n", + "Some non-default generation parameters are set in the model config. These should go into a GenerationConfig file (https://huggingface.co/docs/transformers/generation_strategies#save-a-custom-decoding-strategy-with-your-model) instead. This warning will be raised to an exception in v4.41.\n", + "Non-default generation parameters: {'max_length': 448, 'suppress_tokens': [1, 2, 7, 8, 9, 10, 14, 25, 26, 27, 28, 29, 31, 58, 59, 60, 61, 62, 63, 90, 91, 92, 93, 359, 503, 522, 542, 873, 893, 902, 918, 922, 931, 1350, 1853, 1982, 2460, 2627, 3246, 3253, 3268, 3536, 3846, 3961, 4183, 4667, 6585, 6647, 7273, 9061, 9383, 10428, 10929, 11938, 12033, 12331, 12562, 13793, 14157, 14635, 15265, 15618, 16553, 16604, 18362, 18956, 20075, 21675, 22520, 26130, 26161, 26435, 28279, 29464, 31650, 32302, 32470, 36865, 42863, 47425, 49870, 50254, 50258, 50358, 50359, 50360, 50361, 50362], 'begin_suppress_tokens': [220, 50257]}\n", + "Using framework PyTorch: 2.3.1+cpu\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> False\n", + "/home/labuser/work/notebook/genai_whisper/lib/python3.10/site-packages/transformers/models/whisper/modeling_whisper.py:1070: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " if input_features.shape[-1] != expected_seq_length:\n", + "/home/labuser/work/notebook/genai_whisper/lib/python3.10/site-packages/transformers/models/whisper/modeling_whisper.py:387: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):\n", + "Using framework PyTorch: 2.3.1+cpu\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> True\n", + "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.\n", + "/home/labuser/work/notebook/genai_whisper/lib/python3.10/site-packages/transformers/models/whisper/modeling_whisper.py:100: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " if sequence_length != 1:\n", + "Using framework PyTorch: 2.3.1+cpu\n", + "Overriding 1 configuration item(s)\n", + "\t- use_cache -> True\n" + ] + } + ], "source": [ "from pathlib import Path\n", "\n", @@ -214,10 +243,8 @@ "![whisper_pipeline.png](https://user-images.githubusercontent.com/29454499/204536733-1f4342f7-2328-476a-a431-cb596df69854.png)\n", "\n", "\n", - "Preprocessing and post-processing are important in this model use. `transformers.AutoProcessor` class used for initialization `WhisperProcessor` is responsible for preparing audio input data for the PyTorch model, converting it to Mel-spectrogram and decoding predicted output token_ids into string using tokenizer. Tokenizers and Processors are distributed with models also compatible with the OpenVINO model.\n", "\n", - "Like the original PyTorch model, the OpenVINO model is also compatible with HuggingFace [pipeline](https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline) interface for `automatic-speech-recognition`. \n", - "Pipeline can be used for long audio transcription. Distil-Whisper uses a chunked algorithm to transcribe long-form audio files. In practice, this chunked long-form algorithm is 9x faster than the sequential algorithm proposed by OpenAI in the Whisper paper. To enable chunking, pass the chunk_length_s parameter to the pipeline. For Distil-Whisper, a chunk length of 15 seconds is optimal. To activate batching, pass the argument batch_size." + "To simplify user experience we will use [OpenVINO Generate API](https://github.com/openvinotoolkit/openvino.genai/blob/master/samples/python/whisper_speech_recognition/README.md). Firstly we will create pipeline with `WhisperPipeline`. You can construct it straight away from the folder with the converted model. It will automatically load the `model`, `tokenizer`, `detokenizer` and default `generation configuration`. " ] }, { @@ -234,36 +261,20 @@ { "cell_type": "code", "execution_count": 4, - "metadata": { - "collapsed": false, - "jupyter": { - "outputs_hidden": false - } - }, - "outputs": [], - "source": [ - "import openvino as ov\n", - "\n", - "core = ov.Core()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a37643c49958440285805210af50b2c2", + "model_id": "321f15ce77bc4fde9426b62cb07592b3", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Dropdown(description='Device:', index=3, options=('CPU', 'GPU.0', 'GPU.1', 'AUTO'), value='AUTO')" + "Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO')" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -285,24 +296,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ - "from optimum.intel.openvino import OVModelForSpeechSeq2Seq\n", - "from transformers import AutoProcessor, pipeline\n", + "import openvino_genai\n", "\n", - "ov_model = OVModelForSpeechSeq2Seq.from_pretrained(model_dir, device=device.value)\n", - "\n", - "processor = AutoProcessor.from_pretrained(model_dir)\n", - "\n", - "pipe = pipeline(\n", - " \"automatic-speech-recognition\",\n", - " model=ov_model,\n", - " chunk_length_s=30,\n", - " tokenizer=processor.tokenizer,\n", - " feature_extractor=processor.feature_extractor,\n", - ")" + "ov_pipe = openvino_genai.WhisperPipeline(str(model_dir), device=device.value)" ] }, { @@ -318,13 +318,13 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f493dd18def948e5a50f5950b5e5c4ca", + "model_id": "0d6b12ccdf2544979b46d944c7ee3567", "version_major": 2, "version_minor": 0 }, @@ -332,7 +332,7 @@ "Text(value='https://youtu.be/kgL5LBM-hFI', description='Video:', placeholder='Type link for video')" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -353,7 +353,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -361,6 +361,19 @@ "output_type": "stream", "text": [ "Downloading video https://youtu.be/kgL5LBM-hFI started\n", + "[youtube] Extracting URL: https://youtu.be/kgL5LBM-hFI\n", + "[youtube] kgL5LBM-hFI: Downloading webpage\n", + "[youtube] kgL5LBM-hFI: Downloading ios player API JSON\n", + "[youtube] kgL5LBM-hFI: Downloading web creator player API JSON\n", + "[youtube] kgL5LBM-hFI: Downloading m3u8 information\n", + "[info] kgL5LBM-hFI: Downloading 1 format(s): 136+140\n", + "[download] Destination: downloaded_video.f136.mp4\n", + "[download] 100% of 2.91MiB in 00:00:01 at 2.43MiB/s \n", + "[download] Destination: downloaded_video.f140.m4a\n", + "[download] 100% of 477.52KiB in 00:00:00 at 2.47MiB/s \n", + "[Merger] Merging formats into \"downloaded_video.mp4\"\n", + "Deleting original file downloaded_video.f136.mp4 (pass -k to keep)\n", + "Deleting original file downloaded_video.f140.m4a (pass -k to keep)\n", "Video saved to downloaded_video.mp4\n" ] } @@ -368,15 +381,19 @@ "source": [ "from pathlib import Path\n", "import yt_dlp\n", - "\n", - "print(f\"Downloading video {link.value} started\")\n", + "import os\n", "\n", "output_file = Path(\"downloaded_video.mp4\")\n", - "ydl_ops = {\"format\": \"best[ext=mp4]\", \"outtmpl\": output_file.as_posix()}\n", - "with yt_dlp.YoutubeDL(ydl_ops) as ydl:\n", - " ydl.download(link.value)\n", "\n", - "print(f\"Video saved to {output_file}\")" + "if not output_file.exists():\n", + " print(f\"Downloading video {link.value} started\")\n", + " export_command = f\"yt-dlp -S vcodec:h264,res,acodec:aac -o {str(output_file)} {link.value}\"\n", + "\n", + " exit_code = os.system(export_command)\n", + " if exit_code != 0:\n", + " raise Exception(\"Failed to load video!\")\n", + "\n", + " print(f\"Video saved to {output_file}\")" ] }, { @@ -392,13 +409,13 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "692ad019edab4ffc81a9cf2808e50d36", + "model_id": "9a165a4bdab54aad8cef86dcb38f22f0", "version_major": 2, "version_minor": 0 }, @@ -406,7 +423,7 @@ "Select(description='Select task:', index=1, options=('transcribe', 'translate'), value='translate')" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -423,7 +440,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -449,27 +466,36 @@ " input_video.audio.write_audiofile(audio_file, verbose=False, logger=None)\n", " with open(audio_file, \"rb\") as f:\n", " inputs = f.read()\n", - " audio = ffmpeg_read(inputs, pipe.feature_extractor.sampling_rate)\n", + " audio = ffmpeg_read(inputs, 16000)\n", " return {\n", " \"raw\": audio,\n", - " \"sampling_rate\": pipe.feature_extractor.sampling_rate,\n", + " \"sampling_rate\": 16000,\n", " }, duration" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's run generation method. We will put input data as `np array`. Also we will specify `task` and `return_timestamps=True` options. If task is `translate`, you can place `language` option, for example `<|fr|>` for French or it would be detect automatically. We can set up generation parametrs in different ways. We can get default config with `get_generation_config()`, setup parameters and put config directly to `generate()`. It's also possible to specify the needed options just as inputs in the `generate()` method and we will use this way. Then we just run `generate` method and get the output in text format.\n", + "\n", + "`generate` method with `return_timestamps` set to `True` will return `chunks`, which contain attributes: `text`, `start_ts` and `end_ts`" + ] + }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "inputs, duration = get_audio(output_file)\n", "\n", - "transcription = pipe(inputs, generate_kwargs={\"task\": task.value}, return_timestamps=True)[\"chunks\"]" + "transcription = ov_pipe.generate(inputs[\"raw\"], task=task.value, return_timestamps=True).chunks" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -501,18 +527,19 @@ " \"\"\"\n", " segment_lines = []\n", " for idx, segment in enumerate(transcription):\n", + " timestamp = (segment.start_ts, segment.end_ts)\n", " # for the case where the model could not predict an ending timestamp, which can happen if audio is cut off in the middle of a word.\n", - " if segment[\"timestamp\"][1] is None:\n", - " segment[\"timestamp\"] = (segment[\"timestamp\"][0], filter_duration)\n", + " if segment.end_ts is None:\n", + " timestamp[1] = filter_duration\n", "\n", - " if filter_duration is not None and (segment[\"timestamp\"][0] >= math.floor(filter_duration) or segment[\"timestamp\"][1] > math.ceil(filter_duration) + 1):\n", + " if filter_duration is not None and (timestamp[0] >= math.floor(filter_duration) or timestamp[1] > math.ceil(filter_duration) + 1):\n", " break\n", " segment_lines.append(str(idx + 1) + \"\\n\")\n", - " time_start = format_timestamp(segment[\"timestamp\"][0])\n", - " time_end = format_timestamp(segment[\"timestamp\"][1])\n", + " time_start = format_timestamp(timestamp[0])\n", + " time_end = format_timestamp(timestamp[1])\n", " time_str = f\"{time_start} --> {time_end}\\n\"\n", " segment_lines.append(time_str)\n", - " segment_lines.append(segment[\"text\"] + \"\\n\\n\")\n", + " segment_lines.append(segment.text + \"\\n\\n\")\n", " return segment_lines" ] }, @@ -526,7 +553,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -546,21 +573,21 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "8bcb6af81fc14c9a9e3e003b9a2a6e0f", + "model_id": "e0dd9fd0fc304f07a3b41bf91893e271", "version_major": 2, "version_minor": 0 }, "text/plain": [ - "Video(value=b\"\\x00\\x00\\x00\\x18ftypmp42\\x00\\x00\\x00\\x00isommp42\\x00\\x00:'moov\\x00\\x00\\x00lmvhd...\", height='800…" + "Video(value=b'\\x00\\x00\\x00 ftypisom\\x00\\x00\\x02\\x00isomiso2avc1mp41\\x00\\x00\\\\\\x9bmoov...', height='800', loop=…" ] }, - "execution_count": 14, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -571,7 +598,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": { "tags": [] }, @@ -647,13 +674,13 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a21bfe2e9278413a9d1adaf94ac7388a", + "model_id": "41add0d492c449b49ac72a773e264dcb", "version_major": 2, "version_minor": 0 }, @@ -661,7 +688,7 @@ "Checkbox(value=True, description='Quantization')" ] }, - "execution_count": 16, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -678,7 +705,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -695,6 +722,52 @@ "%load_ext skip_kernel_extension" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's load converted OpenVINO model format using Optimum-Intel to easily quantize it.\n", + "\n", + "Optimum Intel can be used to load optimized models from the [Hugging Face Hub](https://huggingface.co/docs/optimum/intel/hf.co/models) or local folder to create pipelines to run an inference with OpenVINO Runtime using Hugging Face APIs. The Optimum Inference models are API compatible with Hugging Face Transformers models. This means we just need to replace the `AutoModelForXxx` class with the corresponding `OVModelForXxx` class.\n", + "\n", + "Below is an example of the whisper-tiny model\n", + "\n", + "```diff\n", + "-from transformers import AutoModelForSpeechSeq2Seq\n", + "+from optimum.intel.openvino import OVModelForSpeechSeq2Seq\n", + "from transformers import AutoTokenizer, pipeline\n", + "\n", + "model_id = \"openai/whisper-tiny\"\n", + "-model = AutoModelForSpeechSeq2Seq.from_pretrained(model_id)\n", + "+model = OVModelForSpeechSeq2Seq.from_pretrained(model_id, export=True)\n", + "```\n", + "\n", + "Like the original PyTorch model, the OpenVINO model is also compatible with HuggingFace [pipeline](https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.AutomaticSpeechRecognitionPipeline) interface for `automatic-speech-recognition`." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Compiling the encoder to AUTO ...\n", + "Compiling the decoder to AUTO ...\n", + "Compiling the decoder to AUTO ...\n" + ] + } + ], + "source": [ + "from transformers import AutoProcessor, pipeline\n", + "from optimum.intel.openvino import OVModelForSpeechSeq2Seq\n", + "\n", + "ov_model = OVModelForSpeechSeq2Seq.from_pretrained(model_dir, device=device.value)\n", + "processor = AutoProcessor.from_pretrained(model_dir)" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -758,387 +831,27 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "148b4131e11f4363bcebfc0c78ed13df", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Collecting calibration data: 0%| | 0/50 [00:00\n" - ], - "text/plain": [] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "f30e8952c50e4f6ca374dc6972b95fca", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:nncf:12 ignored nodes were found by name in the NNCFGraph\n", - "INFO:nncf:16 ignored nodes were found by name in the NNCFGraph\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "520c7840eb5440859d2b9ba2123049a7", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "5ddc5b2c750d4a4cbe67f0b8f7be4faf", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "69846e3229834f7992738d50a33b354b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Quantizing decoder with past\n" - ] - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "4c48f03790324ee99afdb4031429a09a", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "INFO:nncf:24 ignored nodes were found by name in the NNCFGraph\n", - "INFO:nncf:24 ignored nodes were found by name in the NNCFGraph\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "03249855c07f4b83bfb4289608bca05b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fea73d3c378a442bacb43bf1ab11b4ec", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Output()" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
\n"
-      ],
-      "text/plain": []
-     },
-     "metadata": {},
-     "output_type": "display_data"
-    },
-    {
-     "data": {
-      "text/html": [
-       "
\n",
-       "
\n" - ], - "text/plain": [ - "\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Compiling the encoder to AUTO ...\n", - "Compiling the decoder to AUTO ...\n", - "Compiling the decoder to AUTO ...\n" - ] - } - ], + "outputs": [], "source": [ - "%%skip not $to_quantize.value\n", + "# %%skip not $to_quantize.value\n", "\n", "import gc\n", "import shutil\n", "import nncf\n", + "import openvino as ov\n", "from datasets import load_dataset\n", "from tqdm.notebook import tqdm\n", "\n", - "def extract_input_features(sample):\n", - " input_features = processor(\n", - " sample[\"audio\"][\"array\"],\n", - " sampling_rate=sample[\"audio\"][\"sampling_rate\"],\n", - " return_tensors=\"pt\",\n", - " ).input_features\n", - " return input_features\n", - "\n", - "\n", "\n", - "CALIBRATION_DATASET_SIZE = 50\n", + "CALIBRATION_DATASET_SIZE = 30\n", "quantized_model_path = Path(f\"{model_dir}_quantized\")\n", "\n", "\n", "def quantize(ov_model: OVModelForSpeechSeq2Seq, calibration_dataset_size: int):\n", " if not quantized_model_path.exists():\n", - " encoder_calibration_data, decoder_calibration_data = collect_calibration_dataset(\n", - " ov_model, calibration_dataset_size\n", - " )\n", + " encoder_calibration_data, decoder_calibration_data = collect_calibration_dataset(ov_model, calibration_dataset_size)\n", " print(\"Quantizing encoder\")\n", " quantized_encoder = nncf.quantize(\n", " ov_model.encoder.model,\n", @@ -1146,7 +859,7 @@ " subset_size=len(encoder_calibration_data),\n", " model_type=nncf.ModelType.TRANSFORMER,\n", " # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search\n", - " advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.50)\n", + " advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.80),\n", " )\n", " ov.save_model(quantized_encoder, quantized_model_path / \"openvino_encoder_model.xml\")\n", " del quantized_encoder\n", @@ -1160,7 +873,7 @@ " subset_size=len(decoder_calibration_data),\n", " model_type=nncf.ModelType.TRANSFORMER,\n", " # Smooth Quant algorithm reduces activation quantization error; optimal alpha value was obtained through grid search\n", - " advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.96)\n", + " advanced_parameters=nncf.AdvancedQuantizationParameters(smooth_quant_alpha=0.96),\n", " )\n", " ov.save_model(quantized_decoder_with_past, quantized_model_path / \"openvino_decoder_with_past_model.xml\")\n", " del quantized_decoder_with_past\n", @@ -1173,14 +886,24 @@ " shutil.copy(model_path / \"generation_config.json\", quantized_model_path / \"generation_config.json\")\n", " shutil.copy(model_path / \"openvino_decoder_model.xml\", quantized_model_path / \"openvino_decoder_model.xml\")\n", " shutil.copy(model_path / \"openvino_decoder_model.bin\", quantized_model_path / \"openvino_decoder_model.bin\")\n", - "\n", - " quantized_ov_model = OVModelForSpeechSeq2Seq.from_pretrained(quantized_model_path, compile=False)\n", - " quantized_ov_model.to(device.value)\n", - " quantized_ov_model.compile()\n", - " return quantized_ov_model\n", - "\n", - "\n", - "ov_quantized_model = quantize(ov_model, CALIBRATION_DATASET_SIZE)" + " shutil.copy(model_path / \"openvino_tokenizer.xml\", quantized_model_path / \"openvino_tokenizer.xml\")\n", + " shutil.copy(model_path / \"openvino_tokenizer.bin\", quantized_model_path / \"openvino_tokenizer.bin\")\n", + " shutil.copy(model_path / \"openvino_detokenizer.xml\", quantized_model_path / \"openvino_detokenizer.xml\")\n", + " shutil.copy(model_path / \"openvino_detokenizer.bin\", quantized_model_path / \"openvino_detokenizer.bin\")\n", + " shutil.copy(model_path / \"tokenizer_config.json\", quantized_model_path / \"tokenizer_config.json\")\n", + " shutil.copy(model_path / \"tokenizer.json\", quantized_model_path / \"tokenizer.json\")\n", + " shutil.copy(model_path / \"vocab.json\", quantized_model_path / \"vocab.json\")\n", + " shutil.copy(model_path / \"preprocessor_config.json\", quantized_model_path / \"preprocessor_config.json\")\n", + " shutil.copy(model_path / \"special_tokens_map.json\", quantized_model_path / \"special_tokens_map.json\")\n", + " shutil.copy(model_path / \"normalizer.json\", quantized_model_path / \"normalizer.json\")\n", + " shutil.copy(model_path / \"merges.txt\", quantized_model_path / \"merges.txt\")\n", + " shutil.copy(model_path / \"added_tokens.json\", quantized_model_path / \"added_tokens.json\")\n", + "\n", + " quantized_ov_pipe = openvino_genai.WhisperPipeline(str(quantized_model_path), device=device.value)\n", + " return quantized_ov_pipe\n", + "\n", + "\n", + "quantized_ov_pipe = quantize(ov_model, CALIBRATION_DATASET_SIZE)" ] }, { @@ -1198,62 +921,11 @@ "cell_type": "code", "execution_count": 20, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1\n", - "00:00:00,000 --> 00:00:05,000\n", - " What's that?\n", - "\n", - "2\n", - "00:00:05,000 --> 00:00:07,000\n", - " Oh, wow.\n", - "\n", - "3\n", - "00:00:09,000 --> 00:00:11,000\n", - " Hello humans.\n", - "\n", - "4\n", - "00:00:14,000 --> 00:00:15,000\n", - " Focus on me.\n", - "\n", - "5\n", - "00:00:15,000 --> 00:00:16,000\n", - " Focus on the guard.\n", - "\n", - "6\n", - "00:00:18,000 --> 00:00:20,000\n", - " Don't tell anyone what you're seen in here.\n", - "\n", - "7\n", - "00:00:22,000 --> 00:00:24,000\n", - " Have you seen what's in there?\n", - "\n", - "8\n", - "00:00:24,000 --> 00:00:25,000\n", - " They have intel.\n", - "\n", - "9\n", - "00:00:25,000 --> 00:00:27,000\n", - " This is where it all changes.\n", - "\n", - "\n" - ] - } - ], + "outputs": [], "source": [ "if ov_quantized_model is not None:\n", - " int8_pipe = pipeline(\n", - " \"automatic-speech-recognition\",\n", - " model=ov_quantized_model,\n", - " chunk_length_s=30,\n", - " tokenizer=processor.tokenizer,\n", - " feature_extractor=processor.feature_extractor,\n", - " )\n", " inputs, duration = get_audio(output_file)\n", - " transcription = int8_pipe(inputs, generate_kwargs={\"task\": task.value}, return_timestamps=True)[\"chunks\"]\n", + " transcription = quantized_ov_pipe.generate(inputs[\"raw\"], task=task.value, return_timestamps=True).chunks\n", " srt_lines = prepare_srt(transcription, filter_duration=duration)\n", " print(\"\".join(srt_lines))\n", " widgets.Video.from_file(output_file, loop=False, width=800, height=800)" @@ -1269,20 +941,18 @@ "\n", "Finally, we compare original and quantized Whisper models from accuracy and performance stand-points.\n", "\n", - "To measure accuracy, we use `1 - WER` as a metric, where WER stands for Word Error Rate.\n", - "\n", - "When measuring inference time, we do it separately for encoder and decoder-with-past model forwards, and for the whole model inference too." + "To measure accuracy, we use `1 - WER` as a metric, where WER stands for Word Error Rate." ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a2822b0dcd584fd2aa28e01c607926d0", + "model_id": "cb7e92abfce646dbb131e9d6a82ba7cc", "version_major": 2, "version_minor": 0 }, @@ -1296,7 +966,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3dc0e232e81c4de0ad82737f98f69d2a", + "model_id": "dd753b9493614269bd599870e519fac8", "version_major": 2, "version_minor": 0 }, @@ -1311,11 +981,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Encoder performance speedup: 1.352\n", - "Decoder with past performance speedup: 1.342\n", "Whole pipeline performance speedup: 1.350\n", - "Whisper transcription word accuracy. Original model: 81.67%. Quantized model: 83.67%.\n", - "Accuracy drop: -1.99%.\n" + "Whisper transcription word accuracy. Original model: 81.77%. Quantized model: 82.97%.\n", + "Accuracy drop: -1.20%.\n" ] } ], @@ -1328,67 +996,33 @@ "\n", "\n", "TEST_DATASET_SIZE = 50\n", - "MEASURE_TIME = False\n", - "\n", - "@contextmanager\n", - "def time_measurement():\n", - " global MEASURE_TIME\n", - " try:\n", - " MEASURE_TIME = True\n", - " yield\n", - " finally:\n", - " MEASURE_TIME = False\n", - "\n", - "def time_fn(obj, fn_name, time_list):\n", - " original_fn = getattr(obj, fn_name)\n", - "\n", - " def wrapper(*args, **kwargs):\n", - " if not MEASURE_TIME:\n", - " return original_fn(*args, **kwargs)\n", - " start_time = time.perf_counter()\n", - " result = original_fn(*args, **kwargs)\n", - " end_time = time.perf_counter()\n", - " time_list.append(end_time - start_time)\n", - " return result\n", - "\n", - " setattr(obj, fn_name, wrapper)\n", "\n", "def calculate_transcription_time_and_accuracy(ov_model, test_samples):\n", - " encoder_infer_times = []\n", - " decoder_with_past_infer_times = []\n", " whole_infer_times = []\n", - " time_fn(ov_model, \"generate\", whole_infer_times)\n", - " time_fn(ov_model.encoder, \"forward\", encoder_infer_times)\n", - " time_fn(ov_model.decoder_with_past, \"forward\", decoder_with_past_infer_times)\n", "\n", " ground_truths = []\n", " predictions = []\n", " for data_item in tqdm(test_samples, desc=\"Measuring performance and accuracy\"):\n", - " input_features = extract_input_features(data_item)\n", - "\n", - " with time_measurement():\n", - " predicted_ids = ov_model.generate(input_features)\n", - " transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n", + " start_time = time.perf_counter()\n", + " transcription = ov_model.generate(data_item[\"audio\"][\"array\"], return_timestamps=True)\n", + " end_time = time.perf_counter()\n", + " whole_infer_times.append(end_time - start_time)\n", "\n", " ground_truths.append(data_item[\"text\"])\n", - " predictions.append(transcription[0])\n", + " predictions.append(transcription.texts[0])\n", "\n", " word_accuracy = (1 - wer(ground_truths, predictions, reference_transform=wer_standardize,\n", " hypothesis_transform=wer_standardize)) * 100\n", " mean_whole_infer_time = sum(whole_infer_times)\n", - " mean_encoder_infer_time = sum(encoder_infer_times)\n", - " mean_decoder_with_time_infer_time = sum(decoder_with_past_infer_times)\n", - " return word_accuracy, (mean_whole_infer_time, mean_encoder_infer_time, mean_decoder_with_time_infer_time)\n", + " return word_accuracy, mean_whole_infer_time\n", "\n", "test_dataset = load_dataset(\"openslr/librispeech_asr\", \"clean\", split=\"validation\", streaming=True, trust_remote_code=True)\n", "test_dataset = test_dataset.shuffle(seed=42).take(TEST_DATASET_SIZE)\n", "test_samples = [sample for sample in test_dataset]\n", "\n", - "accuracy_original, times_original = calculate_transcription_time_and_accuracy(ov_model, test_samples)\n", - "accuracy_quantized, times_quantized = calculate_transcription_time_and_accuracy(ov_quantized_model, test_samples)\n", - "print(f\"Encoder performance speedup: {times_original[1] / times_quantized[1]:.3f}\")\n", - "print(f\"Decoder with past performance speedup: {times_original[2] / times_quantized[2]:.3f}\")\n", - "print(f\"Whole pipeline performance speedup: {times_original[0] / times_quantized[0]:.3f}\")\n", + "accuracy_original, times_original = calculate_transcription_time_and_accuracy(ov_pipe, test_samples)\n", + "accuracy_quantized, times_quantized = calculate_transcription_time_and_accuracy(quantized_ov_pipe, test_samples)\n", + "print(f\"Whole pipeline performance speedup: {times_original / times_quantized:.3f}\")\n", "print(f\"Whisper transcription word accuracy. Original model: {accuracy_original:.2f}%. Quantized model: {accuracy_quantized:.2f}%.\")\n", "print(f\"Accuracy drop: {accuracy_original - accuracy_quantized:.2f}%.\")" ] @@ -1415,12 +1049,17 @@ "source": [ "def transcribe(url, task, use_int8):\n", " output_file = Path(\"downloaded_video.mp4\")\n", - " ydl_ops = {\"format\": \"best[ext=mp4]\", \"outtmpl\": output_file.as_posix()}\n", - " with yt_dlp.YoutubeDL(ydl_ops) as ydl:\n", - " ydl.download(link.value)\n", + " export_command = f\"yt-dlp -S vcodec:h264,res,acodec:aac -o {str(output_file)} {url}\"\n", + " exit_code = os.system(export_command)\n", + " if exit_code != 0:\n", + " raise Exception(\"Failed to load video!\")\n", + "\n", + " # ydl_ops = {\"format\": \"best[ext=mp4]\", \"outtmpl\": output_file.as_posix()}\n", + " # with yt_dlp.YoutubeDL(ydl_ops) as ydl:\n", + " # ydl.download(url)\n", " inputs, duration = get_audio(output_file)\n", - " m_pipe = int8_pipe if use_int8 else pipe\n", - " transcription = m_pipe(inputs, generate_kwargs={\"task\": task.lower()}, return_timestamps=True)[\"chunks\"]\n", + " m_pipe = quantized_ov_pipe if use_int8 else ov_pipe\n", + " transcription = m_pipe.generate(inputs[\"raw\"], task=task.lower(), return_timestamps=True).chunks\n", " srt_lines = prepare_srt(transcription, duration)\n", " with output_file.with_suffix(\".srt\").open(\"w\") as f:\n", " f.writelines(srt_lines)\n", @@ -1447,7 +1086,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1461,7 +1100,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.12" }, "openvino_notebooks": { "imageUrl": "https://user-images.githubusercontent.com/29454499/204548693-1304ef33-c790-490d-8a8b-d5766acb6254.png",