diff --git a/.ci/ignore_treon_docker.txt b/.ci/ignore_treon_docker.txt index 98fd6e98eaf..17b16897191 100644 --- a/.ci/ignore_treon_docker.txt +++ b/.ci/ignore_treon_docker.txt @@ -67,3 +67,4 @@ notebooks/stable-cascade-image-generation/stable-cascade-image-generation.ipynb notebooks/dynamicrafter-animating-images/dynamicrafter-animating-images.ipynb notebooks/yolov10-optimization/yolov10-optimization.ipynb notebooks/whisper-subtitles-generation/whisper-subtitles-generation.ipynb +notebooks/speechbrain-emotion-recognition/speechbrain-emotion-recognition.ipynb diff --git a/.ci/ignore_treon_py38.txt b/.ci/ignore_treon_py38.txt index 1c240f4268e..6749cc516c3 100644 --- a/.ci/ignore_treon_py38.txt +++ b/.ci/ignore_treon_py38.txt @@ -1,2 +1,3 @@ notebooks/surya-line-level-text-detection/surya-line-level-text-detection.ipynb -notebooks/stable-diffusion-keras-cv/stable-diffusion-keras-cv.ipynb \ No newline at end of file +notebooks/stable-diffusion-keras-cv/stable-diffusion-keras-cv.ipynb +notebooks/speechbrain-emotion-recognition/speechbrain-emotion-recognition.ipynb \ No newline at end of file diff --git a/.ci/spellcheck/.pyspelling.wordlist.txt b/.ci/spellcheck/.pyspelling.wordlist.txt index 9121c6978c3..9975e984567 100644 --- a/.ci/spellcheck/.pyspelling.wordlist.txt +++ b/.ci/spellcheck/.pyspelling.wordlist.txt @@ -86,9 +86,11 @@ CHW Cifar cityscape Cityscapes +classname CLI cli CLIP's +codebase codebook codebooks codec @@ -127,6 +129,7 @@ CRNN CSV CTC CUDA +CustomEncoderWav CVF CVPR Databricks @@ -294,6 +297,7 @@ HWC hyperparameters ICIP ICPR +IEMOCAP iGPU IdentityNet iGPUs @@ -592,6 +596,7 @@ PTQ px py pyannote +pymodule PyPI Pythia pytorch @@ -710,6 +715,9 @@ sparsified sparsify spectrogram spectrograms +SpeechBrain +SpeechBrain's +speechbrain splitters SPS SQA @@ -848,6 +856,7 @@ VQVAE waveform waveforms Wav +wav WavLM WebGL WebUI diff --git a/notebooks/speechbrain-emotion-recognition/README.md b/notebooks/speechbrain-emotion-recognition/README.md new file mode 100644 index 00000000000..44f78c0df0a --- /dev/null +++ b/notebooks/speechbrain-emotion-recognition/README.md @@ -0,0 +1,23 @@ +# SpeechBrain Emotion Recognition with OpenVINO + +[SpeechBrain](https://github.com/speechbrain/speechbrain) is an open-source PyTorch toolkit that accelerates Conversational AI development, i.e., the technology behind speech assistants, chatbots, and large language models. It is crafted for fast and easy creation of advanced technologies for Speech and Text Processing. + +Lear more in [GitHub repo](https://github.com/speechbrain/speechbrain) and [paper](https://arxiv.org/pdf/2106.04624) + +## Notebook contents +The tutorial consists from following steps: +- Installations +- Imports +- Prepare base model +- Initialize model +- PyTorch inference +- Initialize model +- SpeechBrain model optimization with Intel OpenVINO + - Step 1: Prepare input tensor + - Step 2: Convert model to OpenVINO IR + - Step 3: OpenVINO model inference + +## Installation instructions +This is a self-contained example that relies solely on its own code.
+We recommend running the notebook in a virtual environment. You only need a Jupyter server to start. +For details, please refer to [Installation Guide](../../README.md). \ No newline at end of file diff --git a/notebooks/speechbrain-emotion-recognition/speechbrain-emotion-recognition.ipynb b/notebooks/speechbrain-emotion-recognition/speechbrain-emotion-recognition.ipynb new file mode 100644 index 00000000000..9ab3b184b7c --- /dev/null +++ b/notebooks/speechbrain-emotion-recognition/speechbrain-emotion-recognition.ipynb @@ -0,0 +1,466 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# SpeechBrain Emotion Recognition with OpenVINO\n", + "\n", + "
Important note: This notebook requires python >= 3.9. Please make sure that your environment fulfill to this requirement before running it
\n", + "\n", + "[SpeechBrain](https://github.com/speechbrain/speechbrain) is an open-source PyTorch toolkit that accelerates Conversational AI development, i.e., the technology behind speech assistants, chatbots, and large language models. \n", + "\n", + "Lear more in [GitHub repo](https://github.com/speechbrain/speechbrain) and [paper](https://arxiv.org/pdf/2106.04624)\n", + "\n", + "This notebook tutorial demonstrates optimization and inference of speechbrain emotion recognition model with OpenVINO.\n", + "\n", + "#### Table of contents:\n", + "\n", + "- [Installations](#Installations)\n", + "- [Imports](#Imports)\n", + "- [Prepare base model](#Prepare-base-model)\n", + "- [Initialize model](#Initialize-model)\n", + "- [PyTorch inference](#PyTorch-inference)\n", + "- [SpeechBrain model optimization with Intel OpenVINO](#SpeechBrain-model-optimization-with-Intel-OpenVINO)\n", + " - [Step 1: Prepare input tensor](#Step-1:-Prepare-input-tensor)\n", + " - [Step 2: Convert model to OpenVINO IR](#Step-2:-Convert-model-to-OpenVINO-IR)\n", + " - [Step 3: OpenVINO model inference](#Step-3:-OpenVINO-model-inference)\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Installations\n", + "[back to top ⬆️](#Table-of-contents:)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cpu\n", + "Requirement already satisfied: speechbrain in /home/psakhamo/.venv/lib/python3.10/site-packages (1.0.0)\n", + "Requirement already satisfied: hyperpyyaml in /home/psakhamo/.venv/lib/python3.10/site-packages (from speechbrain) (1.2.2)\n", + "Requirement already satisfied: joblib in /home/psakhamo/.venv/lib/python3.10/site-packages (from speechbrain) (1.4.2)\n", + "Requirement already satisfied: numpy in /home/psakhamo/.venv/lib/python3.10/site-packages (from speechbrain) (1.26.4)\n", + "Requirement already satisfied: packaging in /home/psakhamo/.venv/lib/python3.10/site-packages (from speechbrain) (24.0)\n", + "Requirement already satisfied: scipy in /home/psakhamo/.venv/lib/python3.10/site-packages (from speechbrain) (1.13.1)\n", + "Requirement already satisfied: sentencepiece in /home/psakhamo/.venv/lib/python3.10/site-packages (from speechbrain) (0.2.0)\n", + "Requirement already satisfied: torch>=1.9 in /home/psakhamo/.venv/lib/python3.10/site-packages (from speechbrain) (2.3.1+cpu)\n", + "Requirement already satisfied: torchaudio in /home/psakhamo/.venv/lib/python3.10/site-packages (from speechbrain) (2.3.1+cpu)\n", + "Requirement already satisfied: tqdm in /home/psakhamo/.venv/lib/python3.10/site-packages (from speechbrain) (4.66.4)\n", + "Requirement already satisfied: huggingface-hub in /home/psakhamo/.venv/lib/python3.10/site-packages (from speechbrain) (0.23.3)\n", + "Requirement already satisfied: filelock in /home/psakhamo/.venv/lib/python3.10/site-packages (from torch>=1.9->speechbrain) (3.13.1)\n", + "Requirement already satisfied: typing-extensions>=4.8.0 in /home/psakhamo/.venv/lib/python3.10/site-packages (from torch>=1.9->speechbrain) (4.9.0)\n", + "Requirement already satisfied: sympy in /home/psakhamo/.venv/lib/python3.10/site-packages (from torch>=1.9->speechbrain) (1.12)\n", + "Requirement already satisfied: networkx in /home/psakhamo/.venv/lib/python3.10/site-packages (from torch>=1.9->speechbrain) (3.2.1)\n", + "Requirement already satisfied: jinja2 in /home/psakhamo/.venv/lib/python3.10/site-packages (from torch>=1.9->speechbrain) (3.1.3)\n", + "Requirement already satisfied: fsspec in /home/psakhamo/.venv/lib/python3.10/site-packages (from torch>=1.9->speechbrain) (2024.2.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /home/psakhamo/.venv/lib/python3.10/site-packages (from huggingface-hub->speechbrain) (6.0.1)\n", + "Requirement already satisfied: requests in /home/psakhamo/.venv/lib/python3.10/site-packages (from huggingface-hub->speechbrain) (2.32.3)\n", + "Requirement already satisfied: ruamel.yaml>=0.17.28 in /home/psakhamo/.venv/lib/python3.10/site-packages (from hyperpyyaml->speechbrain) (0.18.6)\n", + "Requirement already satisfied: ruamel.yaml.clib>=0.2.7 in /home/psakhamo/.venv/lib/python3.10/site-packages (from ruamel.yaml>=0.17.28->hyperpyyaml->speechbrain) (0.2.8)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /home/psakhamo/.venv/lib/python3.10/site-packages (from jinja2->torch>=1.9->speechbrain) (2.1.5)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /home/psakhamo/.venv/lib/python3.10/site-packages (from requests->huggingface-hub->speechbrain) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /home/psakhamo/.venv/lib/python3.10/site-packages (from requests->huggingface-hub->speechbrain) (3.7)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/psakhamo/.venv/lib/python3.10/site-packages (from requests->huggingface-hub->speechbrain) (2.2.1)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /home/psakhamo/.venv/lib/python3.10/site-packages (from requests->huggingface-hub->speechbrain) (2024.6.2)\n", + "Requirement already satisfied: mpmath>=0.19 in /home/psakhamo/.venv/lib/python3.10/site-packages (from sympy->torch>=1.9->speechbrain) (1.3.0)\n", + "Note: you may need to restart the kernel to use updated packages.\n", + "Looking in indexes: https://download.pytorch.org/whl/cpu\n", + "Collecting torch\n", + " Using cached https://download.pytorch.org/whl/cpu/torch-2.3.1%2Bcpu-cp310-cp310-linux_x86_64.whl (190.4 MB)\n", + "Collecting torchaudio\n", + " Using cached https://download.pytorch.org/whl/cpu/torchaudio-2.3.1%2Bcpu-cp310-cp310-linux_x86_64.whl (1.7 MB)\n", + "Collecting filelock (from torch)\n", + " Using cached https://download.pytorch.org/whl/filelock-3.13.1-py3-none-any.whl (11 kB)\n", + "Collecting typing-extensions>=4.8.0 (from torch)\n", + " Using cached https://download.pytorch.org/whl/typing_extensions-4.9.0-py3-none-any.whl (32 kB)\n", + "Collecting sympy (from torch)\n", + " Using cached https://download.pytorch.org/whl/sympy-1.12-py3-none-any.whl (5.7 MB)\n", + "Collecting networkx (from torch)\n", + " Using cached https://download.pytorch.org/whl/networkx-3.2.1-py3-none-any.whl (1.6 MB)\n", + "Collecting jinja2 (from torch)\n", + " Using cached https://download.pytorch.org/whl/Jinja2-3.1.3-py3-none-any.whl (133 kB)\n", + "Collecting fsspec (from torch)\n", + " Using cached https://download.pytorch.org/whl/fsspec-2024.2.0-py3-none-any.whl (170 kB)\n", + "Collecting MarkupSafe>=2.0 (from jinja2->torch)\n", + " Using cached https://download.pytorch.org/whl/MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (25 kB)\n", + "Collecting mpmath>=0.19 (from sympy->torch)\n", + " Using cached https://download.pytorch.org/whl/mpmath-1.3.0-py3-none-any.whl (536 kB)\n", + "Installing collected packages: mpmath, typing-extensions, sympy, networkx, MarkupSafe, fsspec, filelock, jinja2, torch, torchaudio\n", + " Attempting uninstall: mpmath\n", + " Found existing installation: mpmath 1.3.0\n", + " Uninstalling mpmath-1.3.0:\n", + " Successfully uninstalled mpmath-1.3.0\n", + " Attempting uninstall: typing-extensions\n", + " Found existing installation: typing_extensions 4.9.0\n", + " Uninstalling typing_extensions-4.9.0:\n", + " Successfully uninstalled typing_extensions-4.9.0\n", + " Attempting uninstall: sympy\n", + " Found existing installation: sympy 1.12\n", + " Uninstalling sympy-1.12:\n", + " Successfully uninstalled sympy-1.12\n", + " Attempting uninstall: networkx\n", + " Found existing installation: networkx 3.2.1\n", + " Uninstalling networkx-3.2.1:\n", + " Successfully uninstalled networkx-3.2.1\n", + " Attempting uninstall: MarkupSafe\n", + " Found existing installation: MarkupSafe 2.1.5\n", + " Uninstalling MarkupSafe-2.1.5:\n", + " Successfully uninstalled MarkupSafe-2.1.5\n", + " Attempting uninstall: fsspec\n", + " Found existing installation: fsspec 2024.2.0\n", + " Uninstalling fsspec-2024.2.0:\n", + " Successfully uninstalled fsspec-2024.2.0\n", + " Attempting uninstall: filelock\n", + " Found existing installation: filelock 3.13.1\n", + " Uninstalling filelock-3.13.1:\n", + " Successfully uninstalled filelock-3.13.1\n", + " Attempting uninstall: jinja2\n", + " Found existing installation: Jinja2 3.1.3\n", + " Uninstalling Jinja2-3.1.3:\n", + " Successfully uninstalled Jinja2-3.1.3\n", + " Attempting uninstall: torch\n", + " Found existing installation: torch 2.3.1+cpu\n", + " Uninstalling torch-2.3.1+cpu:\n", + " Successfully uninstalled torch-2.3.1+cpu\n", + " Attempting uninstall: torchaudio\n", + " Found existing installation: torchaudio 2.3.1+cpu\n", + " Uninstalling torchaudio-2.3.1+cpu:\n", + " Successfully uninstalled torchaudio-2.3.1+cpu\n", + "Successfully installed MarkupSafe-2.1.5 filelock-3.13.1 fsspec-2024.2.0 jinja2-3.1.3 mpmath-1.3.0 networkx-3.2.1 sympy-1.12 torch-2.3.1+cpu torchaudio-2.3.1+cpu typing-extensions-4.9.0\n", + "Note: you may need to restart the kernel to use updated packages.\n", + "Requirement already satisfied: transformers>=4.30.0 in /home/psakhamo/.venv/lib/python3.10/site-packages (4.41.2)\n", + "Requirement already satisfied: huggingface_hub>=0.8.0 in /home/psakhamo/.venv/lib/python3.10/site-packages (0.23.3)\n", + "Requirement already satisfied: SoundFile in /home/psakhamo/.venv/lib/python3.10/site-packages (0.12.1)\n", + "Requirement already satisfied: filelock in /home/psakhamo/.venv/lib/python3.10/site-packages (from transformers>=4.30.0) (3.13.1)\n", + "Requirement already satisfied: numpy>=1.17 in /home/psakhamo/.venv/lib/python3.10/site-packages (from transformers>=4.30.0) (1.26.4)\n", + "Requirement already satisfied: packaging>=20.0 in /home/psakhamo/.venv/lib/python3.10/site-packages (from transformers>=4.30.0) (24.0)\n", + "Requirement already satisfied: pyyaml>=5.1 in /home/psakhamo/.venv/lib/python3.10/site-packages (from transformers>=4.30.0) (6.0.1)\n", + "Requirement already satisfied: regex!=2019.12.17 in /home/psakhamo/.venv/lib/python3.10/site-packages (from transformers>=4.30.0) (2024.5.15)\n", + "Requirement already satisfied: requests in /home/psakhamo/.venv/lib/python3.10/site-packages (from transformers>=4.30.0) (2.32.3)\n", + "Requirement already satisfied: tokenizers<0.20,>=0.19 in /home/psakhamo/.venv/lib/python3.10/site-packages (from transformers>=4.30.0) (0.19.1)\n", + "Requirement already satisfied: safetensors>=0.4.1 in /home/psakhamo/.venv/lib/python3.10/site-packages (from transformers>=4.30.0) (0.4.3)\n", + "Requirement already satisfied: tqdm>=4.27 in /home/psakhamo/.venv/lib/python3.10/site-packages (from transformers>=4.30.0) (4.66.4)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /home/psakhamo/.venv/lib/python3.10/site-packages (from huggingface_hub>=0.8.0) (2024.2.0)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/psakhamo/.venv/lib/python3.10/site-packages (from huggingface_hub>=0.8.0) (4.9.0)\n", + "Requirement already satisfied: cffi>=1.0 in /home/psakhamo/.venv/lib/python3.10/site-packages (from SoundFile) (1.16.0)\n", + "Requirement already satisfied: pycparser in /home/psakhamo/.venv/lib/python3.10/site-packages (from cffi>=1.0->SoundFile) (2.22)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /home/psakhamo/.venv/lib/python3.10/site-packages (from requests->transformers>=4.30.0) (3.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /home/psakhamo/.venv/lib/python3.10/site-packages (from requests->transformers>=4.30.0) (3.7)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/psakhamo/.venv/lib/python3.10/site-packages (from requests->transformers>=4.30.0) (2.2.1)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /home/psakhamo/.venv/lib/python3.10/site-packages (from requests->transformers>=4.30.0) (2024.6.2)\n", + "Note: you may need to restart the kernel to use updated packages.\n", + "Note: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install speechbrain --extra-index-url https://download.pytorch.org/whl/cpu\n", + "%pip install --upgrade --force-reinstall torch torchaudio --index-url https://download.pytorch.org/whl/cpu\n", + "%pip install \"transformers>=4.30.0\" \"huggingface_hub>=0.8.0\" \"SoundFile\"\n", + "%pip install -q \"openvino>=2024.1.0\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Imports\n", + "[back to top ⬆️](#Table-of-contents:)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "torchvision is not available - cannot save figures\n" + ] + } + ], + "source": [ + "import torch\n", + "import torchaudio\n", + "from speechbrain.inference.interfaces import foreign_class\n", + "\n", + "import openvino as ov" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare base model\n", + "[back to top ⬆️](#Table-of-contents:)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The foreign_class function in SpeechBrain is a utility that allows you to load and use custom PyTorch models within the SpeechBrain ecosystem. It provides a convenient way to integrate external or custom-built models into SpeechBrain's inference pipeline without modifying the core SpeechBrain codebase.\n", + "\n", + "1. source: This argument specifies the source or location of the pre-trained model checkpoint. In this case, \"speechbrain/emotion-recognition-wav2vec2-IEMOCAP\" refers to a pre-trained model checkpoint available on the Hugging Face Hub.\n", + "2. pymodule_file: This argument is the path to a Python file containing the definition of your custom PyTorch model class. In this example, \"custom_interface.py\" is the name of the Python file that defines the CustomEncoderWav2vec2Classifier class.\n", + "3. classname: This argument specifies the name of the custom PyTorch model class defined in the pymodule_file. In this case, \"CustomEncoderWav2vec2Classifier\" is the name of the class that extends SpeechBrain's Pretrained class and implements the necessary methods for inference." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/psakhamo/.venv/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n", + "/home/psakhamo/.venv/lib/python3.10/site-packages/transformers/configuration_utils.py:364: UserWarning: Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 Transformers. Using `model.gradient_checkpointing_enable()` instead, or if you are using the `Trainer` API, pass `gradient_checkpointing=True` in your `TrainingArguments`.\n", + " warnings.warn(\n", + "Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-base and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n", + "speechbrain.lobes.models.huggingface_transformers.huggingface - Wav2Vec2Model is frozen.\n" + ] + } + ], + "source": [ + "classifier = foreign_class(\n", + " source=\"speechbrain/emotion-recognition-wav2vec2-IEMOCAP\", pymodule_file=\"custom_interface.py\", classname=\"CustomEncoderWav2vec2Classifier\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Initialize model\n", + "[back to top ⬆️](#Table-of-contents:)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# wav2vec2 torch model\n", + "torch_model = classifier.mods[\"wav2vec2\"].model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### PyTorch inference \n", + "[back to top ⬆️](#Table-of-contents:)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Perform emotion recognition on the sample audio file.\n", + "\n", + "1. out_prob: Tensor or list containing the predicted probabilities or log probabilities for each emotion class.\n", + "2. score: Scalar value representing the predicted probability or log probability of the most likely emotion class.\n", + "3. index: Integer value representing the index of the most likely emotion class in the out_prob tensor or list.\n", + "4. text_lab: String or list of strings containing the textual labels corresponding to the predicted emotion classes ([\"anger\", \"happiness\", \"sadness\", \"neutrality\"]). " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Emotion Recognition with SpeechBrain PyTorch model: ['ang']\n" + ] + } + ], + "source": [ + "out_prob, score, index, text_lab = classifier.classify_file(\"speechbrain/emotion-recognition-wav2vec2-IEMOCAP/anger.wav\")\n", + "print(f\"Emotion Recognition with SpeechBrain PyTorch model: {text_lab}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## SpeechBrain model optimization with Intel OpenVINO\n", + "[back to top ⬆️](#Table-of-contents:)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 1: Prepare input tensor\n", + "[back to top ⬆️](#Table-of-contents:)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Using sample audio file\n", + "signals = []\n", + "batch_size = 1\n", + "signal, sr = torchaudio.load(str(\"./anger.wav\"), channels_first=False)\n", + "norm_audio = classifier.audio_normalizer(signal, sr)\n", + "signals.append(norm_audio)\n", + "\n", + "sequence_length = norm_audio.shape[-1]\n", + "\n", + "wavs = torch.stack(signals, dim=0)\n", + "wav_len = torch.tensor([sequence_length] * batch_size).unsqueeze(0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 2: Convert model to OpenVINO IR\n", + "[back to top ⬆️](#Table-of-contents:)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/psakhamo/.venv/lib/python3.10/site-packages/transformers/modeling_utils.py:4481: FutureWarning: `_is_quantized_training_enabled` is going to be deprecated in transformers 4.39.0. Please use `model.hf_quantizer.is_trainable` instead\n", + " warnings.warn(\n", + "/home/psakhamo/.venv/lib/python3.10/site-packages/transformers/models/wav2vec2/modeling_wav2vec2.py:968: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):\n" + ] + } + ], + "source": [ + "# Model optimization process\n", + "input_tensor = wavs.float()\n", + "ov_model = ov.convert_model(torch_model, example_input=input_tensor)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Step 3: OpenVINO model inference\n", + "[back to top ⬆️](#Table-of-contents:)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4e5c6191b9e24e17a44eece6a41cccdc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO')" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import ipywidgets as widgets\n", + "\n", + "core = ov.Core()\n", + "\n", + "# Device selection\n", + "device = widgets.Dropdown(\n", + " options=core.available_devices + [\"AUTO\"],\n", + " value=\"AUTO\",\n", + " description=\"Device:\",\n", + " disabled=False,\n", + ")\n", + "\n", + "device" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Emotion Recognition with OpenVINO Model: ['ang']\n" + ] + } + ], + "source": [ + "# OpenVINO Compiled model\n", + "compiled_model = core.compile_model(ov_model, device.value)\n", + "\n", + "# Perform model inference\n", + "output_tensor = compiled_model(wavs)[0]\n", + "output_tensor = torch.from_numpy(output_tensor)\n", + "\n", + "# output post-processing\n", + "outputs = classifier.mods.avg_pool(output_tensor, wav_len)\n", + "outputs = outputs.view(outputs.shape[0], -1)\n", + "outputs = classifier.mods.output_mlp(outputs).squeeze(1)\n", + "ov_out_prob = classifier.hparams.softmax(outputs)\n", + "score, index = torch.max(ov_out_prob, dim=-1)\n", + "text_lab = classifier.hparams.label_encoder.decode_torch(index)\n", + "\n", + "print(f\"Emotion Recognition with OpenVINO Model: {text_lab}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}