From e68d6570f1fb5e82713c138e0c20ae5f1c96861d Mon Sep 17 00:00:00 2001 From: Liubov Talamanova Date: Fri, 14 Jul 2023 14:19:14 +0300 Subject: [PATCH] Add NNCF quantization for CLIP notebook (#1173) * Add NNCF quantization for CLIP notebook * Refactoring * Remove image from notebook * Add reduceSum to ignored scope * Move visualize_result to visualize.py * Fix comments * Fix message * Replace dataset * Add custom nncf version * Renamed notebooks * Fix names * Update notebooks/228-clip-zero-shot-image-classification/README.md Co-authored-by: Ekaterina Aidova * Update notebooks/228-clip-zero-shot-image-classification/README.md Co-authored-by: Ekaterina Aidova * make spell checker happy * Apply suggestions from code review --------- Co-authored-by: Ekaterina Aidova --- ...ipynb => 228-clip-zero-shot-convert.ipynb} | 54 +-- .../228-clip-zero-shot-quantize.ipynb | 423 ++++++++++++++++++ .../README.md | 19 +- .../visualize.py | 32 ++ 4 files changed, 484 insertions(+), 44 deletions(-) rename notebooks/228-clip-zero-shot-image-classification/{228-clip-zero-shot-image-classification.ipynb => 228-clip-zero-shot-convert.ipynb} (90%) create mode 100644 notebooks/228-clip-zero-shot-image-classification/228-clip-zero-shot-quantize.ipynb create mode 100644 notebooks/228-clip-zero-shot-image-classification/visualize.py diff --git a/notebooks/228-clip-zero-shot-image-classification/228-clip-zero-shot-image-classification.ipynb b/notebooks/228-clip-zero-shot-image-classification/228-clip-zero-shot-convert.ipynb similarity index 90% rename from notebooks/228-clip-zero-shot-image-classification/228-clip-zero-shot-image-classification.ipynb rename to notebooks/228-clip-zero-shot-image-classification/228-clip-zero-shot-convert.ipynb index fb8dd577322..e904f994516 100644 --- a/notebooks/228-clip-zero-shot-image-classification/228-clip-zero-shot-image-classification.ipynb +++ b/notebooks/228-clip-zero-shot-image-classification/228-clip-zero-shot-convert.ipynb @@ -58,46 +58,6 @@ "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch16\")" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from typing import List\n", - "import matplotlib.pyplot as plt\n", - "import numpy as np\n", - "from PIL import Image\n", - "\n", - "\n", - "def visualize_result(image:Image, labels:List[str], probs:np.ndarray, top:int = 5):\n", - " \"\"\" \n", - " Utility function for visualization classification results\n", - " params:\n", - " image: input image\n", - " labels: list of classification labels\n", - " probs: model predicted softmaxed probabilities for each label\n", - " top: number of the highest probability results for visualization\n", - " returns:\n", - " None\n", - " \"\"\"\n", - " plt.figure(figsize=(64, 64))\n", - " top_labels = np.argsort(-probs)[:min(top, probs.shape[0])]\n", - " top_probs = probs[top_labels]\n", - " plt.subplot(8, 8, 1)\n", - " plt.imshow(image)\n", - " plt.axis(\"off\")\n", - "\n", - " plt.subplot(8, 8, 2)\n", - " y = np.arange(top_probs.shape[-1])\n", - " plt.grid()\n", - " plt.barh(y, top_probs)\n", - " plt.gca().invert_yaxis()\n", - " plt.gca().set_axisbelow(True)\n", - " plt.yticks(y, [labels[index] for index in top_labels])\n", - " plt.xlabel(\"probability\") " - ] - }, { "attachments": {}, "cell_type": "markdown", @@ -115,6 +75,9 @@ "metadata": {}, "outputs": [], "source": [ + "from PIL import Image\n", + "from visualize import visualize_result\n", + "\n", "image = Image.open('../data/image/coco.jpg')\n", "input_labels = ['cat', 'dog', 'wolf', 'tiger', 'man', 'horse', 'frog', 'tree', 'house', 'computer']\n", "text_descriptions = [f\"This is a photo of a {label}\" for label in input_labels]\n", @@ -197,7 +160,6 @@ "metadata": {}, "outputs": [], "source": [ - "import numpy as np\n", "from scipy.special import softmax\n", "from openvino.runtime import Core\n", "\n", @@ -290,6 +252,16 @@ "# visualize prediction\n", "visualize_result(image, labels, probs[0])" ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next Steps\n", + "\n", + "Open the [228-clip-zero-shot-quantize](228-clip-zero-shot-quantize.ipynb) notebook to quantize the IR model with the Post-training Quantization API of NNCF and compare `FP16` and `INT8` models." + ] } ], "metadata": { diff --git a/notebooks/228-clip-zero-shot-image-classification/228-clip-zero-shot-quantize.ipynb b/notebooks/228-clip-zero-shot-image-classification/228-clip-zero-shot-quantize.ipynb new file mode 100644 index 00000000000..9ab8ad2ea30 --- /dev/null +++ b/notebooks/228-clip-zero-shot-image-classification/228-clip-zero-shot-quantize.ipynb @@ -0,0 +1,423 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "4ad832c5-71db-456f-9702-fb8f3b489e46", + "metadata": {}, + "source": [ + "# Post-Training Quantization of OpenAI CLIP model with NNCF\n", + "\n", + "The goal of this tutorial is to demonstrate how to speed up the model by applying 8-bit post-training quantization from [NNCF](https://github.com/openvinotoolkit/nncf/) (Neural Network Compression Framework) and infer quantized model via OpenVINO™ Toolkit. The optimization process contains the following steps:\n", + "\n", + "1. Quantize the converted OpenVINO model from [notebook](228-clip-zero-shot-convert.ipynb) with NNCF.\n", + "2. Check the model result using the same input data from the [notebook](228-clip-zero-shot-convert.ipynb).\n", + "3. Compare model size of converted and quantized models.\n", + "4. Compare performance of converted and quantized models.\n", + "\n", + "> **NOTE**: you should run [228-clip-zero-shot-convert](228-clip-zero-shot-convert.ipynb) notebook first to generate OpenVINO IR model that is used for quantization." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "527c4c05", + "metadata": {}, + "source": [ + "## Prerequisites" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e20c0fa1", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install -q datasets\n", + "!pip install -q git+https://github.com/openvinotoolkit/nncf.git@6c0aebadd2fcdbe1481a11b40b8cd9f66b3b6fab" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "488397fd", + "metadata": {}, + "source": [ + "## Create and initialize quantization\n", + "\n", + "[NNCF](https://github.com/openvinotoolkit/nncf/) enables post-training quantization by adding the quantization layers into the model graph and then using a subset of the training dataset to initialize the parameters of these additional quantization layers. The framework is designed so that modifications to your original training code are minor. Quantization is the simplest scenario and requires a few modifications.\n", + "\n", + "The optimization process contains the following steps:\n", + "1. Create a Dataset for quantization.\n", + "2. Run `nncf.quantize` for getting a quantized model.\n", + "3. Serialize the `INT8` model using `openvino.runtime.serialize` function." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "f261e6a4", + "metadata": {}, + "source": [ + "### Prepare datasets" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "65d8d559", + "metadata": {}, + "source": [ + "The [Conceptual Captions](https://ai.google.com/research/ConceptualCaptions/) dataset consisting of ~3.3M images annotated with captions is used to quantize model." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "924025b4", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "fp16_model_path = 'clip-vit-base-patch16.xml'\n", + "if not os.path.exists(fp16_model_path):\n", + " raise RuntimeError('This notebook should be run after 228-clip-zero-shot-convert.ipynb.')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6865fdb6", + "metadata": {}, + "outputs": [], + "source": [ + "from transformers import CLIPProcessor, CLIPModel\n", + "\n", + "model = CLIPModel.from_pretrained(\"openai/clip-vit-base-patch16\")\n", + "max_length = model.config.text_config.max_position_embeddings\n", + "processor = CLIPProcessor.from_pretrained(\"openai/clip-vit-base-patch16\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "40d71747", + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from io import BytesIO\n", + "from PIL import Image\n", + "from requests.packages.urllib3.exceptions import InsecureRequestWarning\n", + "requests.packages.urllib3.disable_warnings(InsecureRequestWarning)\n", + "\n", + "def check_text_data(data):\n", + " \"\"\"\n", + " Check if the given data is text-based.\n", + " \"\"\"\n", + " if isinstance(data, str):\n", + " return True\n", + " if isinstance(data, list):\n", + " return all(isinstance(x, str) for x in data)\n", + " return False\n", + "\n", + "def get_pil_from_url(url):\n", + " \"\"\"\n", + " Downloads and converts an image from a URL to a PIL Image object.\n", + " \"\"\"\n", + " response = requests.get(url, verify=False, timeout=20)\n", + " image = Image.open(BytesIO(response.content))\n", + " return image.convert(\"RGB\")\n", + "\n", + "def collate_fn(example, image_column=\"image_url\", text_column=\"caption\"):\n", + " \"\"\"\n", + " Preprocesses an example by loading and transforming image and text data.\n", + " Checks if the text data in the example is valid by calling the `check_text_data` function.\n", + " Downloads the image specified by the URL in the image_column by calling the `get_pil_from_url` function.\n", + " If there is any error during the download process, returns None.\n", + " Returns the preprocessed inputs with transformed image and text data.\n", + " \"\"\"\n", + " assert len(example) == 1\n", + " example = example[0]\n", + "\n", + " if not check_text_data(example[text_column]):\n", + " raise ValueError(\"Text data is not valid\")\n", + "\n", + " url = example[image_column]\n", + " try:\n", + " image = get_pil_from_url(url)\n", + " except Exception:\n", + " return None\n", + "\n", + " inputs = processor(text=example[text_column], images=[image], return_tensors=\"pt\", padding=True)\n", + " if inputs['input_ids'].shape[1] > max_length:\n", + " return None\n", + " return inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "befc2204", + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from datasets import load_dataset\n", + "\n", + "def prepare_calibration_data(dataloader, init_steps):\n", + " \"\"\"\n", + " This function prepares calibration data from a dataloader for a specified number of initialization steps.\n", + " It iterates over the dataloader, fetching batches and storing the relevant data.\n", + " \"\"\"\n", + " data = []\n", + " print(f\"Fetching {init_steps} for the initialization...\")\n", + " counter = 0\n", + " for batch in dataloader:\n", + " if counter == init_steps:\n", + " break\n", + " if batch:\n", + " counter += 1\n", + " with torch.no_grad():\n", + " data.append(\n", + " {\n", + " \"pixel_values\": batch[\"pixel_values\"].to(\"cpu\"),\n", + " \"input_ids\": batch[\"input_ids\"].to(\"cpu\"),\n", + " \"attention_mask\": batch[\"attention_mask\"].to(\"cpu\")\n", + " }\n", + " )\n", + " return data\n", + "\n", + "\n", + "def prepare_dataset(opt_init_steps=300, max_train_samples=1000):\n", + " \"\"\"\n", + " Prepares a vision-text dataset for quantization.\n", + " \"\"\"\n", + " dataset = load_dataset(\"conceptual_captions\", streaming=True)\n", + " train_dataset = dataset[\"train\"].shuffle(seed=42, buffer_size=max_train_samples)\n", + " dataloader = torch.utils.data.DataLoader(train_dataset, collate_fn=collate_fn, batch_size=1)\n", + " calibration_data = prepare_calibration_data(dataloader, opt_init_steps)\n", + " return calibration_data" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "6e56bc39", + "metadata": {}, + "source": [ + "Create a quantized model from the pre-trained `FP16` model.\n", + "\n", + "> **NOTE**: Quantization is time and memory consuming operation. Running quantization code below may take a long time." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7b53d9b5", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import nncf\n", + "from openvino.runtime import Core, serialize\n", + "\n", + "nncf.set_log_level(logging.ERROR)\n", + "\n", + "int8_model_path = 'clip-vit-base-patch16_int8.xml'\n", + "calibration_data = prepare_dataset()\n", + "ov_model = Core().read_model(fp16_model_path)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "4137cd4b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Statistics collection: 100%|██████████| 300/300 [00:12<00:00, 24.04it/s]\n", + "Statistics collection: 100%|██████████| 300/300 [00:49<00:00, 6.05it/s]\n", + "Applying Fast Bias correction: 100%|██████████| 144/144 [00:42<00:00, 3.35it/s]\n" + ] + } + ], + "source": [ + "if len(calibration_data) == 0:\n", + " raise RuntimeError(\n", + " 'Calibration dataset is empty. Please check internet connection and try to download images manually.'\n", + " )\n", + "\n", + "calibration_dataset = nncf.Dataset(calibration_data)\n", + "quantized_model = nncf.quantize(\n", + " model=ov_model,\n", + " calibration_dataset=calibration_dataset,\n", + " model_type=nncf.ModelType.TRANSFORMER,\n", + ")\n", + "serialize(quantized_model, int8_model_path)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "8af6d642", + "metadata": {}, + "source": [ + "NNCF also supports quantization-aware training, and other algorithms than quantization.\n", + "See the [NNCF documentation](https://github.com/openvinotoolkit/nncf/#documentation) in the NNCF repository for more information." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "b0508c15", + "metadata": {}, + "source": [ + "## Run quantized OpenVINO model\n", + "\n", + "The steps for making predictions with the quantized OpenVINO CLIP model are similar to the PyTorch model. Let us check the model result using the same input data from the [1st notebook](228-clip-zero-shot-image-classification.ipynb)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2e029ccd", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from scipy.special import softmax\n", + "from openvino.runtime import compile_model\n", + "from visualize import visualize_result\n", + "\n", + "image = Image.open('../data/image/coco.jpg')\n", + "input_labels = ['cat', 'dog', 'wolf', 'tiger', 'man', 'horse', 'frog', 'tree', 'house', 'computer']\n", + "text_descriptions = [f\"This is a photo of a {label}\" for label in input_labels]\n", + "\n", + "inputs = processor(text=text_descriptions, images=[image], return_tensors=\"pt\", padding=True)\n", + "compiled_model = compile_model(int8_model_path)\n", + "logits_per_image_out = compiled_model.output(0)\n", + "ov_logits_per_image = compiled_model(dict(inputs))[logits_per_image_out]\n", + "probs = softmax(ov_logits_per_image, axis=1)\n", + "visualize_result(image, input_labels, probs[0])" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c5173300", + "metadata": {}, + "source": [ + "#### Compare File Size" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "d87de6af", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "FP16 IR model size: 292228.96 KB\n", + "INT8 model size: 172178.13 KB\n", + "Model compression rate: 1.697\n" + ] + } + ], + "source": [ + "from pathlib import Path\n", + "\n", + "fp16_ir_model_size = Path(fp16_model_path).with_suffix(\".bin\").stat().st_size / 1024\n", + "quantized_model_size = Path(int8_model_path).with_suffix(\".bin\").stat().st_size / 1024\n", + "print(f\"FP16 IR model size: {fp16_ir_model_size:.2f} KB\")\n", + "print(f\"INT8 model size: {quantized_model_size:.2f} KB\")\n", + "print(f\"Model compression rate: {fp16_ir_model_size / quantized_model_size:.3f}\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "5c5bd231", + "metadata": {}, + "source": [ + "#### Compare inference time of the FP16 IR and quantized models\n", + "To measure the inference performance of the `FP16` and `INT8` models, we use median inference time on calibration dataset.\n", + "So we can approximately estimate the speed up of the dynamic quantized models.\n", + "\n", + "\n", + "> **NOTE**: For the most accurate performance estimation, it is recommended to run `benchmark_app` in a terminal/command prompt after closing other applications with static shapes." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "92ce5218", + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "from openvino.runtime import compile_model\n", + "\n", + "def calculate_inference_time(model_path, calibration_data):\n", + " model = compile_model(model_path)\n", + " output_layer = model.output(0)\n", + " inference_time = []\n", + " for batch in calibration_data:\n", + " start = time.perf_counter()\n", + " _ = model(batch)[output_layer]\n", + " end = time.perf_counter()\n", + " delta = end - start\n", + " inference_time.append(delta)\n", + " return np.median(inference_time)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Performance speed up: 1.431\n" + ] + } + ], + "source": [ + "fp16_latency = calculate_inference_time(fp16_model_path, calibration_data)\n", + "int8_latency = calculate_inference_time(int8_model_path, calibration_data)\n", + "print(f\"Performance speed up: {fp16_latency / int8_latency:.3f}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/228-clip-zero-shot-image-classification/README.md b/notebooks/228-clip-zero-shot-image-classification/README.md index 126af5978ee..15f3ba4c9a1 100644 --- a/notebooks/228-clip-zero-shot-image-classification/README.md +++ b/notebooks/228-clip-zero-shot-image-classification/README.md @@ -2,21 +2,34 @@ Zero-shot image classification is a computer vision task to classify images into one of several classes, without any prior training or knowledge of the classes. ![zero-shot-pipeline](https://user-images.githubusercontent.com/29454499/207773481-d77cacf8-6cdc-4765-a31b-a1669476d620.png) + In this tutorial, you will use [OpenAI CLIP](https://github.com/openai/CLIP) model to perform zero-shot image classification. ## Notebook Contents -This notebook demonstrates how to perform zero-shot image classification using the open-source CLIP model. CLIP is a multi-modal vision and language model. It can be instructed in natural language to predict the most relevant text snippet, given an image, without directly optimizing for the task. According to the [paper](https://arxiv.org/abs/2103.00020), CLIP matches the performance of the original ResNet50 on ImageNet “zero-shot” without using any of the original 1.28M labeled examples, overcoming several major challenges in computer vision. +This tutorial demonstrates how to perform zero-shot image classification using the open-source CLIP model. CLIP is a multi-modal vision and language model. It can be instructed in natural language to predict the most relevant text snippet, given an image, without directly optimizing for the task. According to the [paper](https://arxiv.org/abs/2103.00020), CLIP matches the performance of the original ResNet50 on ImageNet “zero-shot” without using any of the original 1.28M labeled examples, overcoming several major challenges in computer vision. You can find more information about this model in the [research paper](https://arxiv.org/abs/2103.00020), [OpenAI blog](https://openai.com/blog/clip/), [model card](https://github.com/openai/CLIP/blob/main/model-card.md) and GitHub [repository](https://github.com/openai/CLIP). -Notebook contains the following steps: +This folder contains two notebooks that show how to convert and quantize model with OpenVINO: + +1. [Convert the CLIP model using OpenVINO](228-clip-zero-shot-convert.ipynb) +2. [Quantize the OpenVINO CLIP model using NNCF](228-clip-zero-shot-quantize.ipynb) + +The first notebook contains the following steps: 1. Download the model. 2. Instantiate the PyTorch model. 3. Export the ONNX model and convert it to OpenVINO IR, using the Model Optimizer tool. 4. Run CLIP with OpenVINO. -The image below shows an example of the notebook work. +The second notebook contains the following steps: +1. Quantize the converted OpenVINO model from [notebook](228-clip-zero-shot-convert.ipynb) with NNCF. +2. Check the model result using the same input data from the [first notebook](228-clip-zero-shot-convert.ipynb). +3. Compare model size of converted and quantized models. +4. Compare performance of converted and quantized models. + +NNCF performs quantization within the OpenVINO IR. It is required to run the first notebook before running the second notebook. +We will use CLIP model for zero-shot image classification. The result of model work demonstrated on the image below ![image](https://user-images.githubusercontent.com/29454499/207795060-437b42f9-e801-4332-a91f-cc26471e5ba2.png) ## Installation Instructions diff --git a/notebooks/228-clip-zero-shot-image-classification/visualize.py b/notebooks/228-clip-zero-shot-image-classification/visualize.py new file mode 100644 index 00000000000..94ba9fcc9a1 --- /dev/null +++ b/notebooks/228-clip-zero-shot-image-classification/visualize.py @@ -0,0 +1,32 @@ +from typing import List +import matplotlib.pyplot as plt +import numpy as np +from PIL import Image + + +def visualize_result(image:Image, labels:List[str], probs:np.ndarray, top:int = 5): + """ + Utility function for visualization classification results + params: + image: input image + labels: list of classification labels + probs: model predicted softmaxed probabilities for each label + top: number of the highest probability results for visualization + returns: + None + """ + plt.figure(figsize=(64, 64)) + top_labels = np.argsort(-probs)[:min(top, probs.shape[0])] + top_probs = probs[top_labels] + plt.subplot(8, 8, 1) + plt.imshow(image) + plt.axis("off") + + plt.subplot(8, 8, 2) + y = np.arange(top_probs.shape[-1]) + plt.grid() + plt.barh(y, top_probs) + plt.gca().invert_yaxis() + plt.gca().set_axisbelow(True) + plt.yticks(y, [labels[index] for index in top_labels]) + plt.xlabel("probability")