add notebook (#2408)

Co-authored-by: Andrei Anufriev <andrey.anufriev@intel.com>
openvinotoolkit · Sep 24, 2024 · 45f3b9a · 45f3b9a
1 parent 4896dbb
commit 45f3b9a
Show file tree

Hide file tree

Showing 11 changed files with 2,229 additions and 3 deletions.
diff --git a/.ci/ignore_convert_execution.txt b/.ci/ignore_convert_execution.txt
@@ -61,4 +61,5 @@ notebooks/hunyuan-dit-image-generation/hunyuan-dit-image-generation.ipynb
 notebooks/stable-diffusion-v3/stable-diffusion-v3.ipynb
 notebooks/llm-rag-llamaindex/llm-rag-llamaindex.ipynb
 notebooks/llm-agent-functioncall/llm-agent-functioncall-qwen.ipynb
-notebooks/llm-agent-react/llm-agent-rag-llamaindex.ipynb
+notebooks/llm-agent-react/llm-agent-rag-llamaindex.ipynb
+notebooks/mllama-3.2/mllama-3.2.ipynb
diff --git a/.ci/ignore_treon_docker.txt b/.ci/ignore_treon_docker.txt
@@ -81,4 +81,5 @@ notebooks/internvl2/internvl2.ipynb
 notebooks/qwen2-vl/qwen2-vl.ipynb
 notebooks/qwen2-audio/qwen2-audio.ipynb
 notebooks/stable-fast-3d/stable-fast-3d.ipynb
+notebooks/mllama-3.2/mllama-3.2.ipynb
 notebooks/segment-anything/segment-anything-2-image.ipynb
diff --git a/.ci/skipped_notebooks.yml b/.ci/skipped_notebooks.yml
@@ -587,6 +587,13 @@
       - '3.8'
     - os:
         - macos-12
+- notebook: notebooks/mllama-3.2/mllama-3.2.ipynb
+  skips:
+    - os:
+        - macos-12
+        - ubuntu-20.04
+        - ubuntu-22.04
+        - windows-2019
 - notebook: notebooks/llm-agent-react/llm-agent-react-langchain.ipynb
   skips:
     - python:

diff --git a/.ci/spellcheck/.pyspelling.wordlist.txt b/.ci/spellcheck/.pyspelling.wordlist.txt
@@ -436,6 +436,7 @@ llava
 llm
 LLM
 LLMs
+LM
 LMS
 LLMPipeline
 logits

diff --git a/notebooks/mllama-3.2/README.md b/notebooks/mllama-3.2/README.md
@@ -0,0 +1,27 @@
+# Visual-language assistant with Llama-3.2-11B-Vision and OpenVINO
+
+Llama-3.2-11B-Vision is the latest model from LLama3 model family those capabilities extended to understand images content.
+More details about model can be found in [model card](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/MODEL_CARD_VISION.md), and original [repo](https://github.com/meta-llama/llama-models).
+
+In this tutorial we consider how to convert and optimize Llama-Vision model for creating multimodal chatbot. Additionally, we demonstrate how to apply stateful transformation on LLM part and model optimization techniques like weights compression and quantization using [NNCF](https://github.com/openvinotoolkit/nncf)
+
+## Notebook contents
+The tutorial consists from following steps:
+
+- Install requirements
+- Convert model
+- Optimize Language model using weights compression
+- Optimize Image encoder using post-training quantization
+- Run OpenVINO model inference
+- Launch Interactive demo
+
+In this demonstration, you'll create interactive chatbot that can answer questions about provided image's content.
+
+The image bellow illustrates example of input prompt and model answer.
+![example.png](https://github.com/user-attachments/assets/1e3fde78-bae5-4b9a-8ef3-ea1291b288cf)
+
+## Installation instructions
+This is a self-contained example that relies solely on its own code.</br>
+We recommend running the notebook in a virtual environment. You only need a Jupyter server to start.
+For details, please refer to [Installation Guide](../../README.md).
+<img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=5b5a4db0-7875-4bfb-bdbd-01698b5b1a77&file=notebooks/mllama-3.2/README.md" />
diff --git a/notebooks/mllama-3.2/data_preprocessing.py b/notebooks/mllama-3.2/data_preprocessing.py
@@ -0,0 +1,254 @@
+import torch
+from datasets import load_dataset
+from transformers import AutoProcessor
+from tqdm.autonotebook import tqdm
+from pathlib import Path
+import pickle
+import gc
+
+import requests
+from io import BytesIO
+import numpy as np
+from PIL import Image
+from requests.packages.urllib3.exceptions import InsecureRequestWarning
+
+requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
+from ov_mllama_helper import OVMLlamaForConditionalGeneration
+
+
+max_length = 4048
+
+
+def check_text_data(data):
+    """
+    Check if the given data is text-based.
+    """
+    if isinstance(data, str):
+        return True
+    if isinstance(data, list):
+        return all(isinstance(x, str) for x in data)
+    return False
+
+
+def get_pil_from_url(url):
+    """
+    Downloads and converts an image from a URL to a PIL Image object.
+    """
+    response = requests.get(url, verify=False, timeout=20)
+    image = Image.open(BytesIO(response.content))
+    return image.convert("RGB")
+
+
+# def collate_fn_llm(example, image_column="image_url", text_column="caption"):
+#     """
+#     Preprocesses an example by loading and transforming image and text data.
+#     Checks if the text data in the example is valid by calling the `check_text_data` function.
+#     Downloads the image specified by the URL in the image_column by calling the `get_pil_from_url` function.
+#     If there is any error during the download process, returns None.
+#     Returns the preprocessed inputs with transformed image and text data.
+#     """
+#     assert len(example) == 1
+#     example = example[0]
+
+#     if not check_text_data(example[text_column]):
+#         raise ValueError("Text data is not valid")
+
+#     url = example[image_column]
+#     try:
+#         image = get_pil_from_url(url)
+#         h, w = image.size
+#         if h == 1 or w == 1:
+#             return None
+#     except Exception:
+#         return None
+
+#     inputs = processor(text="<|image|><|begin_of_text|>"+example[text_column], images=image, return_tensors="pt", padding=True)
+#     if inputs['input_ids'].shape[1] > max_length:
+#         return None
+#     return inputs
+
+
+def prepare_calibration_data_vision(dataloader, init_steps):
+    """
+    This function prepares calibration data from a dataloader for a specified number of initialization steps.
+    It iterates over the dataloader, fetching batches and storing the relevant data.
+    """
+    prompt = "<|image|><|begin_of_text|>If I had to write a haiku for this one"
+    url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    model_id = "Llama-3.2-11B-Vision-Instruct/OV"
+    processor = AutoProcessor.from_pretrained(model_id)
+    inputs = processor(text=prompt, images=image, return_tensors="pt")
+    data = []
+    print(f"Fetching {init_steps} samples for the initialization...")
+    with tqdm(total=init_steps) as pbar:
+        for batch in dataloader:
+            if len(data) == init_steps:
+                break
+            if batch:
+                pbar.update(1)
+                with torch.no_grad():
+                    data.append(
+                        {
+                            "pixel_values": batch["pixel_values"].to("cpu"),
+                            "aspect_ratio_ids": inputs.data["aspect_ratio_ids"].to("cpu"),
+                            "aspect_ratio_mask": inputs.data["aspect_ratio_mask"],
+                        }
+                    )
+    return data
+
+
+def prepare_dataset_vision(processor, opt_init_steps=50, max_train_samples=1000, file_path="vision_dataset.pickle", save_dataset=True):
+    """
+    Prepares a vision-text dataset for quantization.
+    """
+
+    def collate_fn(example, image_column="image_url", text_column="caption"):
+        """
+        Preprocesses an example by loading and transforming image and text data.
+        Checks if the text data in the example is valid by calling the `check_text_data` function.
+        Downloads the image specified by the URL in the image_column by calling the `get_pil_from_url` function.
+        If there is any error during the download process, returns None.
+        Returns the preprocessed inputs with transformed image and text data.
+        """
+        assert len(example) == 1
+        example = example[0]
+
+        if not check_text_data(example[text_column]):
+            raise ValueError("Text data is not valid")
+
+        url = example[image_column]
+        try:
+            image = get_pil_from_url(url)
+            h, w = image.size
+            if h == 1 or w == 1:
+                return None
+        except Exception:
+            return None
+        inputs = processor(
+            text="<|image|><|begin_of_text|> Please describe image content based on information: " + example[text_column],
+            images=image,
+            return_tensors="pt",
+            padding=True,
+        )
+        if inputs["input_ids"].shape[1] > max_length:
+            return None
+        return inputs
+
+    if not Path(file_path).exists():
+        dataset = load_dataset("google-research-datasets/conceptual_captions", trust_remote_code=True)
+        train_dataset = dataset["train"].shuffle(seed=42)
+        dataloader = torch.utils.data.DataLoader(train_dataset, collate_fn=collate_fn, batch_size=1)
+        calibration_data = prepare_calibration_data_vision(dataloader, opt_init_steps)
+        print(f"calibration dataset will be saved in {file_path}")
+        with open(file_path, "wb") as f:
+            pickle.dump(calibration_data, f)
+    else:
+        with open(file_path, "rb") as f:
+            calibration_data = pickle.load(f)
+
+    return calibration_data
+
+
+def prepare_calibration_data_llm(dataloader, init_steps, mllm, processor):
+    """
+    This function prepares calibration data from a dataloader for a specified number of initialization steps.
+    It iterates over the dataloader, fetching batches and storing the relevant data.
+    """
+    data = []
+
+    prompt = "<|image|><|begin_of_text|>If I had to write a haiku for this one"
+    url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+    inputs = processor(text=prompt, images=image, return_tensors="pt")
+
+    print(f"Fetching {init_steps} samples for the initialization...")
+    with tqdm(total=init_steps) as pbar:
+        for batch in dataloader:
+            if len(data) == init_steps:
+                break
+            if batch:
+                pbar.update(1)
+                with torch.no_grad():
+                    cache_position = np.cumsum(batch.data["attention_mask"].to("cpu"), axis=1) - 1
+                    cache_position[batch.data["attention_mask"] == 0] = 1
+
+                    vision_input = {
+                        "pixel_values": batch["pixel_values"].to("cpu"),
+                        "aspect_ratio_ids": batch.data["aspect_ratio_ids"].to("cpu"),
+                        "aspect_ratio_mask": batch.data["aspect_ratio_mask"].to("cpu"),
+                        "cross_attention_mask": batch.data["cross_attention_mask"].to("cpu"),
+                        "cache_position": cache_position[0, :],
+                    }
+
+                    cross_attention_states = mllm.prepare_vision_outputs(**vision_input)
+                    res = {"input_ids": batch.data["input_ids"].to("cpu"), "attention_mask": batch.data["attention_mask"].to("cpu"), **cross_attention_states}
+                    position_ids = np.cumsum(res["attention_mask"], axis=1) - 1
+                    position_ids[res["attention_mask"] == 0] = 1
+                    res["position_ids"] = position_ids
+
+                    res = mllm.prepare_llm_inputs(**res)
+                    data.append(res)
+    return data
+
+
+def prepare_dataset_llm(mllm_id, opt_init_steps=50, max_train_samples=1000, file_path="llm_dataset.pickle", save_dataset=False):
+    """
+    Prepares a vision-text dataset for quantization.
+    """
+
+    if Path(file_path).exists():
+        print(f"callibration dataset will be loaded from {file_path}")
+        with open(file_path, "rb") as f:
+            calibration_data = pickle.load(f)
+        return calibration_data
+
+    mllm = OVMLlamaForConditionalGeneration(mllm_id, slice_lm_head=False)
+    processor = AutoProcessor.from_pretrained(mllm_id)
+
+    def collate_fn(example, image_column="image_url", text_column="caption"):
+        """
+        Preprocesses an example by loading and transforming image and text data.
+        Checks if the text data in the example is valid by calling the `check_text_data` function.
+        Downloads the image specified by the URL in the image_column by calling the `get_pil_from_url` function.
+        If there is any error during the download process, returns None.
+        Returns the preprocessed inputs with transformed image and text data.
+        """
+        assert len(example) == 1
+        example = example[0]
+
+        if not check_text_data(example[text_column]):
+            raise ValueError("Text data is not valid")
+
+        url = example[image_column]
+        try:
+            image = get_pil_from_url(url)
+            h, w = image.size
+            if h == 1 or w == 1:
+                return None
+        except Exception:
+            return None
+        inputs = processor(
+            text="<|image|><|begin_of_text|> Please describe image content based on information: " + example[text_column],
+            images=image,
+            return_tensors="pt",
+            padding=True,
+        )
+        if inputs["input_ids"].shape[1] > max_length:
+            return None
+        return inputs
+
+    dataset = load_dataset("google-research-datasets/conceptual_captions", trust_remote_code=True)
+    train_dataset = dataset["train"].shuffle(seed=42)
+    dataloader = torch.utils.data.DataLoader(train_dataset, collate_fn=collate_fn, batch_size=1)
+    calibration_data = prepare_calibration_data_llm(dataloader, opt_init_steps, mllm, processor)
+
+    if save_dataset:
+        with open(file_path, "wb") as f:
+            print(f"calibration data will be saved into {file_path}")
+            pickle.dump(calibration_data, f)
+
+    del mllm
+    gc.collect()
+
+    return calibration_data