Skip to content

Commit

Permalink
add notebook (#2408)
Browse files Browse the repository at this point in the history
Co-authored-by: Andrei Anufriev <andrey.anufriev@intel.com>
  • Loading branch information
eaidova and andreyanufr authored Sep 24, 2024
1 parent 4896dbb commit 45f3b9a
Show file tree
Hide file tree
Showing 11 changed files with 2,229 additions and 3 deletions.
3 changes: 2 additions & 1 deletion .ci/ignore_convert_execution.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,4 +61,5 @@ notebooks/hunyuan-dit-image-generation/hunyuan-dit-image-generation.ipynb
notebooks/stable-diffusion-v3/stable-diffusion-v3.ipynb
notebooks/llm-rag-llamaindex/llm-rag-llamaindex.ipynb
notebooks/llm-agent-functioncall/llm-agent-functioncall-qwen.ipynb
notebooks/llm-agent-react/llm-agent-rag-llamaindex.ipynb
notebooks/llm-agent-react/llm-agent-rag-llamaindex.ipynb
notebooks/mllama-3.2/mllama-3.2.ipynb
1 change: 1 addition & 0 deletions .ci/ignore_treon_docker.txt
Original file line number Diff line number Diff line change
Expand Up @@ -81,4 +81,5 @@ notebooks/internvl2/internvl2.ipynb
notebooks/qwen2-vl/qwen2-vl.ipynb
notebooks/qwen2-audio/qwen2-audio.ipynb
notebooks/stable-fast-3d/stable-fast-3d.ipynb
notebooks/mllama-3.2/mllama-3.2.ipynb
notebooks/segment-anything/segment-anything-2-image.ipynb
7 changes: 7 additions & 0 deletions .ci/skipped_notebooks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -587,6 +587,13 @@
- '3.8'
- os:
- macos-12
- notebook: notebooks/mllama-3.2/mllama-3.2.ipynb
skips:
- os:
- macos-12
- ubuntu-20.04
- ubuntu-22.04
- windows-2019
- notebook: notebooks/llm-agent-react/llm-agent-react-langchain.ipynb
skips:
- python:
Expand Down
1 change: 1 addition & 0 deletions .ci/spellcheck/.pyspelling.wordlist.txt
Original file line number Diff line number Diff line change
Expand Up @@ -436,6 +436,7 @@ llava
llm
LLM
LLMs
LM
LMS
LLMPipeline
logits
Expand Down
27 changes: 27 additions & 0 deletions notebooks/mllama-3.2/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# Visual-language assistant with Llama-3.2-11B-Vision and OpenVINO

Llama-3.2-11B-Vision is the latest model from LLama3 model family those capabilities extended to understand images content.
More details about model can be found in [model card](https://github.com/meta-llama/llama-models/blob/main/models/llama3_2/MODEL_CARD_VISION.md), and original [repo](https://github.com/meta-llama/llama-models).

In this tutorial we consider how to convert and optimize Llama-Vision model for creating multimodal chatbot. Additionally, we demonstrate how to apply stateful transformation on LLM part and model optimization techniques like weights compression and quantization using [NNCF](https://github.com/openvinotoolkit/nncf)

## Notebook contents
The tutorial consists from following steps:

- Install requirements
- Convert model
- Optimize Language model using weights compression
- Optimize Image encoder using post-training quantization
- Run OpenVINO model inference
- Launch Interactive demo

In this demonstration, you'll create interactive chatbot that can answer questions about provided image's content.

The image bellow illustrates example of input prompt and model answer.
![example.png](https://github.com/user-attachments/assets/1e3fde78-bae5-4b9a-8ef3-ea1291b288cf)

## Installation instructions
This is a self-contained example that relies solely on its own code.</br>
We recommend running the notebook in a virtual environment. You only need a Jupyter server to start.
For details, please refer to [Installation Guide](../../README.md).
<img referrerpolicy="no-referrer-when-downgrade" src="https://static.scarf.sh/a.png?x-pxid=5b5a4db0-7875-4bfb-bdbd-01698b5b1a77&file=notebooks/mllama-3.2/README.md" />
254 changes: 254 additions & 0 deletions notebooks/mllama-3.2/data_preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
import torch
from datasets import load_dataset
from transformers import AutoProcessor
from tqdm.autonotebook import tqdm
from pathlib import Path
import pickle
import gc

import requests
from io import BytesIO
import numpy as np
from PIL import Image
from requests.packages.urllib3.exceptions import InsecureRequestWarning

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
from ov_mllama_helper import OVMLlamaForConditionalGeneration


max_length = 4048


def check_text_data(data):
"""
Check if the given data is text-based.
"""
if isinstance(data, str):
return True
if isinstance(data, list):
return all(isinstance(x, str) for x in data)
return False


def get_pil_from_url(url):
"""
Downloads and converts an image from a URL to a PIL Image object.
"""
response = requests.get(url, verify=False, timeout=20)
image = Image.open(BytesIO(response.content))
return image.convert("RGB")


# def collate_fn_llm(example, image_column="image_url", text_column="caption"):
# """
# Preprocesses an example by loading and transforming image and text data.
# Checks if the text data in the example is valid by calling the `check_text_data` function.
# Downloads the image specified by the URL in the image_column by calling the `get_pil_from_url` function.
# If there is any error during the download process, returns None.
# Returns the preprocessed inputs with transformed image and text data.
# """
# assert len(example) == 1
# example = example[0]

# if not check_text_data(example[text_column]):
# raise ValueError("Text data is not valid")

# url = example[image_column]
# try:
# image = get_pil_from_url(url)
# h, w = image.size
# if h == 1 or w == 1:
# return None
# except Exception:
# return None

# inputs = processor(text="<|image|><|begin_of_text|>"+example[text_column], images=image, return_tensors="pt", padding=True)
# if inputs['input_ids'].shape[1] > max_length:
# return None
# return inputs


def prepare_calibration_data_vision(dataloader, init_steps):
"""
This function prepares calibration data from a dataloader for a specified number of initialization steps.
It iterates over the dataloader, fetching batches and storing the relevant data.
"""
prompt = "<|image|><|begin_of_text|>If I had to write a haiku for this one"
url = "https://www.ilankelman.org/stopsigns/australia.jpg"
image = Image.open(requests.get(url, stream=True).raw)
model_id = "Llama-3.2-11B-Vision-Instruct/OV"
processor = AutoProcessor.from_pretrained(model_id)
inputs = processor(text=prompt, images=image, return_tensors="pt")
data = []
print(f"Fetching {init_steps} samples for the initialization...")
with tqdm(total=init_steps) as pbar:
for batch in dataloader:
if len(data) == init_steps:
break
if batch:
pbar.update(1)
with torch.no_grad():
data.append(
{
"pixel_values": batch["pixel_values"].to("cpu"),
"aspect_ratio_ids": inputs.data["aspect_ratio_ids"].to("cpu"),
"aspect_ratio_mask": inputs.data["aspect_ratio_mask"],
}
)
return data


def prepare_dataset_vision(processor, opt_init_steps=50, max_train_samples=1000, file_path="vision_dataset.pickle", save_dataset=True):
"""
Prepares a vision-text dataset for quantization.
"""

def collate_fn(example, image_column="image_url", text_column="caption"):
"""
Preprocesses an example by loading and transforming image and text data.
Checks if the text data in the example is valid by calling the `check_text_data` function.
Downloads the image specified by the URL in the image_column by calling the `get_pil_from_url` function.
If there is any error during the download process, returns None.
Returns the preprocessed inputs with transformed image and text data.
"""
assert len(example) == 1
example = example[0]

if not check_text_data(example[text_column]):
raise ValueError("Text data is not valid")

url = example[image_column]
try:
image = get_pil_from_url(url)
h, w = image.size
if h == 1 or w == 1:
return None
except Exception:
return None
inputs = processor(
text="<|image|><|begin_of_text|> Please describe image content based on information: " + example[text_column],
images=image,
return_tensors="pt",
padding=True,
)
if inputs["input_ids"].shape[1] > max_length:
return None
return inputs

if not Path(file_path).exists():
dataset = load_dataset("google-research-datasets/conceptual_captions", trust_remote_code=True)
train_dataset = dataset["train"].shuffle(seed=42)
dataloader = torch.utils.data.DataLoader(train_dataset, collate_fn=collate_fn, batch_size=1)
calibration_data = prepare_calibration_data_vision(dataloader, opt_init_steps)
print(f"calibration dataset will be saved in {file_path}")
with open(file_path, "wb") as f:
pickle.dump(calibration_data, f)
else:
with open(file_path, "rb") as f:
calibration_data = pickle.load(f)

return calibration_data


def prepare_calibration_data_llm(dataloader, init_steps, mllm, processor):
"""
This function prepares calibration data from a dataloader for a specified number of initialization steps.
It iterates over the dataloader, fetching batches and storing the relevant data.
"""
data = []

prompt = "<|image|><|begin_of_text|>If I had to write a haiku for this one"
url = "https://www.ilankelman.org/stopsigns/australia.jpg"
image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(text=prompt, images=image, return_tensors="pt")

print(f"Fetching {init_steps} samples for the initialization...")
with tqdm(total=init_steps) as pbar:
for batch in dataloader:
if len(data) == init_steps:
break
if batch:
pbar.update(1)
with torch.no_grad():
cache_position = np.cumsum(batch.data["attention_mask"].to("cpu"), axis=1) - 1
cache_position[batch.data["attention_mask"] == 0] = 1

vision_input = {
"pixel_values": batch["pixel_values"].to("cpu"),
"aspect_ratio_ids": batch.data["aspect_ratio_ids"].to("cpu"),
"aspect_ratio_mask": batch.data["aspect_ratio_mask"].to("cpu"),
"cross_attention_mask": batch.data["cross_attention_mask"].to("cpu"),
"cache_position": cache_position[0, :],
}

cross_attention_states = mllm.prepare_vision_outputs(**vision_input)
res = {"input_ids": batch.data["input_ids"].to("cpu"), "attention_mask": batch.data["attention_mask"].to("cpu"), **cross_attention_states}
position_ids = np.cumsum(res["attention_mask"], axis=1) - 1
position_ids[res["attention_mask"] == 0] = 1
res["position_ids"] = position_ids

res = mllm.prepare_llm_inputs(**res)
data.append(res)
return data


def prepare_dataset_llm(mllm_id, opt_init_steps=50, max_train_samples=1000, file_path="llm_dataset.pickle", save_dataset=False):
"""
Prepares a vision-text dataset for quantization.
"""

if Path(file_path).exists():
print(f"callibration dataset will be loaded from {file_path}")
with open(file_path, "rb") as f:
calibration_data = pickle.load(f)
return calibration_data

mllm = OVMLlamaForConditionalGeneration(mllm_id, slice_lm_head=False)
processor = AutoProcessor.from_pretrained(mllm_id)

def collate_fn(example, image_column="image_url", text_column="caption"):
"""
Preprocesses an example by loading and transforming image and text data.
Checks if the text data in the example is valid by calling the `check_text_data` function.
Downloads the image specified by the URL in the image_column by calling the `get_pil_from_url` function.
If there is any error during the download process, returns None.
Returns the preprocessed inputs with transformed image and text data.
"""
assert len(example) == 1
example = example[0]

if not check_text_data(example[text_column]):
raise ValueError("Text data is not valid")

url = example[image_column]
try:
image = get_pil_from_url(url)
h, w = image.size
if h == 1 or w == 1:
return None
except Exception:
return None
inputs = processor(
text="<|image|><|begin_of_text|> Please describe image content based on information: " + example[text_column],
images=image,
return_tensors="pt",
padding=True,
)
if inputs["input_ids"].shape[1] > max_length:
return None
return inputs

dataset = load_dataset("google-research-datasets/conceptual_captions", trust_remote_code=True)
train_dataset = dataset["train"].shuffle(seed=42)
dataloader = torch.utils.data.DataLoader(train_dataset, collate_fn=collate_fn, batch_size=1)
calibration_data = prepare_calibration_data_llm(dataloader, opt_init_steps, mllm, processor)

if save_dataset:
with open(file_path, "wb") as f:
print(f"calibration data will be saved into {file_path}")
pickle.dump(calibration_data, f)

del mllm
gc.collect()

return calibration_data
Loading

0 comments on commit 45f3b9a

Please sign in to comment.