From 0382dbec2b603c25731d207e045afcd73089ea75 Mon Sep 17 00:00:00 2001 From: Yaroslav Tarkan Date: Fri, 19 Apr 2024 13:17:21 +0300 Subject: [PATCH] Finalize the standalone approach for the notebooks (#1942) Ticket: CVS-138789 Changed notebooks (added fetching of separate files inside the notebook if they don't exist): - [stable-diffusion-keras-cv](https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/stable-diffusion-keras-cv) - [ct-segmentation-quantize](https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/ct-segmentation-quantize) - [vision-background-removal](https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/vision-background-removal) - [blip-visual-language-processing](https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/blip-visual-language-processing) - [llm-question-answering](https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/llm-question-answering) - [whisper-subtitles-generation](https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/whisper-subtitles-generation) - [deepfloyd-if](https://github.com/openvinotoolkit/openvino_notebooks/blob/latest/notebooks/deepfloyd-if) --------- Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .../blip-visual-language-processing.ipynb | 6 + .../ct-scan-live-inference.ipynb | 10 +- .../ct-segmentation-quantize-nncf.ipynb | 18 +- .../deepfloyd-if/deep-floyd-if-convert.ipynb | 14 + .../deepfloyd-if/deep-floyd-if-optimize.ipynb | 13 + notebooks/deepfloyd-if/utils.py | 13 +- .../llm-question-answering.ipynb | 12 + .../stable-diffusion-keras-cv.ipynb | 13 +- ...ov_stable_diffusion_inpainting_pipeline.py | 610 ------------------ .../vision-background-removal.ipynb | 9 +- .../whisper-convert.ipynb | 12 + .../whisper-nncf-quantize.ipynb | 18 +- 12 files changed, 117 insertions(+), 631 deletions(-) delete mode 100644 notebooks/stable-diffusion-v2/implementation/ov_stable_diffusion_inpainting_pipeline.py diff --git a/notebooks/blip-visual-language-processing/blip-visual-language-processing.ipynb b/notebooks/blip-visual-language-processing/blip-visual-language-processing.ipynb index 071ae3e38bc..70d69f6c591 100644 --- a/notebooks/blip-visual-language-processing/blip-visual-language-processing.ipynb +++ b/notebooks/blip-visual-language-processing/blip-visual-language-processing.ipynb @@ -255,6 +255,10 @@ } ], "source": [ + "from pathlib import Path\n", + "\n", + "if not Path(\"./utils.py\").exists():\n", + " download_file(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/blip-visual-language-processing/utils.py\")\n", "from utils import visualize_results\n", "\n", "fig = visualize_results(raw_image, answer, question)" @@ -616,6 +620,8 @@ }, "outputs": [], "source": [ + "if not Path(\"./blip_model.py\").exists():\n", + " download_file(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/blip-visual-language-processing/blip_model.py\")\n", "from blip_model import OVBlipModel\n", "\n", "ov_model = OVBlipModel(model.config, model.decoder_start_token_id, ov_vision_model, ov_text_encoder, text_decoder)\n", diff --git a/notebooks/ct-segmentation-quantize/ct-scan-live-inference.ipynb b/notebooks/ct-segmentation-quantize/ct-scan-live-inference.ipynb index c03206da68d..c4ea3081038 100644 --- a/notebooks/ct-segmentation-quantize/ct-scan-live-inference.ipynb +++ b/notebooks/ct-segmentation-quantize/ct-scan-live-inference.ipynb @@ -87,14 +87,16 @@ "from monai.transforms import LoadImage\n", "import openvino as ov\n", "\n", - "from custom_segmentation import SegmentationModel\n", - "\n", - "# Fetch `notebook_utils` module\n", "import requests\n", "\n", + "# Fetch `notebook_utils` module\n", "r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py\")\n", "open(\"notebook_utils.py\", \"w\").write(r.text)\n", - "from notebook_utils import download_file" + "from notebook_utils import download_file\n", + "\n", + "if not Path(\"./custom_segmentation.py\").exists():\n", + " download_file(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/ct-segmentation-quantize/custom_segmentation.py\")\n", + "from custom_segmentation import SegmentationModel" ] }, { diff --git a/notebooks/ct-segmentation-quantize/ct-segmentation-quantize-nncf.ipynb b/notebooks/ct-segmentation-quantize/ct-segmentation-quantize-nncf.ipynb index 1ec49f8b044..efb1548deb8 100644 --- a/notebooks/ct-segmentation-quantize/ct-segmentation-quantize-nncf.ipynb +++ b/notebooks/ct-segmentation-quantize/ct-segmentation-quantize-nncf.ipynb @@ -142,17 +142,23 @@ "from monai.transforms import LoadImage\n", "from nncf.common.logging.logger import set_log_level\n", "from torchmetrics import F1Score as F1\n", + "import requests\n", "\n", - "set_log_level(logging.ERROR) # Disables all NNCF info and warning messages\n", "\n", - "from custom_segmentation import SegmentationModel\n", - "from async_pipeline import show_live_inference\n", + "set_log_level(logging.ERROR) # Disables all NNCF info and warning messages\n", "\n", "# Fetch `notebook_utils` module\n", - "import requests\n", - "\n", "r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py\")\n", - "from notebook_utils import download_file" + "open(\"notebook_utils.py\", \"w\").write(r.text)\n", + "from notebook_utils import download_file\n", + "\n", + "if not Path(\"./custom_segmentation.py\").exists():\n", + " download_file(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/ct-segmentation-quantize/custom_segmentation.py\")\n", + "from custom_segmentation import SegmentationModel\n", + "\n", + "if not Path(\"./async_pipeline.py\").exists():\n", + " download_file(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/ct-segmentation-quantize/async_pipeline.py\")\n", + "from async_pipeline import show_live_inference" ] }, { diff --git a/notebooks/deepfloyd-if/deep-floyd-if-convert.ipynb b/notebooks/deepfloyd-if/deep-floyd-if-convert.ipynb index 4da08c574bd..a6a2430e906 100644 --- a/notebooks/deepfloyd-if/deep-floyd-if-convert.ipynb +++ b/notebooks/deepfloyd-if/deep-floyd-if-convert.ipynb @@ -144,6 +144,20 @@ "from diffusers import DiffusionPipeline\n", "import openvino as ov\n", "import torch\n", + "\n", + "# Fetch `notebook_utils` module\n", + "import requests\n", + "\n", + "r = requests.get(\n", + " url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py\",\n", + ")\n", + "\n", + "open(\"notebook_utils.py\", \"w\").write(r.text)\n", + "from notebook_utils import download_file\n", + "\n", + "if not Path(\"./utils.py\").exists():\n", + " download_file(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/deepfloyd-if/utils.py\")\n", + "\n", "from utils import (\n", " TextEncoder,\n", " UnetFirstStage,\n", diff --git a/notebooks/deepfloyd-if/deep-floyd-if-optimize.ipynb b/notebooks/deepfloyd-if/deep-floyd-if-optimize.ipynb index d136f8f788a..90eebc6f3d9 100644 --- a/notebooks/deepfloyd-if/deep-floyd-if-optimize.ipynb +++ b/notebooks/deepfloyd-if/deep-floyd-if-optimize.ipynb @@ -75,6 +75,19 @@ "from pathlib import Path\n", "from typing import Any, List\n", "\n", + "# Fetch `notebook_utils` module\n", + "import requests\n", + "\n", + "r = requests.get(\n", + " url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py\",\n", + ")\n", + "\n", + "open(\"notebook_utils.py\", \"w\").write(r.text)\n", + "from notebook_utils import download_file\n", + "\n", + "if not Path(\"./utils.py\").exists():\n", + " download_file(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/deepfloyd-if/utils.py\")\n", + "\n", "from utils import TextEncoder, UnetFirstStage, UnetSecondStage\n", "\n", "checkpoint_variant = \"fp16\"\n", diff --git a/notebooks/deepfloyd-if/utils.py b/notebooks/deepfloyd-if/utils.py index 710e7a6e00a..c4703292e31 100644 --- a/notebooks/deepfloyd-if/utils.py +++ b/notebooks/deepfloyd-if/utils.py @@ -8,14 +8,15 @@ from pathlib import Path from PIL import Image -# Fetch `notebook_utils` module -import requests +if not Path("./notebook_utils.py").exists(): + # Fetch `notebook_utils` module + import requests -r = requests.get( - url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", -) + r = requests.get( + url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py", + ) -open("notebook_utils.py", "w").write(r.text) + open("notebook_utils.py", "w").write(r.text) from notebook_utils import download_file diff --git a/notebooks/llm-question-answering/llm-question-answering.ipynb b/notebooks/llm-question-answering/llm-question-answering.ipynb index 0f6bb524d98..f5690e182d9 100644 --- a/notebooks/llm-question-answering/llm-question-answering.ipynb +++ b/notebooks/llm-question-answering/llm-question-answering.ipynb @@ -101,6 +101,18 @@ "metadata": {}, "outputs": [], "source": [ + "from pathlib import Path\n", + "import requests\n", + "\n", + "# Fetch `notebook_utils` module\n", + "r = requests.get(\n", + " url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py\",\n", + ")\n", + "open(\"notebook_utils.py\", \"w\").write(r.text)\n", + "from notebook_utils import download_file\n", + "\n", + "if not Path(\"./config.py\").exists():\n", + " download_file(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/llm-question-answering/config.py\")\n", "from config import SUPPORTED_LLM_MODELS\n", "import ipywidgets as widgets" ] diff --git a/notebooks/stable-diffusion-keras-cv/stable-diffusion-keras-cv.ipynb b/notebooks/stable-diffusion-keras-cv/stable-diffusion-keras-cv.ipynb index fc8957834af..c37362327b4 100644 --- a/notebooks/stable-diffusion-keras-cv/stable-diffusion-keras-cv.ipynb +++ b/notebooks/stable-diffusion-keras-cv/stable-diffusion-keras-cv.ipynb @@ -100,6 +100,12 @@ "import openvino as ov\n", "import numpy as np\n", "from pathlib import Path\n", + "import requests\n", + "\n", + "# Fetch `notebook_utils` module\n", + "r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py\")\n", + "open(\"notebook_utils.py\", \"w\").write(r.text)\n", + "from notebook_utils import download_file\n", "\n", "IMAGE_WIDTH = 512\n", "IMAGE_HEIGHT = 512\n", @@ -296,11 +302,16 @@ "import tf_keras as keras\n", "import numpy as np\n", "import tensorflow as tf\n", + "from pathlib import Path\n", "\n", - "from constants import UNCONDITIONAL_TOKENS, ALPHAS_CUMPROD\n", "from keras_cv.models.stable_diffusion import SimpleTokenizer\n", "\n", "\n", + "if not Path(\"./constants.py\").exists():\n", + " download_file(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/stable-diffusion-keras-cv/constants.py\")\n", + "from constants import UNCONDITIONAL_TOKENS, ALPHAS_CUMPROD\n", + "\n", + "\n", "class StableDiffusion:\n", " def __init__(self, text_encoder, diffusion_model, decoder):\n", " # UNet requires multiples of 2**7 = 128\n", diff --git a/notebooks/stable-diffusion-v2/implementation/ov_stable_diffusion_inpainting_pipeline.py b/notebooks/stable-diffusion-v2/implementation/ov_stable_diffusion_inpainting_pipeline.py deleted file mode 100644 index 6f23f8dca7d..00000000000 --- a/notebooks/stable-diffusion-v2/implementation/ov_stable_diffusion_inpainting_pipeline.py +++ /dev/null @@ -1,610 +0,0 @@ -import inspect -from typing import List, Optional, Union, Dict - -import PIL -import cv2 - -import numpy as np -import torch - -from transformers import CLIPTokenizer -from diffusers import DiffusionPipeline -from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler -import openvino as ov - - -def prepare_mask_and_masked_image(image: PIL.Image.Image, mask: PIL.Image.Image): - """ - Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be - converted to ``np.array`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the - ``image`` and ``1`` for the ``mask``. - - The ``image`` will be converted to ``np.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be - binarized (``mask > 0.5``) and cast to ``np.float32`` too. - - Args: - image (Union[np.array, PIL.Image]): The image to inpaint. - It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` - mask (_type_): The mask to apply to the image, i.e. regions to inpaint. - It can be a ``PIL.Image``, or a ``height x width`` ``np.array``. - - Returns: - tuple[np.array]: The pair (mask, masked_image) as ``torch.Tensor`` with 4 - dimensions: ``batch x channels x height x width``. - """ - if isinstance(image, (PIL.Image.Image, np.ndarray)): - image = [image] - - if isinstance(image, list) and isinstance(image[0], PIL.Image.Image): - image = [np.array(i.convert("RGB"))[None, :] for i in image] - image = np.concatenate(image, axis=0) - elif isinstance(image, list) and isinstance(image[0], np.ndarray): - image = np.concatenate([i[None, :] for i in image], axis=0) - - image = image.transpose(0, 3, 1, 2) - image = image.astype(np.float32) / 127.5 - 1.0 - - # preprocess mask - if isinstance(mask, (PIL.Image.Image, np.ndarray)): - mask = [mask] - - if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image): - mask = np.concatenate([np.array(m.convert("L"))[None, None, :] for m in mask], axis=0) - mask = mask.astype(np.float32) / 255.0 - elif isinstance(mask, list) and isinstance(mask[0], np.ndarray): - mask = np.concatenate([m[None, None, :] for m in mask], axis=0) - - mask[mask < 0.5] = 0 - mask[mask >= 0.5] = 1 - - masked_image = image * (mask < 0.5) - - return mask, masked_image - - -class OVStableDiffusionInpaintingPipeline(DiffusionPipeline): - def __init__( - self, - vae_decoder: ov.Model, - text_encoder: ov.Model, - tokenizer: CLIPTokenizer, - unet: ov.Model, - scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler], - vae_encoder: ov.Model = None, - ): - """ - Pipeline for text-to-image generation using Stable Diffusion. - Parameters: - vae_decoder (Model): - Variational Auto-Encoder (VAE) Model to decode images to and from latent representations. - text_encoder (Model): - Frozen text-encoder. Stable Diffusion uses the text portion of - [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically - the clip-vit-large-patch14(https://huggingface.co/openai/clip-vit-large-patch14) variant. - tokenizer (CLIPTokenizer): - Tokenizer of class CLIPTokenizer(https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer). - unet (Model): Conditional U-Net architecture to denoise the encoded image latents. - vae_encoder (Model): - Variational Auto-Encoder (VAE) Model to encode images to latent representation. - scheduler (SchedulerMixin): - A scheduler to be used in combination with unet to denoise the encoded image latents. Can be one of - DDIMScheduler, LMSDiscreteScheduler, or PNDMScheduler. - """ - super().__init__() - self.scheduler = scheduler - self.vae_decoder = vae_decoder - self.vae_encoder = vae_encoder - self.text_encoder = text_encoder - self.unet = unet - self._text_encoder_output = text_encoder.output(0) - self._unet_output = unet.output(0) - self._vae_d_output = vae_decoder.output(0) - self._vae_e_output = vae_encoder.output(0) if vae_encoder is not None else None - self.height = self.unet.input(0).shape[2] * 8 - self.width = self.unet.input(0).shape[3] * 8 - self.tokenizer = tokenizer - - def prepare_mask_latents( - self, - mask, - masked_image, - height=512, - width=512, - do_classifier_free_guidance=True, - ): - """ - Prepare mask as Unet nput and encode input masked image to latent space using vae encoder - - Parameters: - mask (np.array): input mask array - masked_image (np.array): masked input image tensor - heigh (int, *optional*, 512): generated image height - width (int, *optional*, 512): generated image width - do_classifier_free_guidance (bool, *optional*, True): whether to use classifier free guidance or not - Returns: - mask (np.array): resized mask tensor - masked_image_latents (np.array): masked image encoded into latent space using VAE - """ - mask = torch.nn.functional.interpolate(torch.from_numpy(mask), size=(height // 8, width // 8)) - mask = mask.numpy() - - # encode the mask image into latents space so we can concatenate it to the latents - logits = self.vae_encoder(masked_image)[self._vae_e_output] - masked_image_latents = logits * 0.18215 - - mask = np.concatenate([mask] * 2) if do_classifier_free_guidance else mask - masked_image_latents = np.concatenate([masked_image_latents] * 2) if do_classifier_free_guidance else masked_image_latents - return mask, masked_image_latents - - def __call__( - self, - prompt: Union[str, List[str]], - image: PIL.Image.Image, - mask_image: PIL.Image.Image, - negative_prompt: Union[str, List[str]] = None, - num_inference_steps: Optional[int] = 50, - guidance_scale: Optional[float] = 7.5, - eta: Optional[float] = 0, - output_type: Optional[str] = "pil", - seed: Optional[int] = None, - ): - """ - Function invoked when calling the pipeline for generation. - Parameters: - prompt (str or List[str]): - The prompt or prompts to guide the image generation. - image (PIL.Image.Image): - Source image for inpainting. - mask_image (PIL.Image.Image): - Mask area for inpainting - negative_prompt (str or List[str]): - The negative prompt or prompts to guide the image generation. - num_inference_steps (int, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - guidance_scale (float, *optional*, defaults to 7.5): - Guidance scale as defined in Classifier-Free Diffusion Guidance(https://arxiv.org/abs/2207.12598). - guidance_scale is defined as `w` of equation 2. - Higher guidance scale encourages to generate images that are closely linked to the text prompt, - usually at the expense of lower image quality. - eta (float, *optional*, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [DDIMScheduler], will be ignored for others. - output_type (`str`, *optional*, defaults to "pil"): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): PIL.Image.Image or np.array. - seed (int, *optional*, None): - Seed for random generator state initialization. - Returns: - Dictionary with keys: - sample - the last generated image PIL.Image.Image or np.array - """ - if seed is not None: - np.random.seed(seed) - # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) - # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` - # corresponds to doing no classifier free guidance. - do_classifier_free_guidance = guidance_scale > 1.0 - # get prompt text embeddings - text_embeddings = self._encode_prompt( - prompt, - do_classifier_free_guidance=do_classifier_free_guidance, - negative_prompt=negative_prompt, - ) - # prepare mask - mask, masked_image = prepare_mask_and_masked_image(image, mask_image) - # set timesteps - accepts_offset = "offset" in set(inspect.signature(self.scheduler.set_timesteps).parameters.keys()) - extra_set_kwargs = {} - if accepts_offset: - extra_set_kwargs["offset"] = 1 - - self.scheduler.set_timesteps(num_inference_steps, **extra_set_kwargs) - timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, 1) - latent_timestep = timesteps[:1] - - # get the initial random noise unless the user supplied it - latents, meta = self.prepare_latents(None, latent_timestep) - mask, masked_image_latents = self.prepare_mask_latents( - mask, - masked_image, - do_classifier_free_guidance=do_classifier_free_guidance, - ) - - # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature - # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. - # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 - # and should be between [0, 1] - accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) - extra_step_kwargs = {} - if accepts_eta: - extra_step_kwargs["eta"] = eta - - for t in self.progress_bar(timesteps): - # expand the latents if we are doing classifier free guidance - latent_model_input = np.concatenate([latents] * 2) if do_classifier_free_guidance else latents - latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) - latent_model_input = np.concatenate([latent_model_input, mask, masked_image_latents], axis=1) - # predict the noise residual - noise_pred = self.unet([latent_model_input, np.array(t, dtype=np.float32), text_embeddings])[self._unet_output] - # perform guidance - if do_classifier_free_guidance: - noise_pred_uncond, noise_pred_text = noise_pred[0], noise_pred[1] - noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond) - - # compute the previous noisy sample x_t -> x_t-1 - latents = self.scheduler.step( - torch.from_numpy(noise_pred), - t, - torch.from_numpy(latents), - **extra_step_kwargs, - )["prev_sample"].numpy() - # scale and decode the image latents with vae - image = self.vae_decoder(latents * (1 / 0.18215))[self._vae_d_output] - - image = self.postprocess_image(image, meta, output_type) - return {"sample": image} - - def _encode_prompt( - self, - prompt: Union[str, List[str]], - num_images_per_prompt: int = 1, - do_classifier_free_guidance: bool = True, - negative_prompt: Union[str, List[str]] = None, - ): - """ - Encodes the prompt into text encoder hidden states. - - Parameters: - prompt (str or list(str)): prompt to be encoded - num_images_per_prompt (int): number of images that should be generated per prompt - do_classifier_free_guidance (bool): whether to use classifier free guidance or not - negative_prompt (str or list(str)): negative prompt to be encoded - Returns: - text_embeddings (np.ndarray): text encoder hidden states - """ - batch_size = len(prompt) if isinstance(prompt, list) else 1 - - # tokenize input prompts - text_inputs = self.tokenizer( - prompt, - padding="max_length", - max_length=self.tokenizer.model_max_length, - truncation=True, - return_tensors="np", - ) - text_input_ids = text_inputs.input_ids - - text_embeddings = self.text_encoder(text_input_ids)[self._text_encoder_output] - - # duplicate text embeddings for each generation per prompt - if num_images_per_prompt != 1: - bs_embed, seq_len, _ = text_embeddings.shape - text_embeddings = np.tile(text_embeddings, (1, num_images_per_prompt, 1)) - text_embeddings = np.reshape(text_embeddings, (bs_embed * num_images_per_prompt, seq_len, -1)) - - # get unconditional embeddings for classifier free guidance - if do_classifier_free_guidance: - uncond_tokens: List[str] - max_length = text_input_ids.shape[-1] - if negative_prompt is None: - uncond_tokens = [""] * batch_size - elif isinstance(negative_prompt, str): - uncond_tokens = [negative_prompt] - else: - uncond_tokens = negative_prompt - uncond_input = self.tokenizer( - uncond_tokens, - padding="max_length", - max_length=max_length, - truncation=True, - return_tensors="np", - ) - - uncond_embeddings = self.text_encoder(uncond_input.input_ids)[self._text_encoder_output] - - # duplicate unconditional embeddings for each generation per prompt, using mps friendly method - seq_len = uncond_embeddings.shape[1] - uncond_embeddings = np.tile(uncond_embeddings, (1, num_images_per_prompt, 1)) - uncond_embeddings = np.reshape(uncond_embeddings, (batch_size * num_images_per_prompt, seq_len, -1)) - - # For classifier free guidance, we need to do two forward passes. - # Here we concatenate the unconditional and text embeddings into a single batch - # to avoid doing two forward passes - text_embeddings = np.concatenate([uncond_embeddings, text_embeddings]) - - return text_embeddings - - def prepare_latents(self, image: PIL.Image.Image = None, latent_timestep: torch.Tensor = None): - """ - Function for getting initial latents for starting generation - - Parameters: - image (PIL.Image.Image, *optional*, None): - Input image for generation, if not provided randon noise will be used as starting point - latent_timestep (torch.Tensor, *optional*, None): - Predicted by scheduler initial step for image generation, required for latent image mixing with nosie - Returns: - latents (np.ndarray): - Image encoded in latent space - """ - latents_shape = (1, 4, self.height // 8, self.width // 8) - noise = np.random.randn(*latents_shape).astype(np.float32) - if image is None: - # if we use LMSDiscreteScheduler, let's make sure latents are mulitplied by sigmas - if isinstance(self.scheduler, LMSDiscreteScheduler): - noise = noise * self.scheduler.sigmas[0].numpy() - return noise, {} - input_image, meta = preprocess(image) - latents = self.vae_encoder(input_image)[self._vae_e_output] - latents = latents * 0.18215 - latents = self.scheduler.add_noise(torch.from_numpy(latents), torch.from_numpy(noise), latent_timestep).numpy() - return latents, meta - - def postprocess_image(self, image: np.ndarray, meta: Dict, output_type: str = "pil"): - """ - Postprocessing for decoded image. Takes generated image decoded by VAE decoder, unpad it to initila image size (if required), - normalize and convert to [0, 255] pixels range. Optionally, convertes it from np.ndarray to PIL.Image format - - Parameters: - image (np.ndarray): - Generated image - meta (Dict): - Metadata obtained on latents preparing step, can be empty - output_type (str, *optional*, pil): - Output format for result, can be pil or numpy - Returns: - image (List of np.ndarray or PIL.Image.Image): - Postprocessed images - """ - if "padding" in meta: - pad = meta["padding"] - (_, end_h), (_, end_w) = pad[1:3] - h, w = image.shape[2:] - unpad_h = h - end_h - unpad_w = w - end_w - image = image[:, :, :unpad_h, :unpad_w] - image = np.clip(image / 2 + 0.5, 0, 1) - image = np.transpose(image, (0, 2, 3, 1)) - # 9. Convert to PIL - if output_type == "pil": - image = self.numpy_to_pil(image) - if "src_height" in meta: - orig_height, orig_width = meta["src_height"], meta["src_width"] - image = [img.resize((orig_width, orig_height), PIL.Image.Resampling.LANCZOS) for img in image] - else: - if "src_height" in meta: - orig_height, orig_width = meta["src_height"], meta["src_width"] - image = [cv2.resize(img, (orig_width, orig_width)) for img in image] - return image - - def get_timesteps(self, num_inference_steps: int, strength: float): - """ - Helper function for getting scheduler timesteps for generation - In case of image-to-image generation, it updates number of steps according to strength - - Parameters: - num_inference_steps (int): - number of inference steps for generation - strength (float): - value between 0.0 and 1.0, that controls the amount of noise that is added to the input image. - Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input. - """ - # get the original timestep using init_timestep - init_timestep = min(int(num_inference_steps * strength), num_inference_steps) - - t_start = max(num_inference_steps - init_timestep, 0) - timesteps = self.scheduler.timesteps[t_start:] - - return timesteps, num_inference_steps - t_start - - -def generate_video( - pipe: OVStableDiffusionInpaintingPipeline, - prompt: Union[str, List[str]], - negative_prompt: Union[str, List[str]], - guidance_scale: float = 7.5, - num_inference_steps: int = 20, - num_frames: int = 20, - mask_width: int = 128, - seed: int = 9999, - zoom_in: bool = False, -): - """ - Zoom video generation function - - Parameters: - pipe (OVStableDiffusionInpaintingPipeline): inpainting pipeline. - prompt (str or List[str]): The prompt or prompts to guide the image generation. - negative_prompt (str or List[str]): The negative prompt or prompts to guide the image generation. - guidance_scale (float, *optional*, defaults to 7.5): - Guidance scale as defined in Classifier-Free Diffusion Guidance(https://arxiv.org/abs/2207.12598). - guidance_scale is defined as `w` of equation 2. - Higher guidance scale encourages to generate images that are closely linked to the text prompt, - usually at the expense of lower image quality. - num_inference_steps (int, *optional*, defaults to 50): The number of denoising steps for each frame. More denoising steps usually lead to a higher quality image at the expense of slower inference. - num_frames (int, *optional*, 20): number frames for video. - mask_width (int, *optional*, 128): size of border mask for inpainting on each step. - seed (int, *optional*, None): Seed for random generator state initialization. - zoom_in (bool, *optional*, False): zoom mode Zoom In or Zoom Out. - Returns: - output_path (str): Path where generated video loacated. - """ - - height = 512 - width = height - - current_image = PIL.Image.new(mode="RGBA", size=(height, width)) - mask_image = np.array(current_image)[:, :, 3] - mask_image = PIL.Image.fromarray(255 - mask_image).convert("RGB") - current_image = current_image.convert("RGB") - - init_images = pipe( - prompt=prompt, - negative_prompt=negative_prompt, - image=current_image, - guidance_scale=guidance_scale, - mask_image=mask_image, - seed=seed, - num_inference_steps=num_inference_steps, - )["sample"] - - image_grid(init_images, rows=1, cols=1) - - num_outpainting_steps = num_frames - num_interpol_frames = 30 - - current_image = init_images[0] - all_frames = [] - all_frames.append(current_image) - - for i in range(num_outpainting_steps): - print(f"Generating image: {i + 1} / {num_outpainting_steps}") - - prev_image_fix = current_image - - prev_image = shrink_and_paste_on_blank(current_image, mask_width) - - current_image = prev_image - - # create mask (black image with white mask_width width edges) - mask_image = np.array(current_image)[:, :, 3] - mask_image = PIL.Image.fromarray(255 - mask_image).convert("RGB") - - # inpainting step - current_image = current_image.convert("RGB") - images = pipe( - prompt=prompt, - negative_prompt=negative_prompt, - image=current_image, - guidance_scale=guidance_scale, - mask_image=mask_image, - seed=seed, - num_inference_steps=num_inference_steps, - )["sample"] - current_image = images[0] - current_image.paste(prev_image, mask=prev_image) - - # interpolation steps bewteen 2 inpainted images (=sequential zoom and crop) - for j in range(num_interpol_frames - 1): - interpol_image = current_image - interpol_width = round((1 - (1 - 2 * mask_width / height) ** (1 - (j + 1) / num_interpol_frames)) * height / 2) - interpol_image = interpol_image.crop( - ( - interpol_width, - interpol_width, - width - interpol_width, - height - interpol_width, - ) - ) - - interpol_image = interpol_image.resize((height, width)) - - # paste the higher resolution previous image in the middle to avoid drop in quality caused by zooming - interpol_width2 = round((1 - (height - 2 * mask_width) / (height - 2 * interpol_width)) / 2 * height) - prev_image_fix_crop = shrink_and_paste_on_blank(prev_image_fix, interpol_width2) - interpol_image.paste(prev_image_fix_crop, mask=prev_image_fix_crop) - all_frames.append(interpol_image) - all_frames.append(current_image) - - video_file_name = f"infinite_zoom_{'in' if zoom_in else 'out'}" - fps = 30 - save_path = video_file_name + ".mp4" - write_video(save_path, all_frames, fps, reversed_order=zoom_in) - return save_path - - -def shrink_and_paste_on_blank(current_image: PIL.Image.Image, mask_width: int): - """ - Decreases size of current_image by mask_width pixels from each side, - then adds a mask_width width transparent frame, - so that the image the function returns is the same size as the input. - - Parameters: - current_image (PIL.Image): input image to transform - mask_width (int): width in pixels to shrink from each side - Returns: - prev_image (PIL.Image): resized image with extended borders - """ - - height = current_image.height - width = current_image.width - - # shrink down by mask_width - prev_image = current_image.resize((height - 2 * mask_width, width - 2 * mask_width)) - prev_image = prev_image.convert("RGBA") - prev_image = np.array(prev_image) - - # create blank non-transparent image - blank_image = np.array(current_image.convert("RGBA")) * 0 - blank_image[:, :, 3] = 1 - - # paste shrinked onto blank - blank_image[mask_width : height - mask_width, mask_width : width - mask_width, :] = prev_image - prev_image = PIL.Image.fromarray(blank_image) - - return prev_image - - -def image_grid(imgs: List[PIL.Image.Image], rows: int, cols: int): - """ - Insert images to grid - - Parameters: - imgs (List[PIL.Image.Image]): list of images for making grid - rows (int): number of rows in grid - cols (int): number of columns in grid - Returns: - grid (PIL.Image): image with input images collage - """ - assert len(imgs) == rows * cols - - w, h = imgs[0].size - grid = PIL.Image.new("RGB", size=(cols * w, rows * h)) - - for i, img in enumerate(imgs): - grid.paste(img, box=(i % cols * w, i // cols * h)) - return grid - - -def write_video( - file_path: str, - frames: List[PIL.Image.Image], - fps: float, - reversed_order: bool = True, - gif: bool = True, -): - """ - Writes frames to an mp4 video file and optionaly to gif - - Parameters: - file_path (str): Path to output video, must end with .mp4 - frames (List of PIL.Image): list of frames - fps (float): Desired frame rate - reversed_order (bool): if order of images to be reversed (default = True) - gif (bool): save frames to gif format (default = True) - Returns: - None - """ - if reversed_order: - frames.reverse() - - w, h = frames[0].size - fourcc = cv2.VideoWriter_fourcc("m", "p", "4", "v") - # fourcc = cv2.VideoWriter_fourcc(*'avc1') - writer = cv2.VideoWriter(file_path, fourcc, fps, (w, h)) - - for frame in frames: - np_frame = np.array(frame.convert("RGB")) - cv_frame = cv2.cvtColor(np_frame, cv2.COLOR_RGB2BGR) - writer.write(cv_frame) - - writer.release() - if gif: - frames[0].save( - file_path.replace(".mp4", ".gif"), - save_all=True, - append_images=frames[1:], - duratiobn=len(frames) / fps, - loop=0, - ) diff --git a/notebooks/vision-background-removal/vision-background-removal.ipynb b/notebooks/vision-background-removal/vision-background-removal.ipynb index 7e60c6be859..e7bde2a70b4 100644 --- a/notebooks/vision-background-removal/vision-background-removal.ipynb +++ b/notebooks/vision-background-removal/vision-background-removal.ipynb @@ -118,10 +118,10 @@ "outputs": [], "source": [ "# Import local modules\n", + "import requests\n", "\n", "if not Path(\"./notebook_utils.py\").exists():\n", " # Fetch `notebook_utils` module\n", - " import requests\n", "\n", " r = requests.get(\n", " url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py\",\n", @@ -129,7 +129,12 @@ "\n", " open(\"notebook_utils.py\", \"w\").write(r.text)\n", "\n", - "from notebook_utils import load_image\n", + "from notebook_utils import load_image, download_file\n", + "\n", + "if not Path(\"./model/u2net.py\").exists():\n", + " download_file(\n", + " url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/vision-background-removal/model/u2net.py\", directory=\"model\"\n", + " )\n", "from model.u2net import U2NET, U2NETP" ] }, diff --git a/notebooks/whisper-subtitles-generation/whisper-convert.ipynb b/notebooks/whisper-subtitles-generation/whisper-convert.ipynb index 70faef58a36..6e84ac236a3 100644 --- a/notebooks/whisper-subtitles-generation/whisper-convert.ipynb +++ b/notebooks/whisper-subtitles-generation/whisper-convert.ipynb @@ -439,6 +439,18 @@ "metadata": {}, "outputs": [], "source": [ + "# Fetch `notebook_utils` module\n", + "import requests\n", + "\n", + "r = requests.get(\n", + " url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py\",\n", + ")\n", + "open(\"notebook_utils.py\", \"w\").write(r.text)\n", + "from notebook_utils import download_file\n", + "\n", + "if not Path(\"./utils.py\").exists():\n", + " download_file(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/whisper-subtitles-generation/utils.py\")\n", + "\n", "from utils import (\n", " patch_whisper_for_ov_inference,\n", " OpenVINOAudioEncoder,\n", diff --git a/notebooks/whisper-subtitles-generation/whisper-nncf-quantize.ipynb b/notebooks/whisper-subtitles-generation/whisper-nncf-quantize.ipynb index 2701cd08bf6..36b83c3aafa 100644 --- a/notebooks/whisper-subtitles-generation/whisper-nncf-quantize.ipynb +++ b/notebooks/whisper-subtitles-generation/whisper-nncf-quantize.ipynb @@ -15,7 +15,8 @@ "> **NOTE**: you should run [whisper-convert](whisper-convert.ipynb) notebook first to generate OpenVINO IR model that is used for quantization.\n", "\n", "\n", - "#### Table of contents:\n\n", + "#### Table of contents:\n", + "\n", "- [Prerequisites](#Prerequisites)\n", "- [Create and initialize quantization ⇑(#0)](#Create-and-initialize-quantization-⇑(#0))\n", " - [Prepare calibration datasets](#Prepare-calibration-datasets)\n", @@ -308,6 +309,19 @@ "outputs": [], "source": [ "import whisper\n", + "\n", + "# Fetch `notebook_utils` module\n", + "import requests\n", + "\n", + "r = requests.get(\n", + " url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py\",\n", + ")\n", + "open(\"notebook_utils.py\", \"w\").write(r.text)\n", + "from notebook_utils import download_file\n", + "\n", + "if not Path(\"./utils.py\").exists():\n", + " download_file(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/whisper-subtitles-generation/utils.py\")\n", + "\n", "from utils import (\n", " patch_whisper_for_ov_inference,\n", " OpenVINOAudioEncoder,\n", @@ -1302,4 +1316,4 @@ }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +}