From 935c2f802e374699e41c020ea61e862c610056ff Mon Sep 17 00:00:00 2001
From: Aleksandr Mokrov <aleksandr.mokrov@intel.com>
Date: Tue, 14 May 2024 13:48:47 +0200
Subject: [PATCH] DynamiCrafter animating images (#2003)

CVS-139504
---
 .ci/ignore_treon_docker.txt                   |   1 +
 .ci/ignore_treon_linux.txt                    |   1 +
 .ci/ignore_treon_mac.txt                      |   3 +-
 .ci/ignore_treon_win.txt                      |   1 +
 .ci/spellcheck/.pyspelling.wordlist.txt       |   3 +
 .../dynamicrafter-animating-images/README.md  |  38 +
 .../dynamicrafter-animating-images.ipynb      | 736 ++++++++++++++++++
 7 files changed, 782 insertions(+), 1 deletion(-)
 create mode 100644 notebooks/dynamicrafter-animating-images/README.md
 create mode 100644 notebooks/dynamicrafter-animating-images/dynamicrafter-animating-images.ipynb

diff --git a/.ci/ignore_treon_docker.txt b/.ci/ignore_treon_docker.txt
index 87191fcc6a1..39d595667b3 100644
--- a/.ci/ignore_treon_docker.txt
+++ b/.ci/ignore_treon_docker.txt
@@ -59,3 +59,4 @@ stable-video-diffusion
 llm-agent-langchain
 hello-npu
 stable-cascade-image-generation
+dynamicrafter-animating-images
\ No newline at end of file
diff --git a/.ci/ignore_treon_linux.txt b/.ci/ignore_treon_linux.txt
index c361dd88faf..ad6328261b9 100644
--- a/.ci/ignore_treon_linux.txt
+++ b/.ci/ignore_treon_linux.txt
@@ -59,3 +59,4 @@ stable-video-diffusion
 llm-agent-langchain
 hello-npu
 stable-cascade-image-generation
+dynamicrafter-animating-images
\ No newline at end of file
diff --git a/.ci/ignore_treon_mac.txt b/.ci/ignore_treon_mac.txt
index 5b4a074e65b..7a5022fbbc9 100644
--- a/.ci/ignore_treon_mac.txt
+++ b/.ci/ignore_treon_mac.txt
@@ -59,4 +59,5 @@ llm-rag-langchain
 stable-video-diffusion
 llm-agent-langchain
 hello-npu
-stable-cascade-image-generation
\ No newline at end of file
+stable-cascade-image-generation
+dynamicrafter-animating-images
\ No newline at end of file
diff --git a/.ci/ignore_treon_win.txt b/.ci/ignore_treon_win.txt
index ccb5352923a..8be33c4c732 100644
--- a/.ci/ignore_treon_win.txt
+++ b/.ci/ignore_treon_win.txt
@@ -56,3 +56,4 @@ stable-video-diffusion
 llm-agent-langchain
 hello-npu
 stable-cascade-image-generation
+dynamicrafter-animating-images
\ No newline at end of file
diff --git a/.ci/spellcheck/.pyspelling.wordlist.txt b/.ci/spellcheck/.pyspelling.wordlist.txt
index 8c1ba1b2893..c720d52fb7b 100644
--- a/.ci/spellcheck/.pyspelling.wordlist.txt
+++ b/.ci/spellcheck/.pyspelling.wordlist.txt
@@ -8,6 +8,7 @@ adaptively
 adas
 ADE
 adversarially
+AE
 aeroplane
 affective
 ai
@@ -56,6 +57,7 @@ bert
 BERT's
 BetterTransformer
 Bewley
+bfloat
 BGE
 bge
 BGR
@@ -202,6 +204,7 @@ dpredictor
 DreamBooth
 Dreamshaper
 dropdown
+DynamiCrafter
 ECCV
 editability
 EfficientNet
diff --git a/notebooks/dynamicrafter-animating-images/README.md b/notebooks/dynamicrafter-animating-images/README.md
new file mode 100644
index 00000000000..f3f31c9d62c
--- /dev/null
+++ b/notebooks/dynamicrafter-animating-images/README.md
@@ -0,0 +1,38 @@
+# Animating Open-domain Images with DynamiCrafter and OpenVINO
+
+Animating a still image offers an engaging visual experience. Traditional image animation techniques mainly focus on animating natural scenes with stochastic dynamics (e.g. clouds and fluid) or domain-specific motions (e.g. human hair or body motions), and thus limits their applicability to more general visual content. To overcome this limitation, [DynamiCrafter team](https://doubiiu.github.io/projects/DynamiCrafter/) explores the synthesis of dynamic content for open-domain images, converting them into animated videos. The key idea is to utilize the motion prior of text-to-video diffusion models by incorporating the image into the generative process as guidance. Given an image, DynamiCrafter team first projects it into a text-aligned rich context representation space using a query transformer, which facilitates the video model to digest the image content in a compatible fashion. However, some visual details still struggle to be preserved in the resultant videos. To supplement with more precise image information, DynamiCrafter team further feeds the full image to the diffusion model by concatenating it with the initial noises. Experimental results show that the proposed method can produce visually convincing and more logical & natural motions, as well as higher conformity to the input image.
+
+<table class="center">
+  <tr>
+    <td colspan="2">"bear playing guitar happily, snowing"</td>
+    <td colspan="2">"boy walking on the street"</td>
+  </tr>
+  <tr>
+  <td>
+    <img src=https://github.com/Doubiiu/DynamiCrafter/blob/main/assets/showcase/guitar0.jpeg_00.png?raw=True width="170">
+  </td>
+  <td>
+    <img src=https://github.com/Doubiiu/DynamiCrafter/blob/main/assets/showcase/guitar0.gif?raw=True width="170">
+  </td>
+  <td>
+    <img src=https://github.com/Doubiiu/DynamiCrafter/blob/main/assets/showcase/walk0.png_00.png?raw=True width="170">
+  </td>
+  <td>
+    <img src=https://github.com/Doubiiu/DynamiCrafter/blob/main/assets/showcase/walk0.gif?raw=True width="170">
+  </td>
+  </tr>
+</table >
+
+## Notebook contents
+This tutorial consists of the following steps:
+- Prerequisites
+- Load the original model
+- Convert the model to OpenVINO IR
+- Compiling models
+- Building the pipeline
+- Interactive inference
+
+## Installation instructions
+This is a self-contained example that relies solely on its own code.</br>
+We recommend running the notebook in a virtual environment. You only need a Jupyter server to start.
+For details, please refer to [Installation Guide](../../README.md).
diff --git a/notebooks/dynamicrafter-animating-images/dynamicrafter-animating-images.ipynb b/notebooks/dynamicrafter-animating-images/dynamicrafter-animating-images.ipynb
new file mode 100644
index 00000000000..ef788ca2b7f
--- /dev/null
+++ b/notebooks/dynamicrafter-animating-images/dynamicrafter-animating-images.ipynb
@@ -0,0 +1,736 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "a30812de-c46e-44a3-8194-b7f6f0fd4707",
+   "metadata": {},
+   "source": [
+    "# Animating Open-domain Images with DynamiCrafter and OpenVINO\n",
+    "\n",
+    "Animating a still image offers an engaging visual experience. Traditional image animation techniques mainly focus on animating natural scenes with stochastic dynamics (e.g. clouds and fluid) or domain-specific motions (e.g. human hair or body motions), and thus limits their applicability to more general visual content. To overcome this limitation, [DynamiCrafter team](https://doubiiu.github.io/projects/DynamiCrafter/) explores the synthesis of dynamic content for open-domain images, converting them into animated videos. The key idea is to utilize the motion prior of text-to-video diffusion models by incorporating the image into the generative process as guidance. Given an image, DynamiCrafter team first projects it into a text-aligned rich context representation space using a query transformer, which facilitates the video model to digest the image content in a compatible fashion. However, some visual details still struggle to be preserved in the resultant videos. To supplement with more precise image information, DynamiCrafter team further feeds the full image to the diffusion model by concatenating it with the initial noises. Experimental results show that the proposed method can produce visually convincing and more logical & natural motions, as well as higher conformity to the input image.\n",
+    "\n",
+    "<table class=\"center\">\n",
+    "  <tr>\n",
+    "    <td colspan=\"2\">\"bear playing guitar happily, snowing\"</td>\n",
+    "    <td colspan=\"2\">\"boy walking on the street\"</td>\n",
+    "  </tr>\n",
+    "  <tr>\n",
+    "  <td>\n",
+    "    <img src=https://github.com/Doubiiu/DynamiCrafter/blob/main/assets/showcase/guitar0.jpeg_00.png?raw=True width=\"170\">\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <img src=https://github.com/Doubiiu/DynamiCrafter/blob/main/assets/showcase/guitar0.gif?raw=True width=\"170\">\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <img src=https://github.com/Doubiiu/DynamiCrafter/blob/main/assets/showcase/walk0.png_00.png?raw=True width=\"170\">\n",
+    "  </td>\n",
+    "  <td>\n",
+    "    <img src=https://github.com/Doubiiu/DynamiCrafter/blob/main/assets/showcase/walk0.gif?raw=True width=\"170\">\n",
+    "  </td>\n",
+    "  </tr>\n",
+    "</table >\n",
+    "\n",
+    "\n",
+    "#### Table of contents:\n",
+    "- [Prerequisites](#Prerequisites)\n",
+    "- [Load the original model](#Load-the-original-model)\n",
+    "- [Convert the model to OpenVINO IR](#Convert-the-model-to-OpenVINO-IR)\n",
+    "  - [Convert CLIP text encoder](#Convert-CLIP-text-encoder)\n",
+    "  - [Convert CLIP image encoder](#Convert-CLIP-image-encoder)\n",
+    "  - [Convert AE encoder](#Convert-AE-encoder)\n",
+    "  - [Convert Diffusion U-Net model](#Convert-Diffusion-U-Net-model)\n",
+    "  - [Convert AE decoder](#Convert-AE-decoder)\n",
+    "- [Compiling models](#Compiling-models)\n",
+    "- [Building the pipeline](#Building-the-pipeline)\n",
+    "- [Interactive inference](#Interactive-inference)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f9dc9580-da81-47dd-b5d3-3cafa8f5a4b5",
+   "metadata": {},
+   "source": [
+    "## Prerequisites\n",
+    "[back to top ⬆️](#Table-of-contents:)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "eac97b7e-2db7-41b3-8dc4-488c5b5cd275",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip uninstall -q -y openvino-dev openvino openvino-nightly\n",
+    "%pip install -q --upgrade openvino-nightly\n",
+    "%pip install -q \"gradio>=4.19\" omegaconf decord einops pytorch_lightning kornia open_clip_torch transformers av opencv-python torch --extra-index-url https://download.pytorch.org/whl/cpu"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4c8de050-19e7-42a2-bf5a-98ca5eef050e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "\n",
+    "dynamicrafter_path = Path(\"dynamicrafter\")\n",
+    "\n",
+    "if not dynamicrafter_path.exists():\n",
+    "    dynamicrafter_path.mkdir(parents=True, exist_ok=True)\n",
+    "    !git clone https://github.com/Doubiiu/DynamiCrafter.git dynamicrafter\n",
+    "\n",
+    "sys.path.append(str(dynamicrafter_path))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a3c0c659-aad3-4962-8db7-7b123379f01a",
+   "metadata": {},
+   "source": [
+    "## Load and run the original pipeline\n",
+    "[back to top ⬆️](#Table-of-contents:)\n",
+    "\n",
+    "We will use model for 256x256 resolution as example. Also, models for 320x512 and 576x1024 are [available](https://github.com/Doubiiu/DynamiCrafter?tab=readme-ov-file#-models)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b3ce0481-d7de-4d37-9414-c72dc6488f8e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "from huggingface_hub import hf_hub_download\n",
+    "from omegaconf import OmegaConf\n",
+    "\n",
+    "from dynamicrafter.scripts.evaluation.funcs import load_model_checkpoint\n",
+    "from dynamicrafter.utils.utils import instantiate_from_config\n",
+    "\n",
+    "\n",
+    "def download_model():\n",
+    "    REPO_ID = \"Doubiiu/DynamiCrafter\"\n",
+    "    if not os.path.exists(\"./checkpoints/dynamicrafter_256_v1/\"):\n",
+    "        os.makedirs(\"./checkpoints/dynamicrafter_256_v1/\")\n",
+    "    local_file = os.path.join(\"./checkpoints/dynamicrafter_256_v1/model.ckpt\")\n",
+    "    if not os.path.exists(local_file):\n",
+    "        hf_hub_download(repo_id=REPO_ID, filename=\"model.ckpt\", local_dir=\"./checkpoints/dynamicrafter_256_v1/\", local_dir_use_symlinks=False)\n",
+    "\n",
+    "    ckpt_path = \"checkpoints/dynamicrafter_256_v1/model.ckpt\"\n",
+    "    config_file = \"dynamicrafter/configs/inference_256_v1.0.yaml\"\n",
+    "    config = OmegaConf.load(config_file)\n",
+    "    model_config = config.pop(\"model\", OmegaConf.create())\n",
+    "    model_config[\"params\"][\"unet_config\"][\"params\"][\"use_checkpoint\"] = False\n",
+    "    model = instantiate_from_config(model_config)\n",
+    "    model = load_model_checkpoint(model, ckpt_path)\n",
+    "    model.eval()\n",
+    "\n",
+    "    return model\n",
+    "\n",
+    "\n",
+    "model = download_model()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "be9643c8-a70c-4dba-8259-d4467ae82949",
+   "metadata": {},
+   "source": [
+    "## Convert the model to OpenVINO IR\n",
+    "[back to top ⬆️](#Table-of-contents:)\n",
+    "\n",
+    "Let's define the conversion function for PyTorch modules. We use `ov.convert_model` function to obtain OpenVINO Intermediate Representation object and `ov.save_model` function to save it as XML file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1bf5ffa9-c7d5-4485-915c-48ed18f657dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "\n",
+    "import openvino as ov\n",
+    "\n",
+    "\n",
+    "def convert(model: torch.nn.Module, xml_path: str, example_input, input_shape=None):\n",
+    "    xml_path = Path(xml_path)\n",
+    "    if not xml_path.exists():\n",
+    "        xml_path.parent.mkdir(parents=True, exist_ok=True)\n",
+    "        with torch.no_grad():\n",
+    "            if not input_shape:\n",
+    "                converted_model = ov.convert_model(model, example_input=example_input)\n",
+    "            else:\n",
+    "                converted_model = ov.convert_model(model, example_input=example_input, input=input_shape)\n",
+    "        ov.save_model(converted_model, xml_path, compress_to_fp16=False)\n",
+    "\n",
+    "        # cleanup memory\n",
+    "        torch._C._jit_clear_class_registry()\n",
+    "        torch.jit._recursive.concrete_type_store = torch.jit._recursive.ConcreteTypeStore()\n",
+    "        torch.jit._state._clear_class_state()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3c63518d-957d-4358-8711-cf6fb935d8be",
+   "metadata": {},
+   "source": [
+    "Flowchart of DynamiCrafter proposed in [the paper](https://arxiv.org/abs/2310.12190):\n",
+    "\n",
+    "![schema](https://github.com/openvinotoolkit/openvino_notebooks/assets/76171391/d1033876-c664-4345-a254-0649edbf1906)\n",
+    "Description:\n",
+    "> During inference, our model can generate animation clips from noise conditioned on the input still image.\n",
+    "\n",
+    "Let's convert models from the pipeline one by one."
+   ]
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "### Convert CLIP text encoder\n",
+    "[back to top ⬆️](#Table-of-contents:)"
+   ],
+   "id": "d9824415cd5b0ffd"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "from dynamicrafter.lvdm.modules.encoders.condition import FrozenOpenCLIPEmbedder\n",
+    "\n",
+    "\n",
+    "COND_STAGE_MODEL_OV_PATH = Path(\"models/cond_stage_model.xml\")\n",
+    "\n",
+    "\n",
+    "class FrozenOpenCLIPEmbedderWrapper(FrozenOpenCLIPEmbedder):\n",
+    "\n",
+    "    def forward(self, tokens):\n",
+    "        z = self.encode_with_transformer(tokens.to(self.device))\n",
+    "        return z\n",
+    "\n",
+    "\n",
+    "cond_stage_model = FrozenOpenCLIPEmbedderWrapper(device=\"cpu\")\n",
+    "\n",
+    "\n",
+    "convert(\n",
+    "    cond_stage_model,\n",
+    "    COND_STAGE_MODEL_OV_PATH,\n",
+    "    example_input=torch.ones([1, 77], dtype=torch.long),\n",
+    ")"
+   ],
+   "id": "2e0b1637-7757-49bb-b60f-175f7d77b470"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "### Convert CLIP image encoder\n",
+    "[back to top ⬆️](#Table-of-contents:)\n",
+    "`FrozenOpenCLIPImageEmbedderV2` model accepts images of various resolutions."
+   ],
+   "id": "63b361937c948711"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "EMBEDDER_OV_PATH = Path(\"models/embedder_ir.xml\")\n",
+    "\n",
+    "\n",
+    "dummy_input = torch.rand([1, 3, 767, 767], dtype=torch.float32)\n",
+    "\n",
+    "model.embedder.model.visual.input_patchnorm = None  # fix error: visual model has not  attribute 'input_patchnorm'\n",
+    "convert(model.embedder, EMBEDDER_OV_PATH, example_input=dummy_input, input_shape=[1, 3, -1, -1])"
+   ],
+   "id": "c6aa0a3f-4093-44c7-b7b0-ca830abc4826"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "### Convert AE encoder\n",
+    "[back to top ⬆️](#Table-of-contents:)"
+   ],
+   "id": "eef65d17fec62fa"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "ENCODER_FIRST_STAGE_OV_PATH = Path(\"models/encode_first_stage_ir.xml\")\n",
+    "\n",
+    "\n",
+    "dummy_input = torch.rand([1, 3, 256, 256], dtype=torch.float32)\n",
+    "\n",
+    "convert(\n",
+    "    model.first_stage_model.encoder,\n",
+    "    ENCODER_FIRST_STAGE_OV_PATH,\n",
+    "    example_input=dummy_input,\n",
+    ")"
+   ],
+   "id": "41434d51-07da-4688-b0af-be6271fef54f"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": [
+    "### Convert Diffusion U-Net model\n",
+    "[back to top ⬆️](#Table-of-contents:)"
+   ],
+   "id": "7ec5ee02317d8e77"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "MODEL_OV_PATH = Path(\"models/model_ir.xml\")\n",
+    "\n",
+    "\n",
+    "class ModelWrapper(torch.nn.Module):\n",
+    "    def __init__(self, diffusion_model):\n",
+    "        super().__init__()\n",
+    "        self.diffusion_model = diffusion_model\n",
+    "\n",
+    "    def forward(self, xc, t, context=None, fs=None, temporal_length=None):\n",
+    "        outputs = self.diffusion_model(xc, t, context=context, fs=fs, temporal_length=temporal_length)\n",
+    "        return outputs\n",
+    "\n",
+    "\n",
+    "convert(\n",
+    "    ModelWrapper(model.model.diffusion_model),\n",
+    "    MODEL_OV_PATH,\n",
+    "    example_input={\n",
+    "        \"xc\": torch.rand([1, 8, 16, 32, 32], dtype=torch.float32),\n",
+    "        \"t\": torch.tensor([1]),\n",
+    "        \"context\": torch.rand([1, 333, 1024], dtype=torch.float32),\n",
+    "        \"fs\": torch.tensor([3]),\n",
+    "        \"temporal_length\": torch.tensor([16]),\n",
+    "    },\n",
+    ")"
+   ],
+   "id": "ca501d47-3f65-4d58-804d-d8ed20a761af"
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8d5430af-12b4-4a15-bb7c-c9300f824431",
+   "metadata": {},
+   "source": [
+    "### Convert AE decoder\n",
+    "[back to top ⬆️](#Table-of-contents:)\n",
+    "`Decoder` receives a `bfloat16` tensor. numpy doesn't support this type. To avoid problems with the conversion lets replace `decode` method to convert bfloat16 to float32."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "db8394e7-d60b-4dbd-a94a-65e4f21a959b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import types\n",
+    "\n",
+    "\n",
+    "def decode(self, z, **kwargs):\n",
+    "    z = self.post_quant_conv(z)\n",
+    "    z = z.float()\n",
+    "    dec = self.decoder(z)\n",
+    "    return dec\n",
+    "\n",
+    "\n",
+    "model.first_stage_model.decode = types.MethodType(decode, model.first_stage_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7c6879b-8b51-4d76-81e1-669378c7c4e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "DECODER_FIRST_STAGE_OV_PATH = Path(\"models/decoder_first_stage_ir.xml\")\n",
+    "\n",
+    "\n",
+    "dummy_input = torch.rand([16, 4, 32, 32], dtype=torch.float32)\n",
+    "\n",
+    "convert(\n",
+    "    model.first_stage_model.decoder,\n",
+    "    DECODER_FIRST_STAGE_OV_PATH,\n",
+    "    example_input=dummy_input,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "51ff6eb8-dd58-4820-aae3-85c0b4e487a8",
+   "metadata": {},
+   "source": [
+    "## Compiling models\n",
+    "[back to top ⬆️](#Table-of-contents:)\n",
+    "\n",
+    "Select device from dropdown list for running inference using OpenVINO."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f052ebf-dabe-4161-bee7-4a9d55b9b69a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ipywidgets as widgets\n",
+    "\n",
+    "core = ov.Core()\n",
+    "device = widgets.Dropdown(\n",
+    "    options=core.available_devices + [\"AUTO\"],\n",
+    "    value=\"AUTO\",\n",
+    "    description=\"Device:\",\n",
+    "    disabled=False,\n",
+    ")\n",
+    "\n",
+    "device"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "410e05b4-11ed-4d85-ab8d-5d1557c4c36e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "compiled_cond_stage_model = core.compile_model(COND_STAGE_MODEL_OV_PATH, device.value)\n",
+    "compiled_encode_first_stage = core.compile_model(ENCODER_FIRST_STAGE_OV_PATH, device.value)\n",
+    "compiled_embedder = core.compile_model(EMBEDDER_OV_PATH, device.value)\n",
+    "compiled_model = core.compile_model(MODEL_OV_PATH, device.value)\n",
+    "compiled_decoder_first_stage = core.compile_model(DECODER_FIRST_STAGE_OV_PATH, device.value)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "11f2c95b-e872-458b-a6f8-448f8124ffe6",
+   "metadata": {},
+   "source": [
+    "## Building the pipeline\n",
+    "[back to top ⬆️](#Table-of-contents:)\n",
+    "\n",
+    "Let's create callable wrapper classes for compiled models to allow interaction with original pipelines. Note that all of wrapper classes return `torch.Tensor`s instead of `np.array`s."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d50a153-f71d-4364-9114-14bc25e239d7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import open_clip\n",
+    "\n",
+    "\n",
+    "class CondStageModelWrapper(torch.nn.Module):\n",
+    "    def __init__(self, cond_stage_model):\n",
+    "        super().__init__()\n",
+    "        self.cond_stage_model = cond_stage_model\n",
+    "\n",
+    "    def encode(self, tokens):\n",
+    "        if isinstance(tokens, list):\n",
+    "            tokens = open_clip.tokenize(tokens[0])\n",
+    "        outs = self.cond_stage_model(tokens)[0]\n",
+    "\n",
+    "        return torch.from_numpy(outs)\n",
+    "\n",
+    "\n",
+    "class EncoderFirstStageModelWrapper(torch.nn.Module):\n",
+    "    def __init__(self, encode_first_stage):\n",
+    "        super().__init__()\n",
+    "        self.encode_first_stage = encode_first_stage\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        outs = self.encode_first_stage(x)[0]\n",
+    "\n",
+    "        return torch.from_numpy(outs)\n",
+    "\n",
+    "\n",
+    "class EmbedderWrapper(torch.nn.Module):\n",
+    "    def __init__(self, embedder):\n",
+    "        super().__init__()\n",
+    "        self.embedder = embedder\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        outs = self.embedder(x)[0]\n",
+    "\n",
+    "        return torch.from_numpy(outs)\n",
+    "\n",
+    "\n",
+    "class CModelWrapper(torch.nn.Module):\n",
+    "    def __init__(self, diffusion_model, out_channels):\n",
+    "        super().__init__()\n",
+    "        self.diffusion_model = diffusion_model\n",
+    "        self.out_channels = out_channels\n",
+    "\n",
+    "    def forward(self, xc, t, context, fs, temporal_length):\n",
+    "        inputs = {\n",
+    "            \"xc\": xc,\n",
+    "            \"t\": t,\n",
+    "            \"context\": context,\n",
+    "            \"fs\": fs,\n",
+    "        }\n",
+    "        outs = self.diffusion_model(inputs)[0]\n",
+    "\n",
+    "        return torch.from_numpy(outs)\n",
+    "\n",
+    "\n",
+    "class DecoderFirstStageModelWrapper(torch.nn.Module):\n",
+    "    def __init__(self, decoder_first_stage):\n",
+    "        super().__init__()\n",
+    "        self.decoder_first_stage = decoder_first_stage\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        x.float()\n",
+    "        outs = self.decoder_first_stage(x)[0]\n",
+    "\n",
+    "        return torch.from_numpy(outs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1178a847-eb14-419b-815e-c47628aa6868",
+   "metadata": {},
+   "source": [
+    "And insert wrappers instances in the pipeline:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f45d5dc3-6d17-408e-826d-b8525f461e44",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.cond_stage_model = CondStageModelWrapper(compiled_cond_stage_model)\n",
+    "model.first_stage_model.encoder = EncoderFirstStageModelWrapper(compiled_encode_first_stage)\n",
+    "model.embedder = EmbedderWrapper(compiled_embedder)\n",
+    "model.model.diffusion_model = CModelWrapper(compiled_model, model.model.diffusion_model.out_channels)\n",
+    "model.first_stage_model.decoder = DecoderFirstStageModelWrapper(compiled_decoder_first_stage)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4417db2b-2f65-407c-a384-ec466f18bca0",
+   "metadata": {},
+   "source": [
+    "## Interactive inference\n",
+    "[back to top ⬆️](#Table-of-contents:)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1494420-f616-4136-958b-1ee3abd20cb2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import time\n",
+    "\n",
+    "from einops import repeat\n",
+    "from pytorch_lightning import seed_everything\n",
+    "import torchvision.transforms as transforms\n",
+    "\n",
+    "from dynamicrafter.scripts.evaluation.funcs import save_videos, batch_ddim_sampling, get_latent_z\n",
+    "from dynamicrafter.lvdm.models.samplers.ddim import DDIMSampler\n",
+    "\n",
+    "\n",
+    "def register_buffer(self, name, attr):\n",
+    "    if isinstance(attr, torch.Tensor):\n",
+    "        if attr.device != torch.device(\"cpu\"):\n",
+    "            attr = attr.to(torch.device(\"cpu\"))\n",
+    "    setattr(self, name, attr)\n",
+    "\n",
+    "\n",
+    "# monkey patching to replace the original method 'register_buffer' that uses CUDA\n",
+    "DDIMSampler.register_buffer = types.MethodType(register_buffer, DDIMSampler)\n",
+    "\n",
+    "\n",
+    "def get_image(image, prompt, steps=5, cfg_scale=7.5, eta=1.0, fs=3, seed=123, model=model):\n",
+    "    result_dir = \"results\"\n",
+    "    if not os.path.exists(result_dir):\n",
+    "        os.mkdir(result_dir)\n",
+    "\n",
+    "    seed_everything(seed)\n",
+    "    transform = transforms.Compose(\n",
+    "        [\n",
+    "            transforms.Resize(min((256, 256))),\n",
+    "            transforms.CenterCrop((256, 256)),\n",
+    "        ]\n",
+    "    )\n",
+    "    # torch.cuda.empty_cache()\n",
+    "    print(\"start:\", prompt, time.strftime(\"%Y-%m-%d %H:%M:%S\", time.localtime(time.time())))\n",
+    "    start = time.time()\n",
+    "    if steps > 60:\n",
+    "        steps = 60\n",
+    "    model = model.cpu()\n",
+    "    batch_size = 1\n",
+    "    channels = model.model.diffusion_model.out_channels\n",
+    "    frames = model.temporal_length\n",
+    "    h, w = 256 // 8, 256 // 8\n",
+    "    noise_shape = [batch_size, channels, frames, h, w]\n",
+    "\n",
+    "    # text cond\n",
+    "    with torch.no_grad(), torch.cpu.amp.autocast():\n",
+    "        text_emb = model.get_learned_conditioning([prompt])\n",
+    "\n",
+    "        # img cond\n",
+    "        img_tensor = torch.from_numpy(image).permute(2, 0, 1).float().to(model.device)\n",
+    "        img_tensor = (img_tensor / 255.0 - 0.5) * 2\n",
+    "\n",
+    "        image_tensor_resized = transform(img_tensor)  # 3,h,w\n",
+    "        videos = image_tensor_resized.unsqueeze(0)  # bchw\n",
+    "\n",
+    "        z = get_latent_z(model, videos.unsqueeze(2))  # bc,1,hw\n",
+    "\n",
+    "        img_tensor_repeat = repeat(z, \"b c t h w -> b c (repeat t) h w\", repeat=frames)\n",
+    "\n",
+    "        cond_images = model.embedder(img_tensor.unsqueeze(0))  # blc\n",
+    "\n",
+    "        img_emb = model.image_proj_model(cond_images)\n",
+    "\n",
+    "        imtext_cond = torch.cat([text_emb, img_emb], dim=1)\n",
+    "\n",
+    "        fs = torch.tensor([fs], dtype=torch.long, device=model.device)\n",
+    "        cond = {\"c_crossattn\": [imtext_cond], \"fs\": fs, \"c_concat\": [img_tensor_repeat]}\n",
+    "\n",
+    "        ## inference\n",
+    "        batch_samples = batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=steps, ddim_eta=eta, cfg_scale=cfg_scale)\n",
+    "        ## b,samples,c,t,h,w\n",
+    "        prompt_str = prompt.replace(\"/\", \"_slash_\") if \"/\" in prompt else prompt\n",
+    "        prompt_str = prompt_str.replace(\" \", \"_\") if \" \" in prompt else prompt_str\n",
+    "        prompt_str = prompt_str[:40]\n",
+    "        if len(prompt_str) == 0:\n",
+    "            prompt_str = \"empty_prompt\"\n",
+    "\n",
+    "    save_videos(batch_samples, result_dir, filenames=[prompt_str], fps=8)\n",
+    "    print(f\"Saved in {prompt_str}. Time used: {(time.time() - start):.2f} seconds\")\n",
+    "\n",
+    "    return os.path.join(result_dir, f\"{prompt_str}.mp4\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bf57f8a8-8cf6-45c5-ae78-02a3ed04fcc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gradio as gr\n",
+    "\n",
+    "\n",
+    "i2v_examples_256 = [\n",
+    "    [\"dynamicrafter/prompts/256/art.png\", \"man fishing in a boat at sunset\", 50, 7.5, 1.0, 3, 234],\n",
+    "    [\"dynamicrafter/prompts/256/boy.png\", \"boy walking on the street\", 50, 7.5, 1.0, 3, 125],\n",
+    "    [\"dynamicrafter/prompts/256/dance1.jpeg\", \"two people dancing\", 50, 7.5, 1.0, 3, 116],\n",
+    "    [\"dynamicrafter/prompts/256/fire_and_beach.jpg\", \"a campfire on the beach and the ocean waves in the background\", 50, 7.5, 1.0, 3, 111],\n",
+    "    [\"dynamicrafter/prompts/256/guitar0.jpeg\", \"bear playing guitar happily, snowing\", 50, 7.5, 1.0, 3, 122],\n",
+    "]\n",
+    "\n",
+    "\n",
+    "def dynamicrafter_demo():\n",
+    "    css = \"\"\"#input_img {max-width: 256px !important} #output_vid {max-width: 256px; max-height: 256px}\"\"\"\n",
+    "\n",
+    "    with gr.Blocks(analytics_enabled=False, css=css) as dynamicrafter_iface:\n",
+    "        with gr.Tab(label=\"Image2Video_256x256\"):\n",
+    "            with gr.Column():\n",
+    "                with gr.Row():\n",
+    "                    with gr.Column():\n",
+    "                        with gr.Row():\n",
+    "                            i2v_input_image = gr.Image(label=\"Input Image\", elem_id=\"input_img\")\n",
+    "                        with gr.Row():\n",
+    "                            i2v_input_text = gr.Text(label=\"Prompts\")\n",
+    "                        with gr.Row():\n",
+    "                            i2v_seed = gr.Slider(label=\"Random Seed\", minimum=0, maximum=10000, step=1, value=123)\n",
+    "                            i2v_eta = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, label=\"ETA\", value=1.0, elem_id=\"i2v_eta\")\n",
+    "                            i2v_cfg_scale = gr.Slider(minimum=1.0, maximum=15.0, step=0.5, label=\"CFG Scale\", value=7.5, elem_id=\"i2v_cfg_scale\")\n",
+    "                        with gr.Row():\n",
+    "                            i2v_steps = gr.Slider(minimum=1, maximum=60, step=1, elem_id=\"i2v_steps\", label=\"Sampling steps\", value=50)\n",
+    "                            i2v_motion = gr.Slider(minimum=1, maximum=4, step=1, elem_id=\"i2v_motion\", label=\"Motion magnitude\", value=3)\n",
+    "                        i2v_end_btn = gr.Button(\"Generate\")\n",
+    "                    with gr.Row():\n",
+    "                        i2v_output_video = gr.Video(label=\"Generated Video\", elem_id=\"output_vid\", autoplay=True, show_share_button=True)\n",
+    "\n",
+    "                gr.Examples(\n",
+    "                    examples=i2v_examples_256,\n",
+    "                    inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed],\n",
+    "                    outputs=[i2v_output_video],\n",
+    "                    fn=get_image,\n",
+    "                    cache_examples=False,\n",
+    "                )\n",
+    "            i2v_end_btn.click(\n",
+    "                inputs=[i2v_input_image, i2v_input_text, i2v_steps, i2v_cfg_scale, i2v_eta, i2v_motion, i2v_seed],\n",
+    "                outputs=[i2v_output_video],\n",
+    "                fn=get_image,\n",
+    "            )\n",
+    "\n",
+    "    return dynamicrafter_iface\n",
+    "\n",
+    "\n",
+    "demo = dynamicrafter_demo()\n",
+    "\n",
+    "\n",
+    "try:\n",
+    "    demo.queue().launch(debug=True)\n",
+    "except Exception:\n",
+    "    demo.queue().launch(debug=True, share=True)\n",
+    "# if you are launching remotely, specify server_name and server_port\n",
+    "# demo.launch(server_name='your server name', server_port='server port in int')\n",
+    "# Read more in the docs: https://gradio.app/docs/"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  },
+  "openvino_notebooks": {
+   "imageUrl": "https://github.com/Doubiiu/DynamiCrafter/blob/main/assets/showcase/guitar0.gif?raw",
+   "tags": {
+    "categories": [
+     "Model Demos",
+     "AI Trends"
+    ],
+    "libraries": [],
+    "other": [],
+    "tasks": [
+     "Image-to-Video"
+    ]
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}