From f0944512d5985ae59000999dcbc3407a050c6ed1 Mon Sep 17 00:00:00 2001 From: Ekaterina Aidova Date: Thu, 25 Jul 2024 13:30:01 +0400 Subject: [PATCH] Dynamic Crafter remove dependency on decord (#2229) --- .../dynamicrafter-animating-images.ipynb | 1063 +++++++++++++++-- 1 file changed, 988 insertions(+), 75 deletions(-) diff --git a/notebooks/dynamicrafter-animating-images/dynamicrafter-animating-images.ipynb b/notebooks/dynamicrafter-animating-images/dynamicrafter-animating-images.ipynb index 0dcec35d9df..beb1eb7fd81 100644 --- a/notebooks/dynamicrafter-animating-images/dynamicrafter-animating-images.ipynb +++ b/notebooks/dynamicrafter-animating-images/dynamicrafter-animating-images.ipynb @@ -1,7 +1,6 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "id": "a30812de-c46e-44a3-8194-b7f6f0fd4707", "metadata": {}, @@ -64,7 +63,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "f9dc9580-da81-47dd-b5d3-3cafa8f5a4b5", "metadata": {}, @@ -75,10 +73,25 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "eac97b7e-2db7-41b3-8dc4-488c5b5cd275", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARNING: Error parsing dependencies of torchsde: .* suffix can only be used with `==` or `!=` operators\n", + " numpy (>=1.19.*) ; python_version >= \"3.7\"\n", + " ~~~~~~~^\u001b[0m\u001b[33m\n", + "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n", + "\u001b[33mWARNING: Error parsing dependencies of torchsde: .* suffix can only be used with `==` or `!=` operators\n", + " numpy (>=1.19.*) ; python_version >= \"3.7\"\n", + " ~~~~~~~^\u001b[0m\u001b[33m\n", + "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n" + ] + } + ], "source": [ "%pip install -q \"openvino>=2024.2.0\" \"nncf>=2.11.0\" \"datasets>=2.20.0\"\n", "%pip install -q \"gradio>=4.19\" omegaconf einops pytorch_lightning kornia \"open_clip_torch==2.22.0\" transformers av opencv-python \"torch==2.2.2\" --extra-index-url https://download.pytorch.org/whl/cpu" @@ -108,7 +121,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "a3c0c659-aad3-4962-8db7-7b123379f01a", "metadata": {}, @@ -121,10 +133,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "b3ce0481-d7de-4d37-9414-c72dc6488f8e", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AE working on z of shape (1, 4, 32, 32) = 4096 dimensions.\n", + ">>> model checkpoint loaded.\n" + ] + } + ], "source": [ "import os\n", "from collections import OrderedDict\n", @@ -193,7 +214,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "be9643c8-a70c-4dba-8259-d4467ae82949", "metadata": {}, @@ -232,7 +252,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "3c63518d-957d-4358-8711-cf6fb935d8be", "metadata": {}, @@ -247,7 +266,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "d9824415cd5b0ffd", "metadata": {}, @@ -270,7 +288,6 @@ "\n", "\n", "class FrozenOpenCLIPEmbedderWrapper(FrozenOpenCLIPEmbedder):\n", - "\n", " def forward(self, tokens):\n", " z = self.encode_with_transformer(tokens.to(self.device))\n", " return z\n", @@ -287,7 +304,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "63b361937c948711", "metadata": {}, @@ -315,7 +331,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "eef65d17fec62fa", "metadata": {}, @@ -345,7 +360,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "7ec5ee02317d8e77", "metadata": {}, @@ -389,7 +403,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "8d5430af-12b4-4a15-bb7c-c9300f824431", "metadata": {}, @@ -424,7 +437,16 @@ "execution_count": 10, "id": "d7c6879b-8b51-4d76-81e1-669378c7c4e6", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ea/work/openvino_notebooks_new_clone/openvino_notebooks/notebooks/dynamicrafter-animating-images/dynamicrafter/lvdm/modules/networks/ae_modules.py:67: TracerWarning: Converting a tensor to a Python integer might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n", + " w_ = w_ * (int(c)**(-0.5))\n" + ] + } + ], "source": [ "DECODER_FIRST_STAGE_OV_PATH = Path(\"models/decoder_first_stage_ir.xml\")\n", "\n", @@ -440,7 +462,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "51ff6eb8-dd58-4820-aae3-85c0b4e487a8", "metadata": {}, @@ -453,10 +474,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "8f052ebf-dabe-4161-bee7-4a9d55b9b69a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "35243ae51b58460e900598c06ae92547", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Dropdown(description='Device:', index=1, options=('CPU', 'AUTO'), value='AUTO')" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "import ipywidgets as widgets\n", "\n", @@ -486,7 +523,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "11f2c95b-e872-458b-a6f8-448f8124ffe6", "metadata": {}, @@ -573,7 +609,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "1178a847-eb14-419b-815e-c47628aa6868", "metadata": {}, @@ -606,13 +641,12 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 16, "id": "9e7729a4", "metadata": {}, "outputs": [], "source": [ - "from einops import repeat\n", - "from dynamicrafter.scripts.evaluation.funcs import get_latent_z\n", + "from einops import repeat, rearrange\n", "import torchvision.transforms as transforms\n", "\n", "\n", @@ -624,6 +658,14 @@ ")\n", "\n", "\n", + "def get_latent_z(model, videos):\n", + " b, c, t, h, w = videos.shape\n", + " x = rearrange(videos, \"b c t h w -> (b t) c h w\")\n", + " z = model.encode_first_stage(x)\n", + " z = rearrange(z, \"(b t) c h w -> b c t h w\", b=b, t=t)\n", + " return z\n", + "\n", + "\n", "def process_input(model, prompt, image, transform=transform, fs=3):\n", " text_emb = model.get_learned_conditioning([prompt])\n", "\n", @@ -649,7 +691,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 19, "id": "52b6ff5b", "metadata": {}, "outputs": [], @@ -657,9 +699,9 @@ "import time\n", "from PIL import Image\n", "import numpy as np\n", - "from dynamicrafter.scripts.evaluation.funcs import save_videos, batch_ddim_sampling\n", "from lvdm.models.samplers.ddim import DDIMSampler\n", "from pytorch_lightning import seed_everything\n", + "import torchvision\n", "\n", "\n", "def register_buffer(self, name, attr):\n", @@ -669,10 +711,94 @@ " setattr(self, name, attr)\n", "\n", "\n", + "def batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=50, ddim_eta=1.0, cfg_scale=1.0, temporal_cfg_scale=None, **kwargs):\n", + " ddim_sampler = DDIMSampler(model)\n", + " uncond_type = model.uncond_type\n", + " batch_size = noise_shape[0]\n", + " fs = cond[\"fs\"]\n", + " del cond[\"fs\"]\n", + " if noise_shape[-1] == 32:\n", + " timestep_spacing = \"uniform\"\n", + " guidance_rescale = 0.0\n", + " else:\n", + " timestep_spacing = \"uniform_trailing\"\n", + " guidance_rescale = 0.7\n", + " # construct unconditional guidance\n", + " if cfg_scale != 1.0:\n", + " if uncond_type == \"empty_seq\":\n", + " prompts = batch_size * [\"\"]\n", + " # prompts = N * T * [\"\"] ## if is_imgbatch=True\n", + " uc_emb = model.get_learned_conditioning(prompts)\n", + " elif uncond_type == \"zero_embed\":\n", + " c_emb = cond[\"c_crossattn\"][0] if isinstance(cond, dict) else cond\n", + " uc_emb = torch.zeros_like(c_emb)\n", + "\n", + " # process image embedding token\n", + " if hasattr(model, \"embedder\"):\n", + " uc_img = torch.zeros(noise_shape[0], 3, 224, 224).to(model.device)\n", + " ## img: b c h w >> b l c\n", + " uc_img = model.embedder(uc_img)\n", + " uc_img = model.image_proj_model(uc_img)\n", + " uc_emb = torch.cat([uc_emb, uc_img], dim=1)\n", + "\n", + " if isinstance(cond, dict):\n", + " uc = {key: cond[key] for key in cond.keys()}\n", + " uc.update({\"c_crossattn\": [uc_emb]})\n", + " else:\n", + " uc = uc_emb\n", + " else:\n", + " uc = None\n", + "\n", + " x_T = None\n", + " batch_variants = []\n", + "\n", + " for _ in range(n_samples):\n", + " if ddim_sampler is not None:\n", + " kwargs.update({\"clean_cond\": True})\n", + " samples, _ = ddim_sampler.sample(\n", + " S=ddim_steps,\n", + " conditioning=cond,\n", + " batch_size=noise_shape[0],\n", + " shape=noise_shape[1:],\n", + " verbose=False,\n", + " unconditional_guidance_scale=cfg_scale,\n", + " unconditional_conditioning=uc,\n", + " eta=ddim_eta,\n", + " temporal_length=noise_shape[2],\n", + " conditional_guidance_scale_temporal=temporal_cfg_scale,\n", + " x_T=x_T,\n", + " fs=fs,\n", + " timestep_spacing=timestep_spacing,\n", + " guidance_rescale=guidance_rescale,\n", + " **kwargs,\n", + " )\n", + " # reconstruct from latent to pixel space\n", + " batch_images = model.decode_first_stage(samples)\n", + " batch_variants.append(batch_images)\n", + " # batch, , c, t, h, w\n", + " batch_variants = torch.stack(batch_variants, dim=1)\n", + " return batch_variants\n", + "\n", + "\n", "# monkey patching to replace the original method 'register_buffer' that uses CUDA\n", "DDIMSampler.register_buffer = types.MethodType(register_buffer, DDIMSampler)\n", "\n", "\n", + "def save_videos(batch_tensors, savedir, filenames, fps=10):\n", + " # b,samples,c,t,h,w\n", + " n_samples = batch_tensors.shape[1]\n", + " for idx, vid_tensor in enumerate(batch_tensors):\n", + " video = vid_tensor.detach().cpu()\n", + " video = torch.clamp(video.float(), -1.0, 1.0)\n", + " video = video.permute(2, 0, 1, 3, 4) # t,n,c,h,w\n", + " frame_grids = [torchvision.utils.make_grid(framesheet, nrow=int(n_samples)) for framesheet in video] # [3, 1*h, n*w]\n", + " grid = torch.stack(frame_grids, dim=0) # stack in temporal dim [t, 3, n*h, w]\n", + " grid = (grid + 1.0) / 2.0\n", + " grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1)\n", + " savepath = os.path.join(savedir, f\"{filenames[idx]}.mp4\")\n", + " torchvision.io.write_video(savepath, grid, fps=fps, video_codec=\"h264\", options={\"crf\": \"10\"})\n", + "\n", + "\n", "def get_image(image, prompt, steps=5, cfg_scale=7.5, eta=1.0, fs=3, seed=123, model=model, result_dir=\"results\"):\n", " if not os.path.exists(result_dir):\n", " os.mkdir(result_dir)\n", @@ -712,7 +838,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 20, "id": "e9a0137a", "metadata": {}, "outputs": [ @@ -723,20 +849,12 @@ "Seed set to 234\n" ] }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_1530080/3626034461.py:17: UserWarning: The given NumPy array is not writable, and PyTorch does not support non-writable tensors. This means writing to this tensor will result in undefined behavior. You may want to copy the array to protect its data or make it writable before converting it to a tensor. This type of warning will be suppressed for the rest of this program. (Triggered internally at ../torch/csrc/utils/tensor_numpy.cpp:206.)\n", - " img_tensor = torch.from_numpy(image).permute(2, 0, 1).float().to(model.device)\n" - ] - }, { "name": "stdout", "output_type": "stream", "text": [ - "start: man fishing in a boat at sunset 2024-07-04 11:31:43\n", - "Saved in man_fishing_in_a_boat_at_sunset.mp4. Time used: 276.20 seconds\n" + "start: man fishing in a boat at sunset 2024-07-25 11:30:59\n", + "Saved in man_fishing_in_a_boat_at_sunset.mp4. Time used: 183.19 seconds\n" ] } ], @@ -752,7 +870,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "id": "3734113f", "metadata": {}, "outputs": [ @@ -768,7 +886,7 @@ "" ] }, - "execution_count": 18, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -804,14 +922,14 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 22, "id": "f991e3c1", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "bb75bcb1a1d346cbb3b6a6d5aba1b9f5", + "model_id": "3ac4286d32934cbb861c354857d48c0d", "version_major": 2, "version_minor": 0 }, @@ -819,7 +937,7 @@ "Checkbox(value=True, description='Quantization')" ] }, - "execution_count": 19, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -844,7 +962,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 23, "id": "61d199d1", "metadata": {}, "outputs": [], @@ -875,7 +993,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 24, "id": "c1b7bb27", "metadata": {}, "outputs": [], @@ -906,7 +1024,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 25, "id": "7c141d3b", "metadata": {}, "outputs": [], @@ -962,10 +1080,39 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 26, "id": "1213b8cb", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0354e90b50bd41e4b08e3af5740c7e95", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading readme: 0%| | 0.00/376 [00:00\n" + ], + "text/plain": [] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "102df3b931314387a921bf6440a7419a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:nncf:2 ignored nodes were found by names in the NNCFGraph\n", + "INFO:nncf:Not adding activation input quantizer for operation: 65 __module.diffusion_model.input_blocks.0.0/aten::_convolution/Convolution\n", + "158 __module.diffusion_model.input_blocks.0.0/aten::_convolution/Add\n", + "\n", + "INFO:nncf:Not adding activation input quantizer for operation: 3789 __module.diffusion_model.out.2/aten::_convolution/Convolution\n", + "4086 __module.diffusion_model.out.2/aten::_convolution/Add\n", + "\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8140b8ffd17f4877bedf566c48c09e53", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "%%skip not $to_quantize.value\n", "\n", @@ -1030,10 +1321,207 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 28, "id": "fe98145f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:nncf:Statistics of the bitwidth distribution:\n", + "┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n", + "│ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │\n", + "┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥\n", + "│ 8 │ 100% (97 / 97) │ 100% (97 / 97) │\n", + "┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "8843fe7ba47d40d09fa86aadd366727a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:nncf:Statistics of the bitwidth distribution:\n", + "┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n", + "│ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │\n", + "┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥\n", + "│ 8 │ 100% (39 / 39) │ 100% (39 / 39) │\n", + "┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "adca15bca575477bb181fc920e14aac3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:nncf:Statistics of the bitwidth distribution:\n", + "┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n", + "│ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │\n", + "┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥\n", + "│ 8 │ 100% (31 / 31) │ 100% (31 / 31) │\n", + "┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c1b747e9b37e4a9a83581154d70a57cf", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO:nncf:Statistics of the bitwidth distribution:\n", + "┍━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┯━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┑\n", + "│ Num bits (N) │ % all parameters (layers) │ % ratio-defining parameters (layers) │\n", + "┝━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┿━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┥\n", + "│ 8 │ 100% (129 / 129) │ 100% (129 / 129) │\n", + "┕━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┷━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┙\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bce5b2de47134bd682eae49cbbe262da", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Output()" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n"
+      ],
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "
\n",
+       "
\n" + ], + "text/plain": [ + "\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "%%skip not $to_quantize.value\n", "\n", @@ -1065,7 +1553,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 29, "id": "d311de3a", "metadata": {}, "outputs": [], @@ -1081,10 +1569,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, "id": "a0ce59ba", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "AE working on z of shape (1, 4, 32, 32) = 4096 dimensions.\n", + ">>> model checkpoint loaded.\n" + ] + } + ], "source": [ "%%skip not $to_quantize.value\n", "\n", @@ -1101,7 +1598,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 31, "id": "1e77da42", "metadata": {}, "outputs": [ @@ -1116,8 +1613,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "start: man fishing in a boat at sunset 2024-07-04 11:37:09\n", - "Saved in man_fishing_in_a_boat_at_sunset.mp4. Time used: 129.95 seconds\n" + "start: man fishing in a boat at sunset 2024-07-25 12:56:18\n", + "Saved in man_fishing_in_a_boat_at_sunset.mp4. Time used: 84.56 seconds\n" ] } ], @@ -1136,7 +1633,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 32, "id": "d8a817f6", "metadata": {}, "outputs": [ @@ -1179,7 +1676,7 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 33, "id": "f2134675", "metadata": {}, "outputs": [ @@ -1189,7 +1686,7 @@ "text": [ "cond_stage_model compression rate: 3.977\n", "decoder_first_stage_ir compression rate: 3.987\n", - "encode_first_stage_ir compression rate: 3.986\n", + "encoder_first_stage_ir compression rate: 3.986\n", "embedder_ir compression rate: 3.977\n", "model_ir compression rate: 3.981\n" ] @@ -1222,7 +1719,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 34, "id": "7755a0cf", "metadata": {}, "outputs": [], @@ -1255,20 +1752,10 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, "id": "e61b1152", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "FP32 latency: 397.222\n", - "INT8 latency: 211.801\n", - "Performance speed up: 1.875\n" - ] - } - ], + "outputs": [], "source": [ "%%skip not $to_quantize.value\n", "\n", @@ -1280,7 +1767,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "4417db2b-2f65-407c-a384-ec466f18bca0", "metadata": {}, @@ -1313,7 +1799,9 @@ "cell_type": "code", "execution_count": null, "id": "bf57f8a8-8cf6-45c5-ae78-02a3ed04fcc8", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "import gradio as gr\n", @@ -1415,6 +1903,431 @@ "Image-to-Video" ] } + }, + "widgets": { + "application/vnd.jupyter.widget-state+json": { + "state": { + "0354e90b50bd41e4b08e3af5740c7e95": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_3ce2399bcf714c3681f07328c58c850b", + "IPY_MODEL_cc0fbe6edb7d4488983633030d6572c8", + "IPY_MODEL_c4067386014b4331abdc4e5860dc6002" + ], + "layout": "IPY_MODEL_8a7f01541bf544cdbad6a9654dd03341" + } + }, + "050ba2deb97c44fea4932980babcf422": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HTMLStyleModel", + "state": { + "description_width": "", + "font_size": null, + "text_color": null + } + }, + "09ae14d72d854422943609e79dde62d5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HTMLStyleModel", + "state": { + "description_width": "", + "font_size": null, + "text_color": null + } + }, + "09e2c5cca48145249e558d5b2a8d9f07": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "102df3b931314387a921bf6440a7419a": { + "model_module": "@jupyter-widgets/output", + "model_module_version": "1.0.0", + "model_name": "OutputModel", + "state": { + "layout": "IPY_MODEL_bf8ce7c7b5f54a739927c480ea73f0ae", + "outputs": [ + { + "data": { + "text/html": "
Applying Smooth Quant ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 269/2690:00:330:00:00\n
\n", + "text/plain": "Applying Smooth Quant \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[38;2;0;104;181m269/269\u001b[0m • \u001b[38;2;0;104;181m0:00:33\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m\n" + }, + "metadata": {}, + "output_type": "display_data" + } + ] + } + }, + "10bc1275f679401b83cf92d460409c8d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "15711837c7084083bc248db140a32da7": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HTMLStyleModel", + "state": { + "description_width": "", + "font_size": null, + "text_color": null + } + }, + "1f8bda015fad48f18ab8bef827fdb4f6": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "306b95f37bee4e1192f1e6b28979259d": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "3417657a28ab4649bed9f6d10c7f214a": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HBoxModel", + "state": { + "children": [ + "IPY_MODEL_a9805e5163d9440092570f5c5c122d30", + "IPY_MODEL_7d47c3b2963c44a4b05866b64fe62838", + "IPY_MODEL_883be8e255984be5b767531b3238c1f5" + ], + "layout": "IPY_MODEL_b0800f0e5b7345a0bd0482c984412b2a" + } + }, + "35243ae51b58460e900598c06ae92547": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "DropdownModel", + "state": { + "_options_labels": [ + "CPU", + "AUTO" + ], + "description": "Device:", + "index": 1, + "layout": "IPY_MODEL_9de71f74c3224737a74e3343afefdd44", + "style": "IPY_MODEL_c83b8b4248bb4a799ddb9a145794bd99" + } + }, + "39c4a84a10e94541b7aa5280e5f95481": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "3ac4286d32934cbb861c354857d48c0d": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "CheckboxModel", + "state": { + "description": "Quantization", + "disabled": false, + "layout": "IPY_MODEL_e5d12e2ab1d74ac9af1f368db0ac7cdc", + "style": "IPY_MODEL_c37a185add934dfb97c07dae77458c36", + "value": true + } + }, + "3ce2399bcf714c3681f07328c58c850b": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_10bc1275f679401b83cf92d460409c8d", + "style": "IPY_MODEL_8e8e73da453645f486f482724d65d970", + "value": "Downloading readme: 100%" + } + }, + "459ddb60988b4bc2b8503d16fa8190bb": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "49fefe46d45d4569a5c878b4d80789ac": { + "model_module": "@jupyter-widgets/output", + "model_module_version": "1.0.0", + "model_name": "OutputModel", + "state": { + "layout": "IPY_MODEL_84b83c54834e41c583def37c2f8b5da8", + "outputs": [ + { + "data": { + "text/html": "
Statistics collection ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 300/3000:23:530:00:00\n
\n", + "text/plain": "Statistics collection \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[38;2;0;104;181m300/300\u001b[0m • \u001b[38;2;0;104;181m0:23:53\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m\n" + }, + "metadata": {}, + "output_type": "display_data" + } + ] + } + }, + "4b374a11575045d492b37ccb3d4069dc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "61df30552c794fa8b6144efc104b3d35": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "description_width": "" + } + }, + "7d47c3b2963c44a4b05866b64fe62838": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "bar_style": "success", + "layout": "IPY_MODEL_86188feeaf82478696c31c68779e29f3", + "max": 300, + "style": "IPY_MODEL_eff5c1ea328a4c31b54b03b018088f97", + "value": 300 + } + }, + "8140b8ffd17f4877bedf566c48c09e53": { + "model_module": "@jupyter-widgets/output", + "model_module_version": "1.0.0", + "model_name": "OutputModel", + "state": { + "layout": "IPY_MODEL_306b95f37bee4e1192f1e6b28979259d", + "outputs": [ + { + "data": { + "text/html": "
Statistics collection ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 300/3000:28:570:00:00\n
\n", + "text/plain": "Statistics collection \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[38;2;0;104;181m300/300\u001b[0m • \u001b[38;2;0;104;181m0:28:57\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m\n" + }, + "metadata": {}, + "output_type": "display_data" + } + ] + } + }, + "833bf18bbcda47118cbf7782d281d753": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "84b83c54834e41c583def37c2f8b5da8": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "84f1e7e84d484043901f8b8ad0a043a7": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "86188feeaf82478696c31c68779e29f3": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "883be8e255984be5b767531b3238c1f5": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_459ddb60988b4bc2b8503d16fa8190bb", + "style": "IPY_MODEL_050ba2deb97c44fea4932980babcf422", + "value": " 300/300 [24:31<00:00,  5.44s/it]" + } + }, + "8843fe7ba47d40d09fa86aadd366727a": { + "model_module": "@jupyter-widgets/output", + "model_module_version": "1.0.0", + "model_name": "OutputModel", + "state": { + "layout": "IPY_MODEL_1f8bda015fad48f18ab8bef827fdb4f6", + "outputs": [ + { + "data": { + "text/html": "
Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 97/970:00:010:00:00\n
\n", + "text/plain": "Applying Weight Compression \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[38;2;0;104;181m97/97\u001b[0m • \u001b[38;2;0;104;181m0:00:01\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m\n" + }, + "metadata": {}, + "output_type": "display_data" + } + ] + } + }, + "8a7f01541bf544cdbad6a9654dd03341": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "8e8e73da453645f486f482724d65d970": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HTMLStyleModel", + "state": { + "description_width": "", + "font_size": null, + "text_color": null + } + }, + "9bb461a1a3124cf6abed1bb725adee32": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "9de71f74c3224737a74e3343afefdd44": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "a9805e5163d9440092570f5c5c122d30": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_9bb461a1a3124cf6abed1bb725adee32", + "style": "IPY_MODEL_09ae14d72d854422943609e79dde62d5", + "value": "100%" + } + }, + "adca15bca575477bb181fc920e14aac3": { + "model_module": "@jupyter-widgets/output", + "model_module_version": "1.0.0", + "model_name": "OutputModel", + "state": { + "layout": "IPY_MODEL_84f1e7e84d484043901f8b8ad0a043a7", + "outputs": [ + { + "data": { + "text/html": "
Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 39/390:00:000:00:00\n
\n", + "text/plain": "Applying Weight Compression \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[38;2;0;104;181m39/39\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m\n" + }, + "metadata": {}, + "output_type": "display_data" + } + ] + } + }, + "b0800f0e5b7345a0bd0482c984412b2a": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "bce5b2de47134bd682eae49cbbe262da": { + "model_module": "@jupyter-widgets/output", + "model_module_version": "1.0.0", + "model_name": "OutputModel", + "state": { + "layout": "IPY_MODEL_833bf18bbcda47118cbf7782d281d753", + "outputs": [ + { + "data": { + "text/html": "
Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 129/1290:00:030:00:00\n
\n", + "text/plain": "Applying Weight Compression \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[38;2;0;104;181m129/129\u001b[0m • \u001b[38;2;0;104;181m0:00:03\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m\n" + }, + "metadata": {}, + "output_type": "display_data" + } + ] + } + }, + "bf8ce7c7b5f54a739927c480ea73f0ae": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "c1b747e9b37e4a9a83581154d70a57cf": { + "model_module": "@jupyter-widgets/output", + "model_module_version": "1.0.0", + "model_name": "OutputModel", + "state": { + "layout": "IPY_MODEL_09e2c5cca48145249e558d5b2a8d9f07", + "outputs": [ + { + "data": { + "text/html": "
Applying Weight Compression ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 100% 31/310:00:000:00:00\n
\n", + "text/plain": "Applying Weight Compression \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[35m100%\u001b[0m \u001b[38;2;0;104;181m31/31\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m • \u001b[38;2;0;104;181m0:00:00\u001b[0m\n" + }, + "metadata": {}, + "output_type": "display_data" + } + ] + } + }, + "c37a185add934dfb97c07dae77458c36": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "CheckboxStyleModel", + "state": { + "description_width": "" + } + }, + "c4067386014b4331abdc4e5860dc6002": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "HTMLModel", + "state": { + "layout": "IPY_MODEL_4b374a11575045d492b37ccb3d4069dc", + "style": "IPY_MODEL_15711837c7084083bc248db140a32da7", + "value": " 376/376 [00:00<00:00, 31.0kB/s]" + } + }, + "c83b8b4248bb4a799ddb9a145794bd99": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "DescriptionStyleModel", + "state": { + "description_width": "" + } + }, + "cc0fbe6edb7d4488983633030d6572c8": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "FloatProgressModel", + "state": { + "bar_style": "success", + "layout": "IPY_MODEL_39c4a84a10e94541b7aa5280e5f95481", + "max": 376, + "style": "IPY_MODEL_61df30552c794fa8b6144efc104b3d35", + "value": 376 + } + }, + "e5d12e2ab1d74ac9af1f368db0ac7cdc": { + "model_module": "@jupyter-widgets/base", + "model_module_version": "2.0.0", + "model_name": "LayoutModel", + "state": {} + }, + "eff5c1ea328a4c31b54b03b018088f97": { + "model_module": "@jupyter-widgets/controls", + "model_module_version": "2.0.0", + "model_name": "ProgressStyleModel", + "state": { + "description_width": "" + } + } + }, + "version_major": 2, + "version_minor": 0 + } } }, "nbformat": 4,