Add Gradio helpers - part 4 (#2315)

Ticket: CVS-147626 Notebooks: 1. animate-anyone/animate-anyone.ipynb 2. bark-text-to-audio/bark-text-to-audio.ipynb 3. blip-visual-language-processing/blip-visual-language-processing.ipynb 4. controlnet-stable-diffusion/controlnet-stable-diffusion.ipynb 5. ddcolor-image-colorization/ddcolor-image-colorization.ipynb 6. depth-anything/depth-anything-v2.ipynb 7. depth-anything/depth-anything.ipynb 8. dolly-2-instruction-following/dolly-2-instruction-following.ipynb 9. dynamicrafter-animating-images/dynamicrafter-animating-images.ipynb
openvinotoolkit · Aug 23, 2024 · 993665b · 993665b
1 parent 6f0fd17
commit 993665b
Show file tree

Hide file tree

Showing 17 changed files with 658 additions and 553 deletions.
diff --git a/notebooks/animate-anyone/animate-anyone.ipynb b/notebooks/animate-anyone/animate-anyone.ipynb
@@ -165,9 +165,7 @@
     "from diffusers.image_processor import VaeImageProcessor\n",
     "from transformers import CLIPImageProcessor\n",
     "import torch\n",
-    "import gradio as gr\n",
     "import ipywidgets as widgets\n",
-    "import numpy as np\n",
     "\n",
     "from src.pipelines.pipeline_pose2vid_long import Pose2VideoPipeline\n",
     "from src.utils.util import get_fps, read_frames\n",
@@ -1523,11 +1521,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
-   "id": "2832a501-a8cb-4a44-a249-846f8524e3d6",
+   "execution_count": null,
+   "id": "6a361d62",
    "metadata": {},
    "outputs": [],
    "source": [
+    "import gradio as gr\n",
+    "\n",
+    "\n",
     "def generate(\n",
     "    img,\n",
     "    pose_vid,\n",
@@ -1573,48 +1574,24 @@
     "        n_rows=3,\n",
     "        fps=12,\n",
     "    )\n",
-    "    return out_path\n",
-    "\n",
-    "\n",
-    "demo = gr.Interface(\n",
-    "    generate,\n",
-    "    [\n",
-    "        gr.Image(label=\"Reference Image\", type=\"pil\"),\n",
-    "        gr.Video(label=\"Pose video\"),\n",
-    "        gr.Slider(\n",
-    "            label=\"Seed\",\n",
-    "            value=42,\n",
-    "            minimum=np.iinfo(np.int32).min,\n",
-    "            maximum=np.iinfo(np.int32).max,\n",
-    "        ),\n",
-    "        gr.Slider(label=\"Guidance scale\", value=3.5, minimum=1.1, maximum=10),\n",
-    "        gr.Slider(label=\"Number of inference steps\", value=30, minimum=15, maximum=100),\n",
-    "    ],\n",
-    "    \"video\",\n",
-    "    examples=[\n",
-    "        [\n",
-    "            \"Moore-AnimateAnyone/configs/inference/ref_images/anyone-2.png\",\n",
-    "            \"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-2_kps.mp4\",\n",
-    "        ],\n",
-    "        [\n",
-    "            \"Moore-AnimateAnyone/configs/inference/ref_images/anyone-10.png\",\n",
-    "            \"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-1_kps.mp4\",\n",
-    "        ],\n",
-    "        [\n",
-    "            \"Moore-AnimateAnyone/configs/inference/ref_images/anyone-11.png\",\n",
-    "            \"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-1_kps.mp4\",\n",
-    "        ],\n",
-    "        [\n",
-    "            \"Moore-AnimateAnyone/configs/inference/ref_images/anyone-3.png\",\n",
-    "            \"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-2_kps.mp4\",\n",
-    "        ],\n",
-    "        [\n",
-    "            \"Moore-AnimateAnyone/configs/inference/ref_images/anyone-5.png\",\n",
-    "            \"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-2_kps.mp4\",\n",
-    "        ],\n",
-    "    ],\n",
-    "    allow_flagging=\"never\",\n",
-    ")\n",
+    "    return out_path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "2832a501-a8cb-4a44-a249-846f8524e3d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "if not Path(\"gradio_helper.py\").exists():\n",
+    "    r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/animate-anyone/gradio_helper.py\")\n",
+    "    open(\"gradio_helper.py\", \"w\").write(r.text)\n",
+    "\n",
+    "from gradio_helper import make_demo\n",
+    "\n",
+    "demo = make_demo(fn=generate)\n",
+    "\n",
     "try:\n",
     "    demo.queue().launch(debug=True)\n",
     "except Exception:\n",

diff --git a/notebooks/animate-anyone/gradio_helper.py b/notebooks/animate-anyone/gradio_helper.py
@@ -0,0 +1,48 @@
+from typing import Callable
+import gradio as gr
+import numpy as np
+
+examples = [
+    [
+        "Moore-AnimateAnyone/configs/inference/ref_images/anyone-2.png",
+        "Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-2_kps.mp4",
+    ],
+    [
+        "Moore-AnimateAnyone/configs/inference/ref_images/anyone-10.png",
+        "Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-1_kps.mp4",
+    ],
+    [
+        "Moore-AnimateAnyone/configs/inference/ref_images/anyone-11.png",
+        "Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-1_kps.mp4",
+    ],
+    [
+        "Moore-AnimateAnyone/configs/inference/ref_images/anyone-3.png",
+        "Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-2_kps.mp4",
+    ],
+    [
+        "Moore-AnimateAnyone/configs/inference/ref_images/anyone-5.png",
+        "Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-2_kps.mp4",
+    ],
+]
+
+
+def make_demo(fn: Callable):
+    demo = gr.Interface(
+        fn=fn,
+        inputs=[
+            gr.Image(label="Reference Image", type="pil"),
+            gr.Video(label="Pose video"),
+            gr.Slider(
+                label="Seed",
+                value=42,
+                minimum=np.iinfo(np.int32).min,
+                maximum=np.iinfo(np.int32).max,
+            ),
+            gr.Slider(label="Guidance scale", value=3.5, minimum=1.1, maximum=10),
+            gr.Slider(label="Number of inference steps", value=30, minimum=15, maximum=100),
+        ],
+        outputs="video",
+        examples=examples,
+        allow_flagging="never",
+    )
+    return demo
diff --git a/notebooks/bark-text-to-audio/bark-text-to-audio.ipynb b/notebooks/bark-text-to-audio/bark-text-to-audio.ipynb
@@ -1132,12 +1132,12 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "590b9db5",
+   "id": "3637baad",
    "metadata": {},
    "outputs": [],
    "source": [
     "import numpy as np\n",
-    "import gradio as gr\n",
+    "\n",
     "from bark import SAMPLE_RATE\n",
     "from bark.generation import SUPPORTED_LANGS\n",
     "\n",
@@ -1151,124 +1151,35 @@
     "PROMPT_LOOKUP[\"Unconditional\"] = None\n",
     "PROMPT_LOOKUP[\"Announcer\"] = \"announcer\"\n",
     "\n",
-    "default_text = \"Hello, my name is Suno. And, uh — and I like pizza. [laughs]\\nBut I also have other interests such as playing tic tac toe.\"\n",
-    "\n",
-    "title = \"# 🐶 Bark: Text-to-Speech using OpenVINO</div>\"\n",
-    "\n",
-    "description = \"\"\"\n",
-    "Bark is a universal text-to-audio model created by [Suno](http://suno.ai). \\\n",
-    "Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. \\\n",
-    "The model output is not censored and the authors do not endorse the opinions in the generated content. \\\n",
-    "Use at your own risk.\n",
-    "\"\"\"\n",
-    "\n",
-    "article = \"\"\"\n",
-    "\n",
-    "## 🌎 Foreign Language\n",
-    "\n",
-    "Bark supports various languages out-of-the-box and automatically determines language from input text. \\\n",
-    "When prompted with code-switched text, Bark will even attempt to employ the native accent for the respective languages in the same voice.\n",
-    "\n",
-    "Try the prompt:\n",
-    "\n",
-    "```\n",
-    "Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible.\n",
-    "```\n",
-    "\n",
-    "## 🤭 Non-Speech Sounds\n",
-    "\n",
-    "Below is a list of some known non-speech sounds, but we are finding more every day. \\\n",
-    "Please let us know if you find patterns that work particularly well on Discord!\n",
-    "\n",
-    "* [laughter]\n",
-    "* [laughs]\n",
-    "* [sighs]\n",
-    "* [music]\n",
-    "* [gasps]\n",
-    "* [clears throat]\n",
-    "* — or ... for hesitations\n",
-    "* ♪ for song lyrics\n",
-    "* capitalization for emphasis of a word\n",
-    "* MAN/WOMAN: for bias towards speaker\n",
-    "\n",
-    "Try the prompt:\n",
-    "\n",
-    "```\n",
-    "\" [clears throat] Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as... ♪ singing ♪.\"\n",
-    "```\n",
-    "\n",
-    "## 🎶 Music\n",
-    "Bark can generate all types of audio, and, in principle, doesn't see a difference between speech and music. \\\n",
-    "Sometimes Bark chooses to generate text as music, but you can help it out by adding music notes around your lyrics.\n",
-    "\n",
-    "Try the prompt:\n",
-    "\n",
-    "```\n",
-    "♪ In the jungle, the mighty jungle, the lion barks tonight ♪\n",
-    "```\n",
-    "\n",
-    "## 🧬 Voice Cloning\n",
-    "\n",
-    "Bark has the capability to fully clone voices - including tone, pitch, emotion and prosody. \\\n",
-    "The model also attempts to preserve music, ambient noise, etc. from input audio. \\\n",
-    "However, to mitigate misuse of this technology, we limit the audio history prompts to a limited set of Suno-provided, fully synthetic options to choose from.\n",
-    "\n",
-    "## 👥 Speaker Prompts\n",
-    "\n",
-    "You can provide certain speaker prompts such as NARRATOR, MAN, WOMAN, etc. \\\n",
-    "Please note that these are not always respected, especially if a conflicting audio history prompt is given.\n",
-    "\n",
-    "Try the prompt:\n",
-    "\n",
-    "```\n",
-    "WOMAN: I would like an oatmilk latte please.\n",
-    "MAN: Wow, that's expensive!\n",
-    "```\n",
-    "\n",
-    "\"\"\"\n",
-    "\n",
-    "examples = [\n",
-    "    [\n",
-    "        \"Please surprise me and speak in whatever voice you enjoy. Vielen Dank und Gesundheit!\",\n",
-    "        \"Unconditional\",\n",
-    "    ],\n",
-    "    [\n",
-    "        \"Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe.\",\n",
-    "        \"Speaker 1 (en)\",\n",
-    "    ],\n",
-    "    [\n",
-    "        \"Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible.\",\n",
-    "        \"Speaker 0 (es)\",\n",
-    "    ],\n",
-    "]\n",
-    "\n",
     "\n",
     "def gen_tts(text, history_prompt):\n",
     "    history_prompt = PROMPT_LOOKUP[history_prompt]\n",
     "    audio_arr = generate_audio(text, history_prompt=history_prompt)\n",
     "    audio_arr = (audio_arr * 32767).astype(np.int16)\n",
-    "    return (SAMPLE_RATE, audio_arr)\n",
-    "\n",
-    "\n",
-    "with gr.Blocks() as block:\n",
-    "    gr.Markdown(title)\n",
-    "    gr.Markdown(description)\n",
-    "    with gr.Row():\n",
-    "        with gr.Column():\n",
-    "            input_text = gr.Textbox(label=\"Input Text\", lines=2, value=default_text)\n",
-    "            options = gr.Dropdown(AVAILABLE_PROMPTS, value=\"Speaker 1 (en)\", label=\"Acoustic Prompt\")\n",
-    "            run_button = gr.Button()\n",
-    "        with gr.Column():\n",
-    "            audio_out = gr.Audio(label=\"Generated Audio\", type=\"numpy\")\n",
-    "    inputs = [input_text, options]\n",
-    "    outputs = [audio_out]\n",
-    "    gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs)\n",
-    "    gr.Markdown(article)\n",
-    "    run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)\n",
+    "    return (SAMPLE_RATE, audio_arr)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "590b9db5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import requests\n",
+    "\n",
+    "if not Path(\"gradio_helper.py\").exists():\n",
+    "    r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/bark-text-to-audio/gradio_helper.py\")\n",
+    "    open(\"gradio_helper.py\", \"w\").write(r.text)\n",
+    "\n",
+    "from gradio_helper import make_demo\n",
+    "\n",
+    "demo = make_demo(fn=gen_tts, available_prompts=AVAILABLE_PROMPTS)\n",
+    "\n",
     "try:\n",
-    "    block.launch(debug=True)\n",
+    "    demo.launch(debug=True)\n",
     "except Exception:\n",
-    "    block.launch(share=True, debug=True)\n",
+    "    demo.launch(share=True, debug=True)\n",
     "# if you are launching remotely, specify server_name and server_port\n",
     "# demo.launch(server_name='your server name', server_port='server port in int')\n",
     "# Read more in the docs: https://gradio.app/docs/"