Skip to content

Commit

Permalink
Add Gradio helpers - part 4 (#2315)
Browse files Browse the repository at this point in the history
Ticket: CVS-147626

Notebooks:
1. animate-anyone/animate-anyone.ipynb
2. bark-text-to-audio/bark-text-to-audio.ipynb
3. blip-visual-language-processing/blip-visual-language-processing.ipynb
4. controlnet-stable-diffusion/controlnet-stable-diffusion.ipynb
5. ddcolor-image-colorization/ddcolor-image-colorization.ipynb
6. depth-anything/depth-anything-v2.ipynb
7. depth-anything/depth-anything.ipynb
8. dolly-2-instruction-following/dolly-2-instruction-following.ipynb
9. dynamicrafter-animating-images/dynamicrafter-animating-images.ipynb
  • Loading branch information
yatarkan authored Aug 23, 2024
1 parent 6f0fd17 commit 993665b
Show file tree
Hide file tree
Showing 17 changed files with 658 additions and 553 deletions.
69 changes: 23 additions & 46 deletions notebooks/animate-anyone/animate-anyone.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -165,9 +165,7 @@
"from diffusers.image_processor import VaeImageProcessor\n",
"from transformers import CLIPImageProcessor\n",
"import torch\n",
"import gradio as gr\n",
"import ipywidgets as widgets\n",
"import numpy as np\n",
"\n",
"from src.pipelines.pipeline_pose2vid_long import Pose2VideoPipeline\n",
"from src.utils.util import get_fps, read_frames\n",
Expand Down Expand Up @@ -1523,11 +1521,14 @@
},
{
"cell_type": "code",
"execution_count": 27,
"id": "2832a501-a8cb-4a44-a249-846f8524e3d6",
"execution_count": null,
"id": "6a361d62",
"metadata": {},
"outputs": [],
"source": [
"import gradio as gr\n",
"\n",
"\n",
"def generate(\n",
" img,\n",
" pose_vid,\n",
Expand Down Expand Up @@ -1573,48 +1574,24 @@
" n_rows=3,\n",
" fps=12,\n",
" )\n",
" return out_path\n",
"\n",
"\n",
"demo = gr.Interface(\n",
" generate,\n",
" [\n",
" gr.Image(label=\"Reference Image\", type=\"pil\"),\n",
" gr.Video(label=\"Pose video\"),\n",
" gr.Slider(\n",
" label=\"Seed\",\n",
" value=42,\n",
" minimum=np.iinfo(np.int32).min,\n",
" maximum=np.iinfo(np.int32).max,\n",
" ),\n",
" gr.Slider(label=\"Guidance scale\", value=3.5, minimum=1.1, maximum=10),\n",
" gr.Slider(label=\"Number of inference steps\", value=30, minimum=15, maximum=100),\n",
" ],\n",
" \"video\",\n",
" examples=[\n",
" [\n",
" \"Moore-AnimateAnyone/configs/inference/ref_images/anyone-2.png\",\n",
" \"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-2_kps.mp4\",\n",
" ],\n",
" [\n",
" \"Moore-AnimateAnyone/configs/inference/ref_images/anyone-10.png\",\n",
" \"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-1_kps.mp4\",\n",
" ],\n",
" [\n",
" \"Moore-AnimateAnyone/configs/inference/ref_images/anyone-11.png\",\n",
" \"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-1_kps.mp4\",\n",
" ],\n",
" [\n",
" \"Moore-AnimateAnyone/configs/inference/ref_images/anyone-3.png\",\n",
" \"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-2_kps.mp4\",\n",
" ],\n",
" [\n",
" \"Moore-AnimateAnyone/configs/inference/ref_images/anyone-5.png\",\n",
" \"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-2_kps.mp4\",\n",
" ],\n",
" ],\n",
" allow_flagging=\"never\",\n",
")\n",
" return out_path"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "2832a501-a8cb-4a44-a249-846f8524e3d6",
"metadata": {},
"outputs": [],
"source": [
"if not Path(\"gradio_helper.py\").exists():\n",
" r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/animate-anyone/gradio_helper.py\")\n",
" open(\"gradio_helper.py\", \"w\").write(r.text)\n",
"\n",
"from gradio_helper import make_demo\n",
"\n",
"demo = make_demo(fn=generate)\n",
"\n",
"try:\n",
" demo.queue().launch(debug=True)\n",
"except Exception:\n",
Expand Down
48 changes: 48 additions & 0 deletions notebooks/animate-anyone/gradio_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from typing import Callable
import gradio as gr
import numpy as np

examples = [
[
"Moore-AnimateAnyone/configs/inference/ref_images/anyone-2.png",
"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-2_kps.mp4",
],
[
"Moore-AnimateAnyone/configs/inference/ref_images/anyone-10.png",
"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-1_kps.mp4",
],
[
"Moore-AnimateAnyone/configs/inference/ref_images/anyone-11.png",
"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-1_kps.mp4",
],
[
"Moore-AnimateAnyone/configs/inference/ref_images/anyone-3.png",
"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-2_kps.mp4",
],
[
"Moore-AnimateAnyone/configs/inference/ref_images/anyone-5.png",
"Moore-AnimateAnyone/configs/inference/pose_videos/anyone-video-2_kps.mp4",
],
]


def make_demo(fn: Callable):
demo = gr.Interface(
fn=fn,
inputs=[
gr.Image(label="Reference Image", type="pil"),
gr.Video(label="Pose video"),
gr.Slider(
label="Seed",
value=42,
minimum=np.iinfo(np.int32).min,
maximum=np.iinfo(np.int32).max,
),
gr.Slider(label="Guidance scale", value=3.5, minimum=1.1, maximum=10),
gr.Slider(label="Number of inference steps", value=30, minimum=15, maximum=100),
],
outputs="video",
examples=examples,
allow_flagging="never",
)
return demo
137 changes: 24 additions & 113 deletions notebooks/bark-text-to-audio/bark-text-to-audio.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1132,12 +1132,12 @@
{
"cell_type": "code",
"execution_count": null,
"id": "590b9db5",
"id": "3637baad",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import gradio as gr\n",
"\n",
"from bark import SAMPLE_RATE\n",
"from bark.generation import SUPPORTED_LANGS\n",
"\n",
Expand All @@ -1151,124 +1151,35 @@
"PROMPT_LOOKUP[\"Unconditional\"] = None\n",
"PROMPT_LOOKUP[\"Announcer\"] = \"announcer\"\n",
"\n",
"default_text = \"Hello, my name is Suno. And, uh — and I like pizza. [laughs]\\nBut I also have other interests such as playing tic tac toe.\"\n",
"\n",
"title = \"# 🐶 Bark: Text-to-Speech using OpenVINO</div>\"\n",
"\n",
"description = \"\"\"\n",
"Bark is a universal text-to-audio model created by [Suno](http://suno.ai). \\\n",
"Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. \\\n",
"The model output is not censored and the authors do not endorse the opinions in the generated content. \\\n",
"Use at your own risk.\n",
"\"\"\"\n",
"\n",
"article = \"\"\"\n",
"\n",
"## 🌎 Foreign Language\n",
"\n",
"Bark supports various languages out-of-the-box and automatically determines language from input text. \\\n",
"When prompted with code-switched text, Bark will even attempt to employ the native accent for the respective languages in the same voice.\n",
"\n",
"Try the prompt:\n",
"\n",
"```\n",
"Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible.\n",
"```\n",
"\n",
"## 🤭 Non-Speech Sounds\n",
"\n",
"Below is a list of some known non-speech sounds, but we are finding more every day. \\\n",
"Please let us know if you find patterns that work particularly well on Discord!\n",
"\n",
"* [laughter]\n",
"* [laughs]\n",
"* [sighs]\n",
"* [music]\n",
"* [gasps]\n",
"* [clears throat]\n",
"* — or ... for hesitations\n",
"* ♪ for song lyrics\n",
"* capitalization for emphasis of a word\n",
"* MAN/WOMAN: for bias towards speaker\n",
"\n",
"Try the prompt:\n",
"\n",
"```\n",
"\" [clears throat] Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as... ♪ singing ♪.\"\n",
"```\n",
"\n",
"## 🎶 Music\n",
"Bark can generate all types of audio, and, in principle, doesn't see a difference between speech and music. \\\n",
"Sometimes Bark chooses to generate text as music, but you can help it out by adding music notes around your lyrics.\n",
"\n",
"Try the prompt:\n",
"\n",
"```\n",
"♪ In the jungle, the mighty jungle, the lion barks tonight ♪\n",
"```\n",
"\n",
"## 🧬 Voice Cloning\n",
"\n",
"Bark has the capability to fully clone voices - including tone, pitch, emotion and prosody. \\\n",
"The model also attempts to preserve music, ambient noise, etc. from input audio. \\\n",
"However, to mitigate misuse of this technology, we limit the audio history prompts to a limited set of Suno-provided, fully synthetic options to choose from.\n",
"\n",
"## 👥 Speaker Prompts\n",
"\n",
"You can provide certain speaker prompts such as NARRATOR, MAN, WOMAN, etc. \\\n",
"Please note that these are not always respected, especially if a conflicting audio history prompt is given.\n",
"\n",
"Try the prompt:\n",
"\n",
"```\n",
"WOMAN: I would like an oatmilk latte please.\n",
"MAN: Wow, that's expensive!\n",
"```\n",
"\n",
"\"\"\"\n",
"\n",
"examples = [\n",
" [\n",
" \"Please surprise me and speak in whatever voice you enjoy. Vielen Dank und Gesundheit!\",\n",
" \"Unconditional\",\n",
" ],\n",
" [\n",
" \"Hello, my name is Suno. And, uh — and I like pizza. [laughs] But I also have other interests such as playing tic tac toe.\",\n",
" \"Speaker 1 (en)\",\n",
" ],\n",
" [\n",
" \"Buenos días Miguel. Tu colega piensa que tu alemán es extremadamente malo. But I suppose your english isn't terrible.\",\n",
" \"Speaker 0 (es)\",\n",
" ],\n",
"]\n",
"\n",
"\n",
"def gen_tts(text, history_prompt):\n",
" history_prompt = PROMPT_LOOKUP[history_prompt]\n",
" audio_arr = generate_audio(text, history_prompt=history_prompt)\n",
" audio_arr = (audio_arr * 32767).astype(np.int16)\n",
" return (SAMPLE_RATE, audio_arr)\n",
"\n",
"\n",
"with gr.Blocks() as block:\n",
" gr.Markdown(title)\n",
" gr.Markdown(description)\n",
" with gr.Row():\n",
" with gr.Column():\n",
" input_text = gr.Textbox(label=\"Input Text\", lines=2, value=default_text)\n",
" options = gr.Dropdown(AVAILABLE_PROMPTS, value=\"Speaker 1 (en)\", label=\"Acoustic Prompt\")\n",
" run_button = gr.Button()\n",
" with gr.Column():\n",
" audio_out = gr.Audio(label=\"Generated Audio\", type=\"numpy\")\n",
" inputs = [input_text, options]\n",
" outputs = [audio_out]\n",
" gr.Examples(examples=examples, fn=gen_tts, inputs=inputs, outputs=outputs)\n",
" gr.Markdown(article)\n",
" run_button.click(fn=gen_tts, inputs=inputs, outputs=outputs, queue=True)\n",
" return (SAMPLE_RATE, audio_arr)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "590b9db5",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"\n",
"if not Path(\"gradio_helper.py\").exists():\n",
" r = requests.get(url=\"https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/bark-text-to-audio/gradio_helper.py\")\n",
" open(\"gradio_helper.py\", \"w\").write(r.text)\n",
"\n",
"from gradio_helper import make_demo\n",
"\n",
"demo = make_demo(fn=gen_tts, available_prompts=AVAILABLE_PROMPTS)\n",
"\n",
"try:\n",
" block.launch(debug=True)\n",
" demo.launch(debug=True)\n",
"except Exception:\n",
" block.launch(share=True, debug=True)\n",
" demo.launch(share=True, debug=True)\n",
"# if you are launching remotely, specify server_name and server_port\n",
"# demo.launch(server_name='your server name', server_port='server port in int')\n",
"# Read more in the docs: https://gradio.app/docs/"
Expand Down
Loading

0 comments on commit 993665b

Please sign in to comment.