Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Error when using Nvidia GPU. #50

Open
Toolfolks opened this issue Jul 21, 2024 · 2 comments
Open

Error when using Nvidia GPU. #50

Toolfolks opened this issue Jul 21, 2024 · 2 comments

Comments

@Toolfolks
Copy link

Toolfolks commented Jul 21, 2024

I have 'python inference_for_demo_video.py --wav_path data/audio/acknowledgement_english.m4a --style_clip_path data/style_clip/3DMM/M030_front_neutral_level1_001.mat --pose_path data/pose/RichardShelby_front_neutral_level1_001.mat --image_path data/src_img/uncropped/male_face.png --cfg_scale 1.0 --max_gen_len 30 --output_name acknowledgement_english@M030_front_neutral_level1_001@male_face --device cuda' working okay.
(new_dreamtalk) D:\techy\talkingHeads\dreamtalk>python testGpu.py
WAV Path: D:\techy\talkingHeads\dreamtalk\data\audio\acknowledgement_english.m4a
Output Path: D:\techy\talkingHeads\dreamtalk\tmp\acknowledgement_english@M030_front_neutral_level1_001@male_face\acknowledgement_english@M030_front_neutral_level1_001@male_face_16K.wav
PyTorch Version: 2.3.1+cpu
CUDA Available: False
CUDA Version: None
No CUDA device found.
NumPy Version: 1.22.4
SciPy Version: 1.13.1
Torchaudio Version: 2.3.1+cpu
OpenCV Version: 4.4.0
Available backends after updating PATH: ['soundfile']
However when I switch to an environment with GPU

(dreamtalk) D:\techy\talkingHeads\dreamtalk>python testGpu.py
WAV Path: D:\techy\talkingHeads\dreamtalk\data\audio\acknowledgement_english.m4a
Output Path: D:\techy\talkingHeads\dreamtalk\tmp\acknowledgement_english@M030_front_neutral_level1_001@male_face\acknowledgement_english@M030_front_neutral_level1_001@male_face_16K.wav
PyTorch Version: 2.3.1+cu121
CUDA Available: True
CUDA Version: 12.1
Device Name: NVIDIA GeForce RTX 3060
NumPy Version: 1.22.4
SciPy Version: 1.10.0
Torchaudio Version: 2.3.1+cu121
OpenCV Version: 4.10.0
Available backends after updating PATH: ['soundfile']

I get error
Traceback (most recent call last):
File "inference_for_demo_video.py", line 187, in
inference_one_video(
File "C:\Users\User.conda\envs\dreamtalk\lib\site-packages\torch\utils_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "inference_for_demo_video.py", line 88, in inference_one_video
gen_exp_stack = diff_net.sample(
File "D:\techy\talkingHeads\dreamtalk\core\networks\diffusion_net.py", line 216, in sample
return self.ddim_sample(
File "D:\techy\talkingHeads\dreamtalk\core\networks\diffusion_net.py", line 144, in ddim_sample
"style_clip": torch.cat([style_clip, uncond_style_clip], dim=0),
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument tensors in method wrapper_CUDA_cat)

New to programming but I have hours going round in circles with ChatGPT.

Anyone have a solution ?

@Toolfolks
Copy link
Author

I finally got it going.... 2-3 days
It was a pain getting all the installs done....
Had to use .to(device)

(dreamtalk38) D:\techy\talkingHeads\dreamtalk>python testGpu.py
10.0.0
WAV Path: D:\techy\talkingHeads\dreamtalk\data\audio\acknowledgement_english.m4a
Output Path: D:\techy\talkingHeads\dreamtalk\tmp\acknowledgement_english@M030_front_neutral_level1_001@male_face\acknowledgement_english@M030_front_neutral_level1_001@male_face_16K.wav
PyTorch Version: 2.3.1
CUDA Available: True
CUDA Version: 11.8
Device Name: NVIDIA GeForce RTX 3060
number of devices 1
NumPy Version: 1.22.4
SciPy Version: 1.7.3
Torchaudio Version: 2.3.1
OpenCV Version: 4.4.0
Available backends after updating PATH: ['soundfile']
Python 3.8.19

Hope this helps someone.

import argparse
import os
import shutil
import subprocess
import numpy as np
import torch
import librosa
from scipy.io import loadmat
from transformers import Wav2Vec2Processor
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2Model

from configs.default import get_cfg_defaults
from core.networks.diffusion_net import DiffusionNet
from core.networks.diffusion_util import NoisePredictor, VarianceSchedule
from core.utils import (
crop_src_image,
get_pose_params,
get_video_style_clip,
get_wav2vec_audio_window,
)
from generators.utils import get_netG, render_video

def print_tensor_device(tensor, tensor_name):
if isinstance(tensor, torch.Tensor):
print(f"{tensor_name} is on device: {tensor.device}")
elif isinstance(tensor, (list, tuple)):
for i, t in enumerate(tensor):
if isinstance(t, torch.Tensor):
print(f"{tensor_name}[{i}] is on device: {t.device}")
else:
print(f"{tensor_name}[{i}] is not a tensor")
else:
print(f"{tensor_name} is not a tensor")

@torch.no_grad()
def get_diff_net(cfg, device):
diff_net = DiffusionNet(
cfg=cfg,
net=NoisePredictor(cfg),
var_sched=VarianceSchedule(
num_steps=cfg.DIFFUSION.SCHEDULE.NUM_STEPS,
beta_1=cfg.DIFFUSION.SCHEDULE.BETA_1,
beta_T=cfg.DIFFUSION.SCHEDULE.BETA_T,
mode=cfg.DIFFUSION.SCHEDULE.MODE,
),
)
checkpoint = torch.load(cfg.INFERENCE.CHECKPOINT, map_location=device)
model_state_dict = checkpoint["model_state_dict"]
diff_net_dict = {
k[9:]: v for k, v in model_state_dict.items() if k.startswith("diff_net.")
}
diff_net.load_state_dict(diff_net_dict, strict=True)
diff_net.to(device).eval() # Ensure model is on the correct device
return diff_net

@torch.no_grad()
def get_audio_feat(wav_path, output_name, wav2vec_model):
# Placeholder function
pass

@torch.no_grad()
def inference_one_video(
cfg,
audio_path,
style_clip_path,
pose_path,
output_path,
diff_net,
device,
max_audio_len=None,
sample_method="ddim",
ddim_num_step=10,
):
audio_raw = np.load(audio_path)
if max_audio_len is not None:
audio_raw = audio_raw[:max_audio_len * 50]
gen_num_frames = len(audio_raw) // 2

audio_win_array = get_wav2vec_audio_window(
    audio_raw,
    start_idx=0,
    num_frames=gen_num_frames,
    win_size=cfg.WIN_SIZE,
)

audio_win = torch.tensor(audio_win_array).to(device)
audio = audio_win.unsqueeze(0).to(device)
print_tensor_device(audio, "audio")

style_clip_raw, style_pad_mask_raw = get_video_style_clip(
    style_clip_path, "", style_max_len=256, start_idx=0
)

style_clip = style_clip_raw.unsqueeze(0).to(device)
print_tensor_device(style_clip, "style_clip")
style_pad_mask = (
    style_pad_mask_raw.unsqueeze(0).to(device)
    if style_pad_mask_raw is not None
    else None
)
print_tensor_device(style_pad_mask, "style_pad_mask")

# Ensure all inputs are on the same device
gen_exp_stack = diff_net.sample(
    audio,
    style_clip,
    style_pad_mask,
    output_dim=cfg.DATASET.FACE3D_DIM,
    use_cf_guidance=cfg.CF_GUIDANCE.INFERENCE,
    cfg_scale=cfg.CF_GUIDANCE.SCALE,
    sample_method=sample_method,
    ddim_num_step=ddim_num_step,
)
gen_exp = gen_exp_stack[0].cpu().numpy()

pose = get_pose_params(pose_path)
selected_pose = pose[: len(gen_exp)] if len(pose) >= len(gen_exp) else pose[-1].unsqueeze(0).repeat(len(gen_exp), 1)
gen_exp_pose = np.concatenate((gen_exp, selected_pose), axis=1)
np.save(output_path, gen_exp_pose)
return output_path

if name == "main":
parser = argparse.ArgumentParser(description="Inference for demo")
parser.add_argument("--wav_path", type=str, required=True, help="Path to WAV file")
parser.add_argument("--image_path", type=str, required=True, help="Path to image file")
parser.add_argument("--style_clip_path", type=str, required=True, help="Path to style clip MAT file")
parser.add_argument("--pose_path", type=str, required=True, help="Path to pose file")
parser.add_argument("--max_gen_len", type=int, default=1000, help="Maximum length (in seconds) for generating videos")
parser.add_argument("--cfg_scale", type=float, default=1.0, help="Scale of classifier-free guidance")
parser.add_argument("--output_name", type=str, default="test", help="Name for the output")
parser.add_argument("--device", type=str, choices=['cpu', 'cuda'], default="cpu", help="Device to use for computation")
parser.add_argument("--disable_img_crop", dest="img_crop", action="store_false", help="Disable image cropping")
parser.set_defaults(img_crop=True)

args = parser.parse_args()

if args.device == "cuda" and not torch.cuda.is_available():
    print("CUDA is not available. Switching to CPU.")
    args.device = "cpu"

device = torch.device(args.device)
print(f"Device = {device}")
cfg = get_cfg_defaults()
cfg.CF_GUIDANCE.SCALE = args.cfg_scale
cfg.freeze()

tmp_dir = os.path.join("tmp", args.output_name)
os.makedirs(tmp_dir, exist_ok=True)

# Preprocess the audio file to WAV format with 16kHz sample rate
wav_16k_path = os.path.join(tmp_dir, f"{args.output_name}_16K.wav")
command = [
    "D:\\techy\\talkingHeads\\dreamtalk\\ffmpeg\\bin\\ffmpeg.exe",
    "-y",
    "-i", args.wav_path,
    "-async", "1",
    "-ac", "1",
    "-vn",
    "-acodec", "pcm_s16le",
    "-ar", "16000",
    wav_16k_path
]
print(f"Executing command: {' '.join(command)}")
try:
    subprocess.run(command, check=True)
except subprocess.CalledProcessError as e:
    print(f"Error executing command: {e}")
    exit(1)

# Load and process audio using librosa
wav2vec_processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
wav2vec_model = (
    Wav2Vec2Model.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
    .eval()
    .to(device)
)

try:
    audio_data, _ = librosa.load(wav_16k_path, sr=16000)
except Exception as e:
    print(f"Error loading audio file: {e}")
    exit(1)

inputs = wav2vec_processor(audio_data, sampling_rate=16_000, return_tensors="pt", padding=True)

with torch.no_grad():
    audio_embedding = wav2vec_model(inputs.input_values.to(device), return_dict=False)[0]

audio_feat_path = os.path.join(tmp_dir, f"{args.output_name}_wav2vec.npy")
np.save(audio_feat_path, audio_embedding[0].cpu().numpy())

# Get source image
src_img_path = os.path.join(tmp_dir, "src_img.png")
if args.img_crop:
    crop_src_image(args.image_path, src_img_path, 0.4)
else:
    shutil.copy(args.image_path, src_img_path)

with torch.no_grad():
    # Get diffusion model and load checkpoint
    diff_net = get_diff_net(cfg, device)
    # Generate face motion
    face_motion_path = os.path.join(tmp_dir, f"{args.output_name}_facemotion.npy")
    inference_one_video(
        cfg,
        audio_feat_path,
        args.style_clip_path,
        args.pose_path,
        face_motion_path,
        diff_net,
        device,
        max_audio_len=args.max_gen_len,
    )
    # Get renderer
    renderer = get_netG("checkpoints/renderer.pt", device)
    # Render video
    output_video_path = os.path.join("output_video", f"{args.output_name}.mp4")
    render_video(
        renderer,
        src_img_path,
        face_motion_path,
        wav_16k_path,
        output_video_path,
        device,
        fps=25,
        no_move=False,
    )

    # Add watermark
    watermark = os.path.join(tmp_dir, f"{args.output_name}_watermarked.mp4")
    watermark_command = [
        "ffmpeg",
        "-i", output_video_path,
        "-vf", "drawtext=text='Your Watermark Text':x=10:y=10:fontsize=24:fontcolor=white",
        "-codec:a", "copy",
        watermark
    ]
    try:
        subprocess.run(watermark_command, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error adding watermark: {e}")
    
print(f"Processing complete. Output video saved to {watermark}")

@nitinmukesh
Copy link

@Toolfolks

I am struggling to make it work on Windows. Please could you explain a little bit. I tried comparing the code you posted with the one in repository but they are a lot different

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants