-
Notifications
You must be signed in to change notification settings - Fork 5
/
hubconf.py
145 lines (116 loc) · 5.55 KB
/
hubconf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
dependencies = ['torch', 'torchaudio', 'numpy', 'omegaconf']
import torch
from torch import Tensor
import torch.nn as nn
import torch.nn.functional as F
import logging
import os
import sys
from pathlib import Path
from urllib.request import urlopen, Request
from urllib.parse import urlparse
from omegaconf import OmegaConf
from density.models import RP_W
class Conv2SC09Wrapper(nn.Module):
def __init__(self, g: nn.Module, hifigan: nn.Module, cfg, seq_len=16000) -> None:
""" Inference wrapper for generator `g` and `hifigan` vocoder for the
conv2 series of models which predict a sequence of hubert vectors, which is then
vocoded with a hifigan trained to convert hubert vectors to waveforms.
"""
super().__init__()
self.g = g
self.z_dim = self.g.z_dim
self.w_dim = self.g.w_dim
self.hifigan = hifigan
self.cfg = cfg
self.seq_len = seq_len
def unconditional_generate(self, N: int):
""" Generate `N` audio samples, returning a tensor of shape (N, 16000) """
z = torch.randn(N, self.g.z_dim).to(self.g.preconv.bias.device)
c = self.g(z, self.cfg.rp_w_cfg.seq_len)
audio = self.hifigan(c).squeeze(1)
n_pad = self.seq_len - audio.shape[-1]
if n_pad > 0:
audio = F.pad(audio, (n_pad//2, n_pad - n_pad//2), value=0)
return audio
def generate_from_latent(self, z: Tensor) -> Tensor:
""" Generate waveforms (N, 16000) from latent standard normal `z` (N, z_dim) """
z = z.to(self.g.preconv.bias.device)
c = self.g(z, self.cfg.rp_w_cfg.seq_len)
audio = self.hifigan(c).squeeze(1)
n_pad = self.seq_len - audio.shape[-1]
if n_pad > 0:
audio = F.pad(audio, (n_pad//2, n_pad - n_pad//2), value=0)
return audio
def z2w(self, z: Tensor) -> Tensor:
""" Generate latent W vectors (N, w_dim) from latent standard normal `z` (N, z_dim) """
z = z.to(self.g.preconv.bias.device)
w = self.g.W(z)
return w
def generate_from_w(self, w: Tensor) -> Tensor:
""" Generate waveforms (N, 16000) from W latent space `w` (N, w_dim) """
w = w.to(self.g.preconv.bias.device)
c = self.g.forward_w(w, self.cfg.rp_w_cfg.seq_len)
audio = self.hifigan(c).squeeze(1)
n_pad = self.seq_len - audio.shape[-1]
if n_pad > 0:
audio = F.pad(audio, (n_pad//2, n_pad - n_pad//2), value=0)
return audio
def asgan_hubert_sc09_6(pretrianed=True, progress=True, device='cuda'):
""" Density GAN which generates mel-spectrograms from standard normal vectors and then
uses hifigan to vocode them back to the time domain.
"""
if torch.cuda.is_available() == False:
if str(device) != 'cpu':
logging.warning(f"Overriding device {device} to cpu since no GPU is available.")
device = 'cpu'
ckpt = torch.hub.load_state_dict_from_url(
"https://github.com/RF5/simple-asgan/releases/download/v0/asgan_hubert_ckpt_00520000_slim.pt",
map_location=device,
progress=progress
)
cfg = OmegaConf.create(ckpt['cfg_yaml'])
cfg.device = 'cpu'
if cfg.data_type == 'melspec':
hifigan = torch.hub.load("bshall/hifigan:main", "hifigan", map_location='cpu').cpu().eval()
else:
hifigan = hubert_hifigan(progress=progress, pretrained=pretrianed, device='cpu')
print(f"HiFiGAN loaded with {sum([p.numel() for p in hifigan.parameters()]):,d} parameters.")
g = RP_W(cfg.rp_w_cfg).cpu().eval()
if pretrianed:
g.load_state_dict(ckpt['g_ema_state_dict'])
model = Conv2SC09Wrapper(g, hifigan, cfg)
model = model.to(device).eval()
return model
def hubert_hifigan(pretrained=True, progress=True, device='cuda', model_dir=None):
""" HiFiGAN which works on HuBERT embeddings. Optionally specify `model_dir` as location of hubert checkpoint. """
if not pretrained: raise NotImplementedError("Only pretrained model supported.")
if model_dir is None:
hub_dir = torch.hub.get_dir()
model_dir = os.path.join(hub_dir, 'checkpoints')
url = "https://github.com/RF5/simple-asgan/releases/download/v0/g_02365000_package.pth"
parts = urlparse(url)
filename = os.path.basename(parts.path)
cached_file = os.path.join(model_dir, filename)
if not os.path.exists(cached_file):
sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file))
hash_prefix = None
torch.hub.download_url_to_file(url, cached_file, hash_prefix, progress=progress)
pi = torch.package.package_importer.PackageImporter(cached_file)
hifigan = pi.load_pickle('.', 'hifigan_pkl.pt', map_location=device)
hifigan.eval()
logging.info(f"[MODEL] HubertHiFiGAN loaded with {sum([p.numel() for p in hifigan.parameters()]):,d} parameters")
del pi
return hifigan
def hubert_base(pretrained=True, progress=True, device='cuda'):
""" Facebook HuBERT BASE model"""
if not pretrained: raise NotImplementedError()
_ = torch.hub.load_state_dict_from_url(
"https://github.com/RF5/simple-asgan/releases/download/v0/hubert_base_ls960_nofinetune.pt",
map_location='cpu', progress=progress
)
base_path = str(Path(torch.hub.get_dir())/'checkpoints'/'hubert_base_ls960_nofinetune.pt')
from hubert_feature_reader import HubertFeatureReader
hubert = HubertFeatureReader(base_path, 6, device=device)
logging.info(f"[MODEL] HuBERT loaded with {sum([p.numel() for p in hubert.model.parameters()]):,d} parameters")
return hubert