Skip to content

Commit

Permalink
Fixes to discrete-symbol-based vocoder (#394)
Browse files Browse the repository at this point in the history
Co-authored-by: Tomoki Hayashi <hayashi.tomoki@g.sp.m.is.nagoya-u.ac.jp>
  • Loading branch information
ftshijt and kan-bayashi authored Feb 20, 2023
1 parent ee7e4ec commit ffaa99f
Show file tree
Hide file tree
Showing 6 changed files with 26 additions and 26 deletions.
6 changes: 3 additions & 3 deletions egs/cvss_c/hubert_voc1/conf/hifigan_hubert_duration.v1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ generator_params:
num_embs: 500
kernel_size: 7 # Kernel size of initial and final conv layers.
upsample_scales: [10, 8, 2, 2] # Upsampling scales.
upsample_kernal_sizes: [20, 16, 4, 4] # Kernel size for upsampling layers.
upsample_kernel_sizes: [20, 16, 4, 4] # Kernel size for upsampling layers.
resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks.
resblock_dilations: # Dilations for residual blocks.
- [1, 3, 5]
Expand Down Expand Up @@ -64,7 +64,7 @@ discriminator_params:
scale_discriminator_params:
in_channels: 1 # Number of input channels.
out_channels: 1 # Number of output channels.
kernel_sizes: [15, 41, 5, 3] # List of kernal sizes.
kernel_sizes: [15, 41, 5, 3] # List of kernel sizes.
channels: 128 # Initial number of channels.
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
max_groups: 16 # Maximum number of groups in downsampling conv layers.
Expand All @@ -78,7 +78,7 @@ discriminator_params:
period_discriminator_params:
in_channels: 1 # Number of input channels.
out_channels: 1 # Number of output channels.
kernel_sizes: [5, 3] # List of kernal sizes.
kernel_sizes: [5, 3] # List of kernel sizes.
channels: 32 # Initial number of channels.
downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ generator_params:
num_spk_embs: 0 # Do not consider speaker embedding for single spk
kernel_size: 7 # Kernel size of initial and final conv layers.
upsample_scales: [12, 10, 2, 2] # Upsampling scales.
upsample_kernal_sizes: [24, 20, 4, 4] # Kernel size for upsampling layers.
upsample_kernel_sizes: [24, 20, 4, 4] # Kernel size for upsampling layers.
resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks.
resblock_dilations: # Dilations for residual blocks.
- [1, 3, 5]
Expand Down Expand Up @@ -69,7 +69,7 @@ discriminator_params:
scale_discriminator_params:
in_channels: 1 # Number of input channels.
out_channels: 1 # Number of output channels.
kernel_sizes: [15, 41, 5, 3] # List of kernal sizes.
kernel_sizes: [15, 41, 5, 3] # List of kernel sizes.
channels: 128 # Initial number of channels.
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
max_groups: 16 # Maximum number of groups in downsampling conv layers.
Expand All @@ -83,7 +83,7 @@ discriminator_params:
period_discriminator_params:
in_channels: 1 # Number of input channels.
out_channels: 1 # Number of output channels.
kernel_sizes: [5, 3] # List of kernal sizes.
kernel_sizes: [5, 3] # List of kernel sizes.
channels: 32 # Initial number of channels.
downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
Expand Down
2 changes: 1 addition & 1 deletion egs/cvss_c/hubert_voc1/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ n_gpus=1 # number of gpus in training
n_jobs=16 # number of parallel jobs in feature extraction

# NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
conf=conf/hifigan_hubert.v1.yaml
conf=conf/hifigan_hubert_duration.v1.yaml

# directory path setting
db_root=/usr0/home/jiatongs/data/cvss/es_en-c # direcotry including wavfiles (MODIFY BY YOURSELF)
Expand Down
6 changes: 3 additions & 3 deletions egs/vctk/hubert_voc1/conf/hifigan_hubert.v1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ generator_params:
concat_spk_emb: false
kernel_size: 7 # Kernel size of initial and final conv layers.
upsample_scales: [10, 8, 2, 2] # Upsampling scales.
upsample_kernal_sizes: [20, 16, 4, 4] # Kernel size for upsampling layers.
upsample_kernel_sizes: [20, 16, 4, 4] # Kernel size for upsampling layers.
resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks.
resblock_dilations: # Dilations for residual blocks.
- [1, 3, 5]
Expand All @@ -61,7 +61,7 @@ discriminator_params:
scale_discriminator_params:
in_channels: 1 # Number of input channels.
out_channels: 1 # Number of output channels.
kernel_sizes: [15, 41, 5, 3] # List of kernal sizes.
kernel_sizes: [15, 41, 5, 3] # List of kernel sizes.
channels: 128 # Initial number of channels.
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
max_groups: 16 # Maximum number of groups in downsampling conv layers.
Expand All @@ -75,7 +75,7 @@ discriminator_params:
period_discriminator_params:
in_channels: 1 # Number of input channels.
out_channels: 1 # Number of output channels.
kernel_sizes: [5, 3] # List of kernal sizes.
kernel_sizes: [5, 3] # List of kernel sizes.
channels: 32 # Initial number of channels.
downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
Expand Down
6 changes: 3 additions & 3 deletions egs/vctk/hubert_voc1/conf/hifigan_hubert_24k.v1.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ generator_params:
concat_spk_emb: false
kernel_size: 7 # Kernel size of initial and final conv layers.
upsample_scales: [12, 10, 2, 2] # Upsampling scales.
upsample_kernal_sizes: [24, 20, 4, 4] # Kernel size for upsampling layers.
upsample_kernel_sizes: [24, 20, 4, 4] # Kernel size for upsampling layers.
resblock_kernel_sizes: [3, 7, 11] # Kernel size for residual blocks.
resblock_dilations: # Dilations for residual blocks.
- [1, 3, 5]
Expand All @@ -61,7 +61,7 @@ discriminator_params:
scale_discriminator_params:
in_channels: 1 # Number of input channels.
out_channels: 1 # Number of output channels.
kernel_sizes: [15, 41, 5, 3] # List of kernal sizes.
kernel_sizes: [15, 41, 5, 3] # List of kernel sizes.
channels: 128 # Initial number of channels.
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
max_groups: 16 # Maximum number of groups in downsampling conv layers.
Expand All @@ -75,7 +75,7 @@ discriminator_params:
period_discriminator_params:
in_channels: 1 # Number of input channels.
out_channels: 1 # Number of output channels.
kernel_sizes: [5, 3] # List of kernal sizes.
kernel_sizes: [5, 3] # List of kernel sizes.
channels: 32 # Initial number of channels.
downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
max_downsample_channels: 1024 # Maximum number of channels in downsampling conv layers.
Expand Down
26 changes: 13 additions & 13 deletions parallel_wavegan/models/hifigan.py
Original file line number Diff line number Diff line change
Expand Up @@ -792,7 +792,7 @@ def __init__(
concat_spk_emb=False,
kernel_size=7,
upsample_scales=(8, 8, 2, 2),
upsample_kernal_sizes=(16, 16, 4, 4),
upsample_kernel_sizes=(16, 16, 4, 4),
resblock_kernel_sizes=(3, 7, 11),
resblock_dilations=[(1, 3, 5), (1, 3, 5), (1, 3, 5)],
use_additional_convs=True,
Expand All @@ -813,8 +813,8 @@ def __init__(
concat_spk_emb (bool): whether to concat speaker embedding to the input
kernel_size (int): Kernel size of initial and final conv layer.
upsample_scales (list): List of upsampling scales.
upsample_kernal_sizes (list): List of kernal sizes for upsampling layers.
resblock_kernal_sizes (list): List of kernal sizes for residual blocks.
upsample_kernel_sizes (list): List of kernel sizes for upsampling layers.
resblock_kernel_sizes (list): List of kernel sizes for residual blocks.
resblock_dilations (list): List of dilation list for residual blocks.
use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
bias (bool): Whether to add bias parameter in convolution layers.
Expand Down Expand Up @@ -843,11 +843,11 @@ def __init__(

# check hyperparameters are valid
assert kernel_size % 2 == 1, "Kernal size must be odd number."
assert len(upsample_scales) == len(upsample_kernal_sizes)
assert len(upsample_scales) == len(upsample_kernel_sizes)
assert len(resblock_dilations) == len(resblock_kernel_sizes)

# define modules
self.num_upsamples = len(upsample_kernal_sizes)
self.num_upsamples = len(upsample_kernel_sizes)
self.num_blocks = len(resblock_kernel_sizes)
self.input_conv = torch.nn.Conv1d(
in_channels,
Expand All @@ -858,7 +858,7 @@ def __init__(
)
self.upsamples = torch.nn.ModuleList()
self.blocks = torch.nn.ModuleList()
for i in range(len(upsample_kernal_sizes)):
for i in range(len(upsample_kernel_sizes)):
self.upsamples += [
torch.nn.Sequential(
getattr(torch.nn, nonlinear_activation)(
Expand All @@ -867,9 +867,9 @@ def __init__(
torch.nn.ConvTranspose1d(
channels // (2**i),
channels // (2 ** (i + 1)),
upsample_kernal_sizes[i],
upsample_kernel_sizes[i],
upsample_scales[i],
padding=(upsample_kernal_sizes[i] - upsample_scales[i]) // 2,
padding=(upsample_kernel_sizes[i] - upsample_scales[i]) // 2,
),
)
]
Expand Down Expand Up @@ -1024,7 +1024,7 @@ def __init__(
duration_dropout_rate=0.5,
kernel_size=7,
upsample_scales=(8, 8, 2, 2),
upsample_kernal_sizes=(16, 16, 4, 4),
upsample_kernel_sizes=(16, 16, 4, 4),
resblock_kernel_sizes=(3, 7, 11),
resblock_dilations=[(1, 3, 5), (1, 3, 5), (1, 3, 5)],
use_additional_convs=True,
Expand All @@ -1050,8 +1050,8 @@ def __init__(
duration_dropout_rate (float): duration predictor dropout rate
kernel_size (int): Kernel size of initial and final conv layer.
upsample_scales (list): List of upsampling scales.
upsample_kernal_sizes (list): List of kernal sizes for upsampling layers.
resblock_kernal_sizes (list): List of kernal sizes for residual blocks.
upsample_kernel_sizes (list): List of kernel sizes for upsampling layers.
resblock_kernel_sizes (list): List of kernel sizes for residual blocks.
resblock_dilations (list): List of dilation list for residual blocks.
use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
bias (bool): Whether to add bias parameter in convolution layers.
Expand All @@ -1071,7 +1071,7 @@ def __init__(
concat_spk_emb=concat_spk_emb,
kernel_size=kernel_size,
upsample_scales=upsample_scales,
upsample_kernal_sizes=upsample_kernal_sizes,
upsample_kernel_sizes=upsample_kernel_sizes,
resblock_kernel_sizes=resblock_kernel_sizes,
resblock_dilations=resblock_dilations,
use_additional_convs=use_additional_convs,
Expand Down Expand Up @@ -1156,7 +1156,7 @@ def inference(self, c, g=None, ds=None, normalize_before=False):
c = c[:, 0:1]

if ds is None:
c = self.synthesis(c.transpose(1, 0).unsqueeze(0))
c, _ = self.synthesis(c.transpose(1, 0).unsqueeze(0))
else:
c, _ = self.forward(c.transpose(1, 0).unsqueeze(0), ds.unsqueeze(0))
return c.squeeze(0).transpose(1, 0)
Expand Down

0 comments on commit ffaa99f

Please sign in to comment.