Fixes to discrete-symbol-based vocoder (#394)

Co-authored-by: Tomoki Hayashi <hayashi.tomoki@g.sp.m.is.nagoya-u.ac.jp>
kan-bayashi · Feb 20, 2023 · ffaa99f · ffaa99f
1 parent ee7e4ec
commit ffaa99f
Show file tree

Hide file tree

Showing 6 changed files with 26 additions and 26 deletions.
diff --git a/egs/cvss_c/hubert_voc1/conf/hifigan_hubert_duration.v1.yaml b/egs/cvss_c/hubert_voc1/conf/hifigan_hubert_duration.v1.yaml
@@ -31,7 +31,7 @@ generator_params:
     num_embs: 500
     kernel_size: 7                        # Kernel size of initial and final conv layers.
     upsample_scales: [10, 8, 2, 2]        # Upsampling scales.
-    upsample_kernal_sizes: [20, 16, 4, 4] # Kernel size for upsampling layers.
+    upsample_kernel_sizes: [20, 16, 4, 4] # Kernel size for upsampling layers.
     resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
     resblock_dilations:                   # Dilations for residual blocks.
         - [1, 3, 5]
@@ -64,7 +64,7 @@ discriminator_params:
     scale_discriminator_params:
         in_channels: 1                     # Number of input channels.
         out_channels: 1                    # Number of output channels.
-        kernel_sizes: [15, 41, 5, 3]       # List of kernal sizes.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
         channels: 128                      # Initial number of channels.
         max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
         max_groups: 16                     # Maximum number of groups in downsampling conv layers.
@@ -78,7 +78,7 @@ discriminator_params:
     period_discriminator_params:
         in_channels: 1                     # Number of input channels.
         out_channels: 1                    # Number of output channels.
-        kernel_sizes: [5, 3]               # List of kernal sizes.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
         channels: 32                       # Initial number of channels.
         downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
         max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.

diff --git a/egs/cvss_c/hubert_voc1/conf/hifigan_hubert_duration_24k.v1.yaml b/egs/cvss_c/hubert_voc1/conf/hifigan_hubert_duration_24k.v1.yaml
@@ -32,7 +32,7 @@ generator_params:
     num_spk_embs: 0                       # Do not consider speaker embedding for single spk
     kernel_size: 7                        # Kernel size of initial and final conv layers.
     upsample_scales: [12, 10, 2, 2]        # Upsampling scales.
-    upsample_kernal_sizes: [24, 20, 4, 4] # Kernel size for upsampling layers.
+    upsample_kernel_sizes: [24, 20, 4, 4] # Kernel size for upsampling layers.
     resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
     resblock_dilations:                   # Dilations for residual blocks.
         - [1, 3, 5]
@@ -69,7 +69,7 @@ discriminator_params:
     scale_discriminator_params:
         in_channels: 1                     # Number of input channels.
         out_channels: 1                    # Number of output channels.
-        kernel_sizes: [15, 41, 5, 3]       # List of kernal sizes.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
         channels: 128                      # Initial number of channels.
         max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
         max_groups: 16                     # Maximum number of groups in downsampling conv layers.
@@ -83,7 +83,7 @@ discriminator_params:
     period_discriminator_params:
         in_channels: 1                     # Number of input channels.
         out_channels: 1                    # Number of output channels.
-        kernel_sizes: [5, 3]               # List of kernal sizes.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
         channels: 32                       # Initial number of channels.
         downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
         max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.

diff --git a/egs/cvss_c/hubert_voc1/run.sh b/egs/cvss_c/hubert_voc1/run.sh
@@ -14,7 +14,7 @@ n_gpus=1       # number of gpus in training
 n_jobs=16      # number of parallel jobs in feature extraction
 
 # NOTE(kan-bayashi): renamed to conf to avoid conflict in parse_options.sh
-conf=conf/hifigan_hubert.v1.yaml
+conf=conf/hifigan_hubert_duration.v1.yaml
 
 # directory path setting
 db_root=/usr0/home/jiatongs/data/cvss/es_en-c # direcotry including wavfiles (MODIFY BY YOURSELF)

diff --git a/egs/vctk/hubert_voc1/conf/hifigan_hubert.v1.yaml b/egs/vctk/hubert_voc1/conf/hifigan_hubert.v1.yaml
@@ -34,7 +34,7 @@ generator_params:
     concat_spk_emb: false
     kernel_size: 7                        # Kernel size of initial and final conv layers.
     upsample_scales: [10, 8, 2, 2]        # Upsampling scales.
-    upsample_kernal_sizes: [20, 16, 4, 4] # Kernel size for upsampling layers.
+    upsample_kernel_sizes: [20, 16, 4, 4] # Kernel size for upsampling layers.
     resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
     resblock_dilations:                   # Dilations for residual blocks.
         - [1, 3, 5]
@@ -61,7 +61,7 @@ discriminator_params:
     scale_discriminator_params:
         in_channels: 1                     # Number of input channels.
         out_channels: 1                    # Number of output channels.
-        kernel_sizes: [15, 41, 5, 3]       # List of kernal sizes.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
         channels: 128                      # Initial number of channels.
         max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
         max_groups: 16                     # Maximum number of groups in downsampling conv layers.
@@ -75,7 +75,7 @@ discriminator_params:
     period_discriminator_params:
         in_channels: 1                     # Number of input channels.
         out_channels: 1                    # Number of output channels.
-        kernel_sizes: [5, 3]               # List of kernal sizes.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
         channels: 32                       # Initial number of channels.
         downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
         max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.

diff --git a/egs/vctk/hubert_voc1/conf/hifigan_hubert_24k.v1.yaml b/egs/vctk/hubert_voc1/conf/hifigan_hubert_24k.v1.yaml
@@ -34,7 +34,7 @@ generator_params:
     concat_spk_emb: false
     kernel_size: 7                        # Kernel size of initial and final conv layers.
     upsample_scales: [12, 10, 2, 2]        # Upsampling scales.
-    upsample_kernal_sizes: [24, 20, 4, 4] # Kernel size for upsampling layers.
+    upsample_kernel_sizes: [24, 20, 4, 4] # Kernel size for upsampling layers.
     resblock_kernel_sizes: [3, 7, 11]     # Kernel size for residual blocks.
     resblock_dilations:                   # Dilations for residual blocks.
         - [1, 3, 5]
@@ -61,7 +61,7 @@ discriminator_params:
     scale_discriminator_params:
         in_channels: 1                     # Number of input channels.
         out_channels: 1                    # Number of output channels.
-        kernel_sizes: [15, 41, 5, 3]       # List of kernal sizes.
+        kernel_sizes: [15, 41, 5, 3]       # List of kernel sizes.
         channels: 128                      # Initial number of channels.
         max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.
         max_groups: 16                     # Maximum number of groups in downsampling conv layers.
@@ -75,7 +75,7 @@ discriminator_params:
     period_discriminator_params:
         in_channels: 1                     # Number of input channels.
         out_channels: 1                    # Number of output channels.
-        kernel_sizes: [5, 3]               # List of kernal sizes.
+        kernel_sizes: [5, 3]               # List of kernel sizes.
         channels: 32                       # Initial number of channels.
         downsample_scales: [3, 3, 3, 3, 1] # Downsampling scales.
         max_downsample_channels: 1024      # Maximum number of channels in downsampling conv layers.

diff --git a/parallel_wavegan/models/hifigan.py b/parallel_wavegan/models/hifigan.py
@@ -792,7 +792,7 @@ def __init__(
         concat_spk_emb=False,
         kernel_size=7,
         upsample_scales=(8, 8, 2, 2),
-        upsample_kernal_sizes=(16, 16, 4, 4),
+        upsample_kernel_sizes=(16, 16, 4, 4),
         resblock_kernel_sizes=(3, 7, 11),
         resblock_dilations=[(1, 3, 5), (1, 3, 5), (1, 3, 5)],
         use_additional_convs=True,
@@ -813,8 +813,8 @@ def __init__(
             concat_spk_emb (bool): whether to concat speaker embedding to the input
             kernel_size (int): Kernel size of initial and final conv layer.
             upsample_scales (list): List of upsampling scales.
-            upsample_kernal_sizes (list): List of kernal sizes for upsampling layers.
-            resblock_kernal_sizes (list): List of kernal sizes for residual blocks.
+            upsample_kernel_sizes (list): List of kernel sizes for upsampling layers.
+            resblock_kernel_sizes (list): List of kernel sizes for residual blocks.
             resblock_dilations (list): List of dilation list for residual blocks.
             use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
             bias (bool): Whether to add bias parameter in convolution layers.
@@ -843,11 +843,11 @@ def __init__(
 
         # check hyperparameters are valid
         assert kernel_size % 2 == 1, "Kernal size must be odd number."
-        assert len(upsample_scales) == len(upsample_kernal_sizes)
+        assert len(upsample_scales) == len(upsample_kernel_sizes)
         assert len(resblock_dilations) == len(resblock_kernel_sizes)
 
         # define modules
-        self.num_upsamples = len(upsample_kernal_sizes)
+        self.num_upsamples = len(upsample_kernel_sizes)
         self.num_blocks = len(resblock_kernel_sizes)
         self.input_conv = torch.nn.Conv1d(
             in_channels,
@@ -858,7 +858,7 @@ def __init__(
         )
         self.upsamples = torch.nn.ModuleList()
         self.blocks = torch.nn.ModuleList()
-        for i in range(len(upsample_kernal_sizes)):
+        for i in range(len(upsample_kernel_sizes)):
             self.upsamples += [
                 torch.nn.Sequential(
                     getattr(torch.nn, nonlinear_activation)(
@@ -867,9 +867,9 @@ def __init__(
                     torch.nn.ConvTranspose1d(
                         channels // (2**i),
                         channels // (2 ** (i + 1)),
-                        upsample_kernal_sizes[i],
+                        upsample_kernel_sizes[i],
                         upsample_scales[i],
-                        padding=(upsample_kernal_sizes[i] - upsample_scales[i]) // 2,
+                        padding=(upsample_kernel_sizes[i] - upsample_scales[i]) // 2,
                     ),
                 )
             ]
@@ -1024,7 +1024,7 @@ def __init__(
         duration_dropout_rate=0.5,
         kernel_size=7,
         upsample_scales=(8, 8, 2, 2),
-        upsample_kernal_sizes=(16, 16, 4, 4),
+        upsample_kernel_sizes=(16, 16, 4, 4),
         resblock_kernel_sizes=(3, 7, 11),
         resblock_dilations=[(1, 3, 5), (1, 3, 5), (1, 3, 5)],
         use_additional_convs=True,
@@ -1050,8 +1050,8 @@ def __init__(
             duration_dropout_rate (float): duration predictor dropout rate
             kernel_size (int): Kernel size of initial and final conv layer.
             upsample_scales (list): List of upsampling scales.
-            upsample_kernal_sizes (list): List of kernal sizes for upsampling layers.
-            resblock_kernal_sizes (list): List of kernal sizes for residual blocks.
+            upsample_kernel_sizes (list): List of kernel sizes for upsampling layers.
+            resblock_kernel_sizes (list): List of kernel sizes for residual blocks.
             resblock_dilations (list): List of dilation list for residual blocks.
             use_additional_convs (bool): Whether to use additional conv layers in residual blocks.
             bias (bool): Whether to add bias parameter in convolution layers.
@@ -1071,7 +1071,7 @@ def __init__(
             concat_spk_emb=concat_spk_emb,
             kernel_size=kernel_size,
             upsample_scales=upsample_scales,
-            upsample_kernal_sizes=upsample_kernal_sizes,
+            upsample_kernel_sizes=upsample_kernel_sizes,
             resblock_kernel_sizes=resblock_kernel_sizes,
             resblock_dilations=resblock_dilations,
             use_additional_convs=use_additional_convs,
@@ -1156,7 +1156,7 @@ def inference(self, c, g=None, ds=None, normalize_before=False):
             c = c[:, 0:1]
 
         if ds is None:
-            c = self.synthesis(c.transpose(1, 0).unsqueeze(0))
+            c, _ = self.synthesis(c.transpose(1, 0).unsqueeze(0))
         else:
             c, _ = self.forward(c.transpose(1, 0).unsqueeze(0), ds.unsqueeze(0))
         return c.squeeze(0).transpose(1, 0)