update format

Ji Chen · Ji Chen · commit a8528bfa206f · 2020-06-25T06:34:21.000-07:00
diff --git a/test/test_models.py b/test/test_models.py
@@ -36,6 +36,11 @@ def test_mfcc(self):
 class TestMelResNet(common_utils.TorchaudioTestCase):
 
     def test_waveform(self):
+        """
+        Create a tensor as the input of _MelResNet layer
+        and test if the output dimensions are correct.
+        """
+
         batch_size = 2
         n_time = 200
         n_freq = 100
@@ -55,6 +60,10 @@ def test_waveform(self):
 class TestUpsampleNetwork(common_utils.TorchaudioTestCase):
 
     def test_waveform(self):
+        """
+        Create a tensor as the input of _UpsampleNetwork block
+        and test if the output dimensions are correct.
+        """
 
         upsample_scales = [5, 5, 8]
         batch_size = 2
@@ -81,6 +90,10 @@ def test_waveform(self):
 class TestWaveRNN(common_utils.TorchaudioTestCase):
 
     def test_waveform(self):
+        """
+        Create a tensor as the input of _WaveRNN model
+        and test if the output dimensions are correct.
+        """
 
         upsample_scales = [5, 5, 8]
         n_rnn = 512
diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py
@@ -8,9 +8,8 @@
 
 
 class _ResBlock(nn.Module):
-    r"""This is a ResNet block layer. This layer is based on the paper "Deep Residual Learning
-    for Image Recognition". Kaiming He,  Xiangyu Zhang, Shaoqing Ren, Jian Sun. CVPR, 2016.
-    It is a block used in WaveRNN.
+    r"""ResNet block layer based on 
+    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_
 
     Args:
         n_freq: the number of bins in a spectrogram (default=128)
@@ -47,7 +46,7 @@ def forward(self, x: Tensor) -> Tensor:
 
 
 class _MelResNet(nn.Module):
-    r"""This is a MelResNet layer based on a stack of ResBlocks. It is a block used in WaveRNN.
+    r"""MelResNet layer based on a stack of ResBlocks.
 
     Args:
         n_res_block: the number of ResBlock in stack (default=10)
@@ -71,10 +70,7 @@ def __init__(self,
                  kernel_size: int = 5) -> None:
         super().__init__()
 
-        ResBlocks = []
-
-        for i in range(n_res_block):
-            ResBlocks.append(_ResBlock(n_hidden))
+        ResBlocks = [_ResBlock(n_hidden) for _ in range(n_res_block)]
 
         self.melresnet_model = nn.Sequential(
             nn.Conv1d(in_channels=n_freq, out_channels=n_hidden, kernel_size=kernel_size, bias=False),
@@ -98,7 +94,7 @@ def forward(self, x: Tensor) -> Tensor:
 
 
 class _Stretch2d(nn.Module):
-    r"""This is a two-dimensional stretch layer. It is a block used in WaveRNN.
+    r"""Two-dimensional stretch layer.
 
     Args:
         x_scale: the scale factor in x axis
@@ -133,8 +129,7 @@ def forward(self, x: Tensor) -> Tensor:
 
 
 class _UpsampleNetwork(nn.Module):
-    r"""This is an upsample block based on a stack of Conv2d and Strech2d layers.
-    It is a block used in WaveRNN.
+    r"""Upsample block based on a stack of Conv2d and Strech2d layers.
 
     Args:
         upsample_scales: the list of upsample scales
@@ -174,11 +169,9 @@ def __init__(self,
 
         up_layers = []
         for scale in upsample_scales:
-            k_size = (1, scale * 2 + 1)
-            padding = (0, scale)
             stretch = _Stretch2d(scale, 1)
-            conv = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=k_size, padding=padding, bias=False)
-            conv.weight.data.fill_(1. / k_size[1])
+            conv = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=(1, scale * 2 + 1), padding=(0, scale), bias=False)
+            conv.weight.data.fill_(1. / (scale * 2 + 1))
             up_layers.append(stretch)
             up_layers.append(conv)
         self.upsample_layers = nn.Sequential(*up_layers)
@@ -207,7 +200,9 @@ def forward(self, x: Tensor) -> Tensor:
 
 
 class _WaveRNN(nn.Module):
-    r"""
+    r"""WaveRNN model based on
+    `"Efficient Neural Audio Synthesis" <https://arxiv.org/pdf/1802.08435.pdf>`_
+
     Args:
         upsample_scales: the list of upsample scales
         n_bits: the bits of output waveform
@@ -220,7 +215,7 @@ class _WaveRNN(nn.Module):
         n_freq: the number of bins in a spectrogram (default=128)
         n_hidden: the number of hidden dimensions (default=128)
         n_output: the number of output dimensions (default=128)
-        mode: the type of input waveform (default='RAW')
+        mode: the type of input waveform in ['RAW', 'MOL'] (default='RAW')
 
     Examples::
         >>> upsamplenetwork = _waveRNN(upsample_scales=[5,5,8],
@@ -262,6 +257,8 @@ def __init__(self,
             self.n_classes = 2 ** n_bits
         elif self.mode == 'MOL':
             self.n_classes = 30
+        else:
+            raise ValueError("Unknown input mode - {}".format(self.mode))
 
         self.n_rnn = n_rnn
         self.n_aux = n_output // 4
@@ -294,8 +291,8 @@ def forward(self, x: Tensor, mels: Tensor) -> Tensor:
         """
 
         batch_size = x.size(0)
-        h1 = torch.zeros(1, batch_size, self.n_rnn, device=x.device)
-        h2 = torch.zeros(1, batch_size, self.n_rnn, device=x.device)
+        h1 = torch.zeros(1, batch_size, self.n_rnn, dtype=x.dtype, device=x.device)
+        h2 = torch.zeros(1, batch_size, self.n_rnn, dtype=x.dtype, device=x.device)
         mels, aux = self.upsample(mels)
 
         aux_idx = [self.n_aux * i for i in range(5)]