From 9aacb78631414e6e73895439da8b715d9f0cee35 Mon Sep 17 00:00:00 2001
From: Ji Chen <jimchen90@devfair0160.h2.fair>
Date: Wed, 17 Jun 2020 06:37:54 -0700
Subject: [PATCH 01/17] upsamplenetwork

---
 torchaudio/models/_wavernn.py | 95 ++++++++++++++++++++++++++++++++---
 1 file changed, 89 insertions(+), 6 deletions(-)

diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py
index 1df9eb0637..b81c304b10 100644
--- a/torchaudio/models/_wavernn.py
+++ b/torchaudio/models/_wavernn.py
@@ -139,15 +139,74 @@ class _UpsampleNetwork(nn.Module):
         >>> upsamplenetwork = _UpsampleNetwork(upsample_scales=[4, 4, 16])
         >>> input = torch.rand(10, 128, 10)  # a random spectrogram
         >>> output = upsamplenetwork(input)  # shape: (10, 1536, 128), (10, 1536, 128)
+=======
+    r"""This is a two-dimensional stretch layer. It is a block used in WaveRNN. WaveRNN is based on the paper
+    "Efficient Neural Audio Synthesis". Nal Kalchbrenner, Erich Elsen, Karen Simonyan, Seb Noury,
+    Norman Casagrande, Edward Lockhart, Florian Stimberg, Aaron van den Oord, Sander Dieleman,
+    Koray Kavukcuoglu. arXiv:1802.08435, 2018.
+
+    Args:
+        x_scale: the scale factor in x axis (required).
+        y_scale: the scale factor in y axis (required).
+
+    Examples::
+        >>> stretch2d = _Stretch2d(x_scale=1, y_scale=1)
+
+        >>> input = torch.rand(10, 1, 100, 512)
+        >>> output = stretch2d(input)
     """
 
     def __init__(self,
-                 upsample_scales: List[int],
-                 n_res_block: int = 10,
-                 n_freq: int = 128,
-                 n_hidden: int = 128,
-                 n_output: int = 128,
-                 kernel_size: int = 5) -> None:
+                 x_scale: int,
+                 y_scale: int) -> None:
+        super().__init__()
+
+        self.x_scale = x_scale
+        self.y_scale = y_scale
+
+    def forward(self, x: Tensor) -> Tensor:
+        r"""Pass the input through the _Stretch2d layer.
+
+        Args:
+            x: the input sequence to the _Stretch2d layer (required).
+
+        Shape:
+            - x: :math:`(N, C, S, T)`.
+            - output: :math:`(N, C, S * y_scale, T * x_scale)`.
+        where N is the batch size, C is the channel size, S is the number of input sequence,
+        T is the length of input sequence.
+        """
+
+        n, c, s, t = x.size()
+        x = x.unsqueeze(-1).unsqueeze(3)
+        x = x.repeat(1, 1, 1, self.y_scale, 1, self.x_scale)
+        return x.view(n, c, s * self.y_scale, t * self.x_scale)
+
+
+class _UpsampleNetwork(nn.Module):
+    r"""This is an upsample block based on a stack of Conv2d and Strech2d layers.
+    It is a block used in WaveRNN. WaveRNN is based on the paper "Efficient Neural Audio Synthesis".
+    Nal Kalchbrenner, Erich Elsen, Karen Simonyan, Seb Noury, Norman Casagrande, Edward Lockhart,
+    Florian Stimberg, Aaron van den Oord, Sander Dieleman, Koray Kavukcuoglu. arXiv:1802.08435, 2018.
+
+    Args:
+        upsample_scales: the list of upsample scales (required).
+        res_blocks: the number of ResBlock in stack (default=10).
+        input_dims: the number of input sequence (default=100).
+        hidden_dims: the number of compute dimensions (default=128).
+        output_dims: the number of output sequence (default=128).
+        pad: the number of kernal size (pad * 2 + 1) in the first Conv1d layer (default=2).
+
+    Examples::
+        >>> upsamplenetwork = _UpsampleNetwork(upsample_scales=[4, 4, 16],
+                                               res_blocks=10,
+                                               input_dims=100,
+                                               hidden_dims=128,
+                                               output_dims=128,
+                                               pad=2)
+        >>> input = torch.rand(10, 100, 512)
+        >>> output = upsamplenetwork(input)
+
         super().__init__()
 
         total_scale = 1
@@ -156,6 +215,7 @@ def __init__(self,
 
         self.indent = (kernel_size - 1) // 2 * total_scale
         self.resnet = _MelResNet(n_res_block, n_freq, n_hidden, n_output, kernel_size)
+
         self.resnet_stretch = _Stretch2d(total_scale, 1)
 
         up_layers = []
@@ -167,6 +227,7 @@ def __init__(self,
                              padding=(0, scale),
                              bias=False)
             conv.weight.data.fill_(1. / (scale * 2 + 1))
+
             up_layers.append(stretch)
             up_layers.append(conv)
         self.upsample_layers = nn.Sequential(*up_layers)
@@ -192,3 +253,25 @@ def forward(self, specgram: Tensor) -> Tensor:
         upsampling_output = upsampling_output.squeeze(1)[:, :, self.indent:-self.indent]
 
         return upsampling_output, resnet_output
+
+    def forward(self, x: Tensor) -> Tensor:
+        r"""Pass the input through the _UpsampleNetwork layer.
+
+        Args:
+            x: the input sequence to the _UpsampleNetwork layer (required).
+
+        Shape:
+            - x: :math:`(N, S, T)`.
+            - output: :math:`(N, (T - 2 * pad) * Total_Scale, S)`, `(N, (T - 2 * pad) * total_scale, P)`.
+        where N is the batch size, S is the number of input sequence, T is the length of input sequence.
+        P is the number of output sequence. Total_Scale is the product of all elements in upsample_scales.
+        """
+
+        resnet_output = self.resnet(x).unsqueeze(1)
+        resnet_output = self.resnet_stretch(resnet_output)
+        resnet_output = resnet_output.squeeze(1)
+
+        upsampling_output = self.upsample_layers(x.unsqueeze(1))
+        upsampling_output = upsampling_output.squeeze(1)[:, :, self.indent:-self.indent]
+
+        return upsampling_output.transpose(1, 2), resnet_output.transpose(1, 2)

From 2f91a9b3a5918471c2dcac0dbe5f839306f95e3f Mon Sep 17 00:00:00 2001
From: Ji Chen <jimchen90@devfair0160.h2.fair>
Date: Thu, 18 Jun 2020 12:33:46 -0700
Subject: [PATCH 02/17] update variable names

---
 torchaudio/models/_wavernn.py | 40 +++++++++++++++++------------------
 1 file changed, 19 insertions(+), 21 deletions(-)

diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py
index b81c304b10..d01033a75d 100644
--- a/torchaudio/models/_wavernn.py
+++ b/torchaudio/models/_wavernn.py
@@ -146,8 +146,8 @@ class _UpsampleNetwork(nn.Module):
     Koray Kavukcuoglu. arXiv:1802.08435, 2018.
 
     Args:
-        x_scale: the scale factor in x axis (required).
-        y_scale: the scale factor in y axis (required).
+        x_scale: the scale factor in x axis (required)
+        y_scale: the scale factor in y axis (required)
 
     Examples::
         >>> stretch2d = _Stretch2d(x_scale=1, y_scale=1)
@@ -168,19 +168,17 @@ def forward(self, x: Tensor) -> Tensor:
         r"""Pass the input through the _Stretch2d layer.
 
         Args:
-            x: the input sequence to the _Stretch2d layer (required).
+            x: the input sequence to the _Stretch2d layer (required)
 
         Shape:
-            - x: :math:`(N, C, S, T)`.
-            - output: :math:`(N, C, S * y_scale, T * x_scale)`.
-        where N is the batch size, C is the channel size, S is the number of input sequence,
-        T is the length of input sequence.
+            - x: :math:`(batch_size, channel, freq, time)`
+            - output: :math:`(batch_size, channel, freq * y_scale, time * x_scale)`
         """
 
-        n, c, s, t = x.size()
+        batch_size, channel, freq, time = x.size()
         x = x.unsqueeze(-1).unsqueeze(3)
         x = x.repeat(1, 1, 1, self.y_scale, 1, self.x_scale)
-        return x.view(n, c, s * self.y_scale, t * self.x_scale)
+        return x.view(batch_size, channel, freq * self.y_scale, time * self.x_scale)
 
 
 class _UpsampleNetwork(nn.Module):
@@ -190,12 +188,12 @@ class _UpsampleNetwork(nn.Module):
     Florian Stimberg, Aaron van den Oord, Sander Dieleman, Koray Kavukcuoglu. arXiv:1802.08435, 2018.
 
     Args:
-        upsample_scales: the list of upsample scales (required).
-        res_blocks: the number of ResBlock in stack (default=10).
-        input_dims: the number of input sequence (default=100).
-        hidden_dims: the number of compute dimensions (default=128).
-        output_dims: the number of output sequence (default=128).
-        pad: the number of kernal size (pad * 2 + 1) in the first Conv1d layer (default=2).
+        upsample_scales: the list of upsample scales (required)
+        res_blocks: the number of ResBlock in stack (default=10)
+        input_dims: the number of input sequence (default=100)
+        hidden_dims: the number of compute dimensions (default=128)
+        output_dims: the number of output sequence (default=128)
+        pad: the kernel size (kernel_size = pad * 2 + 1) in the first Conv1d layer (default=2)
 
     Examples::
         >>> upsamplenetwork = _UpsampleNetwork(upsample_scales=[4, 4, 16],
@@ -258,20 +256,20 @@ def forward(self, x: Tensor) -> Tensor:
         r"""Pass the input through the _UpsampleNetwork layer.
 
         Args:
-            x: the input sequence to the _UpsampleNetwork layer (required).
+            x: the input sequence to the _UpsampleNetwork layer (required)
 
         Shape:
-            - x: :math:`(N, S, T)`.
-            - output: :math:`(N, (T - 2 * pad) * Total_Scale, S)`, `(N, (T - 2 * pad) * total_scale, P)`.
-        where N is the batch size, S is the number of input sequence, T is the length of input sequence.
-        P is the number of output sequence. Total_Scale is the product of all elements in upsample_scales.
+            - x: :math:`(batch_size, freq, time)`
+            - output: :math:`(batch_size, (time - 2 * pad) * total_scale, freq)`, `(batch_size, (time - 2 * pad) * total_scale, output_dims)`
+        where total_scale is the product of all elements in upsample_scales.
         """
 
         resnet_output = self.resnet(x).unsqueeze(1)
         resnet_output = self.resnet_stretch(resnet_output)
         resnet_output = resnet_output.squeeze(1)
 
-        upsampling_output = self.upsample_layers(x.unsqueeze(1))
+        x = x.unsqueeze(1)
+        upsampling_output = self.upsample_layers(x)
         upsampling_output = upsampling_output.squeeze(1)[:, :, self.indent:-self.indent]
 
         return upsampling_output.transpose(1, 2), resnet_output.transpose(1, 2)

From 63391d6325575bb6e4f4ebc5ef1805bfb9d69ab6 Mon Sep 17 00:00:00 2001
From: Ji Chen <jimchen90@devfair0160.h2.fair>
Date: Thu, 18 Jun 2020 12:39:51 -0700
Subject: [PATCH 03/17] update variable name

---
 torchaudio/models/_wavernn.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py
index d01033a75d..df95f42a8e 100644
--- a/torchaudio/models/_wavernn.py
+++ b/torchaudio/models/_wavernn.py
@@ -260,7 +260,8 @@ def forward(self, x: Tensor) -> Tensor:
 
         Shape:
             - x: :math:`(batch_size, freq, time)`
-            - output: :math:`(batch_size, (time - 2 * pad) * total_scale, freq)`, `(batch_size, (time - 2 * pad) * total_scale, output_dims)`
+            - output: :math:`(batch_size, (time - 2 * pad) * total_scale, freq)`,
+                            `(batch_size, (time - 2 * pad) * total_scale, output_dims)`
         where total_scale is the product of all elements in upsample_scales.
         """
 

From 0346f236946b86f1f75cce40e81887902691b1e1 Mon Sep 17 00:00:00 2001
From: Ji Chen <jimchen90@devfair0160.h2.fair>
Date: Sun, 21 Jun 2020 08:57:53 -0700
Subject: [PATCH 04/17] add wavernn model

---
 test/test_models.py           |   4 +-
 torchaudio/models/_wavernn.py | 197 ++++++++++++++++++++++++++++------
 2 files changed, 166 insertions(+), 35 deletions(-)

diff --git a/test/test_models.py b/test/test_models.py
index 519fbc7b26..dd30b6e879 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -1,5 +1,7 @@
+import unittest
+
 import torch
-from torchaudio.models import Wav2Letter, _MelResNet, _UpsampleNetwork
+from torchaudio.models import Wav2Letter, _MelResNet, _UpsampleNetwork, _WaveRNN
 
 from . import common_utils
 
diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py
index df95f42a8e..e17d2f5342 100644
--- a/torchaudio/models/_wavernn.py
+++ b/torchaudio/models/_wavernn.py
@@ -1,9 +1,10 @@
 from typing import List
 
+import torch
 from torch import Tensor
 from torch import nn
 
-__all__ = ["_ResBlock", "_MelResNet", "_Stretch2d", "_UpsampleNetwork"]
+__all__ = ["_ResBlock", "_MelResNet", "_Stretch2d", "_UpsampleNetwork", "_WaveRNN"]
 
 
 class _ResBlock(nn.Module):
@@ -90,6 +91,7 @@ def forward(self, specgram: Tensor) -> Tensor:
 
 
 class _Stretch2d(nn.Module):
+<<<<<<< HEAD
     r"""Upscale the frequency and time dimensions of a spectrogram.
 
     Args:
@@ -144,13 +146,16 @@ class _UpsampleNetwork(nn.Module):
     "Efficient Neural Audio Synthesis". Nal Kalchbrenner, Erich Elsen, Karen Simonyan, Seb Noury,
     Norman Casagrande, Edward Lockhart, Florian Stimberg, Aaron van den Oord, Sander Dieleman,
     Koray Kavukcuoglu. arXiv:1802.08435, 2018.
+=======
+    r"""This is a two-dimensional stretch layer. It is a block used in WaveRNN.
+>>>>>>> add wavernn model
 
     Args:
-        x_scale: the scale factor in x axis (required)
-        y_scale: the scale factor in y axis (required)
+        x_scale: the scale factor in x axis
+        y_scale: the scale factor in y axis
 
     Examples::
-        >>> stretch2d = _Stretch2d(x_scale=1, y_scale=1)
+        >>> stretch2d = _Stretch2d(x_scale=10, y_scale=10)
 
         >>> input = torch.rand(10, 1, 100, 512)
         >>> output = stretch2d(input)
@@ -165,46 +170,50 @@ def __init__(self,
         self.y_scale = y_scale
 
     def forward(self, x: Tensor) -> Tensor:
-        r"""Pass the input through the _Stretch2d layer.
-
+        r"""
         Args:
-            x: the input sequence to the _Stretch2d layer (required)
+            x: the input sequence to the _Stretch2d layer
 
         Shape:
-            - x: :math:`(batch_size, channel, freq, time)`
-            - output: :math:`(batch_size, channel, freq * y_scale, time * x_scale)`
+            - x: :math:`(..., freq, time)`
+            - output: :math:`(..., freq * y_scale, time * x_scale)`
         """
 
-        batch_size, channel, freq, time = x.size()
-        x = x.unsqueeze(-1).unsqueeze(3)
-        x = x.repeat(1, 1, 1, self.y_scale, 1, self.x_scale)
-        return x.view(batch_size, channel, freq * self.y_scale, time * self.x_scale)
+        return x.repeat_interleave(self.y_scale, 2).repeat_interleave(self.x_scale, 3)
 
 
 class _UpsampleNetwork(nn.Module):
     r"""This is an upsample block based on a stack of Conv2d and Strech2d layers.
-    It is a block used in WaveRNN. WaveRNN is based on the paper "Efficient Neural Audio Synthesis".
-    Nal Kalchbrenner, Erich Elsen, Karen Simonyan, Seb Noury, Norman Casagrande, Edward Lockhart,
-    Florian Stimberg, Aaron van den Oord, Sander Dieleman, Koray Kavukcuoglu. arXiv:1802.08435, 2018.
+    It is a block used in WaveRNN.
 
     Args:
-        upsample_scales: the list of upsample scales (required)
-        res_blocks: the number of ResBlock in stack (default=10)
-        input_dims: the number of input sequence (default=100)
-        hidden_dims: the number of compute dimensions (default=128)
-        output_dims: the number of output sequence (default=128)
-        pad: the kernel size (kernel_size = pad * 2 + 1) in the first Conv1d layer (default=2)
+        upsample_scales: the list of upsample scales
+        n_res_block: the number of ResBlock in stack (default=10)
+        n_freq: the number of bins in a spectrogram (default=128)
+        n_hidden: the number of hidden dimensions (default=128)
+        n_output: the number of output dimensions (default=128)
+        kernel_size: the number of kernel size in the first Conv1d layer (default=5)
 
     Examples::
         >>> upsamplenetwork = _UpsampleNetwork(upsample_scales=[4, 4, 16],
-                                               res_blocks=10,
-                                               input_dims=100,
-                                               hidden_dims=128,
-                                               output_dims=128,
-                                               pad=2)
-        >>> input = torch.rand(10, 100, 512)
+                                               n_res_block=10,
+                                               n_freq=128,
+                                               n_hidden=128,
+                                               n_output=128,
+                                               kernel_size=5)
+        >>> input = torch.rand(10, 128, 512)
         >>> output = upsamplenetwork(input)
 
+<<<<<<< HEAD
+=======
+    def __init__(self,
+                 upsample_scales: List[int],
+                 n_res_block: int = 10,
+                 n_freq: int = 128,
+                 n_hidden: int = 128,
+                 n_output: int = 128,
+                 kernel_size: int = 5) -> None:
+>>>>>>> add wavernn model
         super().__init__()
 
         total_scale = 1
@@ -213,7 +222,10 @@ class _UpsampleNetwork(nn.Module):
 
         self.indent = (kernel_size - 1) // 2 * total_scale
         self.resnet = _MelResNet(n_res_block, n_freq, n_hidden, n_output, kernel_size)
+<<<<<<< HEAD
 
+=======
+>>>>>>> add wavernn model
         self.resnet_stretch = _Stretch2d(total_scale, 1)
 
         up_layers = []
@@ -253,15 +265,14 @@ def forward(self, specgram: Tensor) -> Tensor:
         return upsampling_output, resnet_output
 
     def forward(self, x: Tensor) -> Tensor:
-        r"""Pass the input through the _UpsampleNetwork layer.
-
+        r"""
         Args:
-            x: the input sequence to the _UpsampleNetwork layer (required)
+            x: the input sequence to the _UpsampleNetwork layer
 
         Shape:
-            - x: :math:`(batch_size, freq, time)`
-            - output: :math:`(batch_size, (time - 2 * pad) * total_scale, freq)`,
-                            `(batch_size, (time - 2 * pad) * total_scale, output_dims)`
+            - x: :math:`(batch, freq, time)`.
+            - output: :math:`(batch, (time - kernel_size + 1) * total_scale, freq)`,
+                            `(batch, (time - kernel_size + 1) * total_scale, n_output)`
         where total_scale is the product of all elements in upsample_scales.
         """
 
@@ -274,3 +285,121 @@ def forward(self, x: Tensor) -> Tensor:
         upsampling_output = upsampling_output.squeeze(1)[:, :, self.indent:-self.indent]
 
         return upsampling_output.transpose(1, 2), resnet_output.transpose(1, 2)
+
+
+class _WaveRNN(nn.Module):
+    r"""
+    Args:
+        upsample_scales: the list of upsample scales
+        n_bits: the bits of output waveform
+        sample_rate: the rate of audio dimensions (samples per second)
+        hop_length: the number of samples between the starts of consecutive frames
+        n_res_block: the number of ResBlock in stack (default=10)
+        n_rnn: the dimension of RNN layer (default=512)
+        n_fc: the dimension of fully connected layer (default=512)
+        kernel_size: the number of kernel size in the first Conv1d layer (default=5)
+        n_freq: the number of bins in a spectrogram (default=128)
+        n_hidden: the number of hidden dimensions (default=128)
+        n_output: the number of output dimensions (default=128)
+        mode: the type of input waveform (default='RAW')
+
+    Examples::
+        >>> upsamplenetwork = _waveRNN(upsample_scales=[5,5,8],
+                                       n_bits=9,
+                                       sample_rate=24000,
+                                       hop_length=200,
+                                       n_res_block=10,
+                                       n_rnn=512,
+                                       n_fc=512,
+                                       kernel_size=5,
+                                       n_freq=128,
+                                       n_hidden=128,
+                                       n_output=128,
+                                       mode='RAW')
+        >>> x = torch.rand(10, 24800, 512)
+        >>> mels = torch.rand(10, 128, 512)
+        >>> output = upsamplenetwork(x, mels)
+    """
+
+    def __init__(self,
+                 upsample_scales: List[int],
+                 n_bits: int,
+                 sample_rate: int,
+                 hop_length: int,
+                 n_res_block: int = 10,
+                 n_rnn: int = 512,
+                 n_fc: int = 512,
+                 kernel_size: int = 5,
+                 n_freq: int = 128,
+                 n_hidden: int = 128,
+                 n_output: int = 128,
+                 mode: str = 'RAW') -> None:
+        super().__init__()
+
+        self.mode = mode
+        self.kernel_size = kernel_size
+
+        if self.mode == 'RAW':
+            self.n_classes = 2 ** n_bits
+        elif self.mode == 'MOL':
+            self.n_classes = 30
+
+        self.n_rnn = n_rnn
+        self.n_aux = n_output // 4
+        self.hop_length = hop_length
+        self.sample_rate = sample_rate
+
+        self.upsample = _UpsampleNetwork(upsample_scales, n_res_block, n_freq, n_hidden, n_output, kernel_size)
+        self.fc = nn.Linear(n_freq + self.n_aux + 1, n_rnn)
+
+        self.rnn1 = nn.GRU(n_rnn, n_rnn, batch_first=True)
+        self.rnn2 = nn.GRU(n_rnn + self.n_aux, n_rnn, batch_first=True)
+
+        self.relu1 = nn.ReLU(inplace=True)
+        self.relu2 = nn.ReLU(inplace=True)
+
+        self.fc1 = nn.Linear(n_rnn + self.n_aux, n_fc)
+        self.fc2 = nn.Linear(n_fc + self.n_aux, n_fc)
+        self.fc3 = nn.Linear(n_fc, self.n_classes)
+
+    def forward(self, x: Tensor, mels: Tensor) -> Tensor:
+        r"""
+        Args:
+            x: the input waveform to the _WaveRNN layer
+            mels: the input mel-spectrogram to the _WaveRNN layer
+
+        Shape:
+            - x: :math:`(batch, time)`
+            - mels: :math:`(batch, freq, time_mels)`
+            - output: :math:`(batch, time, 2 ** n_bits)`
+        """
+
+        batch_size = x.size(0)
+        h1 = torch.zeros(1, batch_size, self.n_rnn, device=x.device)
+        h2 = torch.zeros(1, batch_size, self.n_rnn, device=x.device)
+        mels, aux = self.upsample(mels)
+
+        aux_idx = [self.n_aux * i for i in range(5)]
+        a1 = aux[:, :, aux_idx[0]:aux_idx[1]]
+        a2 = aux[:, :, aux_idx[1]:aux_idx[2]]
+        a3 = aux[:, :, aux_idx[2]:aux_idx[3]]
+        a4 = aux[:, :, aux_idx[3]:aux_idx[4]]
+
+        x = torch.cat([x.unsqueeze(-1), mels, a1], dim=2)
+        x = self.fc(x)
+        res = x
+        x, _ = self.rnn1(x, h1)
+
+        x = x + res
+        res = x
+        x = torch.cat([x, a2], dim=2)
+        x, _ = self.rnn2(x, h2)
+
+        x = x + res
+        x = torch.cat([x, a3], dim=2)
+        x = self.relu1(self.fc1(x))
+
+        x = torch.cat([x, a4], dim=2)
+        x = self.relu2(self.fc2(x))
+
+        return self.fc3(x)

From 27e26aabc1a65ad43981ad8e0f8b9c626b903df5 Mon Sep 17 00:00:00 2001
From: Ji Chen <jimchen90@devfair0160.h2.fair>
Date: Tue, 23 Jun 2020 09:41:15 -0700
Subject: [PATCH 05/17] update test

---
 test/test_models.py           |   2 -
 torchaudio/models/_wavernn.py | 213 +---------------------------------
 2 files changed, 1 insertion(+), 214 deletions(-)

diff --git a/test/test_models.py b/test/test_models.py
index dd30b6e879..2530b64951 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -1,5 +1,3 @@
-import unittest
-
 import torch
 from torchaudio.models import Wav2Letter, _MelResNet, _UpsampleNetwork, _WaveRNN
 
diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py
index e17d2f5342..1df9eb0637 100644
--- a/torchaudio/models/_wavernn.py
+++ b/torchaudio/models/_wavernn.py
@@ -1,10 +1,9 @@
 from typing import List
 
-import torch
 from torch import Tensor
 from torch import nn
 
-__all__ = ["_ResBlock", "_MelResNet", "_Stretch2d", "_UpsampleNetwork", "_WaveRNN"]
+__all__ = ["_ResBlock", "_MelResNet", "_Stretch2d", "_UpsampleNetwork"]
 
 
 class _ResBlock(nn.Module):
@@ -91,7 +90,6 @@ def forward(self, specgram: Tensor) -> Tensor:
 
 
 class _Stretch2d(nn.Module):
-<<<<<<< HEAD
     r"""Upscale the frequency and time dimensions of a spectrogram.
 
     Args:
@@ -141,71 +139,8 @@ class _UpsampleNetwork(nn.Module):
         >>> upsamplenetwork = _UpsampleNetwork(upsample_scales=[4, 4, 16])
         >>> input = torch.rand(10, 128, 10)  # a random spectrogram
         >>> output = upsamplenetwork(input)  # shape: (10, 1536, 128), (10, 1536, 128)
-=======
-    r"""This is a two-dimensional stretch layer. It is a block used in WaveRNN. WaveRNN is based on the paper
-    "Efficient Neural Audio Synthesis". Nal Kalchbrenner, Erich Elsen, Karen Simonyan, Seb Noury,
-    Norman Casagrande, Edward Lockhart, Florian Stimberg, Aaron van den Oord, Sander Dieleman,
-    Koray Kavukcuoglu. arXiv:1802.08435, 2018.
-=======
-    r"""This is a two-dimensional stretch layer. It is a block used in WaveRNN.
->>>>>>> add wavernn model
-
-    Args:
-        x_scale: the scale factor in x axis
-        y_scale: the scale factor in y axis
-
-    Examples::
-        >>> stretch2d = _Stretch2d(x_scale=10, y_scale=10)
-
-        >>> input = torch.rand(10, 1, 100, 512)
-        >>> output = stretch2d(input)
     """
 
-    def __init__(self,
-                 x_scale: int,
-                 y_scale: int) -> None:
-        super().__init__()
-
-        self.x_scale = x_scale
-        self.y_scale = y_scale
-
-    def forward(self, x: Tensor) -> Tensor:
-        r"""
-        Args:
-            x: the input sequence to the _Stretch2d layer
-
-        Shape:
-            - x: :math:`(..., freq, time)`
-            - output: :math:`(..., freq * y_scale, time * x_scale)`
-        """
-
-        return x.repeat_interleave(self.y_scale, 2).repeat_interleave(self.x_scale, 3)
-
-
-class _UpsampleNetwork(nn.Module):
-    r"""This is an upsample block based on a stack of Conv2d and Strech2d layers.
-    It is a block used in WaveRNN.
-
-    Args:
-        upsample_scales: the list of upsample scales
-        n_res_block: the number of ResBlock in stack (default=10)
-        n_freq: the number of bins in a spectrogram (default=128)
-        n_hidden: the number of hidden dimensions (default=128)
-        n_output: the number of output dimensions (default=128)
-        kernel_size: the number of kernel size in the first Conv1d layer (default=5)
-
-    Examples::
-        >>> upsamplenetwork = _UpsampleNetwork(upsample_scales=[4, 4, 16],
-                                               n_res_block=10,
-                                               n_freq=128,
-                                               n_hidden=128,
-                                               n_output=128,
-                                               kernel_size=5)
-        >>> input = torch.rand(10, 128, 512)
-        >>> output = upsamplenetwork(input)
-
-<<<<<<< HEAD
-=======
     def __init__(self,
                  upsample_scales: List[int],
                  n_res_block: int = 10,
@@ -213,7 +148,6 @@ def __init__(self,
                  n_hidden: int = 128,
                  n_output: int = 128,
                  kernel_size: int = 5) -> None:
->>>>>>> add wavernn model
         super().__init__()
 
         total_scale = 1
@@ -222,10 +156,6 @@ def __init__(self,
 
         self.indent = (kernel_size - 1) // 2 * total_scale
         self.resnet = _MelResNet(n_res_block, n_freq, n_hidden, n_output, kernel_size)
-<<<<<<< HEAD
-
-=======
->>>>>>> add wavernn model
         self.resnet_stretch = _Stretch2d(total_scale, 1)
 
         up_layers = []
@@ -237,7 +167,6 @@ def __init__(self,
                              padding=(0, scale),
                              bias=False)
             conv.weight.data.fill_(1. / (scale * 2 + 1))
-
             up_layers.append(stretch)
             up_layers.append(conv)
         self.upsample_layers = nn.Sequential(*up_layers)
@@ -263,143 +192,3 @@ def forward(self, specgram: Tensor) -> Tensor:
         upsampling_output = upsampling_output.squeeze(1)[:, :, self.indent:-self.indent]
 
         return upsampling_output, resnet_output
-
-    def forward(self, x: Tensor) -> Tensor:
-        r"""
-        Args:
-            x: the input sequence to the _UpsampleNetwork layer
-
-        Shape:
-            - x: :math:`(batch, freq, time)`.
-            - output: :math:`(batch, (time - kernel_size + 1) * total_scale, freq)`,
-                            `(batch, (time - kernel_size + 1) * total_scale, n_output)`
-        where total_scale is the product of all elements in upsample_scales.
-        """
-
-        resnet_output = self.resnet(x).unsqueeze(1)
-        resnet_output = self.resnet_stretch(resnet_output)
-        resnet_output = resnet_output.squeeze(1)
-
-        x = x.unsqueeze(1)
-        upsampling_output = self.upsample_layers(x)
-        upsampling_output = upsampling_output.squeeze(1)[:, :, self.indent:-self.indent]
-
-        return upsampling_output.transpose(1, 2), resnet_output.transpose(1, 2)
-
-
-class _WaveRNN(nn.Module):
-    r"""
-    Args:
-        upsample_scales: the list of upsample scales
-        n_bits: the bits of output waveform
-        sample_rate: the rate of audio dimensions (samples per second)
-        hop_length: the number of samples between the starts of consecutive frames
-        n_res_block: the number of ResBlock in stack (default=10)
-        n_rnn: the dimension of RNN layer (default=512)
-        n_fc: the dimension of fully connected layer (default=512)
-        kernel_size: the number of kernel size in the first Conv1d layer (default=5)
-        n_freq: the number of bins in a spectrogram (default=128)
-        n_hidden: the number of hidden dimensions (default=128)
-        n_output: the number of output dimensions (default=128)
-        mode: the type of input waveform (default='RAW')
-
-    Examples::
-        >>> upsamplenetwork = _waveRNN(upsample_scales=[5,5,8],
-                                       n_bits=9,
-                                       sample_rate=24000,
-                                       hop_length=200,
-                                       n_res_block=10,
-                                       n_rnn=512,
-                                       n_fc=512,
-                                       kernel_size=5,
-                                       n_freq=128,
-                                       n_hidden=128,
-                                       n_output=128,
-                                       mode='RAW')
-        >>> x = torch.rand(10, 24800, 512)
-        >>> mels = torch.rand(10, 128, 512)
-        >>> output = upsamplenetwork(x, mels)
-    """
-
-    def __init__(self,
-                 upsample_scales: List[int],
-                 n_bits: int,
-                 sample_rate: int,
-                 hop_length: int,
-                 n_res_block: int = 10,
-                 n_rnn: int = 512,
-                 n_fc: int = 512,
-                 kernel_size: int = 5,
-                 n_freq: int = 128,
-                 n_hidden: int = 128,
-                 n_output: int = 128,
-                 mode: str = 'RAW') -> None:
-        super().__init__()
-
-        self.mode = mode
-        self.kernel_size = kernel_size
-
-        if self.mode == 'RAW':
-            self.n_classes = 2 ** n_bits
-        elif self.mode == 'MOL':
-            self.n_classes = 30
-
-        self.n_rnn = n_rnn
-        self.n_aux = n_output // 4
-        self.hop_length = hop_length
-        self.sample_rate = sample_rate
-
-        self.upsample = _UpsampleNetwork(upsample_scales, n_res_block, n_freq, n_hidden, n_output, kernel_size)
-        self.fc = nn.Linear(n_freq + self.n_aux + 1, n_rnn)
-
-        self.rnn1 = nn.GRU(n_rnn, n_rnn, batch_first=True)
-        self.rnn2 = nn.GRU(n_rnn + self.n_aux, n_rnn, batch_first=True)
-
-        self.relu1 = nn.ReLU(inplace=True)
-        self.relu2 = nn.ReLU(inplace=True)
-
-        self.fc1 = nn.Linear(n_rnn + self.n_aux, n_fc)
-        self.fc2 = nn.Linear(n_fc + self.n_aux, n_fc)
-        self.fc3 = nn.Linear(n_fc, self.n_classes)
-
-    def forward(self, x: Tensor, mels: Tensor) -> Tensor:
-        r"""
-        Args:
-            x: the input waveform to the _WaveRNN layer
-            mels: the input mel-spectrogram to the _WaveRNN layer
-
-        Shape:
-            - x: :math:`(batch, time)`
-            - mels: :math:`(batch, freq, time_mels)`
-            - output: :math:`(batch, time, 2 ** n_bits)`
-        """
-
-        batch_size = x.size(0)
-        h1 = torch.zeros(1, batch_size, self.n_rnn, device=x.device)
-        h2 = torch.zeros(1, batch_size, self.n_rnn, device=x.device)
-        mels, aux = self.upsample(mels)
-
-        aux_idx = [self.n_aux * i for i in range(5)]
-        a1 = aux[:, :, aux_idx[0]:aux_idx[1]]
-        a2 = aux[:, :, aux_idx[1]:aux_idx[2]]
-        a3 = aux[:, :, aux_idx[2]:aux_idx[3]]
-        a4 = aux[:, :, aux_idx[3]:aux_idx[4]]
-
-        x = torch.cat([x.unsqueeze(-1), mels, a1], dim=2)
-        x = self.fc(x)
-        res = x
-        x, _ = self.rnn1(x, h1)
-
-        x = x + res
-        res = x
-        x = torch.cat([x, a2], dim=2)
-        x, _ = self.rnn2(x, h2)
-
-        x = x + res
-        x = torch.cat([x, a3], dim=2)
-        x = self.relu1(self.fc1(x))
-
-        x = torch.cat([x, a4], dim=2)
-        x = self.relu2(self.fc2(x))
-
-        return self.fc3(x)

From 6981d1c597afa1b38690f870abf3f0b44e86da78 Mon Sep 17 00:00:00 2001
From: Ji Chen <jimchen90@devfair0160.h2.fair>
Date: Wed, 24 Jun 2020 20:11:52 -0700
Subject: [PATCH 06/17] update format

---
 test/test_models.py           |  33 +++++++++
 torchaudio/models/_wavernn.py | 122 ++++++++++++++++++++++++++++++++++
 2 files changed, 155 insertions(+)

diff --git a/test/test_models.py b/test/test_models.py
index 2530b64951..15b9390906 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -81,3 +81,36 @@ def test_waveform(self):
 
         assert out1.size() == (n_batch, n_freq, total_scale * (n_time - kernel_size + 1))
         assert out2.size() == (n_batch, n_output, total_scale * (n_time - kernel_size + 1))
+
+
+class TestWaveRNN(common_utils.TorchaudioTestCase):
+
+    def test_waveform(self):
+        """
+        Create a tensor as the input of _WaveRNN model
+        and test if the output dimensions are correct.
+        """
+
+        upsample_scales = [5, 5, 8]
+        n_rnn = 512
+        n_fc = 512
+        n_bits = 9
+        sample_rate = 24000
+        hop_length = 200
+        batch_size = 2
+        n_time = 200
+        n_freq = 100
+        n_output = 256
+        n_res_block = 10
+        n_hidden = 128
+        kernel_size = 5
+        mode = 'RAW'
+
+        model = _WaveRNN(upsample_scales, n_bits, sample_rate, hop_length, n_res_block,
+                         n_rnn, n_fc, kernel_size, n_freq, n_hidden, n_output, mode)
+
+        x = torch.rand(batch_size, hop_length * (n_time - kernel_size + 1))
+        mels = torch.rand(batch_size, n_freq, n_time)
+        out = model(x, mels)
+
+        assert out.size() == (batch_size, hop_length * (n_time - kernel_size + 1), 2 ** n_bits)
diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py
index 1df9eb0637..5ba641e8ab 100644
--- a/torchaudio/models/_wavernn.py
+++ b/torchaudio/models/_wavernn.py
@@ -192,3 +192,125 @@ def forward(self, specgram: Tensor) -> Tensor:
         upsampling_output = upsampling_output.squeeze(1)[:, :, self.indent:-self.indent]
 
         return upsampling_output, resnet_output
+
+
+class _WaveRNN(nn.Module):
+    r"""WaveRNN model based on
+    `"Efficient Neural Audio Synthesis" <https://arxiv.org/pdf/1802.08435.pdf>`_
+
+    Args:
+        upsample_scales: the list of upsample scales
+        n_bits: the bits of output waveform
+        sample_rate: the rate of audio dimensions (samples per second)
+        hop_length: the number of samples between the starts of consecutive frames
+        n_res_block: the number of ResBlock in stack (default=10)
+        n_rnn: the dimension of RNN layer (default=512)
+        n_fc: the dimension of fully connected layer (default=512)
+        kernel_size: the number of kernel size in the first Conv1d layer (default=5)
+        n_freq: the number of bins in a spectrogram (default=128)
+        n_hidden: the number of hidden dimensions (default=128)
+        n_output: the number of output dimensions (default=128)
+        mode: the type of input waveform in ['RAW', 'MOL'] (default='RAW')
+
+    Examples::
+        >>> upsamplenetwork = _waveRNN(upsample_scales=[5,5,8],
+                                       n_bits=9,
+                                       sample_rate=24000,
+                                       hop_length=200,
+                                       n_res_block=10,
+                                       n_rnn=512,
+                                       n_fc=512,
+                                       kernel_size=5,
+                                       n_freq=128,
+                                       n_hidden=128,
+                                       n_output=128,
+                                       mode='RAW')
+        >>> x = torch.rand(10, 24800, 512)
+        >>> mels = torch.rand(10, 128, 512)
+        >>> output = upsamplenetwork(x, mels)
+    """
+
+    def __init__(self,
+                 upsample_scales: List[int],
+                 n_bits: int,
+                 sample_rate: int,
+                 hop_length: int,
+                 n_res_block: int = 10,
+                 n_rnn: int = 512,
+                 n_fc: int = 512,
+                 kernel_size: int = 5,
+                 n_freq: int = 128,
+                 n_hidden: int = 128,
+                 n_output: int = 128,
+                 mode: str = 'RAW') -> None:
+        super().__init__()
+
+        self.mode = mode
+        self.kernel_size = kernel_size
+
+        if self.mode == 'RAW':
+            self.n_classes = 2 ** n_bits
+        elif self.mode == 'MOL':
+            self.n_classes = 30
+        else:
+            raise ValueError("Unknown input mode - {}".format(self.mode))
+
+        self.n_rnn = n_rnn
+        self.n_aux = n_output // 4
+        self.hop_length = hop_length
+        self.sample_rate = sample_rate
+
+        self.upsample = _UpsampleNetwork(upsample_scales, n_res_block, n_freq, n_hidden, n_output, kernel_size)
+        self.fc = nn.Linear(n_freq + self.n_aux + 1, n_rnn)
+
+        self.rnn1 = nn.GRU(n_rnn, n_rnn, batch_first=True)
+        self.rnn2 = nn.GRU(n_rnn + self.n_aux, n_rnn, batch_first=True)
+
+        self.relu1 = nn.ReLU(inplace=True)
+        self.relu2 = nn.ReLU(inplace=True)
+
+        self.fc1 = nn.Linear(n_rnn + self.n_aux, n_fc)
+        self.fc2 = nn.Linear(n_fc + self.n_aux, n_fc)
+        self.fc3 = nn.Linear(n_fc, self.n_classes)
+
+    def forward(self, x: Tensor, mels: Tensor) -> Tensor:
+        r"""
+        Args:
+            x: the input waveform to the _WaveRNN layer
+            mels: the input mel-spectrogram to the _WaveRNN layer
+
+        Shape:
+            - x: :math:`(batch, time)`
+            - mels: :math:`(batch, freq, time_mels)`
+            - output: :math:`(batch, time, 2 ** n_bits)`
+        """
+
+        batch_size = x.size(0)
+        h1 = torch.zeros(1, batch_size, self.n_rnn, dtype=x.dtype, device=x.device)
+        h2 = torch.zeros(1, batch_size, self.n_rnn, dtype=x.dtype, device=x.device)
+        mels, aux = self.upsample(mels)
+
+        aux_idx = [self.n_aux * i for i in range(5)]
+        a1 = aux[:, :, aux_idx[0]:aux_idx[1]]
+        a2 = aux[:, :, aux_idx[1]:aux_idx[2]]
+        a3 = aux[:, :, aux_idx[2]:aux_idx[3]]
+        a4 = aux[:, :, aux_idx[3]:aux_idx[4]]
+
+        x = torch.cat([x.unsqueeze(-1), mels, a1], dim=2)
+        x = self.fc(x)
+        res = x
+        x, _ = self.rnn1(x, h1)
+
+        x = x + res
+        res = x
+        x = torch.cat([x, a2], dim=2)
+        x, _ = self.rnn2(x, h2)
+
+        x = x + res
+        x = torch.cat([x, a3], dim=2)
+        x = self.relu1(self.fc1(x))
+
+        x = torch.cat([x, a4], dim=2)
+        x = self.relu2(self.fc2(x))
+
+        return self.fc3(x)

From c41ac8fdfed22d2acc9e3db4aba908bc26532e71 Mon Sep 17 00:00:00 2001
From: Ji Chen <jimchen90@devfair0160.h2.fair>
Date: Fri, 26 Jun 2020 05:17:55 -0700
Subject: [PATCH 07/17] update format

---
 test/test_models.py           | 40 ++++++++++++++++----
 torchaudio/models/_wavernn.py | 71 +++++++++++++++++------------------
 2 files changed, 67 insertions(+), 44 deletions(-)

diff --git a/test/test_models.py b/test/test_models.py
index 15b9390906..0f4015374b 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -86,9 +86,35 @@ def test_waveform(self):
 class TestWaveRNN(common_utils.TorchaudioTestCase):
 
     def test_waveform(self):
+        """test the output dimensions of waveform input after _WaveRNN model.
         """
-        Create a tensor as the input of _WaveRNN model
-        and test if the output dimensions are correct.
+
+        upsample_scales = [5, 5, 8]
+        n_rnn = 512
+        n_fc = 512
+        n_bits = 9
+        sample_rate = 24000
+        hop_length = 200
+        n_batch = 2
+        n_time = 200
+        n_freq = 100
+        n_output = 256
+        n_res_block = 10
+        n_hidden = 128
+        kernel_size = 5
+        mode = 'waveform'
+
+        model = _WaveRNN(upsample_scales, n_bits, sample_rate, hop_length, n_res_block,
+                         n_rnn, n_fc, kernel_size, n_freq, n_hidden, n_output, mode)
+
+        x = torch.rand(n_batch, hop_length * (n_time - kernel_size + 1))
+        mels = torch.rand(n_batch, n_freq, n_time)
+        out = model(x, mels)
+
+        assert out.size() == (n_batch, hop_length * (n_time - kernel_size + 1), 2 ** n_bits)
+
+    def test_mol(self):
+        """test the output dimensions of mol input after _WaveRNN model.
         """
 
         upsample_scales = [5, 5, 8]
@@ -97,20 +123,20 @@ def test_waveform(self):
         n_bits = 9
         sample_rate = 24000
         hop_length = 200
-        batch_size = 2
+        n_batch = 2
         n_time = 200
         n_freq = 100
         n_output = 256
         n_res_block = 10
         n_hidden = 128
         kernel_size = 5
-        mode = 'RAW'
+        mode = 'mol'
 
         model = _WaveRNN(upsample_scales, n_bits, sample_rate, hop_length, n_res_block,
                          n_rnn, n_fc, kernel_size, n_freq, n_hidden, n_output, mode)
 
-        x = torch.rand(batch_size, hop_length * (n_time - kernel_size + 1))
-        mels = torch.rand(batch_size, n_freq, n_time)
+        x = torch.rand(n_batch, hop_length * (n_time - kernel_size + 1))
+        mels = torch.rand(n_batch, n_freq, n_time)
         out = model(x, mels)
 
-        assert out.size() == (batch_size, hop_length * (n_time - kernel_size + 1), 2 ** n_bits)
+        assert out.size() == (n_batch, hop_length * (n_time - kernel_size + 1), 30)
diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py
index 5ba641e8ab..1b8a90bfaa 100644
--- a/torchaudio/models/_wavernn.py
+++ b/torchaudio/models/_wavernn.py
@@ -195,8 +195,10 @@ def forward(self, specgram: Tensor) -> Tensor:
 
 
 class _WaveRNN(nn.Module):
-    r"""WaveRNN model based on
-    `"Efficient Neural Audio Synthesis" <https://arxiv.org/pdf/1802.08435.pdf>`_
+    r"""WaveRNN model based on "Efficient Neural Audio Synthesis".
+
+    The paper link is https://arxiv.org/pdf/1802.08435.pdf. The input signals are waveform
+    and spectrogram. The input channels of waveform and spectrogram have to be 1.
 
     Args:
         upsample_scales: the list of upsample scales
@@ -210,24 +212,15 @@ class _WaveRNN(nn.Module):
         n_freq: the number of bins in a spectrogram (default=128)
         n_hidden: the number of hidden dimensions (default=128)
         n_output: the number of output dimensions (default=128)
-        mode: the type of input waveform in ['RAW', 'MOL'] (default='RAW')
-
-    Examples::
-        >>> upsamplenetwork = _waveRNN(upsample_scales=[5,5,8],
-                                       n_bits=9,
-                                       sample_rate=24000,
-                                       hop_length=200,
-                                       n_res_block=10,
-                                       n_rnn=512,
-                                       n_fc=512,
-                                       kernel_size=5,
-                                       n_freq=128,
-                                       n_hidden=128,
-                                       n_output=128,
-                                       mode='RAW')
-        >>> x = torch.rand(10, 24800, 512)
-        >>> mels = torch.rand(10, 128, 512)
-        >>> output = upsamplenetwork(x, mels)
+        mode: the type of input waveform in ['waveform', 'mol'] (default='waveform')
+
+    Examples
+        >>> wavernn = _waveRNN(upsample_scales=[5,5,8], n_bits=9, sample_rate=24000, hop_length=200)
+        >>> waveform, sample_rate = torchaudio.load(file) # waveform shape:
+        >>> (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length)
+        >>> specgram = MelSpectrogram(sample_rate)(waveform) # shape: (n_batch, n_channel, n_freq, n_time)
+        >>> output = wavernn(waveform.squeeze(1), specgram.squeeze(1)) # shape:
+        >>> (n_batch, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits)
     """
 
     def __init__(self,
@@ -242,24 +235,30 @@ def __init__(self,
                  n_freq: int = 128,
                  n_hidden: int = 128,
                  n_output: int = 128,
-                 mode: str = 'RAW') -> None:
+                 mode: str = 'waveform') -> None:
         super().__init__()
 
         self.mode = mode
         self.kernel_size = kernel_size
 
-        if self.mode == 'RAW':
+        if self.mode == 'waveform':
             self.n_classes = 2 ** n_bits
-        elif self.mode == 'MOL':
+        elif self.mode == 'mol':
             self.n_classes = 30
         else:
-            raise ValueError("Unknown input mode - {}".format(self.mode))
+            raise ValueError(f"Expected mode: `waveform` or `mol`, but found {self.mode}")
 
         self.n_rnn = n_rnn
         self.n_aux = n_output // 4
         self.hop_length = hop_length
         self.sample_rate = sample_rate
 
+        total_scale = 1
+        for upsample_scale in upsample_scales:
+            total_scale *= upsample_scale
+        if total_scale != self.hop_length:
+            raise ValueError(f"Expected: total_scale == hop_length, but found {total_scale} != {hop_length}")
+
         self.upsample = _UpsampleNetwork(upsample_scales, n_res_block, n_freq, n_hidden, n_output, kernel_size)
         self.fc = nn.Linear(n_freq + self.n_aux + 1, n_rnn)
 
@@ -273,22 +272,20 @@ def __init__(self,
         self.fc2 = nn.Linear(n_fc + self.n_aux, n_fc)
         self.fc3 = nn.Linear(n_fc, self.n_classes)
 
-    def forward(self, x: Tensor, mels: Tensor) -> Tensor:
-        r"""
+    def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor:
+        r"""Pass the input through the _WaveRNN model.
         Args:
-            x: the input waveform to the _WaveRNN layer
-            mels: the input mel-spectrogram to the _WaveRNN layer
+            waveform: the input waveform to the _WaveRNN layer (n_batch, (n_time - kernel_size + 1) * hop_length)
+            specgram: the input spectrogram to the _WaveRNN layer (n_batch, n_freq, n_time)
 
-        Shape:
-            - x: :math:`(batch, time)`
-            - mels: :math:`(batch, freq, time_mels)`
-            - output: :math:`(batch, time, 2 ** n_bits)`
+        Return:
+            Tensor shape: (n_batch, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits)
         """
 
-        batch_size = x.size(0)
-        h1 = torch.zeros(1, batch_size, self.n_rnn, dtype=x.dtype, device=x.device)
-        h2 = torch.zeros(1, batch_size, self.n_rnn, dtype=x.dtype, device=x.device)
-        mels, aux = self.upsample(mels)
+        batch_size = waveform.size(0)
+        h1 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device)
+        h2 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device)
+        mels, aux = self.upsample(specgram)
 
         aux_idx = [self.n_aux * i for i in range(5)]
         a1 = aux[:, :, aux_idx[0]:aux_idx[1]]
@@ -296,7 +293,7 @@ def forward(self, x: Tensor, mels: Tensor) -> Tensor:
         a3 = aux[:, :, aux_idx[2]:aux_idx[3]]
         a4 = aux[:, :, aux_idx[3]:aux_idx[4]]
 
-        x = torch.cat([x.unsqueeze(-1), mels, a1], dim=2)
+        x = torch.cat([waveform.unsqueeze(-1), mels, a1], dim=2)
         x = self.fc(x)
         res = x
         x, _ = self.rnn1(x, h1)

From c34316601276212a0fa79739ef24683ef6bfe492 Mon Sep 17 00:00:00 2001
From: Ji Chen <jimchen90@devfair0160.h2.fair>
Date: Fri, 26 Jun 2020 09:43:37 -0700
Subject: [PATCH 08/17] update format

---
 test/test_models.py           |  4 ++--
 torchaudio/models/_wavernn.py | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/test/test_models.py b/test/test_models.py
index 0f4015374b..d4873dadc2 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -86,7 +86,7 @@ def test_waveform(self):
 class TestWaveRNN(common_utils.TorchaudioTestCase):
 
     def test_waveform(self):
-        """test the output dimensions of waveform input after _WaveRNN model.
+        """Validate the output dimensions of a _WaveRNN model with a waveform input.
         """
 
         upsample_scales = [5, 5, 8]
@@ -114,7 +114,7 @@ def test_waveform(self):
         assert out.size() == (n_batch, hop_length * (n_time - kernel_size + 1), 2 ** n_bits)
 
     def test_mol(self):
-        """test the output dimensions of mol input after _WaveRNN model.
+        """Validate the output dimensions of a _WaveRNN model with a mol input.
         """
 
         upsample_scales = [5, 5, 8]
diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py
index 1b8a90bfaa..1720cd7429 100644
--- a/torchaudio/models/_wavernn.py
+++ b/torchaudio/models/_wavernn.py
@@ -197,8 +197,8 @@ def forward(self, specgram: Tensor) -> Tensor:
 class _WaveRNN(nn.Module):
     r"""WaveRNN model based on "Efficient Neural Audio Synthesis".
 
-    The paper link is https://arxiv.org/pdf/1802.08435.pdf. The input signals are waveform
-    and spectrogram. The input channels of waveform and spectrogram have to be 1.
+    The paper link is https://arxiv.org/pdf/1802.08435.pdf. The input signals are a waveform
+    and a spectrogram. The input channels of waveform and spectrogram have to be 1.
 
     Args:
         upsample_scales: the list of upsample scales
@@ -216,10 +216,10 @@ class _WaveRNN(nn.Module):
 
     Examples
         >>> wavernn = _waveRNN(upsample_scales=[5,5,8], n_bits=9, sample_rate=24000, hop_length=200)
-        >>> waveform, sample_rate = torchaudio.load(file) # waveform shape:
+        >>> waveform, sample_rate = torchaudio.load(file)  # waveform shape:
         >>> (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length)
-        >>> specgram = MelSpectrogram(sample_rate)(waveform) # shape: (n_batch, n_channel, n_freq, n_time)
-        >>> output = wavernn(waveform.squeeze(1), specgram.squeeze(1)) # shape:
+        >>> specgram = MelSpectrogram(sample_rate)(waveform)  # shape: (n_batch, n_channel, n_freq, n_time)
+        >>> output = wavernn(waveform.squeeze(1), specgram.squeeze(1))  # shape:
         >>> (n_batch, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits)
     """
 

From 17455a3c74529b0d904d177aa610fe3ab8d255b4 Mon Sep 17 00:00:00 2001
From: Ji Chen <jimchen90@devfair0160.h2.fair>
Date: Wed, 1 Jul 2020 13:03:14 -0700
Subject: [PATCH 09/17] fix conflicts and add transpose

---
 test/test_models.py           | 4 ++--
 torchaudio/models/_wavernn.py | 9 ++++++---
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/test/test_models.py b/test/test_models.py
index d4873dadc2..86901eb0ae 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -86,7 +86,7 @@ def test_waveform(self):
 class TestWaveRNN(common_utils.TorchaudioTestCase):
 
     def test_waveform(self):
-        """Validate the output dimensions of a _WaveRNN model with a waveform input.
+        """Validate the output dimensions of a _WaveRNN model in waveform mode.
         """
 
         upsample_scales = [5, 5, 8]
@@ -114,7 +114,7 @@ def test_waveform(self):
         assert out.size() == (n_batch, hop_length * (n_time - kernel_size + 1), 2 ** n_bits)
 
     def test_mol(self):
-        """Validate the output dimensions of a _WaveRNN model with a mol input.
+        """Validate the output dimensions of a _WaveRNN model in mol mode.
         """
 
         upsample_scales = [5, 5, 8]
diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py
index 1720cd7429..1eeeb391d9 100644
--- a/torchaudio/models/_wavernn.py
+++ b/torchaudio/models/_wavernn.py
@@ -197,8 +197,8 @@ def forward(self, specgram: Tensor) -> Tensor:
 class _WaveRNN(nn.Module):
     r"""WaveRNN model based on "Efficient Neural Audio Synthesis".
 
-    The paper link is https://arxiv.org/pdf/1802.08435.pdf. The input signals are a waveform
-    and a spectrogram. The input channels of waveform and spectrogram have to be 1.
+    The paper link is https://arxiv.org/pdf/1802.08435.pdf. The input channels of waveform
+    and spectrogram have to be 1. The product of upsample_scales must equal hop_length.
 
     Args:
         upsample_scales: the list of upsample scales
@@ -212,7 +212,7 @@ class _WaveRNN(nn.Module):
         n_freq: the number of bins in a spectrogram (default=128)
         n_hidden: the number of hidden dimensions (default=128)
         n_output: the number of output dimensions (default=128)
-        mode: the type of input waveform in ['waveform', 'mol'] (default='waveform')
+        mode: the mode of waveform in ['waveform', 'mol'] (default='waveform')
 
     Examples
         >>> wavernn = _waveRNN(upsample_scales=[5,5,8], n_bits=9, sample_rate=24000, hop_length=200)
@@ -274,6 +274,7 @@ def __init__(self,
 
     def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor:
         r"""Pass the input through the _WaveRNN model.
+
         Args:
             waveform: the input waveform to the _WaveRNN layer (n_batch, (n_time - kernel_size + 1) * hop_length)
             specgram: the input spectrogram to the _WaveRNN layer (n_batch, n_freq, n_time)
@@ -286,6 +287,8 @@ def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor:
         h1 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device)
         h2 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device)
         mels, aux = self.upsample(specgram)
+        mels = mels.transpose()
+        aux = aux.transpose()
 
         aux_idx = [self.n_aux * i for i in range(5)]
         a1 = aux[:, :, aux_idx[0]:aux_idx[1]]

From 2c44bc776657e5ae4d5a3a2341de7b928f5b8940 Mon Sep 17 00:00:00 2001
From: Ji Chen <jimchen90@devfair0160.h2.fair>
Date: Wed, 1 Jul 2020 13:25:34 -0700
Subject: [PATCH 10/17] import update

---
 torchaudio/models/_wavernn.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py
index 1eeeb391d9..b4713f7934 100644
--- a/torchaudio/models/_wavernn.py
+++ b/torchaudio/models/_wavernn.py
@@ -1,9 +1,10 @@
 from typing import List
 
+import torch
 from torch import Tensor
 from torch import nn
 
-__all__ = ["_ResBlock", "_MelResNet", "_Stretch2d", "_UpsampleNetwork"]
+__all__ = ["_ResBlock", "_MelResNet", "_Stretch2d", "_UpsampleNetwork", "_WaveRNN"]
 
 
 class _ResBlock(nn.Module):

From 634bc7fa821aab97fb4b9d03592161856a490bc9 Mon Sep 17 00:00:00 2001
From: Ji Chen <jimchen90@devfair0160.h2.fair>
Date: Wed, 1 Jul 2020 13:39:00 -0700
Subject: [PATCH 11/17] update transpose

---
 torchaudio/models/_wavernn.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py
index b4713f7934..a7739d3bdf 100644
--- a/torchaudio/models/_wavernn.py
+++ b/torchaudio/models/_wavernn.py
@@ -288,8 +288,8 @@ def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor:
         h1 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device)
         h2 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device)
         mels, aux = self.upsample(specgram)
-        mels = mels.transpose()
-        aux = aux.transpose()
+        mels = mels.transpose(1, 2)
+        aux = aux.transpose(1, 2)
 
         aux_idx = [self.n_aux * i for i in range(5)]
         a1 = aux[:, :, aux_idx[0]:aux_idx[1]]

From 6a2b8a7267438c26c611f0df8326bfe7ab494a37 Mon Sep 17 00:00:00 2001
From: Ji Chen <jimchen90@devfair0160.h2.fair>
Date: Sun, 5 Jul 2020 12:14:38 -0700
Subject: [PATCH 12/17] update format

---
 torchaudio/models/_wavernn.py | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py
index a7739d3bdf..bce6f25601 100644
--- a/torchaudio/models/_wavernn.py
+++ b/torchaudio/models/_wavernn.py
@@ -198,8 +198,8 @@ def forward(self, specgram: Tensor) -> Tensor:
 class _WaveRNN(nn.Module):
     r"""WaveRNN model based on "Efficient Neural Audio Synthesis".
 
-    The paper link is https://arxiv.org/pdf/1802.08435.pdf. The input channels of waveform
-    and spectrogram have to be 1. The product of upsample_scales must equal hop_length.
+    The paper link is `<https://arxiv.org/pdf/1802.08435.pdf>`_. The input channels of waveform
+    and spectrogram have to be 1. The product of `upsample_scales` must equal `hop_length`.
 
     Args:
         upsample_scales: the list of upsample scales
@@ -217,11 +217,12 @@ class _WaveRNN(nn.Module):
 
     Examples
         >>> wavernn = _waveRNN(upsample_scales=[5,5,8], n_bits=9, sample_rate=24000, hop_length=200)
-        >>> waveform, sample_rate = torchaudio.load(file)  # waveform shape:
-        >>> (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length)
+        >>> waveform, sample_rate = torchaudio.load(file)
+        >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length)
         >>> specgram = MelSpectrogram(sample_rate)(waveform)  # shape: (n_batch, n_channel, n_freq, n_time)
-        >>> output = wavernn(waveform.squeeze(1), specgram.squeeze(1))  # shape:
-        >>> (n_batch, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits)
+        >>> output = wavernn(waveform.squeeze(1), specgram.squeeze(1))
+        >>> # output shape in 'waveform' mode: (n_batch, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits)
+        >>> # output shape in 'mol' mode: (n_batch, (n_time - kernel_size + 1) * hop_length, 30)
     """
 
     def __init__(self,
@@ -287,8 +288,11 @@ def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor:
         batch_size = waveform.size(0)
         h1 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device)
         h2 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device)
-        mels, aux = self.upsample(specgram)
-        mels = mels.transpose(1, 2)
+        # output of upsample:
+        # specgram: (n_batch, n_freq, (n_time - kernel_size + 1) * total_scale)
+        # aux: (n_batch, n_output, (n_time - kernel_size + 1) * total_scale)
+        specgram, aux = self.upsample(specgram)
+        specgram = specgram.transpose(1, 2)
         aux = aux.transpose(1, 2)
 
         aux_idx = [self.n_aux * i for i in range(5)]
@@ -297,21 +301,23 @@ def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor:
         a3 = aux[:, :, aux_idx[2]:aux_idx[3]]
         a4 = aux[:, :, aux_idx[3]:aux_idx[4]]
 
-        x = torch.cat([waveform.unsqueeze(-1), mels, a1], dim=2)
+        x = torch.cat([waveform.unsqueeze(-1), specgram, a1], dim=-1)
         x = self.fc(x)
         res = x
         x, _ = self.rnn1(x, h1)
 
         x = x + res
         res = x
-        x = torch.cat([x, a2], dim=2)
+        x = torch.cat([x, a2], dim=-1)
         x, _ = self.rnn2(x, h2)
 
         x = x + res
-        x = torch.cat([x, a3], dim=2)
-        x = self.relu1(self.fc1(x))
+        x = torch.cat([x, a3], dim=-1)
+        x = self.fc1(x)
+        x = self.relu1(x)
 
-        x = torch.cat([x, a4], dim=2)
-        x = self.relu2(self.fc2(x))
+        x = torch.cat([x, a4], dim=-1)
+        x = self.fc2(x)
+        x = self.relu2(x)
 
         return self.fc3(x)

From b547482bd75a659353247cdd69f76006efcfebfa Mon Sep 17 00:00:00 2001
From: Ji Chen <jimchen90@devfair0160.h2.fair>
Date: Mon, 6 Jul 2020 19:02:59 -0700
Subject: [PATCH 13/17] update docstring

---
 torchaudio/models/_wavernn.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py
index bce6f25601..c21888dcbc 100644
--- a/torchaudio/models/_wavernn.py
+++ b/torchaudio/models/_wavernn.py
@@ -196,10 +196,12 @@ def forward(self, specgram: Tensor) -> Tensor:
 
 
 class _WaveRNN(nn.Module):
-    r"""WaveRNN model based on "Efficient Neural Audio Synthesis".
+    r"""WaveRNN model based on the implementation from `fatchord <https://github.com/fatchord/WaveRNN>`_.
 
-    The paper link is `<https://arxiv.org/pdf/1802.08435.pdf>`_. The input channels of waveform
-    and spectrogram have to be 1. The product of `upsample_scales` must equal `hop_length`.
+    The original implementation was introduced in
+    `"Efficient Neural Audio Synthesis" <https://arxiv.org/pdf/1802.08435.pdf>`_.
+    The input channels of waveform and spectrogram have to be 1. The product of
+    `upsample_scales` must equal `hop_length`.
 
     Args:
         upsample_scales: the list of upsample scales
@@ -215,14 +217,13 @@ class _WaveRNN(nn.Module):
         n_output: the number of output dimensions (default=128)
         mode: the mode of waveform in ['waveform', 'mol'] (default='waveform')
 
-    Examples
+    Example
         >>> wavernn = _waveRNN(upsample_scales=[5,5,8], n_bits=9, sample_rate=24000, hop_length=200)
         >>> waveform, sample_rate = torchaudio.load(file)
         >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length)
         >>> specgram = MelSpectrogram(sample_rate)(waveform)  # shape: (n_batch, n_channel, n_freq, n_time)
         >>> output = wavernn(waveform.squeeze(1), specgram.squeeze(1))
         >>> # output shape in 'waveform' mode: (n_batch, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits)
-        >>> # output shape in 'mol' mode: (n_batch, (n_time - kernel_size + 1) * hop_length, 30)
     """
 
     def __init__(self,

From 978e10178b09ef6778d47ac465e6a918495fc561 Mon Sep 17 00:00:00 2001
From: Ji Chen <jimchen90@devfair0160.h2.fair>
Date: Tue, 7 Jul 2020 09:53:34 -0700
Subject: [PATCH 14/17] add n_channel in input

---
 test/test_models.py           | 12 ++++++------
 torchaudio/models/_wavernn.py | 15 ++++++++++-----
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/test/test_models.py b/test/test_models.py
index 86901eb0ae..c54a57cebd 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -107,11 +107,11 @@ def test_waveform(self):
         model = _WaveRNN(upsample_scales, n_bits, sample_rate, hop_length, n_res_block,
                          n_rnn, n_fc, kernel_size, n_freq, n_hidden, n_output, mode)
 
-        x = torch.rand(n_batch, hop_length * (n_time - kernel_size + 1))
-        mels = torch.rand(n_batch, n_freq, n_time)
+        x = torch.rand(n_batch, 1, hop_length * (n_time - kernel_size + 1))
+        mels = torch.rand(n_batch, 1, n_freq, n_time)
         out = model(x, mels)
 
-        assert out.size() == (n_batch, hop_length * (n_time - kernel_size + 1), 2 ** n_bits)
+        assert out.size() == (n_batch, 1, hop_length * (n_time - kernel_size + 1), 2 ** n_bits)
 
     def test_mol(self):
         """Validate the output dimensions of a _WaveRNN model in mol mode.
@@ -135,8 +135,8 @@ def test_mol(self):
         model = _WaveRNN(upsample_scales, n_bits, sample_rate, hop_length, n_res_block,
                          n_rnn, n_fc, kernel_size, n_freq, n_hidden, n_output, mode)
 
-        x = torch.rand(n_batch, hop_length * (n_time - kernel_size + 1))
-        mels = torch.rand(n_batch, n_freq, n_time)
+        x = torch.rand(n_batch, 1, hop_length * (n_time - kernel_size + 1))
+        mels = torch.rand(n_batch, 1, n_freq, n_time)
         out = model(x, mels)
 
-        assert out.size() == (n_batch, hop_length * (n_time - kernel_size + 1), 30)
+        assert out.size() == (n_batch, 1, hop_length * (n_time - kernel_size + 1), 30)
diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py
index c21888dcbc..7b6b7b0395 100644
--- a/torchaudio/models/_wavernn.py
+++ b/torchaudio/models/_wavernn.py
@@ -222,7 +222,7 @@ class _WaveRNN(nn.Module):
         >>> waveform, sample_rate = torchaudio.load(file)
         >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length)
         >>> specgram = MelSpectrogram(sample_rate)(waveform)  # shape: (n_batch, n_channel, n_freq, n_time)
-        >>> output = wavernn(waveform.squeeze(1), specgram.squeeze(1))
+        >>> output = wavernn(waveform, specgram)
         >>> # output shape in 'waveform' mode: (n_batch, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits)
     """
 
@@ -279,13 +279,17 @@ def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor:
         r"""Pass the input through the _WaveRNN model.
 
         Args:
-            waveform: the input waveform to the _WaveRNN layer (n_batch, (n_time - kernel_size + 1) * hop_length)
-            specgram: the input spectrogram to the _WaveRNN layer (n_batch, n_freq, n_time)
+            waveform: the input waveform to the _WaveRNN layer (n_batch, 1, (n_time - kernel_size + 1) * hop_length)
+            specgram: the input spectrogram to the _WaveRNN layer (n_batch, 1, n_freq, n_time)
 
         Return:
-            Tensor shape: (n_batch, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits)
+            Tensor shape: (n_batch, 1, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits)
         """
 
+        assert waveform.size(1) == 1, 'Require the input channel of waveform is 1'
+        assert specgram.size(1) == 1, 'Require the input channel of specgram is 1'
+        waveform, specgram = waveform.squeeze(1), specgram.squeeze(1)
+
         batch_size = waveform.size(0)
         h1 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device)
         h2 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device)
@@ -320,5 +324,6 @@ def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor:
         x = torch.cat([x, a4], dim=-1)
         x = self.fc2(x)
         x = self.relu2(x)
+        x = self.fc3(x).unsqueeze(1)
 
-        return self.fc3(x)
+        return x

From 01fbbdaa20e6dc0fb0f9d962396f99ed83e40b56 Mon Sep 17 00:00:00 2001
From: Ji Chen <jimchen90@devfair0160.h2.fair>
Date: Tue, 7 Jul 2020 14:32:19 -0700
Subject: [PATCH 15/17] add comment

---
 torchaudio/models/_wavernn.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py
index 7b6b7b0395..de39f1a83f 100644
--- a/torchaudio/models/_wavernn.py
+++ b/torchaudio/models/_wavernn.py
@@ -288,6 +288,7 @@ def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor:
 
         assert waveform.size(1) == 1, 'Require the input channel of waveform is 1'
         assert specgram.size(1) == 1, 'Require the input channel of specgram is 1'
+        # remove channel dimension until the end
         waveform, specgram = waveform.squeeze(1), specgram.squeeze(1)
 
         batch_size = waveform.size(0)
@@ -324,6 +325,7 @@ def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor:
         x = torch.cat([x, a4], dim=-1)
         x = self.fc2(x)
         x = self.relu2(x)
-        x = self.fc3(x).unsqueeze(1)
+        x = self.fc3(x)
 
-        return x
+        # bring back channel dimension
+        return x.unsqueeze(1)

From 0ed6da8990ad472e91ebc73a5d56da0863b71415 Mon Sep 17 00:00:00 2001
From: Ji Chen <jimchen90@devfair0160.h2.fair>
Date: Tue, 7 Jul 2020 15:55:26 -0700
Subject: [PATCH 16/17] update docstring

---
 torchaudio/models/_wavernn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py
index de39f1a83f..7ddff8376a 100644
--- a/torchaudio/models/_wavernn.py
+++ b/torchaudio/models/_wavernn.py
@@ -223,7 +223,7 @@ class _WaveRNN(nn.Module):
         >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length)
         >>> specgram = MelSpectrogram(sample_rate)(waveform)  # shape: (n_batch, n_channel, n_freq, n_time)
         >>> output = wavernn(waveform, specgram)
-        >>> # output shape in 'waveform' mode: (n_batch, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits)
+        >>> # output shape in 'waveform' mode: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits)
     """
 
     def __init__(self,

From 1cb02fd6716f2de9d6ca7420453dce8a422fe5b0 Mon Sep 17 00:00:00 2001
From: Ji Chen <jimchen90@devfair0160.h2.fair>
Date: Tue, 7 Jul 2020 16:01:53 -0700
Subject: [PATCH 17/17] update docstring

---
 torchaudio/models/_wavernn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py
index 7ddff8376a..cd2e89a10c 100644
--- a/torchaudio/models/_wavernn.py
+++ b/torchaudio/models/_wavernn.py
@@ -223,7 +223,7 @@ class _WaveRNN(nn.Module):
         >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length)
         >>> specgram = MelSpectrogram(sample_rate)(waveform)  # shape: (n_batch, n_channel, n_freq, n_time)
         >>> output = wavernn(waveform, specgram)
-        >>> # output shape in 'waveform' mode: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits)
+        >>> # output shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits)
     """
 
     def __init__(self,