From 9aacb78631414e6e73895439da8b715d9f0cee35 Mon Sep 17 00:00:00 2001 From: Ji Chen Date: Wed, 17 Jun 2020 06:37:54 -0700 Subject: [PATCH 01/17] upsamplenetwork --- torchaudio/models/_wavernn.py | 95 ++++++++++++++++++++++++++++++++--- 1 file changed, 89 insertions(+), 6 deletions(-) diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py index 1df9eb0637..b81c304b10 100644 --- a/torchaudio/models/_wavernn.py +++ b/torchaudio/models/_wavernn.py @@ -139,15 +139,74 @@ class _UpsampleNetwork(nn.Module): >>> upsamplenetwork = _UpsampleNetwork(upsample_scales=[4, 4, 16]) >>> input = torch.rand(10, 128, 10) # a random spectrogram >>> output = upsamplenetwork(input) # shape: (10, 1536, 128), (10, 1536, 128) +======= + r"""This is a two-dimensional stretch layer. It is a block used in WaveRNN. WaveRNN is based on the paper + "Efficient Neural Audio Synthesis". Nal Kalchbrenner, Erich Elsen, Karen Simonyan, Seb Noury, + Norman Casagrande, Edward Lockhart, Florian Stimberg, Aaron van den Oord, Sander Dieleman, + Koray Kavukcuoglu. arXiv:1802.08435, 2018. + + Args: + x_scale: the scale factor in x axis (required). + y_scale: the scale factor in y axis (required). + + Examples:: + >>> stretch2d = _Stretch2d(x_scale=1, y_scale=1) + + >>> input = torch.rand(10, 1, 100, 512) + >>> output = stretch2d(input) """ def __init__(self, - upsample_scales: List[int], - n_res_block: int = 10, - n_freq: int = 128, - n_hidden: int = 128, - n_output: int = 128, - kernel_size: int = 5) -> None: + x_scale: int, + y_scale: int) -> None: + super().__init__() + + self.x_scale = x_scale + self.y_scale = y_scale + + def forward(self, x: Tensor) -> Tensor: + r"""Pass the input through the _Stretch2d layer. + + Args: + x: the input sequence to the _Stretch2d layer (required). + + Shape: + - x: :math:`(N, C, S, T)`. + - output: :math:`(N, C, S * y_scale, T * x_scale)`. + where N is the batch size, C is the channel size, S is the number of input sequence, + T is the length of input sequence. + """ + + n, c, s, t = x.size() + x = x.unsqueeze(-1).unsqueeze(3) + x = x.repeat(1, 1, 1, self.y_scale, 1, self.x_scale) + return x.view(n, c, s * self.y_scale, t * self.x_scale) + + +class _UpsampleNetwork(nn.Module): + r"""This is an upsample block based on a stack of Conv2d and Strech2d layers. + It is a block used in WaveRNN. WaveRNN is based on the paper "Efficient Neural Audio Synthesis". + Nal Kalchbrenner, Erich Elsen, Karen Simonyan, Seb Noury, Norman Casagrande, Edward Lockhart, + Florian Stimberg, Aaron van den Oord, Sander Dieleman, Koray Kavukcuoglu. arXiv:1802.08435, 2018. + + Args: + upsample_scales: the list of upsample scales (required). + res_blocks: the number of ResBlock in stack (default=10). + input_dims: the number of input sequence (default=100). + hidden_dims: the number of compute dimensions (default=128). + output_dims: the number of output sequence (default=128). + pad: the number of kernal size (pad * 2 + 1) in the first Conv1d layer (default=2). + + Examples:: + >>> upsamplenetwork = _UpsampleNetwork(upsample_scales=[4, 4, 16], + res_blocks=10, + input_dims=100, + hidden_dims=128, + output_dims=128, + pad=2) + >>> input = torch.rand(10, 100, 512) + >>> output = upsamplenetwork(input) + super().__init__() total_scale = 1 @@ -156,6 +215,7 @@ def __init__(self, self.indent = (kernel_size - 1) // 2 * total_scale self.resnet = _MelResNet(n_res_block, n_freq, n_hidden, n_output, kernel_size) + self.resnet_stretch = _Stretch2d(total_scale, 1) up_layers = [] @@ -167,6 +227,7 @@ def __init__(self, padding=(0, scale), bias=False) conv.weight.data.fill_(1. / (scale * 2 + 1)) + up_layers.append(stretch) up_layers.append(conv) self.upsample_layers = nn.Sequential(*up_layers) @@ -192,3 +253,25 @@ def forward(self, specgram: Tensor) -> Tensor: upsampling_output = upsampling_output.squeeze(1)[:, :, self.indent:-self.indent] return upsampling_output, resnet_output + + def forward(self, x: Tensor) -> Tensor: + r"""Pass the input through the _UpsampleNetwork layer. + + Args: + x: the input sequence to the _UpsampleNetwork layer (required). + + Shape: + - x: :math:`(N, S, T)`. + - output: :math:`(N, (T - 2 * pad) * Total_Scale, S)`, `(N, (T - 2 * pad) * total_scale, P)`. + where N is the batch size, S is the number of input sequence, T is the length of input sequence. + P is the number of output sequence. Total_Scale is the product of all elements in upsample_scales. + """ + + resnet_output = self.resnet(x).unsqueeze(1) + resnet_output = self.resnet_stretch(resnet_output) + resnet_output = resnet_output.squeeze(1) + + upsampling_output = self.upsample_layers(x.unsqueeze(1)) + upsampling_output = upsampling_output.squeeze(1)[:, :, self.indent:-self.indent] + + return upsampling_output.transpose(1, 2), resnet_output.transpose(1, 2) From 2f91a9b3a5918471c2dcac0dbe5f839306f95e3f Mon Sep 17 00:00:00 2001 From: Ji Chen Date: Thu, 18 Jun 2020 12:33:46 -0700 Subject: [PATCH 02/17] update variable names --- torchaudio/models/_wavernn.py | 40 +++++++++++++++++------------------ 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py index b81c304b10..d01033a75d 100644 --- a/torchaudio/models/_wavernn.py +++ b/torchaudio/models/_wavernn.py @@ -146,8 +146,8 @@ class _UpsampleNetwork(nn.Module): Koray Kavukcuoglu. arXiv:1802.08435, 2018. Args: - x_scale: the scale factor in x axis (required). - y_scale: the scale factor in y axis (required). + x_scale: the scale factor in x axis (required) + y_scale: the scale factor in y axis (required) Examples:: >>> stretch2d = _Stretch2d(x_scale=1, y_scale=1) @@ -168,19 +168,17 @@ def forward(self, x: Tensor) -> Tensor: r"""Pass the input through the _Stretch2d layer. Args: - x: the input sequence to the _Stretch2d layer (required). + x: the input sequence to the _Stretch2d layer (required) Shape: - - x: :math:`(N, C, S, T)`. - - output: :math:`(N, C, S * y_scale, T * x_scale)`. - where N is the batch size, C is the channel size, S is the number of input sequence, - T is the length of input sequence. + - x: :math:`(batch_size, channel, freq, time)` + - output: :math:`(batch_size, channel, freq * y_scale, time * x_scale)` """ - n, c, s, t = x.size() + batch_size, channel, freq, time = x.size() x = x.unsqueeze(-1).unsqueeze(3) x = x.repeat(1, 1, 1, self.y_scale, 1, self.x_scale) - return x.view(n, c, s * self.y_scale, t * self.x_scale) + return x.view(batch_size, channel, freq * self.y_scale, time * self.x_scale) class _UpsampleNetwork(nn.Module): @@ -190,12 +188,12 @@ class _UpsampleNetwork(nn.Module): Florian Stimberg, Aaron van den Oord, Sander Dieleman, Koray Kavukcuoglu. arXiv:1802.08435, 2018. Args: - upsample_scales: the list of upsample scales (required). - res_blocks: the number of ResBlock in stack (default=10). - input_dims: the number of input sequence (default=100). - hidden_dims: the number of compute dimensions (default=128). - output_dims: the number of output sequence (default=128). - pad: the number of kernal size (pad * 2 + 1) in the first Conv1d layer (default=2). + upsample_scales: the list of upsample scales (required) + res_blocks: the number of ResBlock in stack (default=10) + input_dims: the number of input sequence (default=100) + hidden_dims: the number of compute dimensions (default=128) + output_dims: the number of output sequence (default=128) + pad: the kernel size (kernel_size = pad * 2 + 1) in the first Conv1d layer (default=2) Examples:: >>> upsamplenetwork = _UpsampleNetwork(upsample_scales=[4, 4, 16], @@ -258,20 +256,20 @@ def forward(self, x: Tensor) -> Tensor: r"""Pass the input through the _UpsampleNetwork layer. Args: - x: the input sequence to the _UpsampleNetwork layer (required). + x: the input sequence to the _UpsampleNetwork layer (required) Shape: - - x: :math:`(N, S, T)`. - - output: :math:`(N, (T - 2 * pad) * Total_Scale, S)`, `(N, (T - 2 * pad) * total_scale, P)`. - where N is the batch size, S is the number of input sequence, T is the length of input sequence. - P is the number of output sequence. Total_Scale is the product of all elements in upsample_scales. + - x: :math:`(batch_size, freq, time)` + - output: :math:`(batch_size, (time - 2 * pad) * total_scale, freq)`, `(batch_size, (time - 2 * pad) * total_scale, output_dims)` + where total_scale is the product of all elements in upsample_scales. """ resnet_output = self.resnet(x).unsqueeze(1) resnet_output = self.resnet_stretch(resnet_output) resnet_output = resnet_output.squeeze(1) - upsampling_output = self.upsample_layers(x.unsqueeze(1)) + x = x.unsqueeze(1) + upsampling_output = self.upsample_layers(x) upsampling_output = upsampling_output.squeeze(1)[:, :, self.indent:-self.indent] return upsampling_output.transpose(1, 2), resnet_output.transpose(1, 2) From 63391d6325575bb6e4f4ebc5ef1805bfb9d69ab6 Mon Sep 17 00:00:00 2001 From: Ji Chen Date: Thu, 18 Jun 2020 12:39:51 -0700 Subject: [PATCH 03/17] update variable name --- torchaudio/models/_wavernn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py index d01033a75d..df95f42a8e 100644 --- a/torchaudio/models/_wavernn.py +++ b/torchaudio/models/_wavernn.py @@ -260,7 +260,8 @@ def forward(self, x: Tensor) -> Tensor: Shape: - x: :math:`(batch_size, freq, time)` - - output: :math:`(batch_size, (time - 2 * pad) * total_scale, freq)`, `(batch_size, (time - 2 * pad) * total_scale, output_dims)` + - output: :math:`(batch_size, (time - 2 * pad) * total_scale, freq)`, + `(batch_size, (time - 2 * pad) * total_scale, output_dims)` where total_scale is the product of all elements in upsample_scales. """ From 0346f236946b86f1f75cce40e81887902691b1e1 Mon Sep 17 00:00:00 2001 From: Ji Chen Date: Sun, 21 Jun 2020 08:57:53 -0700 Subject: [PATCH 04/17] add wavernn model --- test/test_models.py | 4 +- torchaudio/models/_wavernn.py | 197 ++++++++++++++++++++++++++++------ 2 files changed, 166 insertions(+), 35 deletions(-) diff --git a/test/test_models.py b/test/test_models.py index 519fbc7b26..dd30b6e879 100644 --- a/test/test_models.py +++ b/test/test_models.py @@ -1,5 +1,7 @@ +import unittest + import torch -from torchaudio.models import Wav2Letter, _MelResNet, _UpsampleNetwork +from torchaudio.models import Wav2Letter, _MelResNet, _UpsampleNetwork, _WaveRNN from . import common_utils diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py index df95f42a8e..e17d2f5342 100644 --- a/torchaudio/models/_wavernn.py +++ b/torchaudio/models/_wavernn.py @@ -1,9 +1,10 @@ from typing import List +import torch from torch import Tensor from torch import nn -__all__ = ["_ResBlock", "_MelResNet", "_Stretch2d", "_UpsampleNetwork"] +__all__ = ["_ResBlock", "_MelResNet", "_Stretch2d", "_UpsampleNetwork", "_WaveRNN"] class _ResBlock(nn.Module): @@ -90,6 +91,7 @@ def forward(self, specgram: Tensor) -> Tensor: class _Stretch2d(nn.Module): +<<<<<<< HEAD r"""Upscale the frequency and time dimensions of a spectrogram. Args: @@ -144,13 +146,16 @@ class _UpsampleNetwork(nn.Module): "Efficient Neural Audio Synthesis". Nal Kalchbrenner, Erich Elsen, Karen Simonyan, Seb Noury, Norman Casagrande, Edward Lockhart, Florian Stimberg, Aaron van den Oord, Sander Dieleman, Koray Kavukcuoglu. arXiv:1802.08435, 2018. +======= + r"""This is a two-dimensional stretch layer. It is a block used in WaveRNN. +>>>>>>> add wavernn model Args: - x_scale: the scale factor in x axis (required) - y_scale: the scale factor in y axis (required) + x_scale: the scale factor in x axis + y_scale: the scale factor in y axis Examples:: - >>> stretch2d = _Stretch2d(x_scale=1, y_scale=1) + >>> stretch2d = _Stretch2d(x_scale=10, y_scale=10) >>> input = torch.rand(10, 1, 100, 512) >>> output = stretch2d(input) @@ -165,46 +170,50 @@ def __init__(self, self.y_scale = y_scale def forward(self, x: Tensor) -> Tensor: - r"""Pass the input through the _Stretch2d layer. - + r""" Args: - x: the input sequence to the _Stretch2d layer (required) + x: the input sequence to the _Stretch2d layer Shape: - - x: :math:`(batch_size, channel, freq, time)` - - output: :math:`(batch_size, channel, freq * y_scale, time * x_scale)` + - x: :math:`(..., freq, time)` + - output: :math:`(..., freq * y_scale, time * x_scale)` """ - batch_size, channel, freq, time = x.size() - x = x.unsqueeze(-1).unsqueeze(3) - x = x.repeat(1, 1, 1, self.y_scale, 1, self.x_scale) - return x.view(batch_size, channel, freq * self.y_scale, time * self.x_scale) + return x.repeat_interleave(self.y_scale, 2).repeat_interleave(self.x_scale, 3) class _UpsampleNetwork(nn.Module): r"""This is an upsample block based on a stack of Conv2d and Strech2d layers. - It is a block used in WaveRNN. WaveRNN is based on the paper "Efficient Neural Audio Synthesis". - Nal Kalchbrenner, Erich Elsen, Karen Simonyan, Seb Noury, Norman Casagrande, Edward Lockhart, - Florian Stimberg, Aaron van den Oord, Sander Dieleman, Koray Kavukcuoglu. arXiv:1802.08435, 2018. + It is a block used in WaveRNN. Args: - upsample_scales: the list of upsample scales (required) - res_blocks: the number of ResBlock in stack (default=10) - input_dims: the number of input sequence (default=100) - hidden_dims: the number of compute dimensions (default=128) - output_dims: the number of output sequence (default=128) - pad: the kernel size (kernel_size = pad * 2 + 1) in the first Conv1d layer (default=2) + upsample_scales: the list of upsample scales + n_res_block: the number of ResBlock in stack (default=10) + n_freq: the number of bins in a spectrogram (default=128) + n_hidden: the number of hidden dimensions (default=128) + n_output: the number of output dimensions (default=128) + kernel_size: the number of kernel size in the first Conv1d layer (default=5) Examples:: >>> upsamplenetwork = _UpsampleNetwork(upsample_scales=[4, 4, 16], - res_blocks=10, - input_dims=100, - hidden_dims=128, - output_dims=128, - pad=2) - >>> input = torch.rand(10, 100, 512) + n_res_block=10, + n_freq=128, + n_hidden=128, + n_output=128, + kernel_size=5) + >>> input = torch.rand(10, 128, 512) >>> output = upsamplenetwork(input) +<<<<<<< HEAD +======= + def __init__(self, + upsample_scales: List[int], + n_res_block: int = 10, + n_freq: int = 128, + n_hidden: int = 128, + n_output: int = 128, + kernel_size: int = 5) -> None: +>>>>>>> add wavernn model super().__init__() total_scale = 1 @@ -213,7 +222,10 @@ class _UpsampleNetwork(nn.Module): self.indent = (kernel_size - 1) // 2 * total_scale self.resnet = _MelResNet(n_res_block, n_freq, n_hidden, n_output, kernel_size) +<<<<<<< HEAD +======= +>>>>>>> add wavernn model self.resnet_stretch = _Stretch2d(total_scale, 1) up_layers = [] @@ -253,15 +265,14 @@ def forward(self, specgram: Tensor) -> Tensor: return upsampling_output, resnet_output def forward(self, x: Tensor) -> Tensor: - r"""Pass the input through the _UpsampleNetwork layer. - + r""" Args: - x: the input sequence to the _UpsampleNetwork layer (required) + x: the input sequence to the _UpsampleNetwork layer Shape: - - x: :math:`(batch_size, freq, time)` - - output: :math:`(batch_size, (time - 2 * pad) * total_scale, freq)`, - `(batch_size, (time - 2 * pad) * total_scale, output_dims)` + - x: :math:`(batch, freq, time)`. + - output: :math:`(batch, (time - kernel_size + 1) * total_scale, freq)`, + `(batch, (time - kernel_size + 1) * total_scale, n_output)` where total_scale is the product of all elements in upsample_scales. """ @@ -274,3 +285,121 @@ def forward(self, x: Tensor) -> Tensor: upsampling_output = upsampling_output.squeeze(1)[:, :, self.indent:-self.indent] return upsampling_output.transpose(1, 2), resnet_output.transpose(1, 2) + + +class _WaveRNN(nn.Module): + r""" + Args: + upsample_scales: the list of upsample scales + n_bits: the bits of output waveform + sample_rate: the rate of audio dimensions (samples per second) + hop_length: the number of samples between the starts of consecutive frames + n_res_block: the number of ResBlock in stack (default=10) + n_rnn: the dimension of RNN layer (default=512) + n_fc: the dimension of fully connected layer (default=512) + kernel_size: the number of kernel size in the first Conv1d layer (default=5) + n_freq: the number of bins in a spectrogram (default=128) + n_hidden: the number of hidden dimensions (default=128) + n_output: the number of output dimensions (default=128) + mode: the type of input waveform (default='RAW') + + Examples:: + >>> upsamplenetwork = _waveRNN(upsample_scales=[5,5,8], + n_bits=9, + sample_rate=24000, + hop_length=200, + n_res_block=10, + n_rnn=512, + n_fc=512, + kernel_size=5, + n_freq=128, + n_hidden=128, + n_output=128, + mode='RAW') + >>> x = torch.rand(10, 24800, 512) + >>> mels = torch.rand(10, 128, 512) + >>> output = upsamplenetwork(x, mels) + """ + + def __init__(self, + upsample_scales: List[int], + n_bits: int, + sample_rate: int, + hop_length: int, + n_res_block: int = 10, + n_rnn: int = 512, + n_fc: int = 512, + kernel_size: int = 5, + n_freq: int = 128, + n_hidden: int = 128, + n_output: int = 128, + mode: str = 'RAW') -> None: + super().__init__() + + self.mode = mode + self.kernel_size = kernel_size + + if self.mode == 'RAW': + self.n_classes = 2 ** n_bits + elif self.mode == 'MOL': + self.n_classes = 30 + + self.n_rnn = n_rnn + self.n_aux = n_output // 4 + self.hop_length = hop_length + self.sample_rate = sample_rate + + self.upsample = _UpsampleNetwork(upsample_scales, n_res_block, n_freq, n_hidden, n_output, kernel_size) + self.fc = nn.Linear(n_freq + self.n_aux + 1, n_rnn) + + self.rnn1 = nn.GRU(n_rnn, n_rnn, batch_first=True) + self.rnn2 = nn.GRU(n_rnn + self.n_aux, n_rnn, batch_first=True) + + self.relu1 = nn.ReLU(inplace=True) + self.relu2 = nn.ReLU(inplace=True) + + self.fc1 = nn.Linear(n_rnn + self.n_aux, n_fc) + self.fc2 = nn.Linear(n_fc + self.n_aux, n_fc) + self.fc3 = nn.Linear(n_fc, self.n_classes) + + def forward(self, x: Tensor, mels: Tensor) -> Tensor: + r""" + Args: + x: the input waveform to the _WaveRNN layer + mels: the input mel-spectrogram to the _WaveRNN layer + + Shape: + - x: :math:`(batch, time)` + - mels: :math:`(batch, freq, time_mels)` + - output: :math:`(batch, time, 2 ** n_bits)` + """ + + batch_size = x.size(0) + h1 = torch.zeros(1, batch_size, self.n_rnn, device=x.device) + h2 = torch.zeros(1, batch_size, self.n_rnn, device=x.device) + mels, aux = self.upsample(mels) + + aux_idx = [self.n_aux * i for i in range(5)] + a1 = aux[:, :, aux_idx[0]:aux_idx[1]] + a2 = aux[:, :, aux_idx[1]:aux_idx[2]] + a3 = aux[:, :, aux_idx[2]:aux_idx[3]] + a4 = aux[:, :, aux_idx[3]:aux_idx[4]] + + x = torch.cat([x.unsqueeze(-1), mels, a1], dim=2) + x = self.fc(x) + res = x + x, _ = self.rnn1(x, h1) + + x = x + res + res = x + x = torch.cat([x, a2], dim=2) + x, _ = self.rnn2(x, h2) + + x = x + res + x = torch.cat([x, a3], dim=2) + x = self.relu1(self.fc1(x)) + + x = torch.cat([x, a4], dim=2) + x = self.relu2(self.fc2(x)) + + return self.fc3(x) From 27e26aabc1a65ad43981ad8e0f8b9c626b903df5 Mon Sep 17 00:00:00 2001 From: Ji Chen Date: Tue, 23 Jun 2020 09:41:15 -0700 Subject: [PATCH 05/17] update test --- test/test_models.py | 2 - torchaudio/models/_wavernn.py | 213 +--------------------------------- 2 files changed, 1 insertion(+), 214 deletions(-) diff --git a/test/test_models.py b/test/test_models.py index dd30b6e879..2530b64951 100644 --- a/test/test_models.py +++ b/test/test_models.py @@ -1,5 +1,3 @@ -import unittest - import torch from torchaudio.models import Wav2Letter, _MelResNet, _UpsampleNetwork, _WaveRNN diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py index e17d2f5342..1df9eb0637 100644 --- a/torchaudio/models/_wavernn.py +++ b/torchaudio/models/_wavernn.py @@ -1,10 +1,9 @@ from typing import List -import torch from torch import Tensor from torch import nn -__all__ = ["_ResBlock", "_MelResNet", "_Stretch2d", "_UpsampleNetwork", "_WaveRNN"] +__all__ = ["_ResBlock", "_MelResNet", "_Stretch2d", "_UpsampleNetwork"] class _ResBlock(nn.Module): @@ -91,7 +90,6 @@ def forward(self, specgram: Tensor) -> Tensor: class _Stretch2d(nn.Module): -<<<<<<< HEAD r"""Upscale the frequency and time dimensions of a spectrogram. Args: @@ -141,71 +139,8 @@ class _UpsampleNetwork(nn.Module): >>> upsamplenetwork = _UpsampleNetwork(upsample_scales=[4, 4, 16]) >>> input = torch.rand(10, 128, 10) # a random spectrogram >>> output = upsamplenetwork(input) # shape: (10, 1536, 128), (10, 1536, 128) -======= - r"""This is a two-dimensional stretch layer. It is a block used in WaveRNN. WaveRNN is based on the paper - "Efficient Neural Audio Synthesis". Nal Kalchbrenner, Erich Elsen, Karen Simonyan, Seb Noury, - Norman Casagrande, Edward Lockhart, Florian Stimberg, Aaron van den Oord, Sander Dieleman, - Koray Kavukcuoglu. arXiv:1802.08435, 2018. -======= - r"""This is a two-dimensional stretch layer. It is a block used in WaveRNN. ->>>>>>> add wavernn model - - Args: - x_scale: the scale factor in x axis - y_scale: the scale factor in y axis - - Examples:: - >>> stretch2d = _Stretch2d(x_scale=10, y_scale=10) - - >>> input = torch.rand(10, 1, 100, 512) - >>> output = stretch2d(input) """ - def __init__(self, - x_scale: int, - y_scale: int) -> None: - super().__init__() - - self.x_scale = x_scale - self.y_scale = y_scale - - def forward(self, x: Tensor) -> Tensor: - r""" - Args: - x: the input sequence to the _Stretch2d layer - - Shape: - - x: :math:`(..., freq, time)` - - output: :math:`(..., freq * y_scale, time * x_scale)` - """ - - return x.repeat_interleave(self.y_scale, 2).repeat_interleave(self.x_scale, 3) - - -class _UpsampleNetwork(nn.Module): - r"""This is an upsample block based on a stack of Conv2d and Strech2d layers. - It is a block used in WaveRNN. - - Args: - upsample_scales: the list of upsample scales - n_res_block: the number of ResBlock in stack (default=10) - n_freq: the number of bins in a spectrogram (default=128) - n_hidden: the number of hidden dimensions (default=128) - n_output: the number of output dimensions (default=128) - kernel_size: the number of kernel size in the first Conv1d layer (default=5) - - Examples:: - >>> upsamplenetwork = _UpsampleNetwork(upsample_scales=[4, 4, 16], - n_res_block=10, - n_freq=128, - n_hidden=128, - n_output=128, - kernel_size=5) - >>> input = torch.rand(10, 128, 512) - >>> output = upsamplenetwork(input) - -<<<<<<< HEAD -======= def __init__(self, upsample_scales: List[int], n_res_block: int = 10, @@ -213,7 +148,6 @@ def __init__(self, n_hidden: int = 128, n_output: int = 128, kernel_size: int = 5) -> None: ->>>>>>> add wavernn model super().__init__() total_scale = 1 @@ -222,10 +156,6 @@ def __init__(self, self.indent = (kernel_size - 1) // 2 * total_scale self.resnet = _MelResNet(n_res_block, n_freq, n_hidden, n_output, kernel_size) -<<<<<<< HEAD - -======= ->>>>>>> add wavernn model self.resnet_stretch = _Stretch2d(total_scale, 1) up_layers = [] @@ -237,7 +167,6 @@ def __init__(self, padding=(0, scale), bias=False) conv.weight.data.fill_(1. / (scale * 2 + 1)) - up_layers.append(stretch) up_layers.append(conv) self.upsample_layers = nn.Sequential(*up_layers) @@ -263,143 +192,3 @@ def forward(self, specgram: Tensor) -> Tensor: upsampling_output = upsampling_output.squeeze(1)[:, :, self.indent:-self.indent] return upsampling_output, resnet_output - - def forward(self, x: Tensor) -> Tensor: - r""" - Args: - x: the input sequence to the _UpsampleNetwork layer - - Shape: - - x: :math:`(batch, freq, time)`. - - output: :math:`(batch, (time - kernel_size + 1) * total_scale, freq)`, - `(batch, (time - kernel_size + 1) * total_scale, n_output)` - where total_scale is the product of all elements in upsample_scales. - """ - - resnet_output = self.resnet(x).unsqueeze(1) - resnet_output = self.resnet_stretch(resnet_output) - resnet_output = resnet_output.squeeze(1) - - x = x.unsqueeze(1) - upsampling_output = self.upsample_layers(x) - upsampling_output = upsampling_output.squeeze(1)[:, :, self.indent:-self.indent] - - return upsampling_output.transpose(1, 2), resnet_output.transpose(1, 2) - - -class _WaveRNN(nn.Module): - r""" - Args: - upsample_scales: the list of upsample scales - n_bits: the bits of output waveform - sample_rate: the rate of audio dimensions (samples per second) - hop_length: the number of samples between the starts of consecutive frames - n_res_block: the number of ResBlock in stack (default=10) - n_rnn: the dimension of RNN layer (default=512) - n_fc: the dimension of fully connected layer (default=512) - kernel_size: the number of kernel size in the first Conv1d layer (default=5) - n_freq: the number of bins in a spectrogram (default=128) - n_hidden: the number of hidden dimensions (default=128) - n_output: the number of output dimensions (default=128) - mode: the type of input waveform (default='RAW') - - Examples:: - >>> upsamplenetwork = _waveRNN(upsample_scales=[5,5,8], - n_bits=9, - sample_rate=24000, - hop_length=200, - n_res_block=10, - n_rnn=512, - n_fc=512, - kernel_size=5, - n_freq=128, - n_hidden=128, - n_output=128, - mode='RAW') - >>> x = torch.rand(10, 24800, 512) - >>> mels = torch.rand(10, 128, 512) - >>> output = upsamplenetwork(x, mels) - """ - - def __init__(self, - upsample_scales: List[int], - n_bits: int, - sample_rate: int, - hop_length: int, - n_res_block: int = 10, - n_rnn: int = 512, - n_fc: int = 512, - kernel_size: int = 5, - n_freq: int = 128, - n_hidden: int = 128, - n_output: int = 128, - mode: str = 'RAW') -> None: - super().__init__() - - self.mode = mode - self.kernel_size = kernel_size - - if self.mode == 'RAW': - self.n_classes = 2 ** n_bits - elif self.mode == 'MOL': - self.n_classes = 30 - - self.n_rnn = n_rnn - self.n_aux = n_output // 4 - self.hop_length = hop_length - self.sample_rate = sample_rate - - self.upsample = _UpsampleNetwork(upsample_scales, n_res_block, n_freq, n_hidden, n_output, kernel_size) - self.fc = nn.Linear(n_freq + self.n_aux + 1, n_rnn) - - self.rnn1 = nn.GRU(n_rnn, n_rnn, batch_first=True) - self.rnn2 = nn.GRU(n_rnn + self.n_aux, n_rnn, batch_first=True) - - self.relu1 = nn.ReLU(inplace=True) - self.relu2 = nn.ReLU(inplace=True) - - self.fc1 = nn.Linear(n_rnn + self.n_aux, n_fc) - self.fc2 = nn.Linear(n_fc + self.n_aux, n_fc) - self.fc3 = nn.Linear(n_fc, self.n_classes) - - def forward(self, x: Tensor, mels: Tensor) -> Tensor: - r""" - Args: - x: the input waveform to the _WaveRNN layer - mels: the input mel-spectrogram to the _WaveRNN layer - - Shape: - - x: :math:`(batch, time)` - - mels: :math:`(batch, freq, time_mels)` - - output: :math:`(batch, time, 2 ** n_bits)` - """ - - batch_size = x.size(0) - h1 = torch.zeros(1, batch_size, self.n_rnn, device=x.device) - h2 = torch.zeros(1, batch_size, self.n_rnn, device=x.device) - mels, aux = self.upsample(mels) - - aux_idx = [self.n_aux * i for i in range(5)] - a1 = aux[:, :, aux_idx[0]:aux_idx[1]] - a2 = aux[:, :, aux_idx[1]:aux_idx[2]] - a3 = aux[:, :, aux_idx[2]:aux_idx[3]] - a4 = aux[:, :, aux_idx[3]:aux_idx[4]] - - x = torch.cat([x.unsqueeze(-1), mels, a1], dim=2) - x = self.fc(x) - res = x - x, _ = self.rnn1(x, h1) - - x = x + res - res = x - x = torch.cat([x, a2], dim=2) - x, _ = self.rnn2(x, h2) - - x = x + res - x = torch.cat([x, a3], dim=2) - x = self.relu1(self.fc1(x)) - - x = torch.cat([x, a4], dim=2) - x = self.relu2(self.fc2(x)) - - return self.fc3(x) From 6981d1c597afa1b38690f870abf3f0b44e86da78 Mon Sep 17 00:00:00 2001 From: Ji Chen Date: Wed, 24 Jun 2020 20:11:52 -0700 Subject: [PATCH 06/17] update format --- test/test_models.py | 33 +++++++++ torchaudio/models/_wavernn.py | 122 ++++++++++++++++++++++++++++++++++ 2 files changed, 155 insertions(+) diff --git a/test/test_models.py b/test/test_models.py index 2530b64951..15b9390906 100644 --- a/test/test_models.py +++ b/test/test_models.py @@ -81,3 +81,36 @@ def test_waveform(self): assert out1.size() == (n_batch, n_freq, total_scale * (n_time - kernel_size + 1)) assert out2.size() == (n_batch, n_output, total_scale * (n_time - kernel_size + 1)) + + +class TestWaveRNN(common_utils.TorchaudioTestCase): + + def test_waveform(self): + """ + Create a tensor as the input of _WaveRNN model + and test if the output dimensions are correct. + """ + + upsample_scales = [5, 5, 8] + n_rnn = 512 + n_fc = 512 + n_bits = 9 + sample_rate = 24000 + hop_length = 200 + batch_size = 2 + n_time = 200 + n_freq = 100 + n_output = 256 + n_res_block = 10 + n_hidden = 128 + kernel_size = 5 + mode = 'RAW' + + model = _WaveRNN(upsample_scales, n_bits, sample_rate, hop_length, n_res_block, + n_rnn, n_fc, kernel_size, n_freq, n_hidden, n_output, mode) + + x = torch.rand(batch_size, hop_length * (n_time - kernel_size + 1)) + mels = torch.rand(batch_size, n_freq, n_time) + out = model(x, mels) + + assert out.size() == (batch_size, hop_length * (n_time - kernel_size + 1), 2 ** n_bits) diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py index 1df9eb0637..5ba641e8ab 100644 --- a/torchaudio/models/_wavernn.py +++ b/torchaudio/models/_wavernn.py @@ -192,3 +192,125 @@ def forward(self, specgram: Tensor) -> Tensor: upsampling_output = upsampling_output.squeeze(1)[:, :, self.indent:-self.indent] return upsampling_output, resnet_output + + +class _WaveRNN(nn.Module): + r"""WaveRNN model based on + `"Efficient Neural Audio Synthesis" `_ + + Args: + upsample_scales: the list of upsample scales + n_bits: the bits of output waveform + sample_rate: the rate of audio dimensions (samples per second) + hop_length: the number of samples between the starts of consecutive frames + n_res_block: the number of ResBlock in stack (default=10) + n_rnn: the dimension of RNN layer (default=512) + n_fc: the dimension of fully connected layer (default=512) + kernel_size: the number of kernel size in the first Conv1d layer (default=5) + n_freq: the number of bins in a spectrogram (default=128) + n_hidden: the number of hidden dimensions (default=128) + n_output: the number of output dimensions (default=128) + mode: the type of input waveform in ['RAW', 'MOL'] (default='RAW') + + Examples:: + >>> upsamplenetwork = _waveRNN(upsample_scales=[5,5,8], + n_bits=9, + sample_rate=24000, + hop_length=200, + n_res_block=10, + n_rnn=512, + n_fc=512, + kernel_size=5, + n_freq=128, + n_hidden=128, + n_output=128, + mode='RAW') + >>> x = torch.rand(10, 24800, 512) + >>> mels = torch.rand(10, 128, 512) + >>> output = upsamplenetwork(x, mels) + """ + + def __init__(self, + upsample_scales: List[int], + n_bits: int, + sample_rate: int, + hop_length: int, + n_res_block: int = 10, + n_rnn: int = 512, + n_fc: int = 512, + kernel_size: int = 5, + n_freq: int = 128, + n_hidden: int = 128, + n_output: int = 128, + mode: str = 'RAW') -> None: + super().__init__() + + self.mode = mode + self.kernel_size = kernel_size + + if self.mode == 'RAW': + self.n_classes = 2 ** n_bits + elif self.mode == 'MOL': + self.n_classes = 30 + else: + raise ValueError("Unknown input mode - {}".format(self.mode)) + + self.n_rnn = n_rnn + self.n_aux = n_output // 4 + self.hop_length = hop_length + self.sample_rate = sample_rate + + self.upsample = _UpsampleNetwork(upsample_scales, n_res_block, n_freq, n_hidden, n_output, kernel_size) + self.fc = nn.Linear(n_freq + self.n_aux + 1, n_rnn) + + self.rnn1 = nn.GRU(n_rnn, n_rnn, batch_first=True) + self.rnn2 = nn.GRU(n_rnn + self.n_aux, n_rnn, batch_first=True) + + self.relu1 = nn.ReLU(inplace=True) + self.relu2 = nn.ReLU(inplace=True) + + self.fc1 = nn.Linear(n_rnn + self.n_aux, n_fc) + self.fc2 = nn.Linear(n_fc + self.n_aux, n_fc) + self.fc3 = nn.Linear(n_fc, self.n_classes) + + def forward(self, x: Tensor, mels: Tensor) -> Tensor: + r""" + Args: + x: the input waveform to the _WaveRNN layer + mels: the input mel-spectrogram to the _WaveRNN layer + + Shape: + - x: :math:`(batch, time)` + - mels: :math:`(batch, freq, time_mels)` + - output: :math:`(batch, time, 2 ** n_bits)` + """ + + batch_size = x.size(0) + h1 = torch.zeros(1, batch_size, self.n_rnn, dtype=x.dtype, device=x.device) + h2 = torch.zeros(1, batch_size, self.n_rnn, dtype=x.dtype, device=x.device) + mels, aux = self.upsample(mels) + + aux_idx = [self.n_aux * i for i in range(5)] + a1 = aux[:, :, aux_idx[0]:aux_idx[1]] + a2 = aux[:, :, aux_idx[1]:aux_idx[2]] + a3 = aux[:, :, aux_idx[2]:aux_idx[3]] + a4 = aux[:, :, aux_idx[3]:aux_idx[4]] + + x = torch.cat([x.unsqueeze(-1), mels, a1], dim=2) + x = self.fc(x) + res = x + x, _ = self.rnn1(x, h1) + + x = x + res + res = x + x = torch.cat([x, a2], dim=2) + x, _ = self.rnn2(x, h2) + + x = x + res + x = torch.cat([x, a3], dim=2) + x = self.relu1(self.fc1(x)) + + x = torch.cat([x, a4], dim=2) + x = self.relu2(self.fc2(x)) + + return self.fc3(x) From c41ac8fdfed22d2acc9e3db4aba908bc26532e71 Mon Sep 17 00:00:00 2001 From: Ji Chen Date: Fri, 26 Jun 2020 05:17:55 -0700 Subject: [PATCH 07/17] update format --- test/test_models.py | 40 ++++++++++++++++---- torchaudio/models/_wavernn.py | 71 +++++++++++++++++------------------ 2 files changed, 67 insertions(+), 44 deletions(-) diff --git a/test/test_models.py b/test/test_models.py index 15b9390906..0f4015374b 100644 --- a/test/test_models.py +++ b/test/test_models.py @@ -86,9 +86,35 @@ def test_waveform(self): class TestWaveRNN(common_utils.TorchaudioTestCase): def test_waveform(self): + """test the output dimensions of waveform input after _WaveRNN model. """ - Create a tensor as the input of _WaveRNN model - and test if the output dimensions are correct. + + upsample_scales = [5, 5, 8] + n_rnn = 512 + n_fc = 512 + n_bits = 9 + sample_rate = 24000 + hop_length = 200 + n_batch = 2 + n_time = 200 + n_freq = 100 + n_output = 256 + n_res_block = 10 + n_hidden = 128 + kernel_size = 5 + mode = 'waveform' + + model = _WaveRNN(upsample_scales, n_bits, sample_rate, hop_length, n_res_block, + n_rnn, n_fc, kernel_size, n_freq, n_hidden, n_output, mode) + + x = torch.rand(n_batch, hop_length * (n_time - kernel_size + 1)) + mels = torch.rand(n_batch, n_freq, n_time) + out = model(x, mels) + + assert out.size() == (n_batch, hop_length * (n_time - kernel_size + 1), 2 ** n_bits) + + def test_mol(self): + """test the output dimensions of mol input after _WaveRNN model. """ upsample_scales = [5, 5, 8] @@ -97,20 +123,20 @@ def test_waveform(self): n_bits = 9 sample_rate = 24000 hop_length = 200 - batch_size = 2 + n_batch = 2 n_time = 200 n_freq = 100 n_output = 256 n_res_block = 10 n_hidden = 128 kernel_size = 5 - mode = 'RAW' + mode = 'mol' model = _WaveRNN(upsample_scales, n_bits, sample_rate, hop_length, n_res_block, n_rnn, n_fc, kernel_size, n_freq, n_hidden, n_output, mode) - x = torch.rand(batch_size, hop_length * (n_time - kernel_size + 1)) - mels = torch.rand(batch_size, n_freq, n_time) + x = torch.rand(n_batch, hop_length * (n_time - kernel_size + 1)) + mels = torch.rand(n_batch, n_freq, n_time) out = model(x, mels) - assert out.size() == (batch_size, hop_length * (n_time - kernel_size + 1), 2 ** n_bits) + assert out.size() == (n_batch, hop_length * (n_time - kernel_size + 1), 30) diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py index 5ba641e8ab..1b8a90bfaa 100644 --- a/torchaudio/models/_wavernn.py +++ b/torchaudio/models/_wavernn.py @@ -195,8 +195,10 @@ def forward(self, specgram: Tensor) -> Tensor: class _WaveRNN(nn.Module): - r"""WaveRNN model based on - `"Efficient Neural Audio Synthesis" `_ + r"""WaveRNN model based on "Efficient Neural Audio Synthesis". + + The paper link is https://arxiv.org/pdf/1802.08435.pdf. The input signals are waveform + and spectrogram. The input channels of waveform and spectrogram have to be 1. Args: upsample_scales: the list of upsample scales @@ -210,24 +212,15 @@ class _WaveRNN(nn.Module): n_freq: the number of bins in a spectrogram (default=128) n_hidden: the number of hidden dimensions (default=128) n_output: the number of output dimensions (default=128) - mode: the type of input waveform in ['RAW', 'MOL'] (default='RAW') - - Examples:: - >>> upsamplenetwork = _waveRNN(upsample_scales=[5,5,8], - n_bits=9, - sample_rate=24000, - hop_length=200, - n_res_block=10, - n_rnn=512, - n_fc=512, - kernel_size=5, - n_freq=128, - n_hidden=128, - n_output=128, - mode='RAW') - >>> x = torch.rand(10, 24800, 512) - >>> mels = torch.rand(10, 128, 512) - >>> output = upsamplenetwork(x, mels) + mode: the type of input waveform in ['waveform', 'mol'] (default='waveform') + + Examples + >>> wavernn = _waveRNN(upsample_scales=[5,5,8], n_bits=9, sample_rate=24000, hop_length=200) + >>> waveform, sample_rate = torchaudio.load(file) # waveform shape: + >>> (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length) + >>> specgram = MelSpectrogram(sample_rate)(waveform) # shape: (n_batch, n_channel, n_freq, n_time) + >>> output = wavernn(waveform.squeeze(1), specgram.squeeze(1)) # shape: + >>> (n_batch, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits) """ def __init__(self, @@ -242,24 +235,30 @@ def __init__(self, n_freq: int = 128, n_hidden: int = 128, n_output: int = 128, - mode: str = 'RAW') -> None: + mode: str = 'waveform') -> None: super().__init__() self.mode = mode self.kernel_size = kernel_size - if self.mode == 'RAW': + if self.mode == 'waveform': self.n_classes = 2 ** n_bits - elif self.mode == 'MOL': + elif self.mode == 'mol': self.n_classes = 30 else: - raise ValueError("Unknown input mode - {}".format(self.mode)) + raise ValueError(f"Expected mode: `waveform` or `mol`, but found {self.mode}") self.n_rnn = n_rnn self.n_aux = n_output // 4 self.hop_length = hop_length self.sample_rate = sample_rate + total_scale = 1 + for upsample_scale in upsample_scales: + total_scale *= upsample_scale + if total_scale != self.hop_length: + raise ValueError(f"Expected: total_scale == hop_length, but found {total_scale} != {hop_length}") + self.upsample = _UpsampleNetwork(upsample_scales, n_res_block, n_freq, n_hidden, n_output, kernel_size) self.fc = nn.Linear(n_freq + self.n_aux + 1, n_rnn) @@ -273,22 +272,20 @@ def __init__(self, self.fc2 = nn.Linear(n_fc + self.n_aux, n_fc) self.fc3 = nn.Linear(n_fc, self.n_classes) - def forward(self, x: Tensor, mels: Tensor) -> Tensor: - r""" + def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor: + r"""Pass the input through the _WaveRNN model. Args: - x: the input waveform to the _WaveRNN layer - mels: the input mel-spectrogram to the _WaveRNN layer + waveform: the input waveform to the _WaveRNN layer (n_batch, (n_time - kernel_size + 1) * hop_length) + specgram: the input spectrogram to the _WaveRNN layer (n_batch, n_freq, n_time) - Shape: - - x: :math:`(batch, time)` - - mels: :math:`(batch, freq, time_mels)` - - output: :math:`(batch, time, 2 ** n_bits)` + Return: + Tensor shape: (n_batch, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits) """ - batch_size = x.size(0) - h1 = torch.zeros(1, batch_size, self.n_rnn, dtype=x.dtype, device=x.device) - h2 = torch.zeros(1, batch_size, self.n_rnn, dtype=x.dtype, device=x.device) - mels, aux = self.upsample(mels) + batch_size = waveform.size(0) + h1 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device) + h2 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device) + mels, aux = self.upsample(specgram) aux_idx = [self.n_aux * i for i in range(5)] a1 = aux[:, :, aux_idx[0]:aux_idx[1]] @@ -296,7 +293,7 @@ def forward(self, x: Tensor, mels: Tensor) -> Tensor: a3 = aux[:, :, aux_idx[2]:aux_idx[3]] a4 = aux[:, :, aux_idx[3]:aux_idx[4]] - x = torch.cat([x.unsqueeze(-1), mels, a1], dim=2) + x = torch.cat([waveform.unsqueeze(-1), mels, a1], dim=2) x = self.fc(x) res = x x, _ = self.rnn1(x, h1) From c34316601276212a0fa79739ef24683ef6bfe492 Mon Sep 17 00:00:00 2001 From: Ji Chen Date: Fri, 26 Jun 2020 09:43:37 -0700 Subject: [PATCH 08/17] update format --- test/test_models.py | 4 ++-- torchaudio/models/_wavernn.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/test/test_models.py b/test/test_models.py index 0f4015374b..d4873dadc2 100644 --- a/test/test_models.py +++ b/test/test_models.py @@ -86,7 +86,7 @@ def test_waveform(self): class TestWaveRNN(common_utils.TorchaudioTestCase): def test_waveform(self): - """test the output dimensions of waveform input after _WaveRNN model. + """Validate the output dimensions of a _WaveRNN model with a waveform input. """ upsample_scales = [5, 5, 8] @@ -114,7 +114,7 @@ def test_waveform(self): assert out.size() == (n_batch, hop_length * (n_time - kernel_size + 1), 2 ** n_bits) def test_mol(self): - """test the output dimensions of mol input after _WaveRNN model. + """Validate the output dimensions of a _WaveRNN model with a mol input. """ upsample_scales = [5, 5, 8] diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py index 1b8a90bfaa..1720cd7429 100644 --- a/torchaudio/models/_wavernn.py +++ b/torchaudio/models/_wavernn.py @@ -197,8 +197,8 @@ def forward(self, specgram: Tensor) -> Tensor: class _WaveRNN(nn.Module): r"""WaveRNN model based on "Efficient Neural Audio Synthesis". - The paper link is https://arxiv.org/pdf/1802.08435.pdf. The input signals are waveform - and spectrogram. The input channels of waveform and spectrogram have to be 1. + The paper link is https://arxiv.org/pdf/1802.08435.pdf. The input signals are a waveform + and a spectrogram. The input channels of waveform and spectrogram have to be 1. Args: upsample_scales: the list of upsample scales @@ -216,10 +216,10 @@ class _WaveRNN(nn.Module): Examples >>> wavernn = _waveRNN(upsample_scales=[5,5,8], n_bits=9, sample_rate=24000, hop_length=200) - >>> waveform, sample_rate = torchaudio.load(file) # waveform shape: + >>> waveform, sample_rate = torchaudio.load(file) # waveform shape: >>> (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length) - >>> specgram = MelSpectrogram(sample_rate)(waveform) # shape: (n_batch, n_channel, n_freq, n_time) - >>> output = wavernn(waveform.squeeze(1), specgram.squeeze(1)) # shape: + >>> specgram = MelSpectrogram(sample_rate)(waveform) # shape: (n_batch, n_channel, n_freq, n_time) + >>> output = wavernn(waveform.squeeze(1), specgram.squeeze(1)) # shape: >>> (n_batch, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits) """ From 17455a3c74529b0d904d177aa610fe3ab8d255b4 Mon Sep 17 00:00:00 2001 From: Ji Chen Date: Wed, 1 Jul 2020 13:03:14 -0700 Subject: [PATCH 09/17] fix conflicts and add transpose --- test/test_models.py | 4 ++-- torchaudio/models/_wavernn.py | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/test/test_models.py b/test/test_models.py index d4873dadc2..86901eb0ae 100644 --- a/test/test_models.py +++ b/test/test_models.py @@ -86,7 +86,7 @@ def test_waveform(self): class TestWaveRNN(common_utils.TorchaudioTestCase): def test_waveform(self): - """Validate the output dimensions of a _WaveRNN model with a waveform input. + """Validate the output dimensions of a _WaveRNN model in waveform mode. """ upsample_scales = [5, 5, 8] @@ -114,7 +114,7 @@ def test_waveform(self): assert out.size() == (n_batch, hop_length * (n_time - kernel_size + 1), 2 ** n_bits) def test_mol(self): - """Validate the output dimensions of a _WaveRNN model with a mol input. + """Validate the output dimensions of a _WaveRNN model in mol mode. """ upsample_scales = [5, 5, 8] diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py index 1720cd7429..1eeeb391d9 100644 --- a/torchaudio/models/_wavernn.py +++ b/torchaudio/models/_wavernn.py @@ -197,8 +197,8 @@ def forward(self, specgram: Tensor) -> Tensor: class _WaveRNN(nn.Module): r"""WaveRNN model based on "Efficient Neural Audio Synthesis". - The paper link is https://arxiv.org/pdf/1802.08435.pdf. The input signals are a waveform - and a spectrogram. The input channels of waveform and spectrogram have to be 1. + The paper link is https://arxiv.org/pdf/1802.08435.pdf. The input channels of waveform + and spectrogram have to be 1. The product of upsample_scales must equal hop_length. Args: upsample_scales: the list of upsample scales @@ -212,7 +212,7 @@ class _WaveRNN(nn.Module): n_freq: the number of bins in a spectrogram (default=128) n_hidden: the number of hidden dimensions (default=128) n_output: the number of output dimensions (default=128) - mode: the type of input waveform in ['waveform', 'mol'] (default='waveform') + mode: the mode of waveform in ['waveform', 'mol'] (default='waveform') Examples >>> wavernn = _waveRNN(upsample_scales=[5,5,8], n_bits=9, sample_rate=24000, hop_length=200) @@ -274,6 +274,7 @@ def __init__(self, def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor: r"""Pass the input through the _WaveRNN model. + Args: waveform: the input waveform to the _WaveRNN layer (n_batch, (n_time - kernel_size + 1) * hop_length) specgram: the input spectrogram to the _WaveRNN layer (n_batch, n_freq, n_time) @@ -286,6 +287,8 @@ def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor: h1 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device) h2 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device) mels, aux = self.upsample(specgram) + mels = mels.transpose() + aux = aux.transpose() aux_idx = [self.n_aux * i for i in range(5)] a1 = aux[:, :, aux_idx[0]:aux_idx[1]] From 2c44bc776657e5ae4d5a3a2341de7b928f5b8940 Mon Sep 17 00:00:00 2001 From: Ji Chen Date: Wed, 1 Jul 2020 13:25:34 -0700 Subject: [PATCH 10/17] import update --- torchaudio/models/_wavernn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py index 1eeeb391d9..b4713f7934 100644 --- a/torchaudio/models/_wavernn.py +++ b/torchaudio/models/_wavernn.py @@ -1,9 +1,10 @@ from typing import List +import torch from torch import Tensor from torch import nn -__all__ = ["_ResBlock", "_MelResNet", "_Stretch2d", "_UpsampleNetwork"] +__all__ = ["_ResBlock", "_MelResNet", "_Stretch2d", "_UpsampleNetwork", "_WaveRNN"] class _ResBlock(nn.Module): From 634bc7fa821aab97fb4b9d03592161856a490bc9 Mon Sep 17 00:00:00 2001 From: Ji Chen Date: Wed, 1 Jul 2020 13:39:00 -0700 Subject: [PATCH 11/17] update transpose --- torchaudio/models/_wavernn.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py index b4713f7934..a7739d3bdf 100644 --- a/torchaudio/models/_wavernn.py +++ b/torchaudio/models/_wavernn.py @@ -288,8 +288,8 @@ def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor: h1 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device) h2 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device) mels, aux = self.upsample(specgram) - mels = mels.transpose() - aux = aux.transpose() + mels = mels.transpose(1, 2) + aux = aux.transpose(1, 2) aux_idx = [self.n_aux * i for i in range(5)] a1 = aux[:, :, aux_idx[0]:aux_idx[1]] From 6a2b8a7267438c26c611f0df8326bfe7ab494a37 Mon Sep 17 00:00:00 2001 From: Ji Chen Date: Sun, 5 Jul 2020 12:14:38 -0700 Subject: [PATCH 12/17] update format --- torchaudio/models/_wavernn.py | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py index a7739d3bdf..bce6f25601 100644 --- a/torchaudio/models/_wavernn.py +++ b/torchaudio/models/_wavernn.py @@ -198,8 +198,8 @@ def forward(self, specgram: Tensor) -> Tensor: class _WaveRNN(nn.Module): r"""WaveRNN model based on "Efficient Neural Audio Synthesis". - The paper link is https://arxiv.org/pdf/1802.08435.pdf. The input channels of waveform - and spectrogram have to be 1. The product of upsample_scales must equal hop_length. + The paper link is ``_. The input channels of waveform + and spectrogram have to be 1. The product of `upsample_scales` must equal `hop_length`. Args: upsample_scales: the list of upsample scales @@ -217,11 +217,12 @@ class _WaveRNN(nn.Module): Examples >>> wavernn = _waveRNN(upsample_scales=[5,5,8], n_bits=9, sample_rate=24000, hop_length=200) - >>> waveform, sample_rate = torchaudio.load(file) # waveform shape: - >>> (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length) + >>> waveform, sample_rate = torchaudio.load(file) + >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length) >>> specgram = MelSpectrogram(sample_rate)(waveform) # shape: (n_batch, n_channel, n_freq, n_time) - >>> output = wavernn(waveform.squeeze(1), specgram.squeeze(1)) # shape: - >>> (n_batch, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits) + >>> output = wavernn(waveform.squeeze(1), specgram.squeeze(1)) + >>> # output shape in 'waveform' mode: (n_batch, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits) + >>> # output shape in 'mol' mode: (n_batch, (n_time - kernel_size + 1) * hop_length, 30) """ def __init__(self, @@ -287,8 +288,11 @@ def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor: batch_size = waveform.size(0) h1 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device) h2 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device) - mels, aux = self.upsample(specgram) - mels = mels.transpose(1, 2) + # output of upsample: + # specgram: (n_batch, n_freq, (n_time - kernel_size + 1) * total_scale) + # aux: (n_batch, n_output, (n_time - kernel_size + 1) * total_scale) + specgram, aux = self.upsample(specgram) + specgram = specgram.transpose(1, 2) aux = aux.transpose(1, 2) aux_idx = [self.n_aux * i for i in range(5)] @@ -297,21 +301,23 @@ def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor: a3 = aux[:, :, aux_idx[2]:aux_idx[3]] a4 = aux[:, :, aux_idx[3]:aux_idx[4]] - x = torch.cat([waveform.unsqueeze(-1), mels, a1], dim=2) + x = torch.cat([waveform.unsqueeze(-1), specgram, a1], dim=-1) x = self.fc(x) res = x x, _ = self.rnn1(x, h1) x = x + res res = x - x = torch.cat([x, a2], dim=2) + x = torch.cat([x, a2], dim=-1) x, _ = self.rnn2(x, h2) x = x + res - x = torch.cat([x, a3], dim=2) - x = self.relu1(self.fc1(x)) + x = torch.cat([x, a3], dim=-1) + x = self.fc1(x) + x = self.relu1(x) - x = torch.cat([x, a4], dim=2) - x = self.relu2(self.fc2(x)) + x = torch.cat([x, a4], dim=-1) + x = self.fc2(x) + x = self.relu2(x) return self.fc3(x) From b547482bd75a659353247cdd69f76006efcfebfa Mon Sep 17 00:00:00 2001 From: Ji Chen Date: Mon, 6 Jul 2020 19:02:59 -0700 Subject: [PATCH 13/17] update docstring --- torchaudio/models/_wavernn.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py index bce6f25601..c21888dcbc 100644 --- a/torchaudio/models/_wavernn.py +++ b/torchaudio/models/_wavernn.py @@ -196,10 +196,12 @@ def forward(self, specgram: Tensor) -> Tensor: class _WaveRNN(nn.Module): - r"""WaveRNN model based on "Efficient Neural Audio Synthesis". + r"""WaveRNN model based on the implementation from `fatchord `_. - The paper link is ``_. The input channels of waveform - and spectrogram have to be 1. The product of `upsample_scales` must equal `hop_length`. + The original implementation was introduced in + `"Efficient Neural Audio Synthesis" `_. + The input channels of waveform and spectrogram have to be 1. The product of + `upsample_scales` must equal `hop_length`. Args: upsample_scales: the list of upsample scales @@ -215,14 +217,13 @@ class _WaveRNN(nn.Module): n_output: the number of output dimensions (default=128) mode: the mode of waveform in ['waveform', 'mol'] (default='waveform') - Examples + Example >>> wavernn = _waveRNN(upsample_scales=[5,5,8], n_bits=9, sample_rate=24000, hop_length=200) >>> waveform, sample_rate = torchaudio.load(file) >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length) >>> specgram = MelSpectrogram(sample_rate)(waveform) # shape: (n_batch, n_channel, n_freq, n_time) >>> output = wavernn(waveform.squeeze(1), specgram.squeeze(1)) >>> # output shape in 'waveform' mode: (n_batch, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits) - >>> # output shape in 'mol' mode: (n_batch, (n_time - kernel_size + 1) * hop_length, 30) """ def __init__(self, From 978e10178b09ef6778d47ac465e6a918495fc561 Mon Sep 17 00:00:00 2001 From: Ji Chen Date: Tue, 7 Jul 2020 09:53:34 -0700 Subject: [PATCH 14/17] add n_channel in input --- test/test_models.py | 12 ++++++------ torchaudio/models/_wavernn.py | 15 ++++++++++----- 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/test/test_models.py b/test/test_models.py index 86901eb0ae..c54a57cebd 100644 --- a/test/test_models.py +++ b/test/test_models.py @@ -107,11 +107,11 @@ def test_waveform(self): model = _WaveRNN(upsample_scales, n_bits, sample_rate, hop_length, n_res_block, n_rnn, n_fc, kernel_size, n_freq, n_hidden, n_output, mode) - x = torch.rand(n_batch, hop_length * (n_time - kernel_size + 1)) - mels = torch.rand(n_batch, n_freq, n_time) + x = torch.rand(n_batch, 1, hop_length * (n_time - kernel_size + 1)) + mels = torch.rand(n_batch, 1, n_freq, n_time) out = model(x, mels) - assert out.size() == (n_batch, hop_length * (n_time - kernel_size + 1), 2 ** n_bits) + assert out.size() == (n_batch, 1, hop_length * (n_time - kernel_size + 1), 2 ** n_bits) def test_mol(self): """Validate the output dimensions of a _WaveRNN model in mol mode. @@ -135,8 +135,8 @@ def test_mol(self): model = _WaveRNN(upsample_scales, n_bits, sample_rate, hop_length, n_res_block, n_rnn, n_fc, kernel_size, n_freq, n_hidden, n_output, mode) - x = torch.rand(n_batch, hop_length * (n_time - kernel_size + 1)) - mels = torch.rand(n_batch, n_freq, n_time) + x = torch.rand(n_batch, 1, hop_length * (n_time - kernel_size + 1)) + mels = torch.rand(n_batch, 1, n_freq, n_time) out = model(x, mels) - assert out.size() == (n_batch, hop_length * (n_time - kernel_size + 1), 30) + assert out.size() == (n_batch, 1, hop_length * (n_time - kernel_size + 1), 30) diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py index c21888dcbc..7b6b7b0395 100644 --- a/torchaudio/models/_wavernn.py +++ b/torchaudio/models/_wavernn.py @@ -222,7 +222,7 @@ class _WaveRNN(nn.Module): >>> waveform, sample_rate = torchaudio.load(file) >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length) >>> specgram = MelSpectrogram(sample_rate)(waveform) # shape: (n_batch, n_channel, n_freq, n_time) - >>> output = wavernn(waveform.squeeze(1), specgram.squeeze(1)) + >>> output = wavernn(waveform, specgram) >>> # output shape in 'waveform' mode: (n_batch, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits) """ @@ -279,13 +279,17 @@ def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor: r"""Pass the input through the _WaveRNN model. Args: - waveform: the input waveform to the _WaveRNN layer (n_batch, (n_time - kernel_size + 1) * hop_length) - specgram: the input spectrogram to the _WaveRNN layer (n_batch, n_freq, n_time) + waveform: the input waveform to the _WaveRNN layer (n_batch, 1, (n_time - kernel_size + 1) * hop_length) + specgram: the input spectrogram to the _WaveRNN layer (n_batch, 1, n_freq, n_time) Return: - Tensor shape: (n_batch, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits) + Tensor shape: (n_batch, 1, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits) """ + assert waveform.size(1) == 1, 'Require the input channel of waveform is 1' + assert specgram.size(1) == 1, 'Require the input channel of specgram is 1' + waveform, specgram = waveform.squeeze(1), specgram.squeeze(1) + batch_size = waveform.size(0) h1 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device) h2 = torch.zeros(1, batch_size, self.n_rnn, dtype=waveform.dtype, device=waveform.device) @@ -320,5 +324,6 @@ def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor: x = torch.cat([x, a4], dim=-1) x = self.fc2(x) x = self.relu2(x) + x = self.fc3(x).unsqueeze(1) - return self.fc3(x) + return x From 01fbbdaa20e6dc0fb0f9d962396f99ed83e40b56 Mon Sep 17 00:00:00 2001 From: Ji Chen Date: Tue, 7 Jul 2020 14:32:19 -0700 Subject: [PATCH 15/17] add comment --- torchaudio/models/_wavernn.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py index 7b6b7b0395..de39f1a83f 100644 --- a/torchaudio/models/_wavernn.py +++ b/torchaudio/models/_wavernn.py @@ -288,6 +288,7 @@ def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor: assert waveform.size(1) == 1, 'Require the input channel of waveform is 1' assert specgram.size(1) == 1, 'Require the input channel of specgram is 1' + # remove channel dimension until the end waveform, specgram = waveform.squeeze(1), specgram.squeeze(1) batch_size = waveform.size(0) @@ -324,6 +325,7 @@ def forward(self, waveform: Tensor, specgram: Tensor) -> Tensor: x = torch.cat([x, a4], dim=-1) x = self.fc2(x) x = self.relu2(x) - x = self.fc3(x).unsqueeze(1) + x = self.fc3(x) - return x + # bring back channel dimension + return x.unsqueeze(1) From 0ed6da8990ad472e91ebc73a5d56da0863b71415 Mon Sep 17 00:00:00 2001 From: Ji Chen Date: Tue, 7 Jul 2020 15:55:26 -0700 Subject: [PATCH 16/17] update docstring --- torchaudio/models/_wavernn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py index de39f1a83f..7ddff8376a 100644 --- a/torchaudio/models/_wavernn.py +++ b/torchaudio/models/_wavernn.py @@ -223,7 +223,7 @@ class _WaveRNN(nn.Module): >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length) >>> specgram = MelSpectrogram(sample_rate)(waveform) # shape: (n_batch, n_channel, n_freq, n_time) >>> output = wavernn(waveform, specgram) - >>> # output shape in 'waveform' mode: (n_batch, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits) + >>> # output shape in 'waveform' mode: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits) """ def __init__(self, From 1cb02fd6716f2de9d6ca7420453dce8a422fe5b0 Mon Sep 17 00:00:00 2001 From: Ji Chen Date: Tue, 7 Jul 2020 16:01:53 -0700 Subject: [PATCH 17/17] update docstring --- torchaudio/models/_wavernn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchaudio/models/_wavernn.py b/torchaudio/models/_wavernn.py index 7ddff8376a..cd2e89a10c 100644 --- a/torchaudio/models/_wavernn.py +++ b/torchaudio/models/_wavernn.py @@ -223,7 +223,7 @@ class _WaveRNN(nn.Module): >>> # waveform shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length) >>> specgram = MelSpectrogram(sample_rate)(waveform) # shape: (n_batch, n_channel, n_freq, n_time) >>> output = wavernn(waveform, specgram) - >>> # output shape in 'waveform' mode: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits) + >>> # output shape: (n_batch, n_channel, (n_time - kernel_size + 1) * hop_length, 2 ** n_bits) """ def __init__(self,