|
5 | 5 |
|
6 | 6 |
|
7 | 7 | class _ResBlock(nn.Module): |
8 | | - r"""This is a ResNet block layer. This layer is based on the paper "Deep Residual Learning |
9 | | - for Image Recognition". Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. CVPR, 2016. |
10 | | - It is a block used in WaveRNN. WaveRNN is based on the paper "Efficient Neural Audio Synthesis". |
11 | | - Nal Kalchbrenner, Erich Elsen, Karen Simonyan, Seb Noury, Norman Casagrande, Edward Lockhart, |
12 | | - Florian Stimberg, Aaron van den Oord, Sander Dieleman, Koray Kavukcuoglu. arXiv:1802.08435, 2018. |
| 8 | + r"""ResNet block based on "Deep Residual Learning for Image Recognition" |
| 9 | +
|
| 10 | + The paper link is https://arxiv.org/pdf/1512.03385.pdf. |
13 | 11 |
|
14 | 12 | Args: |
15 | | - num_dims: the number of compute dimensions in the input (default=128). |
| 13 | + n_freq: the number of bins in a spectrogram (default=128) |
16 | 14 |
|
17 | | - Examples:: |
18 | | - >>> resblock = _ResBlock(num_dims=128) |
19 | | - >>> input = torch.rand(10, 128, 512) |
20 | | - >>> output = resblock(input) |
| 15 | + Examples |
| 16 | + >>> resblock = _ResBlock() |
| 17 | + >>> input = torch.rand(10, 128, 512) # a random spectrogram |
| 18 | + >>> output = resblock(input) # shape: (10, 128, 512) |
21 | 19 | """ |
22 | 20 |
|
23 | | - def __init__(self, num_dims: int = 128) -> None: |
| 21 | + def __init__(self, n_freq: int = 128) -> None: |
24 | 22 | super().__init__() |
25 | 23 |
|
26 | 24 | self.resblock_model = nn.Sequential( |
27 | | - nn.Conv1d(in_channels=num_dims, out_channels=num_dims, kernel_size=1, bias=False), |
28 | | - nn.BatchNorm1d(num_dims), |
| 25 | + nn.Conv1d(in_channels=n_freq, out_channels=n_freq, kernel_size=1, bias=False), |
| 26 | + nn.BatchNorm1d(n_freq), |
29 | 27 | nn.ReLU(inplace=True), |
30 | | - nn.Conv1d(in_channels=num_dims, out_channels=num_dims, kernel_size=1, bias=False), |
31 | | - nn.BatchNorm1d(num_dims) |
| 28 | + nn.Conv1d(in_channels=n_freq, out_channels=n_freq, kernel_size=1, bias=False), |
| 29 | + nn.BatchNorm1d(n_freq) |
32 | 30 | ) |
33 | 31 |
|
34 | | - def forward(self, x: Tensor) -> Tensor: |
| 32 | + def forward(self, specgram: Tensor) -> Tensor: |
35 | 33 | r"""Pass the input through the _ResBlock layer. |
36 | | -
|
37 | 34 | Args: |
38 | | - x: the input sequence to the _ResBlock layer (required). |
| 35 | + specgram (Tensor): the input sequence to the _ResBlock layer (n_batch, n_freq, n_time). |
39 | 36 |
|
40 | | - Shape: |
41 | | - - x: :math:`(N, S, T)`. |
42 | | - - output: :math:`(N, S, T)`. |
43 | | - where N is the batch size, S is the number of input sequence, |
44 | | - T is the length of input sequence. |
| 37 | + Return: |
| 38 | + Tensor shape: (n_batch, n_freq, n_time) |
45 | 39 | """ |
46 | 40 |
|
47 | | - residual = x |
48 | | - return self.resblock_model(x) + residual |
| 41 | + return self.resblock_model(specgram) + specgram |
49 | 42 |
|
50 | 43 |
|
51 | 44 | class _MelResNet(nn.Module): |
52 | | - r"""This is a MelResNet layer based on a stack of ResBlocks. It is a block used in WaveRNN. |
53 | | - WaveRNN is based on the paper "Efficient Neural Audio Synthesis". Nal Kalchbrenner, Erich Elsen, |
54 | | - Karen Simonyan, Seb Noury, Norman Casagrande, Edward Lockhart, Florian Stimberg, Aaron van den Oord, |
55 | | - Sander Dieleman, Koray Kavukcuoglu. arXiv:1802.08435, 2018. |
| 45 | + r"""MelResNet layer uses a stack of ResBlocks on spectrogram. |
56 | 46 |
|
57 | 47 | Args: |
58 | | - res_blocks: the number of ResBlock in stack (default=10). |
59 | | - input_dims: the number of input sequence (default=100). |
60 | | - hidden_dims: the number of compute dimensions (default=128). |
61 | | - output_dims: the number of output sequence (default=128). |
62 | | - pad: the number of kernal size (pad * 2 + 1) in the first Conv1d layer (default=2). |
63 | | -
|
64 | | - Examples:: |
65 | | - >>> melresnet = _MelResNet(res_blocks=10, input_dims=100, |
66 | | - hidden_dims=128, output_dims=128, pad=2) |
67 | | - >>> input = torch.rand(10, 100, 512) |
68 | | - >>> output = melresnet(input) |
| 48 | + n_res_block: the number of ResBlock in stack (default=10) |
| 49 | + n_freq: the number of bins in a spectrogram (default=128) |
| 50 | + n_hidden: the number of hidden dimensions (default=128) |
| 51 | + n_output: the number of output dimensions (default=128) |
| 52 | + kernel_size: the number of kernel size in the first Conv1d layer (default=5) |
| 53 | +
|
| 54 | + Examples |
| 55 | + >>> melresnet = _MelResNet() |
| 56 | + >>> input = torch.rand(10, 128, 512) # a random spectrogram |
| 57 | + >>> output = melresnet(input) # shape: (10, 128, 508) |
69 | 58 | """ |
70 | 59 |
|
71 | | - def __init__(self, res_blocks: int = 10, |
72 | | - input_dims: int = 100, |
73 | | - hidden_dims: int = 128, |
74 | | - output_dims: int = 128, |
75 | | - pad: int = 2) -> None: |
| 60 | + def __init__(self, |
| 61 | + n_res_block: int = 10, |
| 62 | + n_freq: int = 128, |
| 63 | + n_hidden: int = 128, |
| 64 | + n_output: int = 128, |
| 65 | + kernel_size: int = 5) -> None: |
76 | 66 | super().__init__() |
77 | 67 |
|
78 | | - kernel_size = pad * 2 + 1 |
79 | | - ResBlocks = [] |
80 | | - |
81 | | - for i in range(res_blocks): |
82 | | - ResBlocks.append(_ResBlock(hidden_dims)) |
| 68 | + ResBlocks = [_ResBlock(n_hidden) for _ in range(n_res_block)] |
83 | 69 |
|
84 | 70 | self.melresnet_model = nn.Sequential( |
85 | | - nn.Conv1d(in_channels=input_dims, out_channels=hidden_dims, kernel_size=kernel_size, bias=False), |
86 | | - nn.BatchNorm1d(hidden_dims), |
| 71 | + nn.Conv1d(in_channels=n_freq, out_channels=n_hidden, kernel_size=kernel_size, bias=False), |
| 72 | + nn.BatchNorm1d(n_hidden), |
87 | 73 | nn.ReLU(inplace=True), |
88 | 74 | *ResBlocks, |
89 | | - nn.Conv1d(in_channels=hidden_dims, out_channels=output_dims, kernel_size=1) |
| 75 | + nn.Conv1d(in_channels=n_hidden, out_channels=n_output, kernel_size=1) |
90 | 76 | ) |
91 | 77 |
|
92 | | - def forward(self, x: Tensor) -> Tensor: |
| 78 | + def forward(self, specgram: Tensor) -> Tensor: |
93 | 79 | r"""Pass the input through the _MelResNet layer. |
94 | | -
|
95 | 80 | Args: |
96 | | - x: the input sequence to the _MelResNet layer (required). |
| 81 | + specgram (Tensor): the input sequence to the _MelResNet layer (n_batch, n_freq, n_time). |
97 | 82 |
|
98 | | - Shape: |
99 | | - - x: :math:`(N, S, T)`. |
100 | | - - output: :math:`(N, P, T - 2 * pad)`. |
101 | | - where N is the batch size, S is the number of input sequence, |
102 | | - P is the number of output sequence, T is the length of input sequence. |
| 83 | + Return: |
| 84 | + Tensor shape: (n_batch, n_output, n_time - kernel_size + 1) |
103 | 85 | """ |
104 | 86 |
|
105 | | - return self.melresnet_model(x) |
| 87 | + return self.melresnet_model(specgram) |
0 commit comments