From 95a810f6547af8fd9143c48bfd7e7ef98d381fa9 Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Wed, 3 Jul 2019 09:35:18 -0700 Subject: [PATCH 01/27] first --- torchaudio/functional.py | 99 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 99 insertions(+) diff --git a/torchaudio/functional.py b/torchaudio/functional.py index 95f583b4a8..3539dcd271 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -7,6 +7,7 @@ 'pad_trim', 'downmix_mono', 'LC2CL', + 'istft', 'spectrogram', 'create_fb_matrix', 'spectrogram_to_DB', @@ -105,6 +106,104 @@ def _stft(input, n_fft, hop_length, win_length, window, center, pad_mode, normal return torch.stft(input, n_fft, hop_length, win_length, window, center, pad_mode, normalized, onesided) +def istft(stft_matrix, n_fft, hop_length, win_length, window, center, pad_mode, normalized, onesided): + # type: (Tensor, int, Optional[int], Optional[int], Optional[Tensor], bool, str, bool, bool) -> Tensor + r""" Inverse short time Fourier Transform. This is expected to be the inverse of torch.stft. + It has the same parameters and it should return the least squares estimation of the original signal. + + [1] D. W. Griffin and J. S. Lim, “Signal estimation from modified short-time Fourier transform,” + IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984. + + Inputs: + stft_matrix (Tensor): output of stft where each row of a batch is a frequency and each column is + a window. it has a shape of (batch, fft_size, n_frames, 2) + n_fft (int): size of Fourier transform + hop_length (Optional[int]): the distance between neighboring sliding window frames + win_length (Optional[int]): the size of window frame and STFT filter + window (Optional[Tensor]): the optional window function + center (bool): whether :attr:`input` was padded on both sides so + that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}` + pad_mode (str): controls the padding method used when :attr:`center` is ``True`` + normalized (bool): whether the STFT was normalized + onesided (bool): whether the STFT is onesided + + Outputs: + Tensor: least squares estimation of the original signal of size (batch, signal_length) + """ + device = stft_matrix.device + fft_size = stft_matrix.size(1) + assert (onesided and n_fft // 2 + 1 == fft_size) or (not onesided and n_fft == fft_size) + + # use stft defaults for Optionals + if win_length is None: + win_length = n_fft + + if hop_length is None: + hop_length = int(win_length // 4) + + # There must be overlap + assert 0 < hop_length <= win_length + assert 0 < win_length <= n_fft + + if window is None: + window = torch.ones(win_length) + + assert window.dim() == 1 and window.size(0) == win_length + + if win_length != n_fft: + # center window with pad left and right zeros + left = (n_fft - win_length) // 2 + window = torch.nn.pad(window, (left, n_fft - window_length - left)) + assert window.size(0) == n_fft + # win_length and n_fft are synonymous from here on + + # size (batch, n_frames, fft_size, 2) + stft_matrix = stft_matrix.transpose(1, 2) + # size (batch, n_frames, n_fft) + stft_matrix = torch.irfft(stft_matrix, 1, normalized, onesided, signal_sizes=(n_fft,)) + + assert stft_matrix.size(2) == n_fft + n_frames = stft_matrix.size(1) + + # size (batch, n_frames, n_fft) + ytmp = stft_matrix * window.view(1, 1, n_fft) + # each column of a batch is a frame which needs to be overlap added at the right place + ytmp = ytmp.transpose(1, 2) # size (batch, n_fft, n_frames) + + # size (n_fft, 1, n_fft) + eye = torch.eye(n_fft, requires_grad=False, device=device).unsqueeze(1) + + # this does overlap add where the frames of ytmp are added such that the i'th frame of + # ytmp is added starting at i*hop_length in the output + # size (batch, 1, expected_signal_len) + y = torch.nn.functional.conv_transpose1d(ytmp, eye, stride=hop_length, padding=0) + + # do the same for the window function + # size (1, n_fft, n_frames) + window_sq = window.pow(2).view(n_fft, 1).repeat((1, n_frames)).unsqueeze(0) + # size (1, 1, expected_signal_len) + window_envelop = torch.nn.functional.conv_transpose1d(window_sq, eye, stride=hop_length, padding=0) + + expected_signal_len = n_fft + hop_length * (n_frames - 1) + assert y.size(2) == expected_signal_len + assert window_envelop.size(2) == expected_signal_len + + if center: + # we need to trim the padding away + # since n_frames = 1 + (len + n_fft// 2 + n_fft//2 - n_fft) / hop_length + # we get expected_signal_len -= 2 * (n_fft // 2) + # and since the signal starts at (n_fft // 2) then the end must be -(n_fft // 2) + half_n_fft = n_fft // 2 + y = y[half_n_fft:-half_n_fft] + window_envelop = window_envelop[:, :, half_n_fft:-half_n_fft] + + # check NOLA non-zero overlap condition + assert window_envelop.min() > 1e-11, ('window overlap add min: %f' % (window_envelop.min())) + + # size (batch, expected_signal_len) + return (y / window_envelop).unsqueeze(1) + + @torch.jit.script def spectrogram(sig, pad, window, n_fft, hop, ws, power, normalize): # type: (Tensor, int, Tensor, int, int, int, int, bool) -> Tensor From 0a8386c958750c7734f0badb2d2a0b2e23d43813 Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Wed, 3 Jul 2019 11:01:22 -0700 Subject: [PATCH 02/27] add tests --- torchaudio/functional.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/torchaudio/functional.py b/torchaudio/functional.py index 3539dcd271..e97c9c1ee7 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -110,6 +110,7 @@ def istft(stft_matrix, n_fft, hop_length, win_length, window, center, pad_mode, # type: (Tensor, int, Optional[int], Optional[int], Optional[Tensor], bool, str, bool, bool) -> Tensor r""" Inverse short time Fourier Transform. This is expected to be the inverse of torch.stft. It has the same parameters and it should return the least squares estimation of the original signal. + The algorithm will check using the NOLA condition. [1] D. W. Griffin and J. S. Lim, “Signal estimation from modified short-time Fourier transform,” IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984. @@ -194,14 +195,14 @@ def istft(stft_matrix, n_fft, hop_length, win_length, window, center, pad_mode, # we get expected_signal_len -= 2 * (n_fft // 2) # and since the signal starts at (n_fft // 2) then the end must be -(n_fft // 2) half_n_fft = n_fft // 2 - y = y[half_n_fft:-half_n_fft] + y = y[:, :, half_n_fft:-half_n_fft] window_envelop = window_envelop[:, :, half_n_fft:-half_n_fft] # check NOLA non-zero overlap condition assert window_envelop.min() > 1e-11, ('window overlap add min: %f' % (window_envelop.min())) # size (batch, expected_signal_len) - return (y / window_envelop).unsqueeze(1) + return (y / window_envelop).squeeze(1) @torch.jit.script From 5f48c98dadf24bd0f6d669f3febf91a1098fdd71 Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Wed, 3 Jul 2019 14:12:27 -0700 Subject: [PATCH 03/27] more test --- test/test_functional.py | 99 ++++++++++++++++++++++++++++++++++++++++ torchaudio/functional.py | 56 +++++++++++++++++------ 2 files changed, 142 insertions(+), 13 deletions(-) create mode 100644 test/test_functional.py diff --git a/test/test_functional.py b/test/test_functional.py new file mode 100644 index 0000000000..eaf268d7b7 --- /dev/null +++ b/test/test_functional.py @@ -0,0 +1,99 @@ +import os + +import torch +import torchaudio +import unittest +import test.common_utils + + +class TestFunctional(unittest.TestCase): + # size (2,20) + test_data = torch.tensor([ + [45.4243, 81.9316, 19.1100, 32.4998, 45.3313, 68.8204, 42.0782, 19.7222, + 76.8721, 69.9104, 27.7188, 86.3579, 30.3251, 92.0308, 70.0568, 74.8940, + 94.3127, 82.9875, 88.8303, 96.3460], + [59.4262, 91.0040, 74.7672, 79.8533, 46.7943, 13.6757, 85.5145, 33.0060, + 88.5102, 25.6912, 57.9501, 33.3326, 71.5654, 90.0321, 81.8218, 91.6907, + 87.9834, 16.4177, 62.4474, 0.2146] + ]).float() + + def _test_istft_helper(self, sound, kwargs): + stft = torch.stft(sound, **kwargs) + estimate = torchaudio.functional.istft(stft, length=sound.size(1), **kwargs) + + # trim sound for case when constructed signal is shorter than original + # print(sound) + # print(estimate) + sound = sound[:, :estimate.size(1)] + + self.assertTrue(sound.shape == estimate.shape, (sound.shape, estimate.shape)) + + # print((sound-estimate)) + # print((sound-estimate).abs().max()) + self.assertTrue(torch.allclose(sound, estimate, atol=1e-4)) + + def test_istft(self): + kwargs1 = { + 'n_fft': 12, + 'hop_length': 4, + 'win_length': 12, + 'window': torch.hann_window(12), + 'center': True, + 'pad_mode': 'reflect', + 'normalized': True, + 'onesided': True, + } + + kwargs2 = { + 'n_fft': 12, + 'hop_length': 2, + 'win_length': 8, + 'window': torch.hann_window(8), + 'center': True, + 'pad_mode': 'reflect', + 'normalized': False, + 'onesided': False, + } + + kwargs3 = { + 'n_fft': 15, + 'hop_length': 3, + 'win_length': 11, + 'window': torch.hamming_window(11), + 'center': True, + 'pad_mode': 'constant', + 'normalized': True, + 'onesided': False, + } + + kwargs4 = { + 'n_fft': 5, + 'hop_length': 2, + 'win_length': 5, + 'window': torch.hamming_window(5), + 'center': False, + 'pad_mode': 'constant', + 'normalized': False, + 'onesided': True, + } + + kwargs5 = { + 'n_fft': 3, + 'hop_length': 2, + 'win_length': 3, + 'window': torch.hamming_window(3), + 'center': False, + 'pad_mode': 'reflect', + 'normalized': False, + 'onesided': False, + } + + self._test_istft_helper(self.test_data, kwargs1) + self._test_istft_helper(self.test_data, kwargs2) + self._test_istft_helper(self.test_data, kwargs3) + self._test_istft_helper(self.test_data, kwargs4) + self._test_istft_helper(self.test_data, kwargs5) + + +if __name__ == '__main__': + unittest.main() diff --git a/torchaudio/functional.py b/torchaudio/functional.py index e97c9c1ee7..ed1941b743 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -106,11 +106,41 @@ def _stft(input, n_fft, hop_length, win_length, window, center, pad_mode, normal return torch.stft(input, n_fft, hop_length, win_length, window, center, pad_mode, normalized, onesided) -def istft(stft_matrix, n_fft, hop_length, win_length, window, center, pad_mode, normalized, onesided): - # type: (Tensor, int, Optional[int], Optional[int], Optional[Tensor], bool, str, bool, bool) -> Tensor +def istft(stft_matrix, # type: Tensor + n_fft, # type: int + hop_length, # type: Optional[int] + win_length, # type: Optional[int] + window, # type: Optional[Tensor] + center, # type: bool + pad_mode, # type: str + normalized, # type: bool + onesided, # type: bool + length # type: Optional[int] + ): + # type: (...) -> Tensor r""" Inverse short time Fourier Transform. This is expected to be the inverse of torch.stft. - It has the same parameters and it should return the least squares estimation of the original signal. - The algorithm will check using the NOLA condition. + It has the same parameters (+ additional optional parameter of :attr:`length`) and it should return the + least squares estimation of the original signal. The algorithm will check using the NOLA condition ( + nonzero overlap). + + Important consideration in the parameters :attr:`window` and :attr:`center` so that the envelop + created by the summation of all the windows is never zero at certain point in time. Specifically, + :math:`\sum_{t=-\ infty}^{\ infty} w^2[n-t\times hop\_length] \neq 0`. + + Since stft discards elements at the end of the signal if they do not fit in a frame, the + istft may return a shorter signal than the original signal (can occur if :attr:`center` is False + since the signal isn't padded). + + If :attr:`center` is True, then there will be padding e.g. 'constant', 'reflect', etc. Left padding + can be trimmed off exactly because they can be calculated but right padding cannot be calculated + without additional information. + + Example: Suppose the last window is: + [17, 18, 0, 0, 0] vs [18, 0, 0, 0, 0] + The n_frames, hop_length, win_length are all the same which prevents the calculation of right padding. + + These additional values could be zeros or a reflection of the signal so providing :attr:`length` + could be useful. If :attr:`length` is None then padding will be aggressively removed (some loss of signal). [1] D. W. Griffin and J. S. Lim, “Signal estimation from modified short-time Fourier transform,” IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984. @@ -127,6 +157,7 @@ def istft(stft_matrix, n_fft, hop_length, win_length, window, center, pad_mode, pad_mode (str): controls the padding method used when :attr:`center` is ``True`` normalized (bool): whether the STFT was normalized onesided (bool): whether the STFT is onesided + length (Optional[int]): the amount to trim the signal by (i.e. the original signal length) Outputs: Tensor: least squares estimation of the original signal of size (batch, signal_length) @@ -154,7 +185,7 @@ def istft(stft_matrix, n_fft, hop_length, win_length, window, center, pad_mode, if win_length != n_fft: # center window with pad left and right zeros left = (n_fft - win_length) // 2 - window = torch.nn.pad(window, (left, n_fft - window_length - left)) + window = torch.nn.functional.pad(window, (left, n_fft - win_length - left)) assert window.size(0) == n_fft # win_length and n_fft are synonymous from here on @@ -189,14 +220,13 @@ def istft(stft_matrix, n_fft, hop_length, win_length, window, center, pad_mode, assert y.size(2) == expected_signal_len assert window_envelop.size(2) == expected_signal_len - if center: - # we need to trim the padding away - # since n_frames = 1 + (len + n_fft// 2 + n_fft//2 - n_fft) / hop_length - # we get expected_signal_len -= 2 * (n_fft // 2) - # and since the signal starts at (n_fft // 2) then the end must be -(n_fft // 2) - half_n_fft = n_fft // 2 - y = y[:, :, half_n_fft:-half_n_fft] - window_envelop = window_envelop[:, :, half_n_fft:-half_n_fft] + half_n_fft = n_fft // 2 + # we need to trim the front padding away if center + start = half_n_fft if center else 0 + end = -half_n_fft if length is None else start + length + + y = y[:, :, start:end] + window_envelop = window_envelop[:, :, start:end] # check NOLA non-zero overlap condition assert window_envelop.min() > 1e-11, ('window overlap add min: %f' % (window_envelop.min())) From 1d2cbed0295b746aee188510a2d096cb8bdd647d Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Wed, 3 Jul 2019 14:14:11 -0700 Subject: [PATCH 04/27] remove print --- test/test_functional.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/test_functional.py b/test/test_functional.py index eaf268d7b7..7e59371dba 100644 --- a/test/test_functional.py +++ b/test/test_functional.py @@ -22,14 +22,9 @@ def _test_istft_helper(self, sound, kwargs): estimate = torchaudio.functional.istft(stft, length=sound.size(1), **kwargs) # trim sound for case when constructed signal is shorter than original - # print(sound) - # print(estimate) sound = sound[:, :estimate.size(1)] self.assertTrue(sound.shape == estimate.shape, (sound.shape, estimate.shape)) - - # print((sound-estimate)) - # print((sound-estimate).abs().max()) self.assertTrue(torch.allclose(sound, estimate, atol=1e-4)) def test_istft(self): From 6107f65ea6c72384ef5ca667f470ca7573b55ff6 Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Wed, 3 Jul 2019 14:24:36 -0700 Subject: [PATCH 05/27] abs min instead of min --- torchaudio/functional.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/torchaudio/functional.py b/torchaudio/functional.py index ed1941b743..7269abc8e4 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -229,7 +229,8 @@ def istft(stft_matrix, # type: Tensor window_envelop = window_envelop[:, :, start:end] # check NOLA non-zero overlap condition - assert window_envelop.min() > 1e-11, ('window overlap add min: %f' % (window_envelop.min())) + window_envelop_lowest = window_envelop.abs().min() + assert window_envelop_lowest > 1e-11, ('window overlap add min: %f' % (window_envelop_lowest)) # size (batch, expected_signal_len) return (y / window_envelop).squeeze(1) From 050ae2329002e5fcaa4b24fb705bc6524f36de97 Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Fri, 5 Jul 2019 05:57:55 -0700 Subject: [PATCH 06/27] apply feedback --- torchaudio/functional.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/torchaudio/functional.py b/torchaudio/functional.py index 7269abc8e4..1dcda3b71f 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -149,15 +149,16 @@ def istft(stft_matrix, # type: Tensor stft_matrix (Tensor): output of stft where each row of a batch is a frequency and each column is a window. it has a shape of (batch, fft_size, n_frames, 2) n_fft (int): size of Fourier transform - hop_length (Optional[int]): the distance between neighboring sliding window frames - win_length (Optional[int]): the size of window frame and STFT filter - window (Optional[Tensor]): the optional window function + hop_length (Optional[int]): the distance between neighboring sliding window frames. (Default: win_length // 4) + win_length (Optional[int]): the size of window frame and STFT filter. (Default: n_fft) + window (Optional[Tensor]): the optional window function. (Default: torch.ones(win_length)) center (bool): whether :attr:`input` was padded on both sides so that the :math:`t`-th frame is centered at time :math:`t \times \text{hop\_length}` pad_mode (str): controls the padding method used when :attr:`center` is ``True`` normalized (bool): whether the STFT was normalized onesided (bool): whether the STFT is onesided - length (Optional[int]): the amount to trim the signal by (i.e. the original signal length) + length (Optional[int]): the amount to trim the signal by (i.e. the + original signal length). (Default: whole signal) Outputs: Tensor: least squares estimation of the original signal of size (batch, signal_length) From 690fe928f48fcbbad22eb6451cb5a658713929a0 Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Fri, 5 Jul 2019 11:07:11 -0700 Subject: [PATCH 07/27] apply feedback --- test/test_functional.py | 48 ++++++++++++++++++++++++++++------------- 1 file changed, 33 insertions(+), 15 deletions(-) diff --git a/test/test_functional.py b/test/test_functional.py index 7e59371dba..46533d4c9d 100644 --- a/test/test_functional.py +++ b/test/test_functional.py @@ -7,15 +7,8 @@ class TestFunctional(unittest.TestCase): - # size (2,20) - test_data = torch.tensor([ - [45.4243, 81.9316, 19.1100, 32.4998, 45.3313, 68.8204, 42.0782, 19.7222, - 76.8721, 69.9104, 27.7188, 86.3579, 30.3251, 92.0308, 70.0568, 74.8940, - 94.3127, 82.9875, 88.8303, 96.3460], - [59.4262, 91.0040, 74.7672, 79.8533, 46.7943, 13.6757, 85.5145, 33.0060, - 88.5102, 25.6912, 57.9501, 33.3326, 71.5654, 90.0321, 81.8218, 91.6907, - 87.9834, 16.4177, 62.4474, 0.2146] - ]).float() + data_sizes = (2,20) + number_of_trials = 10 def _test_istft_helper(self, sound, kwargs): stft = torch.stft(sound, **kwargs) @@ -27,7 +20,8 @@ def _test_istft_helper(self, sound, kwargs): self.assertTrue(sound.shape == estimate.shape, (sound.shape, estimate.shape)) self.assertTrue(torch.allclose(sound, estimate, atol=1e-4)) - def test_istft(self): + def test_istft1(self): + # hann_window, centered, normalized, onesided kwargs1 = { 'n_fft': 12, 'hop_length': 4, @@ -39,6 +33,12 @@ def test_istft(self): 'onesided': True, } + for i in range(self.number_of_trials): + test_data = torch.rand(self.data_sizes) + self._test_istft_helper(test_data, kwargs1) + + def test_istft2(self): + # hann_window, centered, not normalized, not onesided kwargs2 = { 'n_fft': 12, 'hop_length': 2, @@ -50,6 +50,12 @@ def test_istft(self): 'onesided': False, } + for i in range(self.number_of_trials): + test_data = torch.rand(self.data_sizes) + self._test_istft_helper(test_data, kwargs2) + + def test_istft3(self): + # hamming_window, centered, normalized, not onesided kwargs3 = { 'n_fft': 15, 'hop_length': 3, @@ -61,6 +67,13 @@ def test_istft(self): 'onesided': False, } + for i in range(self.number_of_trials): + test_data = torch.rand(self.data_sizes) + self._test_istft_helper(test_data, kwargs3) + + def test_istft4(self): + # hamming_window, not centered, not normalized, onesided + # window same size as n_fft kwargs4 = { 'n_fft': 5, 'hop_length': 2, @@ -72,6 +85,13 @@ def test_istft(self): 'onesided': True, } + for i in range(self.number_of_trials): + test_data = torch.rand(self.data_sizes) + self._test_istft_helper(test_data, kwargs4) + + def test_istft5(self): + # hamming_window, not centered, not normalized, not onesided + # window same size as n_fft kwargs5 = { 'n_fft': 3, 'hop_length': 2, @@ -83,11 +103,9 @@ def test_istft(self): 'onesided': False, } - self._test_istft_helper(self.test_data, kwargs1) - self._test_istft_helper(self.test_data, kwargs2) - self._test_istft_helper(self.test_data, kwargs3) - self._test_istft_helper(self.test_data, kwargs4) - self._test_istft_helper(self.test_data, kwargs5) + for i in range(self.number_of_trials): + test_data = torch.rand(self.data_sizes) + self._test_istft_helper(test_data, kwargs5) if __name__ == '__main__': From 1e2f94978b1d9a0617712bca2b100dc3c48b06a7 Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Fri, 5 Jul 2019 11:07:46 -0700 Subject: [PATCH 08/27] flake8 --- test/test_functional.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_functional.py b/test/test_functional.py index 46533d4c9d..0452894633 100644 --- a/test/test_functional.py +++ b/test/test_functional.py @@ -7,7 +7,7 @@ class TestFunctional(unittest.TestCase): - data_sizes = (2,20) + data_sizes = (2, 20) number_of_trials = 10 def _test_istft_helper(self, sound, kwargs): From ee20335ebc48742d209a0795bba330c92b6476fc Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Fri, 5 Jul 2019 11:13:49 -0700 Subject: [PATCH 09/27] apply feedback --- torchaudio/functional.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/torchaudio/functional.py b/torchaudio/functional.py index 1dcda3b71f..f601a96a05 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -165,7 +165,9 @@ def istft(stft_matrix, # type: Tensor """ device = stft_matrix.device fft_size = stft_matrix.size(1) - assert (onesided and n_fft // 2 + 1 == fft_size) or (not onesided and n_fft == fft_size) + assert (onesided and n_fft // 2 + 1 == fft_size) or (not onesided and n_fft == fft_size), ( + 'one_sided implies that n_fft // 2 + 1 == fft_size and not one_sided implies n_fft == fft_size. ' + + 'Given values were onesided: %s, n_fft: %d, fft_size: %d' % ('True' if onesided else False, n_fft, fft_size)) # use stft defaults for Optionals if win_length is None: From 7242e7b4ad981a0338318afc1753db5414b515bc Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Fri, 5 Jul 2019 11:45:18 -0700 Subject: [PATCH 10/27] apply feedback --- test/test_functional.py | 31 ++++++++++++++++++++++++++----- torchaudio/functional.py | 37 ++++++++++++++++++++++++------------- 2 files changed, 50 insertions(+), 18 deletions(-) diff --git a/test/test_functional.py b/test/test_functional.py index 0452894633..6b899b97c8 100644 --- a/test/test_functional.py +++ b/test/test_functional.py @@ -10,16 +10,19 @@ class TestFunctional(unittest.TestCase): data_sizes = (2, 20) number_of_trials = 10 - def _test_istft_helper(self, sound, kwargs): - stft = torch.stft(sound, **kwargs) - estimate = torchaudio.functional.istft(stft, length=sound.size(1), **kwargs) - + def _compare_estimate(self, sound, estimate): # trim sound for case when constructed signal is shorter than original - sound = sound[:, :estimate.size(1)] + sound = sound[..., :estimate.size(-1)] self.assertTrue(sound.shape == estimate.shape, (sound.shape, estimate.shape)) self.assertTrue(torch.allclose(sound, estimate, atol=1e-4)) + def _test_istft_helper(self, sound, kwargs): + stft = torch.stft(sound, **kwargs) + estimate = torchaudio.functional.istft(stft, length=sound.size(1), **kwargs) + + self._compare_estimate(sound, estimate) + def test_istft1(self): # hann_window, centered, normalized, onesided kwargs1 = { @@ -107,6 +110,24 @@ def test_istft5(self): test_data = torch.rand(self.data_sizes) self._test_istft_helper(test_data, kwargs5) + def test_istft6(self): + # stft = torch.stft(torch.ones(4), 4) + stft = torch.tensor([ + [[4., 0.], [4., 0.], [4., 0.], [4., 0.], [4., 0.]], + [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]], + [[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.]] + ]) + + estimate = torchaudio.functional.istft(stft, n_fft=4, length=4) + self._compare_estimate(torch.ones(4), estimate) + + def test_istft7(self): + # stft = torch.stft(torch.zeros(4), 4) + stft = torch.zeros((3, 5, 2)) + + estimate = torchaudio.functional.istft(stft, n_fft=4, length=4) + self._compare_estimate(torch.zeros(4), estimate) + if __name__ == '__main__': unittest.main() diff --git a/torchaudio/functional.py b/torchaudio/functional.py index f601a96a05..278f2940c9 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -106,16 +106,16 @@ def _stft(input, n_fft, hop_length, win_length, window, center, pad_mode, normal return torch.stft(input, n_fft, hop_length, win_length, window, center, pad_mode, normalized, onesided) -def istft(stft_matrix, # type: Tensor - n_fft, # type: int - hop_length, # type: Optional[int] - win_length, # type: Optional[int] - window, # type: Optional[Tensor] - center, # type: bool - pad_mode, # type: str - normalized, # type: bool - onesided, # type: bool - length # type: Optional[int] +def istft(stft_matrix, # type: Tensor + n_fft, # type: int + hop_length=None, # type: Optional[int] + win_length=None, # type: Optional[int] + window=None, # type: Optional[Tensor] + center=True, # type: bool + pad_mode='reflect', # type: str + normalized=False, # type: bool + onesided=True, # type: bool + length=None # type: Optional[int] ): # type: (...) -> Tensor r""" Inverse short time Fourier Transform. This is expected to be the inverse of torch.stft. @@ -147,7 +147,7 @@ def istft(stft_matrix, # type: Tensor Inputs: stft_matrix (Tensor): output of stft where each row of a batch is a frequency and each column is - a window. it has a shape of (batch, fft_size, n_frames, 2) + a window. it has a shape of either (batch, fft_size, n_frames, 2) or (fft_size, n_frames, 2) n_fft (int): size of Fourier transform hop_length (Optional[int]): the distance between neighboring sliding window frames. (Default: win_length // 4) win_length (Optional[int]): the size of window frame and STFT filter. (Default: n_fft) @@ -161,8 +161,15 @@ def istft(stft_matrix, # type: Tensor original signal length). (Default: whole signal) Outputs: - Tensor: least squares estimation of the original signal of size (batch, signal_length) + Tensor: least squares estimation of the original signal of size (batch, signal_length) or (signal_length) """ + stft_matrix_dim = stft_matrix.dim() + assert 3 <= stft_matrix_dim <= 4, ('Incorrect stft dimension: %d' % (stft_matrix_dim)) + + if stft_matrix_dim == 3: + # add a batch dimension + stft_matrix = stft_matrix.unsqueeze(0) + device = stft_matrix.device fft_size = stft_matrix.size(1) assert (onesided and n_fft // 2 + 1 == fft_size) or (not onesided and n_fft == fft_size), ( @@ -236,7 +243,11 @@ def istft(stft_matrix, # type: Tensor assert window_envelop_lowest > 1e-11, ('window overlap add min: %f' % (window_envelop_lowest)) # size (batch, expected_signal_len) - return (y / window_envelop).squeeze(1) + y = (y / window_envelop).squeeze(1) + + if stft_matrix_dim == 3: # remove the batch dimension + y = y.squeeze(0) + return y @torch.jit.script From 50a6f3e95f6024f1d17ce5176611f38f3db30666 Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Mon, 8 Jul 2019 12:41:49 -0700 Subject: [PATCH 11/27] apply feedback --- test/test_functional.py | 76 ++++++++++++++++++++++++++--------------- 1 file changed, 48 insertions(+), 28 deletions(-) diff --git a/test/test_functional.py b/test/test_functional.py index 6b899b97c8..ba0729e439 100644 --- a/test/test_functional.py +++ b/test/test_functional.py @@ -8,22 +8,27 @@ class TestFunctional(unittest.TestCase): data_sizes = (2, 20) - number_of_trials = 10 + number_of_trials = 100 def _compare_estimate(self, sound, estimate): # trim sound for case when constructed signal is shorter than original sound = sound[..., :estimate.size(-1)] self.assertTrue(sound.shape == estimate.shape, (sound.shape, estimate.shape)) - self.assertTrue(torch.allclose(sound, estimate, atol=1e-4)) + self.assertTrue(torch.allclose(sound, estimate, atol=1e-6)) - def _test_istft_helper(self, sound, kwargs): - stft = torch.stft(sound, **kwargs) - estimate = torchaudio.functional.istft(stft, length=sound.size(1), **kwargs) + def _test_istft_is_inverse_of_stft(self, kwargs): + # generates a random sound signal for each tril and then does the stft/istft + # operation to check whether we can reconstruct signal + for i in range(self.number_of_trials): + sound = torch.rand(self.data_sizes) + + stft = torch.stft(sound, **kwargs) + estimate = torchaudio.functional.istft(stft, length=sound.size(1), **kwargs) - self._compare_estimate(sound, estimate) + self._compare_estimate(sound, estimate) - def test_istft1(self): + def test_istft_is_inverse_of_stft1(self): # hann_window, centered, normalized, onesided kwargs1 = { 'n_fft': 12, @@ -36,11 +41,9 @@ def test_istft1(self): 'onesided': True, } - for i in range(self.number_of_trials): - test_data = torch.rand(self.data_sizes) - self._test_istft_helper(test_data, kwargs1) + self._test_istft_is_inverse_of_stft(kwargs1) - def test_istft2(self): + def test_istft_is_inverse_of_stft2(self): # hann_window, centered, not normalized, not onesided kwargs2 = { 'n_fft': 12, @@ -53,11 +56,9 @@ def test_istft2(self): 'onesided': False, } - for i in range(self.number_of_trials): - test_data = torch.rand(self.data_sizes) - self._test_istft_helper(test_data, kwargs2) + self._test_istft_is_inverse_of_stft(kwargs2) - def test_istft3(self): + def test_istft_is_inverse_of_stft3(self): # hamming_window, centered, normalized, not onesided kwargs3 = { 'n_fft': 15, @@ -70,11 +71,9 @@ def test_istft3(self): 'onesided': False, } - for i in range(self.number_of_trials): - test_data = torch.rand(self.data_sizes) - self._test_istft_helper(test_data, kwargs3) + self._test_istft_is_inverse_of_stft(kwargs3) - def test_istft4(self): + def test_istft_is_inverse_of_stft4(self): # hamming_window, not centered, not normalized, onesided # window same size as n_fft kwargs4 = { @@ -88,11 +87,9 @@ def test_istft4(self): 'onesided': True, } - for i in range(self.number_of_trials): - test_data = torch.rand(self.data_sizes) - self._test_istft_helper(test_data, kwargs4) + self._test_istft_is_inverse_of_stft(kwargs4) - def test_istft5(self): + def test_istft_is_inverse_of_stft5(self): # hamming_window, not centered, not normalized, not onesided # window same size as n_fft kwargs5 = { @@ -106,11 +103,9 @@ def test_istft5(self): 'onesided': False, } - for i in range(self.number_of_trials): - test_data = torch.rand(self.data_sizes) - self._test_istft_helper(test_data, kwargs5) + self._test_istft_is_inverse_of_stft(kwargs5) - def test_istft6(self): + def test_istft_of_ones(self): # stft = torch.stft(torch.ones(4), 4) stft = torch.tensor([ [[4., 0.], [4., 0.], [4., 0.], [4., 0.], [4., 0.]], @@ -121,13 +116,38 @@ def test_istft6(self): estimate = torchaudio.functional.istft(stft, n_fft=4, length=4) self._compare_estimate(torch.ones(4), estimate) - def test_istft7(self): + def test_istft_of_zeros(self): # stft = torch.stft(torch.zeros(4), 4) stft = torch.zeros((3, 5, 2)) estimate = torchaudio.functional.istft(stft, n_fft=4, length=4) self._compare_estimate(torch.zeros(4), estimate) + def test_istft_requires_overlap_windows(self): + # the window is size 1 but it hops 20 so there is a gap which throw an error + stft = torch.rand((3, 5, 2)) + self.assertRaises(AssertionError, torchaudio.functional.istft, stft, n_fft=4, + hop_length=20, win_length=1, window=torch.ones(1)) + + def test_istft_requires_nola(self): + stft = torch.zeros((3, 5, 2)) + kwargs_ok = { + 'n_fft': 4, + 'win_length': 4, + 'window': torch.ones(4), + } + + kwargs_not_ok = { + 'n_fft': 4, + 'win_length': 4, + 'window': torch.zeros(4), + } + + # A window of ones meets NOLA but a window of zeros does not. This should + # throw an error. + torchaudio.functional.istft(stft, **kwargs_ok) + self.assertRaises(AssertionError, torchaudio.functional.istft, stft, **kwargs_not_ok) + if __name__ == '__main__': unittest.main() From 38c94b73d6d09edce8a09a3e9ffbddfed492ac9d Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Mon, 8 Jul 2019 14:42:55 -0700 Subject: [PATCH 12/27] fix test_transforms.py. pytorch nightly must have changed from_numpy or something --- test/test_transforms.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/test/test_transforms.py b/test/test_transforms.py index 8ffe9fbafe..7b355470bb 100644 --- a/test/test_transforms.py +++ b/test/test_transforms.py @@ -230,10 +230,10 @@ def _test_librosa_consistency_helper(n_fft, hop_length, power, n_mels, n_mfcc, s librosa_mel = librosa.feature.melspectrogram(y=sound_librosa, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, htk=True, norm=None) + librosa_mel_tensor = torch.from_numpy(librosa_mel) torch_mel = melspect_transform(sound).squeeze().cpu().t() - # lower tolerance, think it's double vs. float - self.assertTrue(torch.allclose(torch_mel.type(torch.double), torch.from_numpy(librosa_mel), atol=5e-3)) + self.assertTrue(torch.allclose(torch_mel.type(librosa_mel_tensor.dtype), librosa_mel_tensor, atol=5e-3)) # test s2db @@ -244,8 +244,9 @@ def _test_librosa_consistency_helper(n_fft, hop_length, power, n_mels, n_mfcc, s db_torch = db_transform(melspect_transform(sound)).squeeze().cpu().t() db_librosa = librosa.core.spectrum.power_to_db(librosa_mel) + db_librosa_tensor = torch.from_numpy(db_librosa) - self.assertTrue(torch.allclose(db_torch.type(torch.double), torch.from_numpy(db_librosa), atol=5e-3)) + self.assertTrue(torch.allclose(db_torch.type(db_librosa_tensor.dtype), db_librosa_tensor, atol=5e-3)) # test MFCC melkwargs = {'hop': hop_length, 'n_fft': n_fft} @@ -269,9 +270,10 @@ def _test_librosa_consistency_helper(n_fft, hop_length, power, n_mels, n_mfcc, s # n_mels=n_mels) librosa_mfcc = scipy.fftpack.dct(db_librosa, axis=0, type=2, norm='ortho')[:n_mfcc] + librosa_mfcc_tensor = torch.from_numpy(librosa_mfcc) torch_mfcc = mfcc_transform(sound).squeeze().cpu().t() - self.assertTrue(torch.allclose(torch_mfcc.type(torch.double), torch.from_numpy(librosa_mfcc), atol=5e-3)) + self.assertTrue(torch.allclose(torch_mfcc.type(librosa_mfcc_tensor.dtype), librosa_mfcc_tensor, atol=5e-3)) kwargs1 = { 'n_fft': 400, From 60ae1bbc5fa5ea1bad032a368b3904d9b5207c59 Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Mon, 8 Jul 2019 16:08:51 -0700 Subject: [PATCH 13/27] apply feedback --- test/test_functional.py | 20 ++++++++++++++++++-- torchaudio/functional.py | 15 +++------------ 2 files changed, 21 insertions(+), 14 deletions(-) diff --git a/test/test_functional.py b/test/test_functional.py index ba0729e439..e41f8c795d 100644 --- a/test/test_functional.py +++ b/test/test_functional.py @@ -1,3 +1,4 @@ +import math import os import torch @@ -10,12 +11,12 @@ class TestFunctional(unittest.TestCase): data_sizes = (2, 20) number_of_trials = 100 - def _compare_estimate(self, sound, estimate): + def _compare_estimate(self, sound, estimate, atol=1e-6): # trim sound for case when constructed signal is shorter than original sound = sound[..., :estimate.size(-1)] self.assertTrue(sound.shape == estimate.shape, (sound.shape, estimate.shape)) - self.assertTrue(torch.allclose(sound, estimate, atol=1e-6)) + self.assertTrue(torch.allclose(sound, estimate, atol=atol)) def _test_istft_is_inverse_of_stft(self, kwargs): # generates a random sound signal for each tril and then does the stft/istft @@ -148,6 +149,21 @@ def test_istft_requires_nola(self): torchaudio.functional.istft(stft, **kwargs_ok) self.assertRaises(AssertionError, torchaudio.functional.istft, stft, **kwargs_not_ok) + def test_istft_of_sine(self): + # stft of 123*sin(2*pi/5*x) with the hop length and window size equaling the period of L = 5 + x = torch.arange(10, dtype=torch.get_default_dtype()) + L = 5 + amplitude = 123 + sound = amplitude * torch.sin(2 * math.pi / L * x) + # stft = torch.stft(sound, L, hop_length=L, win_length=L, + # window=torch.ones(L), center=False, normalized=False) + stft = torch.zeros((3, 2, 2)) + stft[1, :, 1] = -(amplitude * L) / 2.0 + + estimate = torchaudio.functional.istft(stft, L, hop_length=L, win_length=L, + window=torch.ones(L), center=False, normalized=False) + self._compare_estimate(sound, estimate, atol=1e-4) + if __name__ == '__main__': unittest.main() diff --git a/torchaudio/functional.py b/torchaudio/functional.py index 278f2940c9..da4b4ff6a2 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -122,29 +122,22 @@ def istft(stft_matrix, # type: Tensor It has the same parameters (+ additional optional parameter of :attr:`length`) and it should return the least squares estimation of the original signal. The algorithm will check using the NOLA condition ( nonzero overlap). - Important consideration in the parameters :attr:`window` and :attr:`center` so that the envelop created by the summation of all the windows is never zero at certain point in time. Specifically, :math:`\sum_{t=-\ infty}^{\ infty} w^2[n-t\times hop\_length] \neq 0`. - Since stft discards elements at the end of the signal if they do not fit in a frame, the istft may return a shorter signal than the original signal (can occur if :attr:`center` is False since the signal isn't padded). - If :attr:`center` is True, then there will be padding e.g. 'constant', 'reflect', etc. Left padding can be trimmed off exactly because they can be calculated but right padding cannot be calculated without additional information. - Example: Suppose the last window is: [17, 18, 0, 0, 0] vs [18, 0, 0, 0, 0] The n_frames, hop_length, win_length are all the same which prevents the calculation of right padding. - These additional values could be zeros or a reflection of the signal so providing :attr:`length` could be useful. If :attr:`length` is None then padding will be aggressively removed (some loss of signal). - [1] D. W. Griffin and J. S. Lim, “Signal estimation from modified short-time Fourier transform,” IEEE Trans. ASSP, vol.32, no.2, pp.236–243, Apr. 1984. - Inputs: stft_matrix (Tensor): output of stft where each row of a batch is a frequency and each column is a window. it has a shape of either (batch, fft_size, n_frames, 2) or (fft_size, n_frames, 2) @@ -159,7 +152,6 @@ def istft(stft_matrix, # type: Tensor onesided (bool): whether the STFT is onesided length (Optional[int]): the amount to trim the signal by (i.e. the original signal length). (Default: whole signal) - Outputs: Tensor: least squares estimation of the original signal of size (batch, signal_length) or (signal_length) """ @@ -199,10 +191,9 @@ def istft(stft_matrix, # type: Tensor assert window.size(0) == n_fft # win_length and n_fft are synonymous from here on - # size (batch, n_frames, fft_size, 2) - stft_matrix = stft_matrix.transpose(1, 2) - # size (batch, n_frames, n_fft) - stft_matrix = torch.irfft(stft_matrix, 1, normalized, onesided, signal_sizes=(n_fft,)) + stft_matrix = stft_matrix.transpose(1, 2) # size (batch, n_frames, fft_size, 2) + stft_matrix = torch.irfft(stft_matrix, 1, normalized, + onesided, signal_sizes=(n_fft,)) # size (batch, n_frames, n_fft) assert stft_matrix.size(2) == n_fft n_frames = stft_matrix.size(1) From 1c56a0619c88bc476287a8b183030521ec37394f Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Mon, 8 Jul 2019 16:09:19 -0700 Subject: [PATCH 14/27] apply feedback --- torchaudio/functional.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/torchaudio/functional.py b/torchaudio/functional.py index da4b4ff6a2..0b295fe144 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -198,13 +198,12 @@ def istft(stft_matrix, # type: Tensor assert stft_matrix.size(2) == n_fft n_frames = stft_matrix.size(1) - # size (batch, n_frames, n_fft) - ytmp = stft_matrix * window.view(1, 1, n_fft) + ytmp = stft_matrix * window.view(1, 1, n_fft) # size (batch, n_frames, n_fft) # each column of a batch is a frame which needs to be overlap added at the right place ytmp = ytmp.transpose(1, 2) # size (batch, n_fft, n_frames) - # size (n_fft, 1, n_fft) - eye = torch.eye(n_fft, requires_grad=False, device=device).unsqueeze(1) + eye = torch.eye(n_fft, requires_grad=False, + device=device).unsqueeze(1) # size (n_fft, 1, n_fft) # this does overlap add where the frames of ytmp are added such that the i'th frame of # ytmp is added starting at i*hop_length in the output From fe001e172165f7030e1ebd46bb84e01300ffd689 Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Mon, 8 Jul 2019 16:10:53 -0700 Subject: [PATCH 15/27] apply feedback --- torchaudio/functional.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/torchaudio/functional.py b/torchaudio/functional.py index 0b295fe144..a7fb5298c8 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -207,14 +207,13 @@ def istft(stft_matrix, # type: Tensor # this does overlap add where the frames of ytmp are added such that the i'th frame of # ytmp is added starting at i*hop_length in the output - # size (batch, 1, expected_signal_len) - y = torch.nn.functional.conv_transpose1d(ytmp, eye, stride=hop_length, padding=0) + y = torch.nn.functional.conv_transpose1d( + ytmp, eye, stride=hop_length, padding=0) # size (batch, 1, expected_signal_len) # do the same for the window function - # size (1, n_fft, n_frames) - window_sq = window.pow(2).view(n_fft, 1).repeat((1, n_frames)).unsqueeze(0) - # size (1, 1, expected_signal_len) - window_envelop = torch.nn.functional.conv_transpose1d(window_sq, eye, stride=hop_length, padding=0) + window_sq = window.pow(2).view(n_fft, 1).repeat((1, n_frames)).unsqueeze(0) # size (1, n_fft, n_frames) + window_envelop = torch.nn.functional.conv_transpose1d( + window_sq, eye, stride=hop_length, padding=0) # size (1, 1, expected_signal_len) expected_signal_len = n_fft + hop_length * (n_frames - 1) assert y.size(2) == expected_signal_len @@ -232,8 +231,7 @@ def istft(stft_matrix, # type: Tensor window_envelop_lowest = window_envelop.abs().min() assert window_envelop_lowest > 1e-11, ('window overlap add min: %f' % (window_envelop_lowest)) - # size (batch, expected_signal_len) - y = (y / window_envelop).squeeze(1) + y = (y / window_envelop).squeeze(1) # size (batch, expected_signal_len) if stft_matrix_dim == 3: # remove the batch dimension y = y.squeeze(0) From 8427a89cea12a3482add0678d3a56079c2c3d8e5 Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Tue, 9 Jul 2019 05:57:46 -0700 Subject: [PATCH 16/27] apply feedback --- test/test_functional.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/test/test_functional.py b/test/test_functional.py index e41f8c795d..3d07fb8c90 100644 --- a/test/test_functional.py +++ b/test/test_functional.py @@ -149,21 +149,29 @@ def test_istft_requires_nola(self): torchaudio.functional.istft(stft, **kwargs_ok) self.assertRaises(AssertionError, torchaudio.functional.istft, stft, **kwargs_not_ok) - def test_istft_of_sine(self): - # stft of 123*sin(2*pi/5*x) with the hop length and window size equaling the period of L = 5 - x = torch.arange(10, dtype=torch.get_default_dtype()) - L = 5 - amplitude = 123 + def _test_istft_of_sine(self, amplitude, L, n): + # stft of amplitude*sin(2*pi/L*n*x) with the hop length and window size equaling L + x = torch.arange(2 * L, dtype=torch.get_default_dtype()) sound = amplitude * torch.sin(2 * math.pi / L * x) # stft = torch.stft(sound, L, hop_length=L, win_length=L, - # window=torch.ones(L), center=False, normalized=False) - stft = torch.zeros((3, 2, 2)) + # window=torch.ones(L), center=False, normalized=False) + stft = torch.zeros((L // 2 + 1, 2, 2)) stft[1, :, 1] = -(amplitude * L) / 2.0 estimate = torchaudio.functional.istft(stft, L, hop_length=L, win_length=L, window=torch.ones(L), center=False, normalized=False) self._compare_estimate(sound, estimate, atol=1e-4) + def test_istft_of_sine(self): + self._test_istft_of_sine(amplitude=123, L=5, n=1) + self._test_istft_of_sine(amplitude=234, L=5, n=2) + self._test_istft_of_sine(amplitude=345, L=5, n=3) + self._test_istft_of_sine(amplitude=111, L=6, n=3) + self._test_istft_of_sine(amplitude=222, L=7, n=4) + self._test_istft_of_sine(amplitude=100, L=8, n=5) + self._test_istft_of_sine(amplitude=315, L=9, n=6) + self._test_istft_of_sine(amplitude=410, L=10, n=7) + if __name__ == '__main__': unittest.main() From 80255afe1d4d69efe3a6abf7bcdb936760bf3316 Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Tue, 9 Jul 2019 06:02:37 -0700 Subject: [PATCH 17/27] flake8 --- test/test_functional.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_functional.py b/test/test_functional.py index 3d07fb8c90..1ab7a61ee4 100644 --- a/test/test_functional.py +++ b/test/test_functional.py @@ -154,7 +154,7 @@ def _test_istft_of_sine(self, amplitude, L, n): x = torch.arange(2 * L, dtype=torch.get_default_dtype()) sound = amplitude * torch.sin(2 * math.pi / L * x) # stft = torch.stft(sound, L, hop_length=L, win_length=L, - # window=torch.ones(L), center=False, normalized=False) + # window=torch.ones(L), center=False, normalized=False) stft = torch.zeros((L // 2 + 1, 2, 2)) stft[1, :, 1] = -(amplitude * L) / 2.0 From 1d79b54e1e0bde8bdb49399815180c9740ef86ee Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Tue, 9 Jul 2019 06:44:07 -0700 Subject: [PATCH 18/27] apply feedback --- test/test_functional.py | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/test/test_functional.py b/test/test_functional.py index 1ab7a61ee4..13ffcd91f7 100644 --- a/test/test_functional.py +++ b/test/test_functional.py @@ -11,12 +11,12 @@ class TestFunctional(unittest.TestCase): data_sizes = (2, 20) number_of_trials = 100 - def _compare_estimate(self, sound, estimate, atol=1e-6): + def _compare_estimate(self, sound, estimate, atol=1e-6, rtol=1e-8): # trim sound for case when constructed signal is shorter than original sound = sound[..., :estimate.size(-1)] self.assertTrue(sound.shape == estimate.shape, (sound.shape, estimate.shape)) - self.assertTrue(torch.allclose(sound, estimate, atol=atol)) + self.assertTrue(torch.allclose(sound, estimate, atol=atol, rtol=rtol)) def _test_istft_is_inverse_of_stft(self, kwargs): # generates a random sound signal for each tril and then does the stft/istft @@ -152,25 +152,30 @@ def test_istft_requires_nola(self): def _test_istft_of_sine(self, amplitude, L, n): # stft of amplitude*sin(2*pi/L*n*x) with the hop length and window size equaling L x = torch.arange(2 * L, dtype=torch.get_default_dtype()) - sound = amplitude * torch.sin(2 * math.pi / L * x) + sound = amplitude * torch.sin(2 * math.pi / L * x * n) # stft = torch.stft(sound, L, hop_length=L, win_length=L, # window=torch.ones(L), center=False, normalized=False) stft = torch.zeros((L // 2 + 1, 2, 2)) - stft[1, :, 1] = -(amplitude * L) / 2.0 + stft_largest_val = (amplitude * L) / 2.0 + if n < stft.size(0): + stft[n, :, 1] = -stft_largest_val + + if 0 <= L - n < stft.size(0): + # symmetric about L // 2 + stft[L - n, :, 1] = stft_largest_val estimate = torchaudio.functional.istft(stft, L, hop_length=L, win_length=L, window=torch.ones(L), center=False, normalized=False) - self._compare_estimate(sound, estimate, atol=1e-4) + self._compare_estimate(sound, estimate, atol=1e-3) def test_istft_of_sine(self): self._test_istft_of_sine(amplitude=123, L=5, n=1) - self._test_istft_of_sine(amplitude=234, L=5, n=2) - self._test_istft_of_sine(amplitude=345, L=5, n=3) - self._test_istft_of_sine(amplitude=111, L=6, n=3) - self._test_istft_of_sine(amplitude=222, L=7, n=4) - self._test_istft_of_sine(amplitude=100, L=8, n=5) - self._test_istft_of_sine(amplitude=315, L=9, n=6) - self._test_istft_of_sine(amplitude=410, L=10, n=7) + self._test_istft_of_sine(amplitude=150, L=5, n=2) + self._test_istft_of_sine(amplitude=111, L=5, n=3) + self._test_istft_of_sine(amplitude=160, L=7, n=4) + self._test_istft_of_sine(amplitude=145, L=8, n=5) + self._test_istft_of_sine(amplitude=80, L=9, n=6) + self._test_istft_of_sine(amplitude=99, L=10, n=7) if __name__ == '__main__': From 2f36eb7bc40d76e4f98f76e563134ef6d9230ba6 Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Tue, 9 Jul 2019 09:21:48 -0700 Subject: [PATCH 19/27] apply feedback --- test/test_functional.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_functional.py b/test/test_functional.py index 13ffcd91f7..fa1a2093f3 100644 --- a/test/test_functional.py +++ b/test/test_functional.py @@ -151,7 +151,7 @@ def test_istft_requires_nola(self): def _test_istft_of_sine(self, amplitude, L, n): # stft of amplitude*sin(2*pi/L*n*x) with the hop length and window size equaling L - x = torch.arange(2 * L, dtype=torch.get_default_dtype()) + x = torch.arange(2 * L + 1, dtype=torch.get_default_dtype()) sound = amplitude * torch.sin(2 * math.pi / L * x * n) # stft = torch.stft(sound, L, hop_length=L, win_length=L, # window=torch.ones(L), center=False, normalized=False) From 21c95a259029f1ca2f13996ce9bdfd7aae1fd570 Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Tue, 9 Jul 2019 09:36:36 -0700 Subject: [PATCH 20/27] test --- build_tools/travis/install.sh | 3 +++ 1 file changed, 3 insertions(+) diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 9c20394b83..29840f0191 100644 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -37,9 +37,11 @@ then -O miniconda.sh fi chmod +x miniconda.sh && ./miniconda.sh -b -f + echo "jasonb" conda update --yes conda echo "Creating environment to run tests in." conda create -n testenv --yes python="$PYTHON_VERSION" + echo "jasonc" fi cd .. popd @@ -47,6 +49,7 @@ popd # Activate the python environment we created. source activate testenv +echo "jasona" # Install requirements via pip in our conda environment pip install -r requirements.txt From dd2b838de3bcc99932c85ecc81c6f7d1a04265bc Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Tue, 9 Jul 2019 09:54:50 -0700 Subject: [PATCH 21/27] test --- build_tools/travis/install.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 29840f0191..9ae4e43e63 100644 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -38,7 +38,7 @@ then fi chmod +x miniconda.sh && ./miniconda.sh -b -f echo "jasonb" - conda update --yes conda + # conda update --yes conda echo "Creating environment to run tests in." conda create -n testenv --yes python="$PYTHON_VERSION" echo "jasonc" From a0de40d8bff8b6d8746f68a5ea178be53631afee Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Tue, 9 Jul 2019 10:01:43 -0700 Subject: [PATCH 22/27] done --- build_tools/travis/install.sh | 4 ---- 1 file changed, 4 deletions(-) diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index 9ae4e43e63..c63f205524 100644 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -37,11 +37,8 @@ then -O miniconda.sh fi chmod +x miniconda.sh && ./miniconda.sh -b -f - echo "jasonb" - # conda update --yes conda echo "Creating environment to run tests in." conda create -n testenv --yes python="$PYTHON_VERSION" - echo "jasonc" fi cd .. popd @@ -49,7 +46,6 @@ popd # Activate the python environment we created. source activate testenv -echo "jasona" # Install requirements via pip in our conda environment pip install -r requirements.txt From d2a72e666ec01057791b9f541802b3544091a388 Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Tue, 9 Jul 2019 12:02:53 -0700 Subject: [PATCH 23/27] apply feedback --- test/test_functional.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/test_functional.py b/test/test_functional.py index fa1a2093f3..2d0a58426d 100644 --- a/test/test_functional.py +++ b/test/test_functional.py @@ -11,6 +11,10 @@ class TestFunctional(unittest.TestCase): data_sizes = (2, 20) number_of_trials = 100 + def setUp(self): + # we want to make sure that the random values are reproducible + torch.manual_seed(0) + def _compare_estimate(self, sound, estimate, atol=1e-6, rtol=1e-8): # trim sound for case when constructed signal is shorter than original sound = sound[..., :estimate.size(-1)] From 6a9ef42e02c8c353aa923d46a3745167b01378fa Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Tue, 9 Jul 2019 12:06:07 -0700 Subject: [PATCH 24/27] apply feedback --- test/test_functional.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/test/test_functional.py b/test/test_functional.py index 2d0a58426d..9de26e01d3 100644 --- a/test/test_functional.py +++ b/test/test_functional.py @@ -8,7 +8,7 @@ class TestFunctional(unittest.TestCase): - data_sizes = (2, 20) + data_sizes = [(2, 20), (3, 15)] number_of_trials = 100 def setUp(self): @@ -25,13 +25,14 @@ def _compare_estimate(self, sound, estimate, atol=1e-6, rtol=1e-8): def _test_istft_is_inverse_of_stft(self, kwargs): # generates a random sound signal for each tril and then does the stft/istft # operation to check whether we can reconstruct signal - for i in range(self.number_of_trials): - sound = torch.rand(self.data_sizes) + for data_size in self.data_sizes: + for i in range(self.number_of_trials): + sound = torch.rand(data_size) - stft = torch.stft(sound, **kwargs) - estimate = torchaudio.functional.istft(stft, length=sound.size(1), **kwargs) + stft = torch.stft(sound, **kwargs) + estimate = torchaudio.functional.istft(stft, length=sound.size(1), **kwargs) - self._compare_estimate(sound, estimate) + self._compare_estimate(sound, estimate) def test_istft_is_inverse_of_stft1(self): # hann_window, centered, normalized, onesided From fc57968868d14a6a9ae1f2003ed4c7bd203fff0e Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Tue, 9 Jul 2019 12:13:18 -0700 Subject: [PATCH 25/27] apply feedback --- test/test_functional.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/test_functional.py b/test/test_functional.py index 9de26e01d3..b648b45c3e 100644 --- a/test/test_functional.py +++ b/test/test_functional.py @@ -1,5 +1,4 @@ import math -import os import torch import torchaudio @@ -26,7 +25,7 @@ def _test_istft_is_inverse_of_stft(self, kwargs): # generates a random sound signal for each tril and then does the stft/istft # operation to check whether we can reconstruct signal for data_size in self.data_sizes: - for i in range(self.number_of_trials): + for _ in range(self.number_of_trials): sound = torch.rand(data_size) stft = torch.stft(sound, **kwargs) From d8bbb8dc3c842dcf6048f6df0002c98ffa7de10f Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Tue, 9 Jul 2019 13:12:19 -0700 Subject: [PATCH 26/27] revert files --- build_tools/travis/install.sh | 1 + test/test_transforms.py | 10 ++++------ 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/build_tools/travis/install.sh b/build_tools/travis/install.sh index c63f205524..9c20394b83 100644 --- a/build_tools/travis/install.sh +++ b/build_tools/travis/install.sh @@ -37,6 +37,7 @@ then -O miniconda.sh fi chmod +x miniconda.sh && ./miniconda.sh -b -f + conda update --yes conda echo "Creating environment to run tests in." conda create -n testenv --yes python="$PYTHON_VERSION" fi diff --git a/test/test_transforms.py b/test/test_transforms.py index 7b355470bb..8ffe9fbafe 100644 --- a/test/test_transforms.py +++ b/test/test_transforms.py @@ -230,10 +230,10 @@ def _test_librosa_consistency_helper(n_fft, hop_length, power, n_mels, n_mfcc, s librosa_mel = librosa.feature.melspectrogram(y=sound_librosa, sr=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, htk=True, norm=None) - librosa_mel_tensor = torch.from_numpy(librosa_mel) torch_mel = melspect_transform(sound).squeeze().cpu().t() - self.assertTrue(torch.allclose(torch_mel.type(librosa_mel_tensor.dtype), librosa_mel_tensor, atol=5e-3)) + # lower tolerance, think it's double vs. float + self.assertTrue(torch.allclose(torch_mel.type(torch.double), torch.from_numpy(librosa_mel), atol=5e-3)) # test s2db @@ -244,9 +244,8 @@ def _test_librosa_consistency_helper(n_fft, hop_length, power, n_mels, n_mfcc, s db_torch = db_transform(melspect_transform(sound)).squeeze().cpu().t() db_librosa = librosa.core.spectrum.power_to_db(librosa_mel) - db_librosa_tensor = torch.from_numpy(db_librosa) - self.assertTrue(torch.allclose(db_torch.type(db_librosa_tensor.dtype), db_librosa_tensor, atol=5e-3)) + self.assertTrue(torch.allclose(db_torch.type(torch.double), torch.from_numpy(db_librosa), atol=5e-3)) # test MFCC melkwargs = {'hop': hop_length, 'n_fft': n_fft} @@ -270,10 +269,9 @@ def _test_librosa_consistency_helper(n_fft, hop_length, power, n_mels, n_mfcc, s # n_mels=n_mels) librosa_mfcc = scipy.fftpack.dct(db_librosa, axis=0, type=2, norm='ortho')[:n_mfcc] - librosa_mfcc_tensor = torch.from_numpy(librosa_mfcc) torch_mfcc = mfcc_transform(sound).squeeze().cpu().t() - self.assertTrue(torch.allclose(torch_mfcc.type(librosa_mfcc_tensor.dtype), librosa_mfcc_tensor, atol=5e-3)) + self.assertTrue(torch.allclose(torch_mfcc.type(torch.double), torch.from_numpy(librosa_mfcc), atol=5e-3)) kwargs1 = { 'n_fft': 400, From 92801d588f4da2a2c219ab534f33a10090f6fc6d Mon Sep 17 00:00:00 2001 From: Jason Lian Date: Tue, 9 Jul 2019 13:36:06 -0700 Subject: [PATCH 27/27] apply feedback --- test/test_functional.py | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/test/test_functional.py b/test/test_functional.py index b648b45c3e..674baf0b21 100644 --- a/test/test_functional.py +++ b/test/test_functional.py @@ -9,10 +9,25 @@ class TestFunctional(unittest.TestCase): data_sizes = [(2, 20), (3, 15)] number_of_trials = 100 + stored_rand_data = [] + fixed_precision = int(1e10) def setUp(self): # we want to make sure that the random values are reproducible + self.stored_rand_data.clear() torch.manual_seed(0) + for data_size in self.data_sizes: + rand_data1 = torch.randint(low=-self.fixed_precision, high=self.fixed_precision, size=data_size) + rand_data2 = torch.randint(low=-self.fixed_precision, high=self.fixed_precision, size=data_size) + self.stored_rand_data.append([rand_data1, rand_data2]) + + def _get_random_tensor(self, i): + # gets a random tensor of size data_sizes[i]. adds to previous tensors and then mods it. + rand_data = self.stored_rand_data[i] + rand_data3 = (rand_data[0] + rand_data[1]) % self.fixed_precision + rand_data.pop(0) + rand_data.append(rand_data3) + return rand_data3.float() / self.fixed_precision def _compare_estimate(self, sound, estimate, atol=1e-6, rtol=1e-8): # trim sound for case when constructed signal is shorter than original @@ -24,9 +39,9 @@ def _compare_estimate(self, sound, estimate, atol=1e-6, rtol=1e-8): def _test_istft_is_inverse_of_stft(self, kwargs): # generates a random sound signal for each tril and then does the stft/istft # operation to check whether we can reconstruct signal - for data_size in self.data_sizes: + for i in range(len(self.data_sizes)): for _ in range(self.number_of_trials): - sound = torch.rand(data_size) + sound = self._get_random_tensor(i) stft = torch.stft(sound, **kwargs) estimate = torchaudio.functional.istft(stft, length=sound.size(1), **kwargs) @@ -170,6 +185,7 @@ def _test_istft_of_sine(self, amplitude, L, n): estimate = torchaudio.functional.istft(stft, L, hop_length=L, win_length=L, window=torch.ones(L), center=False, normalized=False) + # There is a larger error due to the scaling of amplitude self._compare_estimate(sound, estimate, atol=1e-3) def test_istft_of_sine(self):