From 83d6a4671c0dc05de6b8aa37456856929e3c1748 Mon Sep 17 00:00:00 2001 From: Marc Date: Sat, 8 Jun 2019 02:04:30 +0200 Subject: [PATCH 1/5] Add new transforms (pad, randOpposite, randCrop, randStrech) --- test/test_transforms.py | 31 ++++++++++ torchaudio/functional.py | 125 ++++++++++++++++++++++++++++++++++++++- torchaudio/transforms.py | 99 +++++++++++++++++++++++++++++++ 3 files changed, 254 insertions(+), 1 deletion(-) diff --git a/test/test_transforms.py b/test/test_transforms.py index 31b5f994be..e16a426dd2 100644 --- a/test/test_transforms.py +++ b/test/test_transforms.py @@ -303,6 +303,37 @@ def _test_librosa_consistency_helper(n_fft, hop_length, power, n_mels, n_mfcc, s _test_librosa_consistency_helper(**kwargs2) _test_librosa_consistency_helper(**kwargs3) + def test_random_opposite(self): + audio_orig = self.sig.clone() + + audio_flipped = transforms.RandomOpposite(probability=0)(audio_orig) + self.assertTrue(torch.allclose(audio_flipped, -audio_orig, atol=5e-3)) + + audio_flipped = transforms.RandomOpposite(probability=1)(audio_orig) + self.assertTrue(torch.allclose(audio_flipped, audio_orig, atol=5e-3)) + + def test_random_strech(self): + audio_orig = self.sig.clone().transpose(0, 1) + + audio_streched = transforms.RandomStrech(max_factor=1)(audio_orig) + self.assertTrue(torch.allclose(audio_streched, audio_orig, atol=5e-3)) + + audio_streched = transforms.RandomStrech(max_factor=2)(audio_orig) + self.assertNotEqual(audio_streched.size(1), audio_orig.size(1)) + # False if random resturns one... Unlikely + + def test_random_crop(self): + audio_orig = self.sig.clone().transpose(0, 1) + + croped_audio = transforms.RandomCrop(200)(audio_orig) + self.assertEqual(croped_audio.size(1), 200) + + def test_pad(self): + audio_orig = self.sig.clone().transpose(0, 1) + + padded_audio = transforms.Pad(200, 0)(audio_orig) + self.assertEqual(padded_audio.size(1), audio_orig.size(1) + 200 * 2) + if __name__ == '__main__': unittest.main() diff --git a/torchaudio/functional.py b/torchaudio/functional.py index 346ffbf9a9..48e7a355cb 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -15,7 +15,13 @@ 'MFCC', 'BLC2CBL', 'mu_law_encoding', - 'mu_law_expanding' + 'mu_law_expanding', + 'crop_in_between', + 'random_crop', + 'strech', + 'random_strech', + 'opposite', + 'random_opposite' ] @@ -350,3 +356,120 @@ def mu_law_expanding(x_mu, qc): x = ((x_mu) / mu) * 2 - 1. x = torch.sign(x) * (torch.exp(torch.abs(x) * torch.log1p(mu)) - 1.) / mu return x + + +def crop_in_between(tensor, start, end, ch_dim): + """Crops a piece of tensor + + Args: + tensor (Tensor): Tensor of audio of size (NxC) or (CxN) + start (int): Starting point of crop + end (int): Ending point of crop + ch_dim (int): Dimension of channel (not size) + + Returns: + Tensor: a piece of the tensor + """ + if ch_dim == 1: + tensor = tensor.transpose(0, 1) + + tensor = tensor[:, start: end] + + if ch_dim == 1: + tensor = tensor.transpose(0, 1) + + return tensor + + +def random_crop(tensor, size, ch_dim): + """Randomly crops a piece of tensor + + Args: + tensor (Tensor): Tensor of audio of size (NxC) or (CxN) + start (int): Starting point of crop + end (int): Ending point of crop + ch_dim (int): Dimension of channel (not size) + + Returns: + Tensor: a piece of the tensor + """ + orig_size = tensor.size(1 - ch_dim) + start = torch.randint(0, orig_size - size, (1,)) + end = start + size + return crop_in_between(tensor, start.item(), end.item(), ch_dim) + + +def strech(tensor, factor, interpolate, ch_dim): + """Strech a tensor on the time dimention (not the channel one) with + the given factor. + + Args: + tensor (Tensor): Tensor of audio of size (n x c) or (c x n) + factor (Tensor, float): Streching factor of the tensor + interpolate (str): mode of interpolation for the generated audio + points (linear or nearest) + ch_dim (int): Dimension of channel (not size) + + Returns: + Tensor : the streched tensor + """ + type_orig = tensor.type() + if ch_dim == 1: + tensor = tensor.transpose(0, 1) + + # Generate list of factor indexes + output_size = (tensor.size(1) * factor).float() + ref = torch.arange(output_size.item()) / factor + + # Select interpolation type + if interpolate.lower() == 'linear': + ref1 = ref.int().float() + ref2 = torch.clamp_max(ref1 + 1, tensor.size(1) - 1) + r = (ref - ref1).type(type_orig) # Ratio of sound[ref] to use + streched_sound = (tensor[:, ref1.long()] * (1 - r) + + tensor[:, ref2.long()] * r) + elif interpolate.lower() == 'nearest': + ref = ref.int() # Nearest index + streched_sound = tensor[ref.long()] + else: + raise Exception('Invalid interpolation mode {}'.format( + interpolate)) + + if ch_dim == 1: + streched_sound = streched_sound.transpose(0, 1) + + return streched_sound + + +def random_strech(tensor, max_factor, interpolate, ch_dim): + """Strech a tensor on the time dimention (not the channel one) with + a random factor. + + Args: + tensor (Tensor): Tensor of audio of size (n x c) or (c x n) + max_factor (float): Max streching factor of the tensor + interpolate (str): Mode of interpolation for the generated audio + points (linear or nearest) + ch_dim (int): Dimension of channel (not size) + + Returns: + Tensor : the streched tensor + """ + factor = max_factor ** (torch.rand(1) * 2 - 1) + return strech(tensor, factor, interpolate, ch_dim) + + +def opposite(tensor): + """Returns the opposite value of the tensor + """ + return -tensor + + +def random_opposite(tensor, probability): + """Ramdomly return the opposite values of the tensor + """ + do_it = (torch.rand(1) >= probability) + if do_it: + tensor = opposite(tensor) + + return tensor diff --git a/torchaudio/transforms.py b/torchaudio/transforms.py index 861b2712f7..3d9a20dfd2 100644 --- a/torchaudio/transforms.py +++ b/torchaudio/transforms.py @@ -444,3 +444,102 @@ def __call__(self, x_mu): def __repr__(self): return self.__class__.__name__ + '()' + + +class Pad(object): + """Pad the given tensor on all sides with specified padding fill value + + Args: + padding (int or tuple): Padding on each border. If a single int is + provided this is used to pad all borders. If tuple of length 2 + is provided this is the padding on left/right. + fill: fill value. + channels_first (bool): Channel is first and time second. Default: `True` + """ + def __init__(self, padding, fill=0, channel_first=True): + self.padding = padding + self.fill = fill + self.ch_dim = int(not channel_first) + + def __call__(self, tensor): + """ + Args: + tensor (Tensor): Audio of size (Samples x Channels) or (C x S) + + Returns: + tensor (Tensor): A tensor padded right and/or left with fill value + """ + if self.ch_dim == 1: + tensor = tensor.transpose(0, 1) + + tensor = torch.nn.ConstantPad1d(self.padding, self.fill)(tensor) + + if self.ch_dim == 1: + tensor = tensor.transpose(0, 1) + + return tensor + + +class RandomCrop(object): + """Randomly crops a piece of tensor + + Args: + size (int): size of the crop to retrieve + channels_first (bool): Channel is first and time second. Default: `True` + """ + def __init__(self, size, channel_first=True): + self.size = size + self.ch_dim = int(not channel_first) + + def __call__(self, tensor): + """ + Args: + tensor (Tensor): Audio of size (SxC) or (CxS) + + Returns: + Tensor: A tensor randomly steched by a factor on the sample axis. + """ + return F.random_crop(tensor, self.size, self.ch_dim) + + +class RandomStrech(object): + """Randomly strech or shrink audio + + Args: + max_factor (float): Streching factor of the audio + interpolate (str): mode of interpolation for the generated audio + points (linear or nearest) + channels_first (bool): Channel is first and time second. Default: `True` + """ + def __init__(self, max_factor=1.3, interpolate='Linear', channel_first=True): + self.max_factor = max_factor + self.interpolate = interpolate + self.ch_dim = int(not channel_first) + + def __call__(self, tensor): + """ + Args: + tensor (Tensor): Audio of size (Samples x Channels) or (C x S) + + Returns: + Tensor: A tensor randomly steched by a factor on the sample axis. + """ + return F.random_strech(tensor, + self.max_factor, + self.interpolate, + self.ch_dim) + + +class RandomOpposite(object): + """Randomly retrive the opposite values of $tensor$ + + Args: + tensor (Tensor): signal tensor with shape (size, channels) + probability (float): Probability for a flip to happen. + + """ + def __init__(self, probability=0.5): + self.probability = probability + + def __call__(self, tensor): + return F.random_opposite(tensor, self.probability) From af2695f65f6f897afe3635f9e5e217565844518d Mon Sep 17 00:00:00 2001 From: Marc Date: Sun, 9 Jun 2019 12:47:54 +0200 Subject: [PATCH 2/5] add dimension with transforms (calls unsqueeze) --- torchaudio/transforms.py | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/torchaudio/transforms.py b/torchaudio/transforms.py index 3d9a20dfd2..f1fc7fed9d 100644 --- a/torchaudio/transforms.py +++ b/torchaudio/transforms.py @@ -543,3 +543,37 @@ def __init__(self, probability=0.5): def __call__(self, tensor): return F.random_opposite(tensor, self.probability) + + +class AddChannelDimension(object): + """Add a channel dimension if missing. This result in a two dimensional + Tensor + + Args: + tensor (Tensor): signal tensor with shape (size, channels) + channels_first (bool): Channel is first and time second. Default: `True` + """ + def __init__(self, channel_first): + self.ch_dim = int(not channel_first) + + def __call__(self, tensor): + if len(tensor.shape) == 1: + tensor = tensor.unsqueeze(self.ch_dim) + return tensor + + +class AddDimension(object): + """Add a dimension to a Tensor to fit desired model. + eg: add dimension to fit 2D and 3D convolutions. + Tensor + + Args: + tensor (Tensor): signal tensor with shape (size, channels) + dimension (int): The dimesion to create + """ + def __init__(self, dimension): + self.dim = int(dimension) + + def __call__(self, tensor): + tensor = tensor.unsqueeze(self.dim) + return tensor From bb37eca55d0d58c5ab14f31f13bc2af1a8f83f32 Mon Sep 17 00:00:00 2001 From: Marc Date: Sun, 9 Jun 2019 16:42:29 +0200 Subject: [PATCH 3/5] misspelled stretch --- test/test_transforms.py | 10 +++++----- torchaudio/functional.py | 26 +++++++++++++------------- torchaudio/transforms.py | 24 ++++++++++++++++-------- 3 files changed, 34 insertions(+), 26 deletions(-) diff --git a/test/test_transforms.py b/test/test_transforms.py index e16a426dd2..f145076b38 100644 --- a/test/test_transforms.py +++ b/test/test_transforms.py @@ -312,14 +312,14 @@ def test_random_opposite(self): audio_flipped = transforms.RandomOpposite(probability=1)(audio_orig) self.assertTrue(torch.allclose(audio_flipped, audio_orig, atol=5e-3)) - def test_random_strech(self): + def test_random_stretch(self): audio_orig = self.sig.clone().transpose(0, 1) - audio_streched = transforms.RandomStrech(max_factor=1)(audio_orig) - self.assertTrue(torch.allclose(audio_streched, audio_orig, atol=5e-3)) + audio_stretched = transforms.RandomStrech(max_factor=1)(audio_orig) + self.assertTrue(torch.allclose(audio_stretched, audio_orig, atol=5e-3)) - audio_streched = transforms.RandomStrech(max_factor=2)(audio_orig) - self.assertNotEqual(audio_streched.size(1), audio_orig.size(1)) + audio_stretched = transforms.RandomStrech(max_factor=2)(audio_orig) + self.assertNotEqual(audio_stretched.size(1), audio_orig.size(1)) # False if random resturns one... Unlikely def test_random_crop(self): diff --git a/torchaudio/functional.py b/torchaudio/functional.py index 48e7a355cb..c77a3820de 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -18,8 +18,8 @@ 'mu_law_expanding', 'crop_in_between', 'random_crop', - 'strech', - 'random_strech', + 'stretch', + 'random_stretch', 'opposite', 'random_opposite' ] @@ -399,7 +399,7 @@ def random_crop(tensor, size, ch_dim): return crop_in_between(tensor, start.item(), end.item(), ch_dim) -def strech(tensor, factor, interpolate, ch_dim): +def stretch(tensor, factor, interpolate, ch_dim): """Strech a tensor on the time dimention (not the channel one) with the given factor. @@ -411,7 +411,7 @@ def strech(tensor, factor, interpolate, ch_dim): ch_dim (int): Dimension of channel (not size) Returns: - Tensor : the streched tensor + Tensor : the stretched tensor """ type_orig = tensor.type() if ch_dim == 1: @@ -426,37 +426,37 @@ def strech(tensor, factor, interpolate, ch_dim): ref1 = ref.int().float() ref2 = torch.clamp_max(ref1 + 1, tensor.size(1) - 1) r = (ref - ref1).type(type_orig) # Ratio of sound[ref] to use - streched_sound = (tensor[:, ref1.long()] * (1 - r) + - tensor[:, ref2.long()] * r) + stretched_sound = (tensor[:, ref1.long()] * (1 - r) + + tensor[:, ref2.long()] * r) elif interpolate.lower() == 'nearest': ref = ref.int() # Nearest index - streched_sound = tensor[ref.long()] + stretched_sound = tensor[ref.long()] else: raise Exception('Invalid interpolation mode {}'.format( interpolate)) if ch_dim == 1: - streched_sound = streched_sound.transpose(0, 1) + stretched_sound = stretched_sound.transpose(0, 1) - return streched_sound + return stretched_sound -def random_strech(tensor, max_factor, interpolate, ch_dim): +def random_stretch(tensor, max_factor, interpolate, ch_dim): """Strech a tensor on the time dimention (not the channel one) with a random factor. Args: tensor (Tensor): Tensor of audio of size (n x c) or (c x n) - max_factor (float): Max streching factor of the tensor + max_factor (float): Max stretching factor of the tensor interpolate (str): Mode of interpolation for the generated audio points (linear or nearest) ch_dim (int): Dimension of channel (not size) Returns: - Tensor : the streched tensor + Tensor : the stretched tensor """ factor = max_factor ** (torch.rand(1) * 2 - 1) - return strech(tensor, factor, interpolate, ch_dim) + return stretch(tensor, factor, interpolate, ch_dim) def opposite(tensor): diff --git a/torchaudio/transforms.py b/torchaudio/transforms.py index f1fc7fed9d..36fd6129b9 100644 --- a/torchaudio/transforms.py +++ b/torchaudio/transforms.py @@ -5,6 +5,14 @@ from . import functional as F +def _check_audio(tensor): + if not isinstance(tensor, nn.Tensor): + raise TypeError('tensor should be a torch tensor') + if len(tensor.size()) > 2: + raise TypeError(('tensor representing audio should be at most ', + '2Dimentional')) + + class Compose(object): """Composes several transforms together. @@ -502,11 +510,11 @@ def __call__(self, tensor): return F.random_crop(tensor, self.size, self.ch_dim) -class RandomStrech(object): - """Randomly strech or shrink audio +class RandomStretch(object): + """Randomly stretch or shrink audio Args: - max_factor (float): Streching factor of the audio + max_factor (float): Stretching factor of the audio interpolate (str): mode of interpolation for the generated audio points (linear or nearest) channels_first (bool): Channel is first and time second. Default: `True` @@ -522,12 +530,12 @@ def __call__(self, tensor): tensor (Tensor): Audio of size (Samples x Channels) or (C x S) Returns: - Tensor: A tensor randomly steched by a factor on the sample axis. + Tensor: A tensor randomly stetched by a factor on the sample axis. """ - return F.random_strech(tensor, - self.max_factor, - self.interpolate, - self.ch_dim) + return F.random_stretch(tensor, + self.max_factor, + self.interpolate, + self.ch_dim) class RandomOpposite(object): From 7ee1d389bb07a77e3fcbc4afcfa59fbb96306d04 Mon Sep 17 00:00:00 2001 From: Marc Date: Tue, 11 Jun 2019 22:29:07 +0200 Subject: [PATCH 4/5] try a fix on stretch fail --- torchaudio/functional.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchaudio/functional.py b/torchaudio/functional.py index c77a3820de..222942babf 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -423,7 +423,7 @@ def stretch(tensor, factor, interpolate, ch_dim): # Select interpolation type if interpolate.lower() == 'linear': - ref1 = ref.int().float() + ref1 = ref.floor().float() ref2 = torch.clamp_max(ref1 + 1, tensor.size(1) - 1) r = (ref - ref1).type(type_orig) # Ratio of sound[ref] to use stretched_sound = (tensor[:, ref1.long()] * (1 - r) + From 5d5289a114966d4c41d122dbd995bd6349d270d4 Mon Sep 17 00:00:00 2001 From: Marc Date: Thu, 28 Nov 2019 03:12:25 +0100 Subject: [PATCH 5/5] int to float --- torchaudio/transforms.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/torchaudio/transforms.py b/torchaudio/transforms.py index 36fd6129b9..b21571f68c 100644 --- a/torchaudio/transforms.py +++ b/torchaudio/transforms.py @@ -585,3 +585,21 @@ def __init__(self, dimension): def __call__(self, tensor): tensor = tensor.unsqueeze(self.dim) return tensor + + +class ToTensor(object): + """Convert a ``numpy.ndarray`` to tensor. + """ + + def __call__(self, array): + """ + Args: + array: a numpy array or array to be converted + + Returns: + Tensor: Converted sound. + """ + return torch.tensor(array) + + def __repr__(self): + return self.__class__.__name__ + '()'