pytorch · mthrok · Jul 16, 2020 · Jul 14, 2020 · Jul 14, 2020 · Jul 14, 2020
diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -13,6 +13,7 @@ The :mod:`torchaudio` package consists of I/O, popular datasets and common audio
    kaldi_io
    transforms
    functional
+   utils
 
 .. automodule:: torchaudio
    :members:
diff --git a/docs/source/sox_effects.rst b/docs/source/sox_effects.rst
@@ -4,10 +4,16 @@
 torchaudio.sox_effects
 ======================
 
-Create SoX effects chain for preprocessing audio.
-
 .. currentmodule:: torchaudio.sox_effects
 
+Apply SoX effects chain on torch.Tensor or on file and load as torch.Tensor.
+
+.. autofunction:: apply_effects_tensor
+
+.. autofunction:: apply_effects_file
+
+Create SoX effects chain for preprocessing audio.
+
 :hidden:`SoxEffect`
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 

diff --git a/docs/source/utils.rst b/docs/source/utils.rst
@@ -0,0 +1,21 @@
+.. role:: hidden
+    :class: hidden-section
+
+torchaudio.utils.sox_utils
+==========================
+
+Utility module to configure libsox. This affects functionalities in ``sox_io`` backend and ``torchaudio.sox_effects``.
+
+.. currentmodule:: torchaudio.utils.sox_utils
+
+.. autofunction:: set_seed
+
+.. autofunction:: set_verbosity
+
+.. autofunction:: set_buffer_size
+
+.. autofunction:: set_use_threads
+
+.. autofunction:: list_effects
+
+.. autofunction:: list_formats
diff --git a/test/assets/sox_effect_test_args.json b/test/assets/sox_effect_test_args.json
@@ -0,0 +1,88 @@
+{"effects": [["allpass", "300", "10"]]}
+{"effects": [["band", "300", "10"]]}
+{"effects": [["bandpass", "300", "10"]]}
+{"effects": [["bandreject", "300", "10"]]}
+{"effects": [["bass", "-10"]]}
+{"effects": [["bend", ".35,180,.25", ".15,740,.53", "0,-520,.3"]]}
+{"effects": [["biquad", "0.4", "0.2", "0.9", "0.7", "0.2", "0.6"]]}
+{"effects": [["chorus", "0.7", "0.9", "55", "0.4", "0.25", "2", "-t"]]}
+{"effects": [["chorus", "0.6", "0.9", "50", "0.4", "0.25", "2", "-t", "60", "0.32", "0.4", "1.3", "-s"]]}
+{"effects": [["chorus", "0.5", "0.9", "50", "0.4", "0.25", "2", "-t", "60", "0.32", "0.4", "2.3", "-t", "40", "0.3", "0.3", "1.3", "-s"]]}
+{"effects": [["channels", "1"]]}
+{"effects": [["channels", "2"]]}
+{"effects": [["channels", "3"]]}
+{"effects": [["compand", "0.3,1", "6:-70,-60,-20", "-5", "-90", "0.2"]]}
+{"effects": [["compand", ".1,.2", "-inf,-50.1,-inf,-50,-50", "0", "-90", ".1"]]}
+{"effects": [["compand", ".1,.1", "-45.1,-45,-inf,0,-inf", "45", "-90", ".1"]]}
+{"effects": [["contrast", "0"]]}
+{"effects": [["contrast", "25"]]}
+{"effects": [["contrast", "50"]]}
+{"effects": [["contrast", "75"]]}
+{"effects": [["contrast", "100"]]}
+{"effects": [["dcshift", "1.0"]]}
+{"effects": [["dcshift", "-1.0"]]}
+{"effects": [["deemph"]], "input_sample_rate": 44100}
+{"effects": [["delay", "1.5", "+1"]]}
+{"effects": [["dither", "-s"]]}
+{"effects": [["dither", "-S"]]}
+{"effects": [["divide"]]}
+{"effects": [["downsample", "2"]], "input_sample_rate": 8000, "output_sample_rate": 4000}
+{"effects": [["earwax"]], "input_sample_rate": 44100}
+{"effects": [["echo", "0.8", "0.88", "60", "0.4"]]}
+{"effects": [["echo", "0.8", "0.88", "6", "0.4"]]}
+{"effects": [["echo", "0.8", "0.9", "1000", "0.3"]]}
+{"effects": [["echo", "0.8", "0.9", "1000", "0.3", "1800", "0.25"]]}
+{"effects": [["echos", "0.8", "0.7", "700", "0.25", "700", "0.3"]]}
+{"effects": [["echos", "0.8", "0.7", "700", "0.25", "900", "0.3"]]}
+{"effects": [["echos", "0.8", "0.7", "40", "0.25", "63", "0.3"]]}
+{"effects": [["equalizer", "300", "10", "5"]]}
+{"effects": [["fade", "q", "3"]]}
+{"effects": [["fade", "h", "3"]]}
+{"effects": [["fade", "t", "3"]]}
+{"effects": [["fade", "l", "3"]]}
+{"effects": [["fade", "p", "3"]]}
+{"effects": [["fir", "0.0195", "-0.082", "0.234", "0.891", "-0.145", "0.043"]]}
+{"effects": [["fir", "test/assets/sox_effect_test_fir_coeffs.txt"]]}
+{"effects": [["flanger"]]}
+{"effects": [["gain", "-n"]]}
+{"effects": [["gain", "-n", "-3"]]}
+{"effects": [["gain", "-l", "-6"]]}
+{"effects": [["highpass", "-1", "300"]]}
+{"effects": [["highpass", "-2", "300"]]}
+{"effects": [["hilbert"]]}
+{"effects": [["loudness"]]}
+{"effects": [["lowpass", "-1", "300"]]}
+{"effects": [["lowpass", "-2", "300"]]}
+{"effects": [["mcompand", "0.005,0.1 -47,-40,-34,-34,-17,-33", "100", "0.003,0.05 -47,-40,-34,-34,-17,-33", "400", "0.000625,0.0125 -47,-40,-34,-34,-15,-33", "1600", "0.0001,0.025 -47,-40,-34,-34,-31,-31,-0,-30", "6400", "0,0.025 -38,-31,-28,-28,-0,-25"]], "input_sample_rate": 44100}
+{"effects": [["norm"]]}
+{"effects": [["oops"]]}
+{"effects": [["overdrive"]]}
+{"effects": [["pad"]]}
+{"effects": [["phaser"]]}
+{"effects": [["pitch", "6.48"], ["rate", "8030"]], "output_sample_rate": 8030}
+{"effects": [["pitch", "-6.50"], ["rate", "7970"]], "output_sample_rate": 7970}
+{"effects": [["rate", "4567"]], "output_sample_rate": 4567}
+{"effects": [["remix", "6", "7", "8", "0"]], "num_channels": 8}
+{"effects": [["remix", "1-3,7", "3"]], "num_channels": 8}
+{"effects": [["repeat"]]}
+{"effects": [["reverb"]]}
+{"effects": [["reverse"]]}
+{"effects": [["riaa"]], "input_sample_rate": 44100}
+{"effects": [["silence", "0"]]}
+{"effects": [["sinc", "3k"]]}
+{"effects": [["speed", "1.3"]], "input_sample_rate": 4000, "output_sample_rate": 5200}
+{"effects": [["speed", "0.7"]], "input_sample_rate": 4000, "output_sample_rate": 2800}
+{"effects": [["stat"]]}
+{"effects": [["stats"]]}
+{"effects": [["stretch"]]}
+{"effects": [["swap"]]}
+{"effects": [["synth"]]}
+{"effects": [["tempo", "0.9"]]}
+{"effects": [["tempo", "1.1"]]}
+{"effects": [["treble", "3"]]}
+{"effects": [["tremolo", "300", "40"]]}
+{"effects": [["tremolo", "300", "50"]]}
+{"effects": [["trim", "0", "0.1"]]}
+{"effects": [["upsample", "2"]], "input_sample_rate": 8000, "output_sample_rate": 16000}
+{"effects": [["vad"]]}
+{"effects": [["vol", "3"]]}
diff --git a/test/assets/sox_effect_test_fir_coeffs.txt b/test/assets/sox_effect_test_fir_coeffs.txt
@@ -0,0 +1 @@
+0.0195 -0.082 0.234 0.891 -0.145 0.043
diff --git a/test/common_utils/data_utils.py b/test/common_utils/data_utils.py
@@ -72,6 +72,7 @@ def get_sinusoid(
     n_channels: int = 1,
     dtype: Union[str, torch.dtype] = "float32",
     device: Union[str, torch.device] = "cpu",
+    channels_first: bool = True,
 ):
     """Generate pseudo audio data with sine wave.
 
@@ -91,4 +92,7 @@ def get_sinusoid(
     pie2 = 2 * 3.141592653589793
     end = pie2 * frequency * duration
     theta = torch.linspace(0, end, sample_rate * duration, dtype=dtype, device=device)
-    return torch.sin(theta, out=None).repeat([n_channels, 1])
+    sin = torch.sin(theta, out=None).repeat([n_channels, 1])
+    if not channels_first:
+        sin = sin.t()
+    return sin
diff --git a/test/common_utils/sox_utils.py b/test/common_utils/sox_utils.py
@@ -77,3 +77,24 @@ def convert_audio_file(
     command += [dst_path]
     print(' '.join(command))
     subprocess.run(command, check=True)
+
+
+def _flattern(effects):
+    if not effects:
+        return effects
+    if isinstance(effects[0], str):
+        return effects
+    return [item for sublist in effects for item in sublist]
+
+
+def run_sox_effect(input_file, output_file, effect, *, output_sample_rate=None, output_bitdepth=None):
+    """Run sox effects"""
+    effect = _flattern(effect)
+    command = ['sox', '-V', '--no-dither', input_file]
+    if output_bitdepth:
+        command += ['--bits', str(output_bitdepth)]
+    command += [output_file] + effect
+    if output_sample_rate:
+        command += ['rate', str(output_sample_rate)]
+    print(' '.join(command))
+    subprocess.run(command, check=True)
diff --git a/test/common_utils/test_case_utils.py b/test/common_utils/test_case_utils.py
@@ -14,33 +14,28 @@
 class TempDirMixin:
     """Mixin to provide easy access to temp dir"""
     temp_dir_ = None
-    base_temp_dir = None
-    temp_dir = None
 
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
+    @property
+    def base_temp_dir(self):
         # If TORCHAUDIO_TEST_TEMP_DIR is set, use it instead of temporary directory.
         # this is handy for debugging.
         key = 'TORCHAUDIO_TEST_TEMP_DIR'
         if key in os.environ:
-            cls.base_temp_dir = os.environ[key]
-        else:
-            cls.temp_dir_ = tempfile.TemporaryDirectory()
-            cls.base_temp_dir = cls.temp_dir_.name
+            return os.environ[key]
+        if self.__class__.temp_dir_ is None:
+            self.__class__.temp_dir_ = tempfile.TemporaryDirectory()
+        return self.__class__.temp_dir_.name
 
     @classmethod
     def tearDownClass(cls):
         super().tearDownClass()
-        if isinstance(cls.temp_dir_, tempfile.TemporaryDirectory):
+        if cls.temp_dir_ is not None:
             cls.temp_dir_.cleanup()
-
-    def setUp(self):
-        super().setUp()
-        self.temp_dir = os.path.join(self.base_temp_dir, self.id())
+            cls.temp_dir_ = None
 
     def get_temp_path(self, *paths):
-        path = os.path.join(self.temp_dir, *paths)
+        temp_dir = os.path.join(self.base_temp_dir, self.id())
+        path = os.path.join(temp_dir, *paths)
         os.makedirs(os.path.dirname(path), exist_ok=True)
         return path
 

diff --git a/test/sox_effect/__init__.py b/test/sox_effect/__init__.py
diff --git a/test/sox_effect/common.py b/test/sox_effect/common.py
@@ -0,0 +1,6 @@
+def name_func(func, _, params):
+    if isinstance(params.args[0], str):
+        args = "_".join([str(arg) for arg in params.args])
+    else:
+        args = "_".join([str(arg) for arg in params.args[0]])
+    return f'{func.__name__}_{args}'
diff --git a/test/sox_effect/test_dataset.py b/test/sox_effect/test_dataset.py
@@ -0,0 +1,115 @@
+from typing import List, Tuple
+
+import numpy as np
+import torch
+import torchaudio
+
+from ..common_utils import (
+    TempDirMixin,
+    PytorchTestCase,
+    skipIfNoExtension,
+    get_whitenoise,
+    load_wav,
+    save_wav,
+)
+
+
+class RandomPerturbationFile(torch.utils.data.Dataset):
+    """Given flist, apply random speed perturbation"""
+    def __init__(self, flist: List[str], sample_rate: int):
+        super().__init__()
+        self.flist = flist
+        self.sample_rate = sample_rate
+        self.rng = None
+
+    def __getitem__(self, index):
+        speed = self.rng.uniform(0.5, 2.0)
+        effects = [
+            ['gain', '-n', '-10'],
+            ['speed', f'{speed:.5f}'],  # duration of data is 0.5 ~ 2.0 seconds.
+            ['rate', f'{self.sample_rate}'],
+            ['pad', '0', '1.5'],  # add 1.5 seconds silence at the end
+            ['trim', '0', '2'],  # get the first 2 seconds
+        ]
+        data, _ = torchaudio.sox_effects.apply_effects_file(self.flist[index], effects)
+        return data
+
+    def __len__(self):
+        return len(self.flist)
+
+
+class RandomPerturbationTensor(torch.utils.data.Dataset):
+    """Apply speed purturbation to (synthetic) Tensor data"""
+    def __init__(self, signals: List[Tuple[torch.Tensor, int]], sample_rate: int):
+        super().__init__()
+        self.signals = signals
+        self.sample_rate = sample_rate
+        self.rng = None
+
+    def __getitem__(self, index):
+        speed = self.rng.uniform(0.5, 2.0)
+        effects = [
+            ['gain', '-n', '-10'],
+            ['speed', f'{speed:.5f}'],  # duration of data is 0.5 ~ 2.0 seconds.
+            ['rate', f'{self.sample_rate}'],
+            ['pad', '0', '1.5'],  # add 1.5 seconds silence at the end
+            ['trim', '0', '2'],  # get the first 2 seconds
+        ]
+        tensor, sample_rate = self.signals[index]
+        data, _ = torchaudio.sox_effects.apply_effects_tensor(tensor, sample_rate, effects)
+        return data
+
+    def __len__(self):
+        return len(self.signals)
+
+
+def init_random_seed(worker_id):
+    dataset = torch.utils.data.get_worker_info().dataset
+    dataset.rng = np.random.RandomState(worker_id)
+
+
+@skipIfNoExtension
+class TestSoxEffectsDataset(TempDirMixin, PytorchTestCase):
+    """Test `apply_effects_file` in multi-process dataloader setting"""
+
+    def _generate_dataset(self, num_samples=128):
+        flist = []
+        for i in range(num_samples):
+            sample_rate = np.random.choice([8000, 16000, 44100])
+            dtype = np.random.choice(['float32', 'int32', 'int16', 'uint8'])
+            data = get_whitenoise(n_channels=2, sample_rate=sample_rate, duration=1, dtype=dtype)
+            path = self.get_temp_path(f'{i:03d}_{dtype}_{sample_rate}.wav')
+            save_wav(path, data, sample_rate)
+            flist.append(path)
+        return flist
+
+    def test_apply_effects_file(self):
+        sample_rate = 12000
+        flist = self._generate_dataset()
+        dataset = RandomPerturbationFile(flist, sample_rate)
+        loader = torch.utils.data.DataLoader(
+            dataset, batch_size=32, num_workers=16,
+            worker_init_fn=init_random_seed,
+        )
+        for batch in loader:
+            assert batch.shape == (32, 2, 2 * sample_rate)
+
+    def _generate_signals(self, num_samples=128):
+        signals = []
+        for _ in range(num_samples):
+            sample_rate = np.random.choice([8000, 16000, 44100])
+            data = get_whitenoise(
+                n_channels=2, sample_rate=sample_rate, duration=1, dtype='float32')
+            signals.append((data, sample_rate))
+        return signals
+
+    def test_apply_effects_tensor(self):
+        sample_rate = 12000
+        signals = self._generate_signals()
+        dataset = RandomPerturbationTensor(signals, sample_rate)
+        loader = torch.utils.data.DataLoader(
+            dataset, batch_size=32, num_workers=16,
+            worker_init_fn=init_random_seed,
+        )
+        for batch in loader:
+            assert batch.shape == (32, 2, 2 * sample_rate)