From a6703e091e00241a0bb2c84bc5aed010c98270a9 Mon Sep 17 00:00:00 2001 From: Chuan Zhang Date: Wed, 18 Sep 2019 12:25:06 -0700 Subject: [PATCH 01/17] adding test --- test/test_functional_lfilter_perf.py | 131 +++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 test/test_functional_lfilter_perf.py diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py new file mode 100644 index 0000000000..0030541277 --- /dev/null +++ b/test/test_functional_lfilter_perf.py @@ -0,0 +1,131 @@ +from __future__ import absolute_import, division, print_function, unicode_literals +import math +import os +import torch +import torchaudio +import unittest +import common_utils + +from torchaudio.functional import lfilter +from _torch_filtering import ( + _lfilter_tensor_matrix, + _lfilter_tensor, + _lfilter_element_wise, +) + + +class TestFunctionalLFilterPerformance(unittest.TestCase): + test_dirpath, test_dir = common_utils.create_temp_assets_dir() + + @staticmethod + def run_test(n_channels, n_frames, n_order_filter): + waveform = torch.rand(n_channels, n_frames, device="cpu") + b_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu") + a_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu") + + if n_order_filter == 8: + # Eighth Order Filter + # >>> import scipy.signal + # >>> wp = 0.3 + # >>> ws = 0.5 + # >>> gpass = 1 + # >>> gstop = 100 + # >>> b, a = scipy.signal.iirdesign(wp, ws, gpass, gstop) + b_coeffs = [ + 0.0006544487997063485, + 0.001669274889397942, + 0.003218714446315984, + 0.004222562499298002, + 0.004222562499298002, + 0.0032187144463159834, + 0.0016692748893979413, + 0.0006544487997063485, + ] + a_coeffs = [ + 1.0, + -4.67403506662255, + 10.516336803850786, + -14.399207825856776, + 12.844181702707655, + -7.43604712843608, + 2.5888616732696077, + -0.4205601576432048, + ] + elif n_order_filter == 5: + # Fifth Order Filter + # >>> import scipy.signal + # >>> wp = 0.3, ws = 0.5, gpass = 1, gstop = 40 + # >>> b, a = scipy.signal.iirdesign(wp, ws, gpass, gstop) + b_coeffs = [ + 0.0353100066384039, + 0.023370652985988206, + 0.0560524973457262, + 0.023370652985988193, + 0.03531000663840389, + ] + a_coeffs = [ + 1.0, + -2.321010052951366, + 2.677193357612127, + -1.5774235418173692, + 0.4158137396065854, + ] + + # Cast into Tensors + a_coeffs = torch.tensor(a_coeffs, device="cpu", dtype=torch.float32) + b_coeffs = torch.tensor(b_coeffs, device="cpu", dtype=torch.float32) + + def time_and_output(func): + import time + + st = time.time() + output = func() + run_time = time.time() - st + return (output, run_time) + + (output_waveform_1, run_time_1) = time_and_output( + lambda: lfilter(waveform, a_coeffs, b_coeffs) + ) + (output_waveform_2, run_time_2) = time_and_output( + lambda: _lfilter_element_wise(waveform, a_coeffs, b_coeffs) + ) + (output_waveform_3, run_time_3) = time_and_output( + lambda: _lfilter_tensor_matrix(waveform, a_coeffs, b_coeffs) + ) + + print("-" * 80) + print("Evaluating Runtime between lfilter implementations") + print("-" * 80) + print( + "Data Size: [%d x %d], Filter Order: %d" + % (waveform.size(0), waveform.size(1), a_coeffs.size(0)) + ) + print("-" * 80) + print("Python Matrix Runtime [current]: %10.6f s" % run_time_1) + print("CPP Element Wise Runtime : %10.6f s" % run_time_2) + print("CPP Matrix Runtime : %10.6f s" % run_time_3) + + # maxDeviation = torch.kthvalue(torch.abs(output_waveform_1- output_waveform_2), output_waveform_1.size(1)) + + assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4) + assert torch.allclose(output_waveform_1, output_waveform_3, atol=3e-4) + print("-" * 80) + print("✓ - all outputs are identical") + print("-" * 80) + + def test_lfilter_cmp(self): + """ + Runs comparison on CPU + """ + + torch.random.manual_seed(423) + self.run_test(2, 8000, 5) + self.run_test(2, 80000, 5) + self.run_test(2, 800000, 5) + self.run_test(2, 8000, 8) + self.run_test(2, 80000, 8) + self.run_test(2, 800000, 8) + + +if __name__ == "__main__": + unittest.main() From 16b7cb2a0bf8591869887a71bf06116f0de83999 Mon Sep 17 00:00:00 2001 From: Chuan Zhang Date: Wed, 18 Sep 2019 13:27:10 -0700 Subject: [PATCH 02/17] added cpp lfilter implementation --- setup.py | 8 ++ test/test_functional_lfilter_perf.py | 79 ++++++++++--- torchaudio/filtering.cpp | 159 +++++++++++++++++++++++++++ torchaudio/functional.py | 4 +- 4 files changed, 232 insertions(+), 18 deletions(-) create mode 100644 torchaudio/filtering.cpp diff --git a/setup.py b/setup.py index 9ed91f2468..4822a6e9d0 100644 --- a/setup.py +++ b/setup.py @@ -113,6 +113,14 @@ def check_env_flag(name, default=''): extra_compile_args=eca, extra_objects=extra_objects, extra_link_args=ela), + CppExtension( + '_torch_filtering', + ['torchaudio/filtering.cpp'], + libraries=libraries, + include_dirs=include_dirs, + extra_compile_args=eca, + extra_objects=extra_objects, + extra_link_args=ela), ], cmdclass={'build_ext': BuildExtension}, install_requires=[pytorch_package_dep] diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py index 0030541277..63c1fbadde 100644 --- a/test/test_functional_lfilter_perf.py +++ b/test/test_functional_lfilter_perf.py @@ -7,18 +7,14 @@ import common_utils from torchaudio.functional import lfilter -from _torch_filtering import ( - _lfilter_tensor_matrix, - _lfilter_tensor, - _lfilter_element_wise, -) +from _torch_filtering import _lfilter_tensor_matrix, _lfilter_element_wise class TestFunctionalLFilterPerformance(unittest.TestCase): test_dirpath, test_dir = common_utils.create_temp_assets_dir() @staticmethod - def run_test(n_channels, n_frames, n_order_filter): + def run_test(n_channels, n_frames, n_order_filter, assertClose=True): waveform = torch.rand(n_channels, n_frames, device="cpu") b_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu") a_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu") @@ -70,6 +66,50 @@ def run_test(n_channels, n_frames, n_order_filter): -1.5774235418173692, 0.4158137396065854, ] + elif n_order_filter == 18: + # >>> import scipy.signal + # >>> wp = 0.48, ws = 0.5, gpass = 0.2, gstop = 120 + # >>> b, a = scipy.signal.iirdesign(wp, ws, gpass, gstop) + b_coeffs = [ + 0.0006050813536446144, + 0.002920916369302935, + 0.010247568347759453, + 0.02591236698507957, + 0.05390501051935878, + 0.09344581172781004, + 0.13951533321139883, + 0.1808658576803922, + 0.2056643061895918, + 0.2056643061895911, + 0.1808658576803912, + 0.13951533321139847, + 0.09344581172781012, + 0.053905010519358885, + 0.02591236698507962, + 0.010247568347759466, + 0.0029209163693029367, + 0.0006050813536446148, + ] + a_coeffs = [ + 1.0, + -4.3964136877356745, + 14.650181359641305, + -34.45816395187684, + 67.18247518997862, + -108.01956225077998, + 149.4332056661277, + -178.07791467502364, + 185.28267044557634, + -168.13382659655514, + 133.22364764531704, + -91.59439958870928, + 54.15835239046956, + -27.090521914173934, + 11.163677645454127, + -3.627296054625132, + 0.8471764313073272, + -0.11712354962357388, + ] # Cast into Tensors a_coeffs = torch.tensor(a_coeffs, device="cpu", dtype=torch.float32) @@ -93,25 +133,24 @@ def time_and_output(func): lambda: _lfilter_tensor_matrix(waveform, a_coeffs, b_coeffs) ) - print("-" * 80) - print("Evaluating Runtime between lfilter implementations") print("-" * 80) print( - "Data Size: [%d x %d], Filter Order: %d" + "lfilter perf - Data Size: [%d x %d], Filter Order: %d" % (waveform.size(0), waveform.size(1), a_coeffs.size(0)) ) print("-" * 80) print("Python Matrix Runtime [current]: %10.6f s" % run_time_1) print("CPP Element Wise Runtime : %10.6f s" % run_time_2) print("CPP Matrix Runtime : %10.6f s" % run_time_3) - - # maxDeviation = torch.kthvalue(torch.abs(output_waveform_1- output_waveform_2), output_waveform_1.size(1)) - - assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4) - assert torch.allclose(output_waveform_1, output_waveform_3, atol=3e-4) - print("-" * 80) - print("✓ - all outputs are identical") print("-" * 80) + print("Ratio Python / CPP ElementWise : %10.2f x" % (run_time_1/run_time_2)) + + if assertClose: + # maxDeviation = torch.kthvalue(torch.abs(output_waveform_3- output_waveform_2), output_waveform_1.size(1)) + assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4) + assert torch.allclose(output_waveform_2, output_waveform_3, atol=3e-4) + print("✓ - all outputs are identical") + print("-" * 80) def test_lfilter_cmp(self): """ @@ -126,6 +165,14 @@ def test_lfilter_cmp(self): self.run_test(2, 80000, 8) self.run_test(2, 800000, 8) + # For higher order filters, due to floating point precision + # matrix method and element method can get different results depending on order of operations + # Also, for longer signals and higher filters, easier to create unstable filter + # https://dsp.stackexchange.com/questions/54386/relation-between-order-and-stability-in-iir-filter + self.run_test(2, 8000, 18, False) + self.run_test(2, 80000, 18, False) + self.run_test(2, 800000, 18, False) + if __name__ == "__main__": unittest.main() diff --git a/torchaudio/filtering.cpp b/torchaudio/filtering.cpp new file mode 100644 index 0000000000..aa60a8a818 --- /dev/null +++ b/torchaudio/filtering.cpp @@ -0,0 +1,159 @@ +#include + +// TBD - Compile on CUDA +// TBD - Expand to other data types outside float32? + +namespace torch { +namespace audio { + + +at::Tensor lfilter_tensor_matrix( + at::Tensor const & waveform, + at::Tensor const & a_coeffs, + at::Tensor const & b_coeffs + ) { + + // Assumptions - float32, waveform between -1 and 1 + assert(waveform.dtype() == torch::kFloat32); + + // CPU only for now - Implement for GPU in future + assert(waveform.device().type() == torch::kCPU); + + // Numerator and denominator coefficients shoudlb e same size + assert(a_coeffs.size(0) == b_coeffs.size(0)); + int n_order = a_coeffs.size(0); // n'th order - 1 filter + assert(n_order > 0); + + int64_t n_channels = waveform.size(0); + int64_t n_frames = waveform.size(1); + + // Device options should mirror input waveform + auto options = torch::TensorOptions().dtype(waveform.dtype()).device(waveform.device().type()); + + // Allocate padded input and output waveform, copy in the input waveform + at::Tensor padded_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options); + at::Tensor padded_output_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options); + padded_waveform.slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1) = waveform; + + // Create [n_order, n_channels] structure for a and b coefficients + at::Tensor a_coeffs_filled = at::transpose(at::flip(a_coeffs, {0,}).repeat({n_channels, 1}), 0, 1); + at::Tensor b_coeffs_filled = at::transpose(at::flip(b_coeffs, {0,}).repeat({n_channels, 1}), 0, 1); + + // few more temporary data structure + at::Tensor o0 = torch::zeros({n_channels, 1}, options); + at::Tensor ones = torch::ones({n_channels, n_frames + n_order - 1}, options); + at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1; + at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1}); + + + for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) { + + // calculate the output at time i_frame for all channels + o0 = torch::zeros({n_channels, 1}, torch::TensorOptions().dtype(torch::kFloat32)); + + // time window of input and output, size [n_channels, n_order] + at::Tensor const & input_window = padded_waveform.slice(0, 0, n_channels).slice(1, i_frame, i_frame + n_order); + at::Tensor const & output_window = padded_output_waveform.slice(0, 0, n_channels).slice(1, i_frame, i_frame+ n_order); + + // matrix multiply to get [n_channels x n_channels], extract diagonal and unsqueeze to get [n_channels, 1] result for a frame + at::Tensor inp_result = torch::unsqueeze(torch::diag(torch::mm(input_window, b_coeffs_filled)), 1); + at::Tensor out_result = torch::unsqueeze(torch::diag(torch::mm(output_window, a_coeffs_filled)), 1); + + o0.add_(inp_result); + o0.sub_(out_result); + + // normalize by a0 + o0.div_(normalization_a0); + + // Set the output + padded_output_waveform.slice(0, 0, n_channels).slice(1, i_frame + n_order - 1, i_frame + n_order - 1 + 1) = o0; + + } + // return clipped, and without initial conditions + return torch::min(ones, torch::max(negones, padded_output_waveform)).slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1); + } + + at::Tensor lfilter_element_wise( + at::Tensor const & waveform, + at::Tensor const & a_coeffs, + at::Tensor const & b_coeffs + ) { + + + // Assumptions - float32, waveform between -1 and 1 + assert(waveform.dtype() == torch::kFloat32); + + // CPU only for now - Implement for GPU in future + assert(waveform.device().type() == torch::kCPU); + + // Numerator and denominator coefficients shoudlb e same size + assert(a_coeffs.size(0) == b_coeffs.size(0)); + int n_order = a_coeffs.size(0); // n'th order - 1 filter + assert(n_order > 0); + + int64_t n_channels = waveform.size(0); + int64_t n_frames = waveform.size(1); + + // Device options should mirror input waveform + auto options = torch::TensorOptions().dtype(waveform.dtype()).device(waveform.device().type()); + + // Allocate padded input and output waveform, copy in the input waveform + at::Tensor padded_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options); + at::Tensor padded_output_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options); + padded_waveform.slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1) = waveform; + + // few more temporary data structure + at::Tensor o0 = torch::zeros({n_channels, 1}, options); + at::Tensor ones = torch::ones({n_channels, n_frames + n_order - 1}, options); + at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1; + at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1}); // Device options should mirror input waveform + + // TBD - Implement for CUDA + //auto input_accessor = waveform.packed_accessor64(); + //auto output_accessor = output_waveform.packed_accessor64(); + //auto a_coeffs_accessor = a_coeffs.packed_accessor64(); + //auto b_coeffs_accessor = b_coeffs.packed_accessor64(); + + // initialize the output tensor + + auto input_accessor = padded_waveform.accessor(); + auto output_accessor = padded_output_waveform.accessor(); + auto a_coeffs_accessor = a_coeffs.accessor(); + auto b_coeffs_accessor = b_coeffs.accessor(); + + for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) { + + for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) { + + // calculate the output at time i_frame by iterating through + // inputs / outputs at previous time steps and multiplying by coeffs + float o0 = 0; + for (int i_offset = 0; i_offset < n_order; ++i_offset) { + o0 += input_accessor[i_channel][i_frame + i_offset] * b_coeffs_accessor[n_order - i_offset - 1]; + o0 -= output_accessor[i_channel][i_frame + i_offset] * a_coeffs_accessor[n_order - i_offset - 1]; + } + o0 = o0 / a_coeffs_accessor[0]; + + output_accessor[i_channel][i_frame + n_order - 1] = o0; + + } + } + // return clipped, and without initial conditions + return torch::min(ones, torch::max(negones, padded_output_waveform)).slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1); + } + +}} + + +PYBIND11_MODULE(_torch_filtering, m) { + py::options options; + options.disable_function_signatures(); + m.def( + "_lfilter_tensor_matrix", + &torch::audio::lfilter_tensor_matrix, + "Executes difference equation with tensor"); + m.def( + "_lfilter_element_wise", + &torch::audio::lfilter_element_wise, + "Executes difference equation with tensor"); +} diff --git a/torchaudio/functional.py b/torchaudio/functional.py index 842d8495bf..1577c016e9 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -511,7 +511,7 @@ def phase_vocoder(complex_specgrams, rate, phase_advance): def lfilter(waveform, a_coeffs, b_coeffs): # type: (Tensor, Tensor, Tensor) -> Tensor r""" - Performs an IIR filter by evaluating difference equation. + Performs an IIR filter by evaluating difference equation. Assumes difference equation is stable. Args: waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`. Must be normalized to -1 to 1. @@ -572,7 +572,7 @@ def biquad(waveform, b0, b1, b2, a0, a1, a2): https://en.wikipedia.org/wiki/Digital_biquad_filter Args: - waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)` + waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`. Currently only supports float32. b0 (float): numerator coefficient of current input, x[n] b1 (float): numerator coefficient of input one time step ago x[n-1] b2 (float): numerator coefficient of input two time steps ago x[n-2] From 0115ec24ceea0ae1dc8a63f900d3fb1ce129cb29 Mon Sep 17 00:00:00 2001 From: Chuan Zhang Date: Wed, 18 Sep 2019 13:34:04 -0700 Subject: [PATCH 03/17] slight refactoring of cpp lfilter implementations --- torchaudio/filtering.cpp | 23 +++++++++++------------ torchaudio/functional.py | 2 +- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/torchaudio/filtering.cpp b/torchaudio/filtering.cpp index aa60a8a818..275db9ceaa 100644 --- a/torchaudio/filtering.cpp +++ b/torchaudio/filtering.cpp @@ -45,7 +45,6 @@ at::Tensor lfilter_tensor_matrix( at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1; at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1}); - for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) { // calculate the output at time i_frame for all channels @@ -79,7 +78,6 @@ at::Tensor lfilter_tensor_matrix( at::Tensor const & b_coeffs ) { - // Assumptions - float32, waveform between -1 and 1 assert(waveform.dtype() == torch::kFloat32); @@ -108,25 +106,24 @@ at::Tensor lfilter_tensor_matrix( at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1; at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1}); // Device options should mirror input waveform - // TBD - Implement for CUDA - //auto input_accessor = waveform.packed_accessor64(); - //auto output_accessor = output_waveform.packed_accessor64(); - //auto a_coeffs_accessor = a_coeffs.packed_accessor64(); - //auto b_coeffs_accessor = b_coeffs.packed_accessor64(); - - // initialize the output tensor - + // Create accessors for fast access - https://pytorch.org/cppdocs/notes/tensor_basics.html#efficient-access-to-tensor-elements + // CPU auto input_accessor = padded_waveform.accessor(); auto output_accessor = padded_output_waveform.accessor(); auto a_coeffs_accessor = a_coeffs.accessor(); auto b_coeffs_accessor = b_coeffs.accessor(); + // CUDA - TBD + //auto input_accessor = waveform.packed_accessor64(); + //auto output_accessor = output_waveform.packed_accessor64(); + //auto a_coeffs_accessor = a_coeffs.packed_accessor64(); + //auto b_coeffs_accessor = b_coeffs.packed_accessor64(); + for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) { for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) { - // calculate the output at time i_frame by iterating through - // inputs / outputs at previous time steps and multiplying by coeffs + // execute the difference equation float o0 = 0; for (int i_offset = 0; i_offset < n_order; ++i_offset) { o0 += input_accessor[i_channel][i_frame + i_offset] * b_coeffs_accessor[n_order - i_offset - 1]; @@ -134,10 +131,12 @@ at::Tensor lfilter_tensor_matrix( } o0 = o0 / a_coeffs_accessor[0]; + // put back into the main data structure output_accessor[i_channel][i_frame + n_order - 1] = o0; } } + // return clipped, and without initial conditions return torch::min(ones, torch::max(negones, padded_output_waveform)).slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1); } diff --git a/torchaudio/functional.py b/torchaudio/functional.py index 1577c016e9..319b5b3904 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -511,7 +511,7 @@ def phase_vocoder(complex_specgrams, rate, phase_advance): def lfilter(waveform, a_coeffs, b_coeffs): # type: (Tensor, Tensor, Tensor) -> Tensor r""" - Performs an IIR filter by evaluating difference equation. Assumes difference equation is stable. + Performs an IIR filter by evaluating difference equation. Coefficients should be designed to be stable. Args: waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`. Must be normalized to -1 to 1. From 6b872e96a5aaf2ce787611e39ffee4948ada7d56 Mon Sep 17 00:00:00 2001 From: Chuan Zhang Date: Wed, 18 Sep 2019 13:38:58 -0700 Subject: [PATCH 04/17] Tweaked docs for lowpass_biquad, highpass_biquad, biquad --- test/test_functional_lfilter_perf.py | 2 +- torchaudio/functional.py | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py index 63c1fbadde..8784d118b3 100644 --- a/test/test_functional_lfilter_perf.py +++ b/test/test_functional_lfilter_perf.py @@ -143,7 +143,7 @@ def time_and_output(func): print("CPP Element Wise Runtime : %10.6f s" % run_time_2) print("CPP Matrix Runtime : %10.6f s" % run_time_3) print("-" * 80) - print("Ratio Python / CPP ElementWise : %10.2f x" % (run_time_1/run_time_2)) + print("Ratio Python / CPP ElementWise : %10.2f x" % (run_time_1 / run_time_2)) if assertClose: # maxDeviation = torch.kthvalue(torch.abs(output_waveform_3- output_waveform_2), output_waveform_1.size(1)) diff --git a/torchaudio/functional.py b/torchaudio/functional.py index 319b5b3904..54311e68d3 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -572,7 +572,8 @@ def biquad(waveform, b0, b1, b2, a0, a1, a2): https://en.wikipedia.org/wiki/Digital_biquad_filter Args: - waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`. Currently only supports float32. + waveform (torch.Tensor): Audio waveform of dimension of `(n_channel, n_frames)`. + Currently only supports float32. Normalized [-1, 1] b0 (float): numerator coefficient of current input, x[n] b1 (float): numerator coefficient of input one time step ago x[n-1] b2 (float): numerator coefficient of input two time steps ago x[n-2] @@ -581,7 +582,7 @@ def biquad(waveform, b0, b1, b2, a0, a1, a2): a2 (float): denominator coefficient of current output y[n-2] Returns: - output_waveform (torch.Tensor): Dimension of `(n_channel, n_frames)` + output_waveform (torch.Tensor): Dimension of `(n_channel, n_frames)` in range [-1, 1] """ assert(waveform.dtype == torch.float32) @@ -601,13 +602,13 @@ def highpass_biquad(waveform, sample_rate, cutoff_freq, Q=0.707): r"""Designs biquad highpass filter and performs filtering. Similar to SoX implementation. Args: - waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)` + waveform (torch.Tensor): Reference `biquad` documentation. sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) cutoff_freq (float): filter cutoff frequency Q (float): https://en.wikipedia.org/wiki/Q_factor Returns: - output_waveform (torch.Tensor): Dimension of `(n_channel, n_frames)` + output_waveform (torch.Tensor): Reference `biquad` documentation. """ GAIN = 1 # TBD - add as a parameter @@ -628,15 +629,15 @@ def highpass_biquad(waveform, sample_rate, cutoff_freq, Q=0.707): def lowpass_biquad(waveform, sample_rate, cutoff_freq, Q=0.707): # type: (Tensor, int, float, Optional[float]) -> Tensor r"""Designs biquad lowpass filter and performs filtering. Similar to SoX implementation. - + Args: - waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)` + waveform (torch.Tensor): Reference `biquad` documentation sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) cutoff_freq (float): filter cutoff frequency Q (float): https://en.wikipedia.org/wiki/Q_factor Returns: - output_waveform (torch.Tensor): Dimension of `(n_channel, n_frames)` + output_waveform (torch.Tensor): Reference `biquad` documentation. """ GAIN = 1 From f8efc98bb778e3da2202fb275bf6bfe538420158 Mon Sep 17 00:00:00 2001 From: Chuan Zhang Date: Wed, 18 Sep 2019 14:43:52 -0700 Subject: [PATCH 05/17] Adding 40th order filter --- torchaudio/functional.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchaudio/functional.py b/torchaudio/functional.py index 54311e68d3..99766025fc 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -629,7 +629,7 @@ def highpass_biquad(waveform, sample_rate, cutoff_freq, Q=0.707): def lowpass_biquad(waveform, sample_rate, cutoff_freq, Q=0.707): # type: (Tensor, int, float, Optional[float]) -> Tensor r"""Designs biquad lowpass filter and performs filtering. Similar to SoX implementation. - + Args: waveform (torch.Tensor): Reference `biquad` documentation sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) From 35fcf4a8e3d75a553a61a80c8d427a45becc6c14 Mon Sep 17 00:00:00 2001 From: Chuan Zhang Date: Wed, 18 Sep 2019 15:03:40 -0700 Subject: [PATCH 06/17] adding 40th order filter --- test/test_functional_lfilter_perf.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py index 8784d118b3..7968dd5e89 100644 --- a/test/test_functional_lfilter_perf.py +++ b/test/test_functional_lfilter_perf.py @@ -110,6 +110,12 @@ def run_test(n_channels, n_frames, n_order_filter, assertClose=True): 0.8471764313073272, -0.11712354962357388, ] + elif n_order_filter == 40: + # Create random set of 40 coefficients, will not be stable, test runtime rather than + # correctness + a_coeffs = torch.rand(40).numpy() + b_coeffs = torch.rand(40).numpy() + a_coeffs[0] = 1 # Set a0 to 1.0 # Cast into Tensors a_coeffs = torch.tensor(a_coeffs, device="cpu", dtype=torch.float32) @@ -149,7 +155,7 @@ def time_and_output(func): # maxDeviation = torch.kthvalue(torch.abs(output_waveform_3- output_waveform_2), output_waveform_1.size(1)) assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4) assert torch.allclose(output_waveform_2, output_waveform_3, atol=3e-4) - print("✓ - all outputs are identical") + print("PASS - all outputs are identical") print("-" * 80) def test_lfilter_cmp(self): @@ -173,6 +179,10 @@ def test_lfilter_cmp(self): self.run_test(2, 80000, 18, False) self.run_test(2, 800000, 18, False) + self.run_test(2, 8000, 40, False) + self.run_test(2, 80000, 40, False) + self.run_test(2, 800000, 40, False) + if __name__ == "__main__": unittest.main() From e5fb427660f1d3872e953abef6749f5408564fc2 Mon Sep 17 00:00:00 2001 From: Chuan Zhang Date: Wed, 18 Sep 2019 16:06:56 -0700 Subject: [PATCH 07/17] Applied cpplint to torchaudio/filtering.cpp --- torchaudio/filtering.cpp | 134 +++++++++++++++++++++++++-------------- 1 file changed, 85 insertions(+), 49 deletions(-) diff --git a/torchaudio/filtering.cpp b/torchaudio/filtering.cpp index 275db9ceaa..0f50189f81 100644 --- a/torchaudio/filtering.cpp +++ b/torchaudio/filtering.cpp @@ -12,7 +12,6 @@ at::Tensor lfilter_tensor_matrix( at::Tensor const & a_coeffs, at::Tensor const & b_coeffs ) { - // Assumptions - float32, waveform between -1 and 1 assert(waveform.dtype() == torch::kFloat32); @@ -21,42 +20,61 @@ at::Tensor lfilter_tensor_matrix( // Numerator and denominator coefficients shoudlb e same size assert(a_coeffs.size(0) == b_coeffs.size(0)); - int n_order = a_coeffs.size(0); // n'th order - 1 filter + int n_order = a_coeffs.size(0); // n'th order - 1 filter assert(n_order > 0); int64_t n_channels = waveform.size(0); int64_t n_frames = waveform.size(1); // Device options should mirror input waveform - auto options = torch::TensorOptions().dtype(waveform.dtype()).device(waveform.device().type()); + auto options = torch::TensorOptions() + .dtype(waveform.dtype()) + .device(waveform.device().type()); // Allocate padded input and output waveform, copy in the input waveform - at::Tensor padded_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options); - at::Tensor padded_output_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options); - padded_waveform.slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1) = waveform; + at::Tensor padded_waveform = + torch::zeros({n_channels, n_frames + n_order - 1}, options); + at::Tensor padded_output_waveform = + torch::zeros({n_channels, n_frames + n_order - 1}, options); + padded_waveform.slice(0, 0, n_channels) + .slice(1, n_order - 1, n_order + n_frames - 1) = waveform; // Create [n_order, n_channels] structure for a and b coefficients - at::Tensor a_coeffs_filled = at::transpose(at::flip(a_coeffs, {0,}).repeat({n_channels, 1}), 0, 1); - at::Tensor b_coeffs_filled = at::transpose(at::flip(b_coeffs, {0,}).repeat({n_channels, 1}), 0, 1); + at::Tensor a_coeffs_filled = + at::transpose(at::flip(a_coeffs, {0, }).repeat({n_channels, 1}), 0, 1); + at::Tensor b_coeffs_filled = + at::transpose(at::flip(b_coeffs, {0, }).repeat({n_channels, 1}), 0, 1); // few more temporary data structure at::Tensor o0 = torch::zeros({n_channels, 1}, options); - at::Tensor ones = torch::ones({n_channels, n_frames + n_order - 1}, options); - at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1; - at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1}); + at::Tensor ones = + torch::ones({n_channels, n_frames + n_order - 1}, options); + at::Tensor negones = + torch::ones({n_channels, n_frames + n_order -1 }, options) * -1; + at::Tensor normalization_a0 = + torch::unsqueeze(a_coeffs[0], {0, }).repeat({n_channels, 1}); for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) { - // calculate the output at time i_frame for all channels - o0 = torch::zeros({n_channels, 1}, torch::TensorOptions().dtype(torch::kFloat32)); + o0 = torch::zeros({n_channels, 1}, + torch::TensorOptions().dtype(torch::kFloat32)); // time window of input and output, size [n_channels, n_order] - at::Tensor const & input_window = padded_waveform.slice(0, 0, n_channels).slice(1, i_frame, i_frame + n_order); - at::Tensor const & output_window = padded_output_waveform.slice(0, 0, n_channels).slice(1, i_frame, i_frame+ n_order); - - // matrix multiply to get [n_channels x n_channels], extract diagonal and unsqueeze to get [n_channels, 1] result for a frame - at::Tensor inp_result = torch::unsqueeze(torch::diag(torch::mm(input_window, b_coeffs_filled)), 1); - at::Tensor out_result = torch::unsqueeze(torch::diag(torch::mm(output_window, a_coeffs_filled)), 1); + at::Tensor const & input_window = + padded_waveform.slice(0, 0, n_channels) + .slice(1, i_frame, i_frame + n_order); + at::Tensor const & output_window = + padded_output_waveform.slice(0, 0, n_channels) + .slice(1, i_frame, i_frame+ n_order); + + // matrix multiply to get [n_channels x n_channels], + // extract diagonal and unsqueeze to get [n_channels, 1] result + at::Tensor inp_result = + torch::unsqueeze(torch::diag(torch::mm(input_window, + b_coeffs_filled)), 1); + at::Tensor out_result = + torch::unsqueeze(torch::diag(torch::mm(output_window, + a_coeffs_filled)), 1); o0.add_(inp_result); o0.sub_(out_result); @@ -65,11 +83,15 @@ at::Tensor lfilter_tensor_matrix( o0.div_(normalization_a0); // Set the output - padded_output_waveform.slice(0, 0, n_channels).slice(1, i_frame + n_order - 1, i_frame + n_order - 1 + 1) = o0; - + padded_output_waveform.slice(0, 0, n_channels) + .slice(1, + i_frame + n_order - 1, + i_frame + n_order - 1 + 1) = o0; } // return clipped, and without initial conditions - return torch::min(ones, torch::max(negones, padded_output_waveform)).slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1); + return torch::min(ones, torch::max(negones, padded_output_waveform)) + .slice(0, 0, n_channels) + .slice(1, n_order - 1, n_order + n_frames - 1); } at::Tensor lfilter_element_wise( @@ -77,7 +99,6 @@ at::Tensor lfilter_tensor_matrix( at::Tensor const & a_coeffs, at::Tensor const & b_coeffs ) { - // Assumptions - float32, waveform between -1 and 1 assert(waveform.dtype() == torch::kFloat32); @@ -86,62 +107,77 @@ at::Tensor lfilter_tensor_matrix( // Numerator and denominator coefficients shoudlb e same size assert(a_coeffs.size(0) == b_coeffs.size(0)); - int n_order = a_coeffs.size(0); // n'th order - 1 filter + int n_order = a_coeffs.size(0); // n'th order - 1 filter assert(n_order > 0); int64_t n_channels = waveform.size(0); int64_t n_frames = waveform.size(1); // Device options should mirror input waveform - auto options = torch::TensorOptions().dtype(waveform.dtype()).device(waveform.device().type()); + auto options = torch::TensorOptions() + .dtype(waveform.dtype()) + .device(waveform.device().type()); // Allocate padded input and output waveform, copy in the input waveform - at::Tensor padded_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options); - at::Tensor padded_output_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options); - padded_waveform.slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1) = waveform; + at::Tensor padded_waveform = + torch::zeros({n_channels, n_frames + n_order - 1}, options); + at::Tensor padded_output_waveform = + torch::zeros({n_channels, n_frames + n_order - 1}, options); + padded_waveform.slice(0, 0, n_channels) + .slice(1, n_order - 1, n_order + n_frames - 1) = waveform; // few more temporary data structure - at::Tensor o0 = torch::zeros({n_channels, 1}, options); - at::Tensor ones = torch::ones({n_channels, n_frames + n_order - 1}, options); - at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1; - at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1}); // Device options should mirror input waveform + at::Tensor o0 = + torch::zeros({n_channels, 1}, options); + at::Tensor ones = + torch::ones({n_channels, n_frames + n_order - 1}, options); + at::Tensor negones = + torch::ones({n_channels, n_frames + n_order -1 }, options) * -1; + at::Tensor normalization_a0 = + torch::unsqueeze(a_coeffs[0], {0, }).repeat({n_channels, 1}); + + // Create accessors for fast access + // https://pytorch.org/cppdocs/notes/tensor_basics.html - // Create accessors for fast access - https://pytorch.org/cppdocs/notes/tensor_basics.html#efficient-access-to-tensor-elements // CPU - auto input_accessor = padded_waveform.accessor(); - auto output_accessor = padded_output_waveform.accessor(); - auto a_coeffs_accessor = a_coeffs.accessor(); - auto b_coeffs_accessor = b_coeffs.accessor(); + auto input_accessor = padded_waveform.accessor(); + auto output_accessor = padded_output_waveform.accessor(); + auto a_coeffs_accessor = a_coeffs.accessor(); + auto b_coeffs_accessor = b_coeffs.accessor(); // CUDA - TBD - //auto input_accessor = waveform.packed_accessor64(); - //auto output_accessor = output_waveform.packed_accessor64(); - //auto a_coeffs_accessor = a_coeffs.packed_accessor64(); - //auto b_coeffs_accessor = b_coeffs.packed_accessor64(); - - for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) { + // auto input_accessor = waveform.packed_accessor64(); + // auto output_accessor = output_waveform.packed_accessor64(); + // auto a_coeffs_accessor = a_coeffs.packed_accessor64(); + // auto b_coeffs_accessor = b_coeffs.packed_accessor64(); + for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) { for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) { - // execute the difference equation float o0 = 0; for (int i_offset = 0; i_offset < n_order; ++i_offset) { - o0 += input_accessor[i_channel][i_frame + i_offset] * b_coeffs_accessor[n_order - i_offset - 1]; - o0 -= output_accessor[i_channel][i_frame + i_offset] * a_coeffs_accessor[n_order - i_offset - 1]; + o0 += input_accessor[i_channel][i_frame + i_offset] * + b_coeffs_accessor[n_order - i_offset - 1]; + o0 -= output_accessor[i_channel][i_frame + i_offset] * + a_coeffs_accessor[n_order - i_offset - 1]; } o0 = o0 / a_coeffs_accessor[0]; // put back into the main data structure output_accessor[i_channel][i_frame + n_order - 1] = o0; - } } // return clipped, and without initial conditions - return torch::min(ones, torch::max(negones, padded_output_waveform)).slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1); + return torch::min(ones, + torch::max(negones, + padded_output_waveform)) + .slice(0, 0, n_channels) + .slice(1, n_order - 1, n_order + n_frames - 1); } -}} +} // namespace audio +} // namespace torch PYBIND11_MODULE(_torch_filtering, m) { From 01a52ee4e4ac04184e6c6559c49fc889fb0fbedc Mon Sep 17 00:00:00 2001 From: Chuan Zhang Date: Thu, 19 Sep 2019 12:01:07 -0700 Subject: [PATCH 08/17] refactor to use python wrapper --- test/test_functional_lfilter_perf.py | 21 +++-- torchaudio/filtering.cpp | 133 +++++---------------------- torchaudio/functional.py | 73 +++++++++++++++ 3 files changed, 112 insertions(+), 115 deletions(-) diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py index 7968dd5e89..4d8301a287 100644 --- a/test/test_functional_lfilter_perf.py +++ b/test/test_functional_lfilter_perf.py @@ -6,8 +6,7 @@ import unittest import common_utils -from torchaudio.functional import lfilter -from _torch_filtering import _lfilter_tensor_matrix, _lfilter_element_wise +from torchaudio.functional import lfilter, lfilter_cpp_impl class TestFunctionalLFilterPerformance(unittest.TestCase): @@ -16,8 +15,6 @@ class TestFunctionalLFilterPerformance(unittest.TestCase): @staticmethod def run_test(n_channels, n_frames, n_order_filter, assertClose=True): waveform = torch.rand(n_channels, n_frames, device="cpu") - b_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu") - a_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu") if n_order_filter == 8: # Eighth Order Filter @@ -133,10 +130,10 @@ def time_and_output(func): lambda: lfilter(waveform, a_coeffs, b_coeffs) ) (output_waveform_2, run_time_2) = time_and_output( - lambda: _lfilter_element_wise(waveform, a_coeffs, b_coeffs) + lambda: lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, 'element_wise') ) (output_waveform_3, run_time_3) = time_and_output( - lambda: _lfilter_tensor_matrix(waveform, a_coeffs, b_coeffs) + lambda: lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, 'matrix') ) print("-" * 80) @@ -152,12 +149,22 @@ def time_and_output(func): print("Ratio Python / CPP ElementWise : %10.2f x" % (run_time_1 / run_time_2)) if assertClose: - # maxDeviation = torch.kthvalue(torch.abs(output_waveform_3- output_waveform_2), output_waveform_1.size(1)) + # maxDeviation = torch.kthvalue(torch.abs(output_waveform_2- output_waveform_1), output_waveform_1.size(1)) + assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4) assert torch.allclose(output_waveform_2, output_waveform_3, atol=3e-4) print("PASS - all outputs are identical") print("-" * 80) + def test_cpp_lfilter_runs(self): + waveform = torch.rand(2, 1000) + b_coeffs = torch.rand(2, dtype=torch.float32) + a_coeffs = torch.rand(2, dtype=torch.float32) + a_coeffs[0] = 1 + + output_waveform = lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, 'element_wise') + output_waveform = lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, 'matrix') + def test_lfilter_cmp(self): """ Runs comparison on CPU diff --git a/torchaudio/filtering.cpp b/torchaudio/filtering.cpp index 0f50189f81..16225d4b92 100644 --- a/torchaudio/filtering.cpp +++ b/torchaudio/filtering.cpp @@ -1,5 +1,5 @@ #include - +// TBD - for CUDA support #include // TBD - Compile on CUDA // TBD - Expand to other data types outside float32? @@ -7,52 +7,17 @@ namespace torch { namespace audio { -at::Tensor lfilter_tensor_matrix( - at::Tensor const & waveform, - at::Tensor const & a_coeffs, - at::Tensor const & b_coeffs +void _lfilter_tensor_matrix( + at::Tensor const & padded_waveform, + at::Tensor & padded_output_waveform, + at::Tensor const & a_coeffs_filled, + at::Tensor const & b_coeffs_filled, + at::Tensor & o0, + at::Tensor const & normalization_a0 ) { - // Assumptions - float32, waveform between -1 and 1 - assert(waveform.dtype() == torch::kFloat32); - - // CPU only for now - Implement for GPU in future - assert(waveform.device().type() == torch::kCPU); - - // Numerator and denominator coefficients shoudlb e same size - assert(a_coeffs.size(0) == b_coeffs.size(0)); - int n_order = a_coeffs.size(0); // n'th order - 1 filter - assert(n_order > 0); - - int64_t n_channels = waveform.size(0); - int64_t n_frames = waveform.size(1); - - // Device options should mirror input waveform - auto options = torch::TensorOptions() - .dtype(waveform.dtype()) - .device(waveform.device().type()); - - // Allocate padded input and output waveform, copy in the input waveform - at::Tensor padded_waveform = - torch::zeros({n_channels, n_frames + n_order - 1}, options); - at::Tensor padded_output_waveform = - torch::zeros({n_channels, n_frames + n_order - 1}, options); - padded_waveform.slice(0, 0, n_channels) - .slice(1, n_order - 1, n_order + n_frames - 1) = waveform; - - // Create [n_order, n_channels] structure for a and b coefficients - at::Tensor a_coeffs_filled = - at::transpose(at::flip(a_coeffs, {0, }).repeat({n_channels, 1}), 0, 1); - at::Tensor b_coeffs_filled = - at::transpose(at::flip(b_coeffs, {0, }).repeat({n_channels, 1}), 0, 1); - - // few more temporary data structure - at::Tensor o0 = torch::zeros({n_channels, 1}, options); - at::Tensor ones = - torch::ones({n_channels, n_frames + n_order - 1}, options); - at::Tensor negones = - torch::ones({n_channels, n_frames + n_order -1 }, options) * -1; - at::Tensor normalization_a0 = - torch::unsqueeze(a_coeffs[0], {0, }).repeat({n_channels, 1}); + int n_order = a_coeffs_filled.size(0); + int n_frames = padded_waveform.size(1) - n_order + 1; + int n_channels = padded_waveform.size(0); for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) { // calculate the output at time i_frame for all channels @@ -88,69 +53,28 @@ at::Tensor lfilter_tensor_matrix( i_frame + n_order - 1, i_frame + n_order - 1 + 1) = o0; } - // return clipped, and without initial conditions - return torch::min(ones, torch::max(negones, padded_output_waveform)) - .slice(0, 0, n_channels) - .slice(1, n_order - 1, n_order + n_frames - 1); } - at::Tensor lfilter_element_wise( - at::Tensor const & waveform, + + + void _lfilter_element_wise( + at::Tensor const & padded_waveform, + at::Tensor & padded_output_waveform, at::Tensor const & a_coeffs, - at::Tensor const & b_coeffs + at::Tensor const & b_coeffs, + at::Tensor & o0 ) { - // Assumptions - float32, waveform between -1 and 1 - assert(waveform.dtype() == torch::kFloat32); - - // CPU only for now - Implement for GPU in future - assert(waveform.device().type() == torch::kCPU); - - // Numerator and denominator coefficients shoudlb e same size - assert(a_coeffs.size(0) == b_coeffs.size(0)); - int n_order = a_coeffs.size(0); // n'th order - 1 filter - assert(n_order > 0); - - int64_t n_channels = waveform.size(0); - int64_t n_frames = waveform.size(1); - - // Device options should mirror input waveform - auto options = torch::TensorOptions() - .dtype(waveform.dtype()) - .device(waveform.device().type()); - - // Allocate padded input and output waveform, copy in the input waveform - at::Tensor padded_waveform = - torch::zeros({n_channels, n_frames + n_order - 1}, options); - at::Tensor padded_output_waveform = - torch::zeros({n_channels, n_frames + n_order - 1}, options); - padded_waveform.slice(0, 0, n_channels) - .slice(1, n_order - 1, n_order + n_frames - 1) = waveform; - - // few more temporary data structure - at::Tensor o0 = - torch::zeros({n_channels, 1}, options); - at::Tensor ones = - torch::ones({n_channels, n_frames + n_order - 1}, options); - at::Tensor negones = - torch::ones({n_channels, n_frames + n_order -1 }, options) * -1; - at::Tensor normalization_a0 = - torch::unsqueeze(a_coeffs[0], {0, }).repeat({n_channels, 1}); - - // Create accessors for fast access - // https://pytorch.org/cppdocs/notes/tensor_basics.html + int n_order = a_coeffs.size(0); + int n_frames = padded_waveform.size(1) - n_order + 1; + int n_channels = padded_waveform.size(0); - // CPU + // Create CPU accessors for fast access + // https://pytorch.org/cppdocs/notes/tensor_basics.html auto input_accessor = padded_waveform.accessor(); auto output_accessor = padded_output_waveform.accessor(); auto a_coeffs_accessor = a_coeffs.accessor(); auto b_coeffs_accessor = b_coeffs.accessor(); - // CUDA - TBD - // auto input_accessor = waveform.packed_accessor64(); - // auto output_accessor = output_waveform.packed_accessor64(); - // auto a_coeffs_accessor = a_coeffs.packed_accessor64(); - // auto b_coeffs_accessor = b_coeffs.packed_accessor64(); - for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) { for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) { // execute the difference equation @@ -167,13 +91,6 @@ at::Tensor lfilter_tensor_matrix( output_accessor[i_channel][i_frame + n_order - 1] = o0; } } - - // return clipped, and without initial conditions - return torch::min(ones, - torch::max(negones, - padded_output_waveform)) - .slice(0, 0, n_channels) - .slice(1, n_order - 1, n_order + n_frames - 1); } } // namespace audio @@ -185,10 +102,10 @@ PYBIND11_MODULE(_torch_filtering, m) { options.disable_function_signatures(); m.def( "_lfilter_tensor_matrix", - &torch::audio::lfilter_tensor_matrix, + &torch::audio::_lfilter_tensor_matrix, "Executes difference equation with tensor"); m.def( "_lfilter_element_wise", - &torch::audio::lfilter_element_wise, + &torch::audio::_lfilter_element_wise, "Executes difference equation with tensor"); } diff --git a/torchaudio/functional.py b/torchaudio/functional.py index 99766025fc..565440eaaf 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -1,6 +1,7 @@ from __future__ import absolute_import, division, print_function, unicode_literals import math import torch +from _torch_filtering import _lfilter_tensor_matrix, _lfilter_element_wise __all__ = [ "istft", @@ -15,6 +16,7 @@ "magphase", "phase_vocoder", "lfilter", + "lfilter_cpp_impl", "lowpass_biquad", "highpass_biquad", "biquad", @@ -566,6 +568,77 @@ def lfilter(waveform, a_coeffs, b_coeffs): return torch.min(ones, torch.max(ones * -1, padded_output_waveform[:, (n_order - 1):])) +def lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, execution_method): + # type: (Tensor, Tensor, Tensor, string) -> Tensor + r""" + See lfilter documentation. Execution method can be either `element_wise` or `matrix`. + """ + + # Perform sanity checks, input check, and memory allocation in python + + # Current limitations to be removed in future + assert(waveform.dtype == torch.float32) + assert(waveform.device.type == 'cpu') + assert(waveform.device == a_coeffs.device) + assert(waveform.device == b_coeffs.device) + + # Use these parameters for any calculations + device = waveform.device + dtype = waveform.dtype + + # Check filter orders + n_order = a_coeffs.size(0) + assert(b_coeffs.size(0) == n_order) + assert(n_order > 0) + assert(a_coeffs[0] != 0) # a0 coeff for y[n] can not be 0 + + # Check waveform size + assert(len(waveform.size()) == 2) + n_channels = waveform.size(0) + n_frames = waveform.size(1) + n_padded_frames = n_frames + n_order - 1 + + # Allocate temporary data structures + # First, the input waveform should be padded by the order of the filter + # N.B. Should we look to how we can avoid copying + padded_waveform = torch.zeros(n_channels, n_padded_frames, dtype=dtype, device=device) + padded_output_waveform = torch.zeros(n_channels, n_padded_frames, dtype=dtype, device=device) + padded_waveform[:, (n_order - 1):] = waveform + + # More temporary data structures + o0 = torch.zeros(n_channels, 1, dtype=dtype, device=device) + normalization_a0 = a_coeffs[0].unsqueeze(0).repeat(n_channels, 1) + ones = torch.ones(n_channels, n_padded_frames, dtype=dtype, device=device) + + # Run through assertion size checks + assert(normalization_a0.size(0) == n_channels) + + if execution_method == 'element_wise': + _lfilter_element_wise(padded_waveform, + padded_output_waveform, + a_coeffs, + b_coeffs, + o0 + ) + elif execution_method == 'matrix': + # From [n_order] a_coeffs, create a [n_channel, n_order] tensor + # lowest order coefficients e.g. a0, b0 should be at the "bottom" + # used for matrix multiply later + a_coeffs_filled = a_coeffs.flip(0).repeat(n_channels, 1).t() + b_coeffs_filled = b_coeffs.flip(0).repeat(n_channels, 1).t() + _lfilter_tensor_matrix(padded_waveform, + padded_output_waveform, + a_coeffs_filled, + b_coeffs_filled, + o0, + normalization_a0, + ) + else: + raise Exception("invalid lfilter execution method %s" % execution_method) + + return torch.min(ones, torch.max(ones * -1, padded_output_waveform))[:, (n_order - 1):] + + def biquad(waveform, b0, b1, b2, a0, a1, a2): # type: (Tensor, float, float, float, float, float, float) -> Tensor r"""Performs a biquad filter of input tensor. Initial conditions set to 0. From 62a2dfd030a8851468b118f94782b76daf626f49 Mon Sep 17 00:00:00 2001 From: Chuan Zhang Date: Wed, 18 Sep 2019 12:25:06 -0700 Subject: [PATCH 09/17] adding test --- test/test_functional_lfilter_perf.py | 131 +++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 test/test_functional_lfilter_perf.py diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py new file mode 100644 index 0000000000..0030541277 --- /dev/null +++ b/test/test_functional_lfilter_perf.py @@ -0,0 +1,131 @@ +from __future__ import absolute_import, division, print_function, unicode_literals +import math +import os +import torch +import torchaudio +import unittest +import common_utils + +from torchaudio.functional import lfilter +from _torch_filtering import ( + _lfilter_tensor_matrix, + _lfilter_tensor, + _lfilter_element_wise, +) + + +class TestFunctionalLFilterPerformance(unittest.TestCase): + test_dirpath, test_dir = common_utils.create_temp_assets_dir() + + @staticmethod + def run_test(n_channels, n_frames, n_order_filter): + waveform = torch.rand(n_channels, n_frames, device="cpu") + b_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu") + a_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu") + + if n_order_filter == 8: + # Eighth Order Filter + # >>> import scipy.signal + # >>> wp = 0.3 + # >>> ws = 0.5 + # >>> gpass = 1 + # >>> gstop = 100 + # >>> b, a = scipy.signal.iirdesign(wp, ws, gpass, gstop) + b_coeffs = [ + 0.0006544487997063485, + 0.001669274889397942, + 0.003218714446315984, + 0.004222562499298002, + 0.004222562499298002, + 0.0032187144463159834, + 0.0016692748893979413, + 0.0006544487997063485, + ] + a_coeffs = [ + 1.0, + -4.67403506662255, + 10.516336803850786, + -14.399207825856776, + 12.844181702707655, + -7.43604712843608, + 2.5888616732696077, + -0.4205601576432048, + ] + elif n_order_filter == 5: + # Fifth Order Filter + # >>> import scipy.signal + # >>> wp = 0.3, ws = 0.5, gpass = 1, gstop = 40 + # >>> b, a = scipy.signal.iirdesign(wp, ws, gpass, gstop) + b_coeffs = [ + 0.0353100066384039, + 0.023370652985988206, + 0.0560524973457262, + 0.023370652985988193, + 0.03531000663840389, + ] + a_coeffs = [ + 1.0, + -2.321010052951366, + 2.677193357612127, + -1.5774235418173692, + 0.4158137396065854, + ] + + # Cast into Tensors + a_coeffs = torch.tensor(a_coeffs, device="cpu", dtype=torch.float32) + b_coeffs = torch.tensor(b_coeffs, device="cpu", dtype=torch.float32) + + def time_and_output(func): + import time + + st = time.time() + output = func() + run_time = time.time() - st + return (output, run_time) + + (output_waveform_1, run_time_1) = time_and_output( + lambda: lfilter(waveform, a_coeffs, b_coeffs) + ) + (output_waveform_2, run_time_2) = time_and_output( + lambda: _lfilter_element_wise(waveform, a_coeffs, b_coeffs) + ) + (output_waveform_3, run_time_3) = time_and_output( + lambda: _lfilter_tensor_matrix(waveform, a_coeffs, b_coeffs) + ) + + print("-" * 80) + print("Evaluating Runtime between lfilter implementations") + print("-" * 80) + print( + "Data Size: [%d x %d], Filter Order: %d" + % (waveform.size(0), waveform.size(1), a_coeffs.size(0)) + ) + print("-" * 80) + print("Python Matrix Runtime [current]: %10.6f s" % run_time_1) + print("CPP Element Wise Runtime : %10.6f s" % run_time_2) + print("CPP Matrix Runtime : %10.6f s" % run_time_3) + + # maxDeviation = torch.kthvalue(torch.abs(output_waveform_1- output_waveform_2), output_waveform_1.size(1)) + + assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4) + assert torch.allclose(output_waveform_1, output_waveform_3, atol=3e-4) + print("-" * 80) + print("✓ - all outputs are identical") + print("-" * 80) + + def test_lfilter_cmp(self): + """ + Runs comparison on CPU + """ + + torch.random.manual_seed(423) + self.run_test(2, 8000, 5) + self.run_test(2, 80000, 5) + self.run_test(2, 800000, 5) + self.run_test(2, 8000, 8) + self.run_test(2, 80000, 8) + self.run_test(2, 800000, 8) + + +if __name__ == "__main__": + unittest.main() From 7d08183a03b2f58a70abb5d999228ffdcdc1a029 Mon Sep 17 00:00:00 2001 From: Chuan Zhang Date: Wed, 18 Sep 2019 13:27:10 -0700 Subject: [PATCH 10/17] added cpp lfilter implementation --- setup.py | 8 ++ test/test_functional_lfilter_perf.py | 79 ++++++++++--- torchaudio/filtering.cpp | 159 +++++++++++++++++++++++++++ torchaudio/functional.py | 4 +- 4 files changed, 232 insertions(+), 18 deletions(-) create mode 100644 torchaudio/filtering.cpp diff --git a/setup.py b/setup.py index 9ed91f2468..4822a6e9d0 100644 --- a/setup.py +++ b/setup.py @@ -113,6 +113,14 @@ def check_env_flag(name, default=''): extra_compile_args=eca, extra_objects=extra_objects, extra_link_args=ela), + CppExtension( + '_torch_filtering', + ['torchaudio/filtering.cpp'], + libraries=libraries, + include_dirs=include_dirs, + extra_compile_args=eca, + extra_objects=extra_objects, + extra_link_args=ela), ], cmdclass={'build_ext': BuildExtension}, install_requires=[pytorch_package_dep] diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py index 0030541277..63c1fbadde 100644 --- a/test/test_functional_lfilter_perf.py +++ b/test/test_functional_lfilter_perf.py @@ -7,18 +7,14 @@ import common_utils from torchaudio.functional import lfilter -from _torch_filtering import ( - _lfilter_tensor_matrix, - _lfilter_tensor, - _lfilter_element_wise, -) +from _torch_filtering import _lfilter_tensor_matrix, _lfilter_element_wise class TestFunctionalLFilterPerformance(unittest.TestCase): test_dirpath, test_dir = common_utils.create_temp_assets_dir() @staticmethod - def run_test(n_channels, n_frames, n_order_filter): + def run_test(n_channels, n_frames, n_order_filter, assertClose=True): waveform = torch.rand(n_channels, n_frames, device="cpu") b_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu") a_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu") @@ -70,6 +66,50 @@ def run_test(n_channels, n_frames, n_order_filter): -1.5774235418173692, 0.4158137396065854, ] + elif n_order_filter == 18: + # >>> import scipy.signal + # >>> wp = 0.48, ws = 0.5, gpass = 0.2, gstop = 120 + # >>> b, a = scipy.signal.iirdesign(wp, ws, gpass, gstop) + b_coeffs = [ + 0.0006050813536446144, + 0.002920916369302935, + 0.010247568347759453, + 0.02591236698507957, + 0.05390501051935878, + 0.09344581172781004, + 0.13951533321139883, + 0.1808658576803922, + 0.2056643061895918, + 0.2056643061895911, + 0.1808658576803912, + 0.13951533321139847, + 0.09344581172781012, + 0.053905010519358885, + 0.02591236698507962, + 0.010247568347759466, + 0.0029209163693029367, + 0.0006050813536446148, + ] + a_coeffs = [ + 1.0, + -4.3964136877356745, + 14.650181359641305, + -34.45816395187684, + 67.18247518997862, + -108.01956225077998, + 149.4332056661277, + -178.07791467502364, + 185.28267044557634, + -168.13382659655514, + 133.22364764531704, + -91.59439958870928, + 54.15835239046956, + -27.090521914173934, + 11.163677645454127, + -3.627296054625132, + 0.8471764313073272, + -0.11712354962357388, + ] # Cast into Tensors a_coeffs = torch.tensor(a_coeffs, device="cpu", dtype=torch.float32) @@ -93,25 +133,24 @@ def time_and_output(func): lambda: _lfilter_tensor_matrix(waveform, a_coeffs, b_coeffs) ) - print("-" * 80) - print("Evaluating Runtime between lfilter implementations") print("-" * 80) print( - "Data Size: [%d x %d], Filter Order: %d" + "lfilter perf - Data Size: [%d x %d], Filter Order: %d" % (waveform.size(0), waveform.size(1), a_coeffs.size(0)) ) print("-" * 80) print("Python Matrix Runtime [current]: %10.6f s" % run_time_1) print("CPP Element Wise Runtime : %10.6f s" % run_time_2) print("CPP Matrix Runtime : %10.6f s" % run_time_3) - - # maxDeviation = torch.kthvalue(torch.abs(output_waveform_1- output_waveform_2), output_waveform_1.size(1)) - - assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4) - assert torch.allclose(output_waveform_1, output_waveform_3, atol=3e-4) - print("-" * 80) - print("✓ - all outputs are identical") print("-" * 80) + print("Ratio Python / CPP ElementWise : %10.2f x" % (run_time_1/run_time_2)) + + if assertClose: + # maxDeviation = torch.kthvalue(torch.abs(output_waveform_3- output_waveform_2), output_waveform_1.size(1)) + assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4) + assert torch.allclose(output_waveform_2, output_waveform_3, atol=3e-4) + print("✓ - all outputs are identical") + print("-" * 80) def test_lfilter_cmp(self): """ @@ -126,6 +165,14 @@ def test_lfilter_cmp(self): self.run_test(2, 80000, 8) self.run_test(2, 800000, 8) + # For higher order filters, due to floating point precision + # matrix method and element method can get different results depending on order of operations + # Also, for longer signals and higher filters, easier to create unstable filter + # https://dsp.stackexchange.com/questions/54386/relation-between-order-and-stability-in-iir-filter + self.run_test(2, 8000, 18, False) + self.run_test(2, 80000, 18, False) + self.run_test(2, 800000, 18, False) + if __name__ == "__main__": unittest.main() diff --git a/torchaudio/filtering.cpp b/torchaudio/filtering.cpp new file mode 100644 index 0000000000..aa60a8a818 --- /dev/null +++ b/torchaudio/filtering.cpp @@ -0,0 +1,159 @@ +#include + +// TBD - Compile on CUDA +// TBD - Expand to other data types outside float32? + +namespace torch { +namespace audio { + + +at::Tensor lfilter_tensor_matrix( + at::Tensor const & waveform, + at::Tensor const & a_coeffs, + at::Tensor const & b_coeffs + ) { + + // Assumptions - float32, waveform between -1 and 1 + assert(waveform.dtype() == torch::kFloat32); + + // CPU only for now - Implement for GPU in future + assert(waveform.device().type() == torch::kCPU); + + // Numerator and denominator coefficients shoudlb e same size + assert(a_coeffs.size(0) == b_coeffs.size(0)); + int n_order = a_coeffs.size(0); // n'th order - 1 filter + assert(n_order > 0); + + int64_t n_channels = waveform.size(0); + int64_t n_frames = waveform.size(1); + + // Device options should mirror input waveform + auto options = torch::TensorOptions().dtype(waveform.dtype()).device(waveform.device().type()); + + // Allocate padded input and output waveform, copy in the input waveform + at::Tensor padded_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options); + at::Tensor padded_output_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options); + padded_waveform.slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1) = waveform; + + // Create [n_order, n_channels] structure for a and b coefficients + at::Tensor a_coeffs_filled = at::transpose(at::flip(a_coeffs, {0,}).repeat({n_channels, 1}), 0, 1); + at::Tensor b_coeffs_filled = at::transpose(at::flip(b_coeffs, {0,}).repeat({n_channels, 1}), 0, 1); + + // few more temporary data structure + at::Tensor o0 = torch::zeros({n_channels, 1}, options); + at::Tensor ones = torch::ones({n_channels, n_frames + n_order - 1}, options); + at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1; + at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1}); + + + for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) { + + // calculate the output at time i_frame for all channels + o0 = torch::zeros({n_channels, 1}, torch::TensorOptions().dtype(torch::kFloat32)); + + // time window of input and output, size [n_channels, n_order] + at::Tensor const & input_window = padded_waveform.slice(0, 0, n_channels).slice(1, i_frame, i_frame + n_order); + at::Tensor const & output_window = padded_output_waveform.slice(0, 0, n_channels).slice(1, i_frame, i_frame+ n_order); + + // matrix multiply to get [n_channels x n_channels], extract diagonal and unsqueeze to get [n_channels, 1] result for a frame + at::Tensor inp_result = torch::unsqueeze(torch::diag(torch::mm(input_window, b_coeffs_filled)), 1); + at::Tensor out_result = torch::unsqueeze(torch::diag(torch::mm(output_window, a_coeffs_filled)), 1); + + o0.add_(inp_result); + o0.sub_(out_result); + + // normalize by a0 + o0.div_(normalization_a0); + + // Set the output + padded_output_waveform.slice(0, 0, n_channels).slice(1, i_frame + n_order - 1, i_frame + n_order - 1 + 1) = o0; + + } + // return clipped, and without initial conditions + return torch::min(ones, torch::max(negones, padded_output_waveform)).slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1); + } + + at::Tensor lfilter_element_wise( + at::Tensor const & waveform, + at::Tensor const & a_coeffs, + at::Tensor const & b_coeffs + ) { + + + // Assumptions - float32, waveform between -1 and 1 + assert(waveform.dtype() == torch::kFloat32); + + // CPU only for now - Implement for GPU in future + assert(waveform.device().type() == torch::kCPU); + + // Numerator and denominator coefficients shoudlb e same size + assert(a_coeffs.size(0) == b_coeffs.size(0)); + int n_order = a_coeffs.size(0); // n'th order - 1 filter + assert(n_order > 0); + + int64_t n_channels = waveform.size(0); + int64_t n_frames = waveform.size(1); + + // Device options should mirror input waveform + auto options = torch::TensorOptions().dtype(waveform.dtype()).device(waveform.device().type()); + + // Allocate padded input and output waveform, copy in the input waveform + at::Tensor padded_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options); + at::Tensor padded_output_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options); + padded_waveform.slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1) = waveform; + + // few more temporary data structure + at::Tensor o0 = torch::zeros({n_channels, 1}, options); + at::Tensor ones = torch::ones({n_channels, n_frames + n_order - 1}, options); + at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1; + at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1}); // Device options should mirror input waveform + + // TBD - Implement for CUDA + //auto input_accessor = waveform.packed_accessor64(); + //auto output_accessor = output_waveform.packed_accessor64(); + //auto a_coeffs_accessor = a_coeffs.packed_accessor64(); + //auto b_coeffs_accessor = b_coeffs.packed_accessor64(); + + // initialize the output tensor + + auto input_accessor = padded_waveform.accessor(); + auto output_accessor = padded_output_waveform.accessor(); + auto a_coeffs_accessor = a_coeffs.accessor(); + auto b_coeffs_accessor = b_coeffs.accessor(); + + for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) { + + for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) { + + // calculate the output at time i_frame by iterating through + // inputs / outputs at previous time steps and multiplying by coeffs + float o0 = 0; + for (int i_offset = 0; i_offset < n_order; ++i_offset) { + o0 += input_accessor[i_channel][i_frame + i_offset] * b_coeffs_accessor[n_order - i_offset - 1]; + o0 -= output_accessor[i_channel][i_frame + i_offset] * a_coeffs_accessor[n_order - i_offset - 1]; + } + o0 = o0 / a_coeffs_accessor[0]; + + output_accessor[i_channel][i_frame + n_order - 1] = o0; + + } + } + // return clipped, and without initial conditions + return torch::min(ones, torch::max(negones, padded_output_waveform)).slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1); + } + +}} + + +PYBIND11_MODULE(_torch_filtering, m) { + py::options options; + options.disable_function_signatures(); + m.def( + "_lfilter_tensor_matrix", + &torch::audio::lfilter_tensor_matrix, + "Executes difference equation with tensor"); + m.def( + "_lfilter_element_wise", + &torch::audio::lfilter_element_wise, + "Executes difference equation with tensor"); +} diff --git a/torchaudio/functional.py b/torchaudio/functional.py index c289f61e01..e80abb4333 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -529,7 +529,7 @@ def phase_vocoder(complex_specgrams, rate, phase_advance): def lfilter(waveform, a_coeffs, b_coeffs): # type: (Tensor, Tensor, Tensor) -> Tensor r""" - Performs an IIR filter by evaluating difference equation. + Performs an IIR filter by evaluating difference equation. Assumes difference equation is stable. Args: waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`. Must be normalized to -1 to 1. @@ -594,7 +594,7 @@ def biquad(waveform, b0, b1, b2, a0, a1, a2): https://en.wikipedia.org/wiki/Digital_biquad_filter Args: - waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)` + waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`. Currently only supports float32. b0 (float): numerator coefficient of current input, x[n] b1 (float): numerator coefficient of input one time step ago x[n-1] b2 (float): numerator coefficient of input two time steps ago x[n-2] From a6fefbeb115cca1454d88acd3f330889b424e949 Mon Sep 17 00:00:00 2001 From: Chuan Zhang Date: Wed, 18 Sep 2019 13:34:04 -0700 Subject: [PATCH 11/17] slight refactoring of cpp lfilter implementations --- torchaudio/filtering.cpp | 23 +++++++++++------------ torchaudio/functional.py | 2 +- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/torchaudio/filtering.cpp b/torchaudio/filtering.cpp index aa60a8a818..275db9ceaa 100644 --- a/torchaudio/filtering.cpp +++ b/torchaudio/filtering.cpp @@ -45,7 +45,6 @@ at::Tensor lfilter_tensor_matrix( at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1; at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1}); - for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) { // calculate the output at time i_frame for all channels @@ -79,7 +78,6 @@ at::Tensor lfilter_tensor_matrix( at::Tensor const & b_coeffs ) { - // Assumptions - float32, waveform between -1 and 1 assert(waveform.dtype() == torch::kFloat32); @@ -108,25 +106,24 @@ at::Tensor lfilter_tensor_matrix( at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1; at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1}); // Device options should mirror input waveform - // TBD - Implement for CUDA - //auto input_accessor = waveform.packed_accessor64(); - //auto output_accessor = output_waveform.packed_accessor64(); - //auto a_coeffs_accessor = a_coeffs.packed_accessor64(); - //auto b_coeffs_accessor = b_coeffs.packed_accessor64(); - - // initialize the output tensor - + // Create accessors for fast access - https://pytorch.org/cppdocs/notes/tensor_basics.html#efficient-access-to-tensor-elements + // CPU auto input_accessor = padded_waveform.accessor(); auto output_accessor = padded_output_waveform.accessor(); auto a_coeffs_accessor = a_coeffs.accessor(); auto b_coeffs_accessor = b_coeffs.accessor(); + // CUDA - TBD + //auto input_accessor = waveform.packed_accessor64(); + //auto output_accessor = output_waveform.packed_accessor64(); + //auto a_coeffs_accessor = a_coeffs.packed_accessor64(); + //auto b_coeffs_accessor = b_coeffs.packed_accessor64(); + for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) { for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) { - // calculate the output at time i_frame by iterating through - // inputs / outputs at previous time steps and multiplying by coeffs + // execute the difference equation float o0 = 0; for (int i_offset = 0; i_offset < n_order; ++i_offset) { o0 += input_accessor[i_channel][i_frame + i_offset] * b_coeffs_accessor[n_order - i_offset - 1]; @@ -134,10 +131,12 @@ at::Tensor lfilter_tensor_matrix( } o0 = o0 / a_coeffs_accessor[0]; + // put back into the main data structure output_accessor[i_channel][i_frame + n_order - 1] = o0; } } + // return clipped, and without initial conditions return torch::min(ones, torch::max(negones, padded_output_waveform)).slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1); } diff --git a/torchaudio/functional.py b/torchaudio/functional.py index e80abb4333..413d2b4ab5 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -529,7 +529,7 @@ def phase_vocoder(complex_specgrams, rate, phase_advance): def lfilter(waveform, a_coeffs, b_coeffs): # type: (Tensor, Tensor, Tensor) -> Tensor r""" - Performs an IIR filter by evaluating difference equation. Assumes difference equation is stable. + Performs an IIR filter by evaluating difference equation. Coefficients should be designed to be stable. Args: waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`. Must be normalized to -1 to 1. From d0e694a39bdd0e763a860d9f9c53722eb1ff5881 Mon Sep 17 00:00:00 2001 From: Chuan Zhang Date: Wed, 18 Sep 2019 13:38:58 -0700 Subject: [PATCH 12/17] Tweaked docs for lowpass_biquad, highpass_biquad, biquad --- test/test_functional_lfilter_perf.py | 2 +- torchaudio/functional.py | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py index 63c1fbadde..8784d118b3 100644 --- a/test/test_functional_lfilter_perf.py +++ b/test/test_functional_lfilter_perf.py @@ -143,7 +143,7 @@ def time_and_output(func): print("CPP Element Wise Runtime : %10.6f s" % run_time_2) print("CPP Matrix Runtime : %10.6f s" % run_time_3) print("-" * 80) - print("Ratio Python / CPP ElementWise : %10.2f x" % (run_time_1/run_time_2)) + print("Ratio Python / CPP ElementWise : %10.2f x" % (run_time_1 / run_time_2)) if assertClose: # maxDeviation = torch.kthvalue(torch.abs(output_waveform_3- output_waveform_2), output_waveform_1.size(1)) diff --git a/torchaudio/functional.py b/torchaudio/functional.py index 413d2b4ab5..ac8102ade5 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -594,7 +594,8 @@ def biquad(waveform, b0, b1, b2, a0, a1, a2): https://en.wikipedia.org/wiki/Digital_biquad_filter Args: - waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`. Currently only supports float32. + waveform (torch.Tensor): Audio waveform of dimension of `(n_channel, n_frames)`. + Currently only supports float32. Normalized [-1, 1] b0 (float): numerator coefficient of current input, x[n] b1 (float): numerator coefficient of input one time step ago x[n-1] b2 (float): numerator coefficient of input two time steps ago x[n-2] @@ -603,7 +604,7 @@ def biquad(waveform, b0, b1, b2, a0, a1, a2): a2 (float): denominator coefficient of current output y[n-2] Returns: - output_waveform (torch.Tensor): Dimension of `(n_channel, n_frames)` + output_waveform (torch.Tensor): Dimension of `(n_channel, n_frames)` in range [-1, 1] """ device = waveform.device @@ -626,13 +627,13 @@ def highpass_biquad(waveform, sample_rate, cutoff_freq, Q=0.707): r"""Designs biquad highpass filter and performs filtering. Similar to SoX implementation. Args: - waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)` + waveform (torch.Tensor): Reference `biquad` documentation. sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) cutoff_freq (float): filter cutoff frequency Q (float): https://en.wikipedia.org/wiki/Q_factor Returns: - output_waveform (torch.Tensor): Dimension of `(n_channel, n_frames)` + output_waveform (torch.Tensor): Reference `biquad` documentation. """ GAIN = 1 # TBD - add as a parameter @@ -653,15 +654,15 @@ def highpass_biquad(waveform, sample_rate, cutoff_freq, Q=0.707): def lowpass_biquad(waveform, sample_rate, cutoff_freq, Q=0.707): # type: (Tensor, int, float, Optional[float]) -> Tensor r"""Designs biquad lowpass filter and performs filtering. Similar to SoX implementation. - + Args: - waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)` + waveform (torch.Tensor): Reference `biquad` documentation sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) cutoff_freq (float): filter cutoff frequency Q (float): https://en.wikipedia.org/wiki/Q_factor Returns: - output_waveform (torch.Tensor): Dimension of `(n_channel, n_frames)` + output_waveform (torch.Tensor): Reference `biquad` documentation. """ GAIN = 1 From 4127096e7758d7a439880b4022d7dee21ae378d5 Mon Sep 17 00:00:00 2001 From: Chuan Zhang Date: Wed, 18 Sep 2019 14:43:52 -0700 Subject: [PATCH 13/17] Adding 40th order filter --- torchaudio/functional.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchaudio/functional.py b/torchaudio/functional.py index ac8102ade5..3aab718f70 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -654,7 +654,7 @@ def highpass_biquad(waveform, sample_rate, cutoff_freq, Q=0.707): def lowpass_biquad(waveform, sample_rate, cutoff_freq, Q=0.707): # type: (Tensor, int, float, Optional[float]) -> Tensor r"""Designs biquad lowpass filter and performs filtering. Similar to SoX implementation. - + Args: waveform (torch.Tensor): Reference `biquad` documentation sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz) From 64370914d438a56758718261080368ce91303c5f Mon Sep 17 00:00:00 2001 From: Chuan Zhang Date: Wed, 18 Sep 2019 15:03:40 -0700 Subject: [PATCH 14/17] adding 40th order filter --- test/test_functional_lfilter_perf.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py index 8784d118b3..7968dd5e89 100644 --- a/test/test_functional_lfilter_perf.py +++ b/test/test_functional_lfilter_perf.py @@ -110,6 +110,12 @@ def run_test(n_channels, n_frames, n_order_filter, assertClose=True): 0.8471764313073272, -0.11712354962357388, ] + elif n_order_filter == 40: + # Create random set of 40 coefficients, will not be stable, test runtime rather than + # correctness + a_coeffs = torch.rand(40).numpy() + b_coeffs = torch.rand(40).numpy() + a_coeffs[0] = 1 # Set a0 to 1.0 # Cast into Tensors a_coeffs = torch.tensor(a_coeffs, device="cpu", dtype=torch.float32) @@ -149,7 +155,7 @@ def time_and_output(func): # maxDeviation = torch.kthvalue(torch.abs(output_waveform_3- output_waveform_2), output_waveform_1.size(1)) assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4) assert torch.allclose(output_waveform_2, output_waveform_3, atol=3e-4) - print("✓ - all outputs are identical") + print("PASS - all outputs are identical") print("-" * 80) def test_lfilter_cmp(self): @@ -173,6 +179,10 @@ def test_lfilter_cmp(self): self.run_test(2, 80000, 18, False) self.run_test(2, 800000, 18, False) + self.run_test(2, 8000, 40, False) + self.run_test(2, 80000, 40, False) + self.run_test(2, 800000, 40, False) + if __name__ == "__main__": unittest.main() From a056409f40464dc7690f89460f8f6420f0483214 Mon Sep 17 00:00:00 2001 From: Chuan Zhang Date: Wed, 18 Sep 2019 16:06:56 -0700 Subject: [PATCH 15/17] Applied cpplint to torchaudio/filtering.cpp --- torchaudio/filtering.cpp | 134 +++++++++++++++++++++++++-------------- 1 file changed, 85 insertions(+), 49 deletions(-) diff --git a/torchaudio/filtering.cpp b/torchaudio/filtering.cpp index 275db9ceaa..0f50189f81 100644 --- a/torchaudio/filtering.cpp +++ b/torchaudio/filtering.cpp @@ -12,7 +12,6 @@ at::Tensor lfilter_tensor_matrix( at::Tensor const & a_coeffs, at::Tensor const & b_coeffs ) { - // Assumptions - float32, waveform between -1 and 1 assert(waveform.dtype() == torch::kFloat32); @@ -21,42 +20,61 @@ at::Tensor lfilter_tensor_matrix( // Numerator and denominator coefficients shoudlb e same size assert(a_coeffs.size(0) == b_coeffs.size(0)); - int n_order = a_coeffs.size(0); // n'th order - 1 filter + int n_order = a_coeffs.size(0); // n'th order - 1 filter assert(n_order > 0); int64_t n_channels = waveform.size(0); int64_t n_frames = waveform.size(1); // Device options should mirror input waveform - auto options = torch::TensorOptions().dtype(waveform.dtype()).device(waveform.device().type()); + auto options = torch::TensorOptions() + .dtype(waveform.dtype()) + .device(waveform.device().type()); // Allocate padded input and output waveform, copy in the input waveform - at::Tensor padded_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options); - at::Tensor padded_output_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options); - padded_waveform.slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1) = waveform; + at::Tensor padded_waveform = + torch::zeros({n_channels, n_frames + n_order - 1}, options); + at::Tensor padded_output_waveform = + torch::zeros({n_channels, n_frames + n_order - 1}, options); + padded_waveform.slice(0, 0, n_channels) + .slice(1, n_order - 1, n_order + n_frames - 1) = waveform; // Create [n_order, n_channels] structure for a and b coefficients - at::Tensor a_coeffs_filled = at::transpose(at::flip(a_coeffs, {0,}).repeat({n_channels, 1}), 0, 1); - at::Tensor b_coeffs_filled = at::transpose(at::flip(b_coeffs, {0,}).repeat({n_channels, 1}), 0, 1); + at::Tensor a_coeffs_filled = + at::transpose(at::flip(a_coeffs, {0, }).repeat({n_channels, 1}), 0, 1); + at::Tensor b_coeffs_filled = + at::transpose(at::flip(b_coeffs, {0, }).repeat({n_channels, 1}), 0, 1); // few more temporary data structure at::Tensor o0 = torch::zeros({n_channels, 1}, options); - at::Tensor ones = torch::ones({n_channels, n_frames + n_order - 1}, options); - at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1; - at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1}); + at::Tensor ones = + torch::ones({n_channels, n_frames + n_order - 1}, options); + at::Tensor negones = + torch::ones({n_channels, n_frames + n_order -1 }, options) * -1; + at::Tensor normalization_a0 = + torch::unsqueeze(a_coeffs[0], {0, }).repeat({n_channels, 1}); for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) { - // calculate the output at time i_frame for all channels - o0 = torch::zeros({n_channels, 1}, torch::TensorOptions().dtype(torch::kFloat32)); + o0 = torch::zeros({n_channels, 1}, + torch::TensorOptions().dtype(torch::kFloat32)); // time window of input and output, size [n_channels, n_order] - at::Tensor const & input_window = padded_waveform.slice(0, 0, n_channels).slice(1, i_frame, i_frame + n_order); - at::Tensor const & output_window = padded_output_waveform.slice(0, 0, n_channels).slice(1, i_frame, i_frame+ n_order); - - // matrix multiply to get [n_channels x n_channels], extract diagonal and unsqueeze to get [n_channels, 1] result for a frame - at::Tensor inp_result = torch::unsqueeze(torch::diag(torch::mm(input_window, b_coeffs_filled)), 1); - at::Tensor out_result = torch::unsqueeze(torch::diag(torch::mm(output_window, a_coeffs_filled)), 1); + at::Tensor const & input_window = + padded_waveform.slice(0, 0, n_channels) + .slice(1, i_frame, i_frame + n_order); + at::Tensor const & output_window = + padded_output_waveform.slice(0, 0, n_channels) + .slice(1, i_frame, i_frame+ n_order); + + // matrix multiply to get [n_channels x n_channels], + // extract diagonal and unsqueeze to get [n_channels, 1] result + at::Tensor inp_result = + torch::unsqueeze(torch::diag(torch::mm(input_window, + b_coeffs_filled)), 1); + at::Tensor out_result = + torch::unsqueeze(torch::diag(torch::mm(output_window, + a_coeffs_filled)), 1); o0.add_(inp_result); o0.sub_(out_result); @@ -65,11 +83,15 @@ at::Tensor lfilter_tensor_matrix( o0.div_(normalization_a0); // Set the output - padded_output_waveform.slice(0, 0, n_channels).slice(1, i_frame + n_order - 1, i_frame + n_order - 1 + 1) = o0; - + padded_output_waveform.slice(0, 0, n_channels) + .slice(1, + i_frame + n_order - 1, + i_frame + n_order - 1 + 1) = o0; } // return clipped, and without initial conditions - return torch::min(ones, torch::max(negones, padded_output_waveform)).slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1); + return torch::min(ones, torch::max(negones, padded_output_waveform)) + .slice(0, 0, n_channels) + .slice(1, n_order - 1, n_order + n_frames - 1); } at::Tensor lfilter_element_wise( @@ -77,7 +99,6 @@ at::Tensor lfilter_tensor_matrix( at::Tensor const & a_coeffs, at::Tensor const & b_coeffs ) { - // Assumptions - float32, waveform between -1 and 1 assert(waveform.dtype() == torch::kFloat32); @@ -86,62 +107,77 @@ at::Tensor lfilter_tensor_matrix( // Numerator and denominator coefficients shoudlb e same size assert(a_coeffs.size(0) == b_coeffs.size(0)); - int n_order = a_coeffs.size(0); // n'th order - 1 filter + int n_order = a_coeffs.size(0); // n'th order - 1 filter assert(n_order > 0); int64_t n_channels = waveform.size(0); int64_t n_frames = waveform.size(1); // Device options should mirror input waveform - auto options = torch::TensorOptions().dtype(waveform.dtype()).device(waveform.device().type()); + auto options = torch::TensorOptions() + .dtype(waveform.dtype()) + .device(waveform.device().type()); // Allocate padded input and output waveform, copy in the input waveform - at::Tensor padded_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options); - at::Tensor padded_output_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options); - padded_waveform.slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1) = waveform; + at::Tensor padded_waveform = + torch::zeros({n_channels, n_frames + n_order - 1}, options); + at::Tensor padded_output_waveform = + torch::zeros({n_channels, n_frames + n_order - 1}, options); + padded_waveform.slice(0, 0, n_channels) + .slice(1, n_order - 1, n_order + n_frames - 1) = waveform; // few more temporary data structure - at::Tensor o0 = torch::zeros({n_channels, 1}, options); - at::Tensor ones = torch::ones({n_channels, n_frames + n_order - 1}, options); - at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1; - at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1}); // Device options should mirror input waveform + at::Tensor o0 = + torch::zeros({n_channels, 1}, options); + at::Tensor ones = + torch::ones({n_channels, n_frames + n_order - 1}, options); + at::Tensor negones = + torch::ones({n_channels, n_frames + n_order -1 }, options) * -1; + at::Tensor normalization_a0 = + torch::unsqueeze(a_coeffs[0], {0, }).repeat({n_channels, 1}); + + // Create accessors for fast access + // https://pytorch.org/cppdocs/notes/tensor_basics.html - // Create accessors for fast access - https://pytorch.org/cppdocs/notes/tensor_basics.html#efficient-access-to-tensor-elements // CPU - auto input_accessor = padded_waveform.accessor(); - auto output_accessor = padded_output_waveform.accessor(); - auto a_coeffs_accessor = a_coeffs.accessor(); - auto b_coeffs_accessor = b_coeffs.accessor(); + auto input_accessor = padded_waveform.accessor(); + auto output_accessor = padded_output_waveform.accessor(); + auto a_coeffs_accessor = a_coeffs.accessor(); + auto b_coeffs_accessor = b_coeffs.accessor(); // CUDA - TBD - //auto input_accessor = waveform.packed_accessor64(); - //auto output_accessor = output_waveform.packed_accessor64(); - //auto a_coeffs_accessor = a_coeffs.packed_accessor64(); - //auto b_coeffs_accessor = b_coeffs.packed_accessor64(); - - for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) { + // auto input_accessor = waveform.packed_accessor64(); + // auto output_accessor = output_waveform.packed_accessor64(); + // auto a_coeffs_accessor = a_coeffs.packed_accessor64(); + // auto b_coeffs_accessor = b_coeffs.packed_accessor64(); + for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) { for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) { - // execute the difference equation float o0 = 0; for (int i_offset = 0; i_offset < n_order; ++i_offset) { - o0 += input_accessor[i_channel][i_frame + i_offset] * b_coeffs_accessor[n_order - i_offset - 1]; - o0 -= output_accessor[i_channel][i_frame + i_offset] * a_coeffs_accessor[n_order - i_offset - 1]; + o0 += input_accessor[i_channel][i_frame + i_offset] * + b_coeffs_accessor[n_order - i_offset - 1]; + o0 -= output_accessor[i_channel][i_frame + i_offset] * + a_coeffs_accessor[n_order - i_offset - 1]; } o0 = o0 / a_coeffs_accessor[0]; // put back into the main data structure output_accessor[i_channel][i_frame + n_order - 1] = o0; - } } // return clipped, and without initial conditions - return torch::min(ones, torch::max(negones, padded_output_waveform)).slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1); + return torch::min(ones, + torch::max(negones, + padded_output_waveform)) + .slice(0, 0, n_channels) + .slice(1, n_order - 1, n_order + n_frames - 1); } -}} +} // namespace audio +} // namespace torch PYBIND11_MODULE(_torch_filtering, m) { From bc6494013a8a91e3d8325a7e51ad2c36c6fd082b Mon Sep 17 00:00:00 2001 From: Chuan Zhang Date: Thu, 19 Sep 2019 12:01:07 -0700 Subject: [PATCH 16/17] refactor to use python wrapper --- test/test_functional_lfilter_perf.py | 21 +++-- torchaudio/filtering.cpp | 133 +++++---------------------- torchaudio/functional.py | 73 +++++++++++++++ 3 files changed, 112 insertions(+), 115 deletions(-) diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py index 7968dd5e89..4d8301a287 100644 --- a/test/test_functional_lfilter_perf.py +++ b/test/test_functional_lfilter_perf.py @@ -6,8 +6,7 @@ import unittest import common_utils -from torchaudio.functional import lfilter -from _torch_filtering import _lfilter_tensor_matrix, _lfilter_element_wise +from torchaudio.functional import lfilter, lfilter_cpp_impl class TestFunctionalLFilterPerformance(unittest.TestCase): @@ -16,8 +15,6 @@ class TestFunctionalLFilterPerformance(unittest.TestCase): @staticmethod def run_test(n_channels, n_frames, n_order_filter, assertClose=True): waveform = torch.rand(n_channels, n_frames, device="cpu") - b_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu") - a_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu") if n_order_filter == 8: # Eighth Order Filter @@ -133,10 +130,10 @@ def time_and_output(func): lambda: lfilter(waveform, a_coeffs, b_coeffs) ) (output_waveform_2, run_time_2) = time_and_output( - lambda: _lfilter_element_wise(waveform, a_coeffs, b_coeffs) + lambda: lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, 'element_wise') ) (output_waveform_3, run_time_3) = time_and_output( - lambda: _lfilter_tensor_matrix(waveform, a_coeffs, b_coeffs) + lambda: lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, 'matrix') ) print("-" * 80) @@ -152,12 +149,22 @@ def time_and_output(func): print("Ratio Python / CPP ElementWise : %10.2f x" % (run_time_1 / run_time_2)) if assertClose: - # maxDeviation = torch.kthvalue(torch.abs(output_waveform_3- output_waveform_2), output_waveform_1.size(1)) + # maxDeviation = torch.kthvalue(torch.abs(output_waveform_2- output_waveform_1), output_waveform_1.size(1)) + assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4) assert torch.allclose(output_waveform_2, output_waveform_3, atol=3e-4) print("PASS - all outputs are identical") print("-" * 80) + def test_cpp_lfilter_runs(self): + waveform = torch.rand(2, 1000) + b_coeffs = torch.rand(2, dtype=torch.float32) + a_coeffs = torch.rand(2, dtype=torch.float32) + a_coeffs[0] = 1 + + output_waveform = lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, 'element_wise') + output_waveform = lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, 'matrix') + def test_lfilter_cmp(self): """ Runs comparison on CPU diff --git a/torchaudio/filtering.cpp b/torchaudio/filtering.cpp index 0f50189f81..16225d4b92 100644 --- a/torchaudio/filtering.cpp +++ b/torchaudio/filtering.cpp @@ -1,5 +1,5 @@ #include - +// TBD - for CUDA support #include // TBD - Compile on CUDA // TBD - Expand to other data types outside float32? @@ -7,52 +7,17 @@ namespace torch { namespace audio { -at::Tensor lfilter_tensor_matrix( - at::Tensor const & waveform, - at::Tensor const & a_coeffs, - at::Tensor const & b_coeffs +void _lfilter_tensor_matrix( + at::Tensor const & padded_waveform, + at::Tensor & padded_output_waveform, + at::Tensor const & a_coeffs_filled, + at::Tensor const & b_coeffs_filled, + at::Tensor & o0, + at::Tensor const & normalization_a0 ) { - // Assumptions - float32, waveform between -1 and 1 - assert(waveform.dtype() == torch::kFloat32); - - // CPU only for now - Implement for GPU in future - assert(waveform.device().type() == torch::kCPU); - - // Numerator and denominator coefficients shoudlb e same size - assert(a_coeffs.size(0) == b_coeffs.size(0)); - int n_order = a_coeffs.size(0); // n'th order - 1 filter - assert(n_order > 0); - - int64_t n_channels = waveform.size(0); - int64_t n_frames = waveform.size(1); - - // Device options should mirror input waveform - auto options = torch::TensorOptions() - .dtype(waveform.dtype()) - .device(waveform.device().type()); - - // Allocate padded input and output waveform, copy in the input waveform - at::Tensor padded_waveform = - torch::zeros({n_channels, n_frames + n_order - 1}, options); - at::Tensor padded_output_waveform = - torch::zeros({n_channels, n_frames + n_order - 1}, options); - padded_waveform.slice(0, 0, n_channels) - .slice(1, n_order - 1, n_order + n_frames - 1) = waveform; - - // Create [n_order, n_channels] structure for a and b coefficients - at::Tensor a_coeffs_filled = - at::transpose(at::flip(a_coeffs, {0, }).repeat({n_channels, 1}), 0, 1); - at::Tensor b_coeffs_filled = - at::transpose(at::flip(b_coeffs, {0, }).repeat({n_channels, 1}), 0, 1); - - // few more temporary data structure - at::Tensor o0 = torch::zeros({n_channels, 1}, options); - at::Tensor ones = - torch::ones({n_channels, n_frames + n_order - 1}, options); - at::Tensor negones = - torch::ones({n_channels, n_frames + n_order -1 }, options) * -1; - at::Tensor normalization_a0 = - torch::unsqueeze(a_coeffs[0], {0, }).repeat({n_channels, 1}); + int n_order = a_coeffs_filled.size(0); + int n_frames = padded_waveform.size(1) - n_order + 1; + int n_channels = padded_waveform.size(0); for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) { // calculate the output at time i_frame for all channels @@ -88,69 +53,28 @@ at::Tensor lfilter_tensor_matrix( i_frame + n_order - 1, i_frame + n_order - 1 + 1) = o0; } - // return clipped, and without initial conditions - return torch::min(ones, torch::max(negones, padded_output_waveform)) - .slice(0, 0, n_channels) - .slice(1, n_order - 1, n_order + n_frames - 1); } - at::Tensor lfilter_element_wise( - at::Tensor const & waveform, + + + void _lfilter_element_wise( + at::Tensor const & padded_waveform, + at::Tensor & padded_output_waveform, at::Tensor const & a_coeffs, - at::Tensor const & b_coeffs + at::Tensor const & b_coeffs, + at::Tensor & o0 ) { - // Assumptions - float32, waveform between -1 and 1 - assert(waveform.dtype() == torch::kFloat32); - - // CPU only for now - Implement for GPU in future - assert(waveform.device().type() == torch::kCPU); - - // Numerator and denominator coefficients shoudlb e same size - assert(a_coeffs.size(0) == b_coeffs.size(0)); - int n_order = a_coeffs.size(0); // n'th order - 1 filter - assert(n_order > 0); - - int64_t n_channels = waveform.size(0); - int64_t n_frames = waveform.size(1); - - // Device options should mirror input waveform - auto options = torch::TensorOptions() - .dtype(waveform.dtype()) - .device(waveform.device().type()); - - // Allocate padded input and output waveform, copy in the input waveform - at::Tensor padded_waveform = - torch::zeros({n_channels, n_frames + n_order - 1}, options); - at::Tensor padded_output_waveform = - torch::zeros({n_channels, n_frames + n_order - 1}, options); - padded_waveform.slice(0, 0, n_channels) - .slice(1, n_order - 1, n_order + n_frames - 1) = waveform; - - // few more temporary data structure - at::Tensor o0 = - torch::zeros({n_channels, 1}, options); - at::Tensor ones = - torch::ones({n_channels, n_frames + n_order - 1}, options); - at::Tensor negones = - torch::ones({n_channels, n_frames + n_order -1 }, options) * -1; - at::Tensor normalization_a0 = - torch::unsqueeze(a_coeffs[0], {0, }).repeat({n_channels, 1}); - - // Create accessors for fast access - // https://pytorch.org/cppdocs/notes/tensor_basics.html + int n_order = a_coeffs.size(0); + int n_frames = padded_waveform.size(1) - n_order + 1; + int n_channels = padded_waveform.size(0); - // CPU + // Create CPU accessors for fast access + // https://pytorch.org/cppdocs/notes/tensor_basics.html auto input_accessor = padded_waveform.accessor(); auto output_accessor = padded_output_waveform.accessor(); auto a_coeffs_accessor = a_coeffs.accessor(); auto b_coeffs_accessor = b_coeffs.accessor(); - // CUDA - TBD - // auto input_accessor = waveform.packed_accessor64(); - // auto output_accessor = output_waveform.packed_accessor64(); - // auto a_coeffs_accessor = a_coeffs.packed_accessor64(); - // auto b_coeffs_accessor = b_coeffs.packed_accessor64(); - for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) { for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) { // execute the difference equation @@ -167,13 +91,6 @@ at::Tensor lfilter_tensor_matrix( output_accessor[i_channel][i_frame + n_order - 1] = o0; } } - - // return clipped, and without initial conditions - return torch::min(ones, - torch::max(negones, - padded_output_waveform)) - .slice(0, 0, n_channels) - .slice(1, n_order - 1, n_order + n_frames - 1); } } // namespace audio @@ -185,10 +102,10 @@ PYBIND11_MODULE(_torch_filtering, m) { options.disable_function_signatures(); m.def( "_lfilter_tensor_matrix", - &torch::audio::lfilter_tensor_matrix, + &torch::audio::_lfilter_tensor_matrix, "Executes difference equation with tensor"); m.def( "_lfilter_element_wise", - &torch::audio::lfilter_element_wise, + &torch::audio::_lfilter_element_wise, "Executes difference equation with tensor"); } diff --git a/torchaudio/functional.py b/torchaudio/functional.py index 3aab718f70..c3bf904684 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -1,6 +1,7 @@ from __future__ import absolute_import, division, print_function, unicode_literals import math import torch +from _torch_filtering import _lfilter_tensor_matrix, _lfilter_element_wise __all__ = [ "istft", @@ -15,6 +16,7 @@ "magphase", "phase_vocoder", "lfilter", + "lfilter_cpp_impl", "lowpass_biquad", "highpass_biquad", "biquad", @@ -588,6 +590,77 @@ def lfilter(waveform, a_coeffs, b_coeffs): return torch.min(ones, torch.max(ones * -1, padded_output_waveform[:, (n_order - 1):])) +def lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, execution_method): + # type: (Tensor, Tensor, Tensor, string) -> Tensor + r""" + See lfilter documentation. Execution method can be either `element_wise` or `matrix`. + """ + + # Perform sanity checks, input check, and memory allocation in python + + # Current limitations to be removed in future + assert(waveform.dtype == torch.float32) + assert(waveform.device.type == 'cpu') + assert(waveform.device == a_coeffs.device) + assert(waveform.device == b_coeffs.device) + + # Use these parameters for any calculations + device = waveform.device + dtype = waveform.dtype + + # Check filter orders + n_order = a_coeffs.size(0) + assert(b_coeffs.size(0) == n_order) + assert(n_order > 0) + assert(a_coeffs[0] != 0) # a0 coeff for y[n] can not be 0 + + # Check waveform size + assert(len(waveform.size()) == 2) + n_channels = waveform.size(0) + n_frames = waveform.size(1) + n_padded_frames = n_frames + n_order - 1 + + # Allocate temporary data structures + # First, the input waveform should be padded by the order of the filter + # N.B. Should we look to how we can avoid copying + padded_waveform = torch.zeros(n_channels, n_padded_frames, dtype=dtype, device=device) + padded_output_waveform = torch.zeros(n_channels, n_padded_frames, dtype=dtype, device=device) + padded_waveform[:, (n_order - 1):] = waveform + + # More temporary data structures + o0 = torch.zeros(n_channels, 1, dtype=dtype, device=device) + normalization_a0 = a_coeffs[0].unsqueeze(0).repeat(n_channels, 1) + ones = torch.ones(n_channels, n_padded_frames, dtype=dtype, device=device) + + # Run through assertion size checks + assert(normalization_a0.size(0) == n_channels) + + if execution_method == 'element_wise': + _lfilter_element_wise(padded_waveform, + padded_output_waveform, + a_coeffs, + b_coeffs, + o0 + ) + elif execution_method == 'matrix': + # From [n_order] a_coeffs, create a [n_channel, n_order] tensor + # lowest order coefficients e.g. a0, b0 should be at the "bottom" + # used for matrix multiply later + a_coeffs_filled = a_coeffs.flip(0).repeat(n_channels, 1).t() + b_coeffs_filled = b_coeffs.flip(0).repeat(n_channels, 1).t() + _lfilter_tensor_matrix(padded_waveform, + padded_output_waveform, + a_coeffs_filled, + b_coeffs_filled, + o0, + normalization_a0, + ) + else: + raise Exception("invalid lfilter execution method %s" % execution_method) + + return torch.min(ones, torch.max(ones * -1, padded_output_waveform))[:, (n_order - 1):] + + def biquad(waveform, b0, b1, b2, a0, a1, a2): # type: (Tensor, float, float, float, float, float, float) -> Tensor r"""Performs a biquad filter of input tensor. Initial conditions set to 0. From a982c5f48494afb82c5ab71ad94ee4ef3cc2716f Mon Sep 17 00:00:00 2001 From: Chuan Zhang Date: Sat, 21 Sep 2019 17:54:29 -0400 Subject: [PATCH 17/17] rebased and added template for element wise lfilter --- test/test_functional_lfilter_perf.py | 11 +++++-- torchaudio/filtering.cpp | 43 ++++++++++++++-------------- torchaudio/functional.py | 27 +++++++++++------ 3 files changed, 49 insertions(+), 32 deletions(-) diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py index 4d8301a287..ec9d9b2ee4 100644 --- a/test/test_functional_lfilter_perf.py +++ b/test/test_functional_lfilter_perf.py @@ -149,21 +149,26 @@ def time_and_output(func): print("Ratio Python / CPP ElementWise : %10.2f x" % (run_time_1 / run_time_2)) if assertClose: - # maxDeviation = torch.kthvalue(torch.abs(output_waveform_2- output_waveform_1), output_waveform_1.size(1)) - + # maxDeviation = torch.kthvalue(torch.abs(output_waveform_2- output_waveform_3), output_waveform_1.size(1)) assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4) assert torch.allclose(output_waveform_2, output_waveform_3, atol=3e-4) print("PASS - all outputs are identical") print("-" * 80) def test_cpp_lfilter_runs(self): - waveform = torch.rand(2, 1000) + waveform = torch.rand(2, 1000, dtype=torch.float32) b_coeffs = torch.rand(2, dtype=torch.float32) a_coeffs = torch.rand(2, dtype=torch.float32) a_coeffs[0] = 1 + double_waveform = torch.rand(2, 1000, dtype=torch.float64) + double_b_coeffs = torch.rand(5, dtype=torch.float64) + double_a_coeffs = torch.rand(5, dtype=torch.float64) + double_a_coeffs[0] = 1 + output_waveform = lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, 'element_wise') output_waveform = lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, 'matrix') + output_waveform = lfilter_cpp_impl(double_waveform, double_a_coeffs, double_b_coeffs, 'element_wise') def test_lfilter_cmp(self): """ diff --git a/torchaudio/filtering.cpp b/torchaudio/filtering.cpp index 16225d4b92..61f1ec8ca0 100644 --- a/torchaudio/filtering.cpp +++ b/torchaudio/filtering.cpp @@ -1,7 +1,6 @@ #include // TBD - for CUDA support #include // TBD - Compile on CUDA -// TBD - Expand to other data types outside float32? namespace torch { namespace audio { @@ -15,14 +14,13 @@ void _lfilter_tensor_matrix( at::Tensor & o0, at::Tensor const & normalization_a0 ) { - int n_order = a_coeffs_filled.size(0); - int n_frames = padded_waveform.size(1) - n_order + 1; - int n_channels = padded_waveform.size(0); + int64_t n_order = a_coeffs_filled.size(0); + int64_t n_frames = padded_waveform.size(1) - n_order + 1; + int64_t n_channels = padded_waveform.size(0); for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) { - // calculate the output at time i_frame for all channels - o0 = torch::zeros({n_channels, 1}, - torch::TensorOptions().dtype(torch::kFloat32)); + // reset all o0 + o0.fill_(0.0); // time window of input and output, size [n_channels, n_order] at::Tensor const & input_window = @@ -55,30 +53,29 @@ void _lfilter_tensor_matrix( } } - - + template void _lfilter_element_wise( at::Tensor const & padded_waveform, at::Tensor & padded_output_waveform, at::Tensor const & a_coeffs, - at::Tensor const & b_coeffs, - at::Tensor & o0 + at::Tensor const & b_coeffs ) { - int n_order = a_coeffs.size(0); - int n_frames = padded_waveform.size(1) - n_order + 1; - int n_channels = padded_waveform.size(0); + int64_t n_order = a_coeffs.size(0); + int64_t n_frames = padded_waveform.size(1) - n_order + 1; + int64_t n_channels = padded_waveform.size(0); // Create CPU accessors for fast access // https://pytorch.org/cppdocs/notes/tensor_basics.html - auto input_accessor = padded_waveform.accessor(); - auto output_accessor = padded_output_waveform.accessor(); - auto a_coeffs_accessor = a_coeffs.accessor(); - auto b_coeffs_accessor = b_coeffs.accessor(); + auto input_accessor = padded_waveform.accessor(); + auto output_accessor = padded_output_waveform.accessor(); + auto a_coeffs_accessor = a_coeffs.accessor(); + auto b_coeffs_accessor = b_coeffs.accessor(); + T o0; for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) { for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) { // execute the difference equation - float o0 = 0; + o0 = 0; for (int i_offset = 0; i_offset < n_order; ++i_offset) { o0 += input_accessor[i_channel][i_frame + i_offset] * b_coeffs_accessor[n_order - i_offset - 1]; @@ -105,7 +102,11 @@ PYBIND11_MODULE(_torch_filtering, m) { &torch::audio::_lfilter_tensor_matrix, "Executes difference equation with tensor"); m.def( - "_lfilter_element_wise", - &torch::audio::_lfilter_element_wise, + "_lfilter_element_wise_float", + &torch::audio::_lfilter_element_wise, + "Executes difference equation with tensor"); + m.def( + "_lfilter_element_wise_double", + &torch::audio::_lfilter_element_wise, "Executes difference equation with tensor"); } diff --git a/torchaudio/functional.py b/torchaudio/functional.py index c3bf904684..84b2e042ee 100644 --- a/torchaudio/functional.py +++ b/torchaudio/functional.py @@ -1,7 +1,7 @@ from __future__ import absolute_import, division, print_function, unicode_literals import math import torch -from _torch_filtering import _lfilter_tensor_matrix, _lfilter_element_wise +from _torch_filtering import _lfilter_tensor_matrix, _lfilter_element_wise_float, _lfilter_element_wise_double __all__ = [ "istft", @@ -599,10 +599,13 @@ def lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, execution_method): # Perform sanity checks, input check, and memory allocation in python # Current limitations to be removed in future - assert(waveform.dtype == torch.float32) + assert(waveform.dtype == torch.float32 or waveform.dtype == torch.float64) assert(waveform.device.type == 'cpu') + assert(waveform.device == a_coeffs.device) assert(waveform.device == b_coeffs.device) + assert(waveform.dtype == a_coeffs.dtype) + assert(waveform.dtype == b_coeffs.dtype) # Use these parameters for any calculations device = waveform.device @@ -636,12 +639,20 @@ def lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, execution_method): assert(normalization_a0.size(0) == n_channels) if execution_method == 'element_wise': - _lfilter_element_wise(padded_waveform, - padded_output_waveform, - a_coeffs, - b_coeffs, - o0 - ) + if (dtype == torch.float32): + _lfilter_element_wise_float(padded_waveform, + padded_output_waveform, + a_coeffs, + b_coeffs, + ) + elif (dtype == torch.float64): + _lfilter_element_wise_double(padded_waveform, + padded_output_waveform, + a_coeffs, + b_coeffs, + ) + else: + raise Exception("lfilter not supported for type ", dtype) elif execution_method == 'matrix': # From [n_order] a_coeffs, create a [n_channel, n_order] tensor # lowest order coefficients e.g. a0, b0 should be at the "bottom"