From a6703e091e00241a0bb2c84bc5aed010c98270a9 Mon Sep 17 00:00:00 2001
From: Chuan Zhang <engineerchuan@gmail.com>
Date: Wed, 18 Sep 2019 12:25:06 -0700
Subject: [PATCH 01/17] adding test

---
 test/test_functional_lfilter_perf.py | 131 +++++++++++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 test/test_functional_lfilter_perf.py

diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py
new file mode 100644
index 0000000000..0030541277
--- /dev/null
+++ b/test/test_functional_lfilter_perf.py
@@ -0,0 +1,131 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+import math
+import os
+import torch
+import torchaudio
+import unittest
+import common_utils
+
+from torchaudio.functional import lfilter
+from _torch_filtering import (
+    _lfilter_tensor_matrix,
+    _lfilter_tensor,
+    _lfilter_element_wise,
+)
+
+
+class TestFunctionalLFilterPerformance(unittest.TestCase):
+    test_dirpath, test_dir = common_utils.create_temp_assets_dir()
+
+    @staticmethod
+    def run_test(n_channels, n_frames, n_order_filter):
+        waveform = torch.rand(n_channels, n_frames, device="cpu")
+        b_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu")
+        a_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu")
+
+        if n_order_filter == 8:
+            # Eighth Order Filter
+            #  >>> import scipy.signal
+            #  >>>  wp = 0.3
+            #  >>>  ws = 0.5
+            #  >>>  gpass = 1
+            #  >>>  gstop = 100
+            #  >>>  b, a = scipy.signal.iirdesign(wp, ws, gpass, gstop)
+            b_coeffs = [
+                0.0006544487997063485,
+                0.001669274889397942,
+                0.003218714446315984,
+                0.004222562499298002,
+                0.004222562499298002,
+                0.0032187144463159834,
+                0.0016692748893979413,
+                0.0006544487997063485,
+            ]
+            a_coeffs = [
+                1.0,
+                -4.67403506662255,
+                10.516336803850786,
+                -14.399207825856776,
+                12.844181702707655,
+                -7.43604712843608,
+                2.5888616732696077,
+                -0.4205601576432048,
+            ]
+        elif n_order_filter == 5:
+            # Fifth Order Filter
+            # >>> import scipy.signal
+            # >>> wp = 0.3, ws = 0.5, gpass = 1, gstop = 40
+            # >>> b, a = scipy.signal.iirdesign(wp, ws, gpass, gstop)
+            b_coeffs = [
+                0.0353100066384039,
+                0.023370652985988206,
+                0.0560524973457262,
+                0.023370652985988193,
+                0.03531000663840389,
+            ]
+            a_coeffs = [
+                1.0,
+                -2.321010052951366,
+                2.677193357612127,
+                -1.5774235418173692,
+                0.4158137396065854,
+            ]
+
+        # Cast into Tensors
+        a_coeffs = torch.tensor(a_coeffs, device="cpu", dtype=torch.float32)
+        b_coeffs = torch.tensor(b_coeffs, device="cpu", dtype=torch.float32)
+
+        def time_and_output(func):
+            import time
+
+            st = time.time()
+            output = func()
+            run_time = time.time() - st
+            return (output, run_time)
+
+        (output_waveform_1, run_time_1) = time_and_output(
+            lambda: lfilter(waveform, a_coeffs, b_coeffs)
+        )
+        (output_waveform_2, run_time_2) = time_and_output(
+            lambda: _lfilter_element_wise(waveform, a_coeffs, b_coeffs)
+        )
+        (output_waveform_3, run_time_3) = time_and_output(
+            lambda: _lfilter_tensor_matrix(waveform, a_coeffs, b_coeffs)
+        )
+
+        print("-" * 80)
+        print("Evaluating Runtime between lfilter implementations")
+        print("-" * 80)
+        print(
+            "Data Size: [%d x %d], Filter Order: %d"
+            % (waveform.size(0), waveform.size(1), a_coeffs.size(0))
+        )
+        print("-" * 80)
+        print("Python Matrix Runtime [current]: %10.6f s" % run_time_1)
+        print("CPP Element Wise Runtime       : %10.6f s" % run_time_2)
+        print("CPP Matrix Runtime             : %10.6f s" % run_time_3)
+
+        # maxDeviation = torch.kthvalue(torch.abs(output_waveform_1- output_waveform_2), output_waveform_1.size(1))
+
+        assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4)
+        assert torch.allclose(output_waveform_1, output_waveform_3, atol=3e-4)
+        print("-" * 80)
+        print("✓ - all outputs are identical")
+        print("-" * 80)
+
+    def test_lfilter_cmp(self):
+        """
+        Runs comparison on CPU
+        """
+
+        torch.random.manual_seed(423)
+        self.run_test(2, 8000, 5)
+        self.run_test(2, 80000, 5)
+        self.run_test(2, 800000, 5)
+        self.run_test(2, 8000, 8)
+        self.run_test(2, 80000, 8)
+        self.run_test(2, 800000, 8)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 16b7cb2a0bf8591869887a71bf06116f0de83999 Mon Sep 17 00:00:00 2001
From: Chuan Zhang <engineerchuan@gmail.com>
Date: Wed, 18 Sep 2019 13:27:10 -0700
Subject: [PATCH 02/17] added cpp lfilter implementation

---
 setup.py                             |   8 ++
 test/test_functional_lfilter_perf.py |  79 ++++++++++---
 torchaudio/filtering.cpp             | 159 +++++++++++++++++++++++++++
 torchaudio/functional.py             |   4 +-
 4 files changed, 232 insertions(+), 18 deletions(-)
 create mode 100644 torchaudio/filtering.cpp

diff --git a/setup.py b/setup.py
index 9ed91f2468..4822a6e9d0 100644
--- a/setup.py
+++ b/setup.py
@@ -113,6 +113,14 @@ def check_env_flag(name, default=''):
             extra_compile_args=eca,
             extra_objects=extra_objects,
             extra_link_args=ela),
+        CppExtension(
+            '_torch_filtering',
+            ['torchaudio/filtering.cpp'],
+            libraries=libraries,
+            include_dirs=include_dirs,
+            extra_compile_args=eca,
+            extra_objects=extra_objects,
+            extra_link_args=ela),
     ],
     cmdclass={'build_ext': BuildExtension},
     install_requires=[pytorch_package_dep]
diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py
index 0030541277..63c1fbadde 100644
--- a/test/test_functional_lfilter_perf.py
+++ b/test/test_functional_lfilter_perf.py
@@ -7,18 +7,14 @@
 import common_utils
 
 from torchaudio.functional import lfilter
-from _torch_filtering import (
-    _lfilter_tensor_matrix,
-    _lfilter_tensor,
-    _lfilter_element_wise,
-)
+from _torch_filtering import _lfilter_tensor_matrix, _lfilter_element_wise
 
 
 class TestFunctionalLFilterPerformance(unittest.TestCase):
     test_dirpath, test_dir = common_utils.create_temp_assets_dir()
 
     @staticmethod
-    def run_test(n_channels, n_frames, n_order_filter):
+    def run_test(n_channels, n_frames, n_order_filter, assertClose=True):
         waveform = torch.rand(n_channels, n_frames, device="cpu")
         b_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu")
         a_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu")
@@ -70,6 +66,50 @@ def run_test(n_channels, n_frames, n_order_filter):
                 -1.5774235418173692,
                 0.4158137396065854,
             ]
+        elif n_order_filter == 18:
+            # >>> import scipy.signal
+            # >>> wp = 0.48, ws = 0.5, gpass = 0.2, gstop = 120
+            # >>> b, a = scipy.signal.iirdesign(wp, ws, gpass, gstop)
+            b_coeffs = [
+                0.0006050813536446144,
+                0.002920916369302935,
+                0.010247568347759453,
+                0.02591236698507957,
+                0.05390501051935878,
+                0.09344581172781004,
+                0.13951533321139883,
+                0.1808658576803922,
+                0.2056643061895918,
+                0.2056643061895911,
+                0.1808658576803912,
+                0.13951533321139847,
+                0.09344581172781012,
+                0.053905010519358885,
+                0.02591236698507962,
+                0.010247568347759466,
+                0.0029209163693029367,
+                0.0006050813536446148,
+            ]
+            a_coeffs = [
+                1.0,
+                -4.3964136877356745,
+                14.650181359641305,
+                -34.45816395187684,
+                67.18247518997862,
+                -108.01956225077998,
+                149.4332056661277,
+                -178.07791467502364,
+                185.28267044557634,
+                -168.13382659655514,
+                133.22364764531704,
+                -91.59439958870928,
+                54.15835239046956,
+                -27.090521914173934,
+                11.163677645454127,
+                -3.627296054625132,
+                0.8471764313073272,
+                -0.11712354962357388,
+            ]
 
         # Cast into Tensors
         a_coeffs = torch.tensor(a_coeffs, device="cpu", dtype=torch.float32)
@@ -93,25 +133,24 @@ def time_and_output(func):
             lambda: _lfilter_tensor_matrix(waveform, a_coeffs, b_coeffs)
         )
 
-        print("-" * 80)
-        print("Evaluating Runtime between lfilter implementations")
         print("-" * 80)
         print(
-            "Data Size: [%d x %d], Filter Order: %d"
+            "lfilter perf - Data Size: [%d x %d], Filter Order: %d"
             % (waveform.size(0), waveform.size(1), a_coeffs.size(0))
         )
         print("-" * 80)
         print("Python Matrix Runtime [current]: %10.6f s" % run_time_1)
         print("CPP Element Wise Runtime       : %10.6f s" % run_time_2)
         print("CPP Matrix Runtime             : %10.6f s" % run_time_3)
-
-        # maxDeviation = torch.kthvalue(torch.abs(output_waveform_1- output_waveform_2), output_waveform_1.size(1))
-
-        assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4)
-        assert torch.allclose(output_waveform_1, output_waveform_3, atol=3e-4)
-        print("-" * 80)
-        print("✓ - all outputs are identical")
         print("-" * 80)
+        print("Ratio Python / CPP ElementWise : %10.2f x" % (run_time_1/run_time_2))
+
+        if assertClose:
+            # maxDeviation = torch.kthvalue(torch.abs(output_waveform_3- output_waveform_2), output_waveform_1.size(1))
+            assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4)
+            assert torch.allclose(output_waveform_2, output_waveform_3, atol=3e-4)
+            print("✓ - all outputs are identical")
+            print("-" * 80)
 
     def test_lfilter_cmp(self):
         """
@@ -126,6 +165,14 @@ def test_lfilter_cmp(self):
         self.run_test(2, 80000, 8)
         self.run_test(2, 800000, 8)
 
+        # For higher order filters, due to floating point precision
+        #  matrix method and element method can get different results depending on order of operations
+        # Also, for longer signals and higher filters, easier to create unstable filter
+        # https://dsp.stackexchange.com/questions/54386/relation-between-order-and-stability-in-iir-filter
+        self.run_test(2, 8000, 18, False)
+        self.run_test(2, 80000, 18, False)
+        self.run_test(2, 800000, 18, False)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchaudio/filtering.cpp b/torchaudio/filtering.cpp
new file mode 100644
index 0000000000..aa60a8a818
--- /dev/null
+++ b/torchaudio/filtering.cpp
@@ -0,0 +1,159 @@
+#include <torch/extension.h>
+
+// TBD - Compile on CUDA
+// TBD - Expand to other data types outside float32?
+
+namespace torch {
+namespace audio {
+
+
+at::Tensor lfilter_tensor_matrix(
+    at::Tensor const & waveform,
+    at::Tensor const & a_coeffs,
+    at::Tensor const & b_coeffs
+  ) {
+
+    // Assumptions - float32, waveform between -1 and 1
+    assert(waveform.dtype() == torch::kFloat32);
+
+    // CPU only for now - Implement for GPU in future
+    assert(waveform.device().type() == torch::kCPU);
+
+    // Numerator and denominator coefficients shoudlb e same size
+    assert(a_coeffs.size(0) == b_coeffs.size(0));
+    int n_order = a_coeffs.size(0); // n'th order - 1 filter
+    assert(n_order > 0);
+
+    int64_t n_channels = waveform.size(0);
+    int64_t n_frames = waveform.size(1);
+
+    // Device options should mirror input waveform
+    auto options = torch::TensorOptions().dtype(waveform.dtype()).device(waveform.device().type());
+
+    // Allocate padded input and output waveform, copy in the input waveform
+    at::Tensor padded_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options);
+    at::Tensor padded_output_waveform = torch::zeros({n_channels, n_frames + n_order - 1},  options);
+    padded_waveform.slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1) = waveform;
+
+    // Create [n_order, n_channels] structure for a and b coefficients
+    at::Tensor a_coeffs_filled = at::transpose(at::flip(a_coeffs, {0,}).repeat({n_channels, 1}), 0, 1);
+    at::Tensor b_coeffs_filled = at::transpose(at::flip(b_coeffs, {0,}).repeat({n_channels, 1}), 0, 1);
+
+    // few more temporary data structure
+    at::Tensor o0 = torch::zeros({n_channels, 1}, options);
+    at::Tensor ones = torch::ones({n_channels, n_frames + n_order - 1}, options);
+    at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1;
+    at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1});
+
+
+    for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) {
+
+      // calculate the output at time i_frame for all channels
+      o0 = torch::zeros({n_channels, 1}, torch::TensorOptions().dtype(torch::kFloat32));
+
+      // time window of input and output, size [n_channels, n_order]
+      at::Tensor const & input_window = padded_waveform.slice(0, 0, n_channels).slice(1, i_frame, i_frame + n_order);
+      at::Tensor const & output_window = padded_output_waveform.slice(0, 0, n_channels).slice(1, i_frame, i_frame+ n_order);
+
+      // matrix multiply to get [n_channels x n_channels], extract diagonal and unsqueeze to get [n_channels, 1] result for a frame
+      at::Tensor inp_result = torch::unsqueeze(torch::diag(torch::mm(input_window, b_coeffs_filled)), 1);
+      at::Tensor out_result = torch::unsqueeze(torch::diag(torch::mm(output_window, a_coeffs_filled)), 1);
+
+      o0.add_(inp_result);
+      o0.sub_(out_result);
+
+      // normalize by a0
+      o0.div_(normalization_a0);
+
+      // Set the output
+      padded_output_waveform.slice(0, 0, n_channels).slice(1, i_frame + n_order - 1, i_frame + n_order - 1 + 1) = o0;
+
+    }
+    // return clipped, and without initial conditions
+    return torch::min(ones, torch::max(negones, padded_output_waveform)).slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1);
+  }
+
+  at::Tensor lfilter_element_wise(
+    at::Tensor const & waveform,
+    at::Tensor const & a_coeffs,
+    at::Tensor const & b_coeffs
+  ) {
+
+
+    // Assumptions - float32, waveform between -1 and 1
+    assert(waveform.dtype() == torch::kFloat32);
+
+    // CPU only for now - Implement for GPU in future
+    assert(waveform.device().type() == torch::kCPU);
+
+    // Numerator and denominator coefficients shoudlb e same size
+    assert(a_coeffs.size(0) == b_coeffs.size(0));
+    int n_order = a_coeffs.size(0); // n'th order - 1 filter
+    assert(n_order > 0);
+
+    int64_t n_channels = waveform.size(0);
+    int64_t n_frames = waveform.size(1);
+
+    // Device options should mirror input waveform
+    auto options = torch::TensorOptions().dtype(waveform.dtype()).device(waveform.device().type());
+
+    // Allocate padded input and output waveform, copy in the input waveform
+    at::Tensor padded_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options);
+    at::Tensor padded_output_waveform = torch::zeros({n_channels, n_frames + n_order - 1},  options);
+    padded_waveform.slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1) = waveform;
+
+    // few more temporary data structure
+    at::Tensor o0 = torch::zeros({n_channels, 1}, options);
+    at::Tensor ones = torch::ones({n_channels, n_frames + n_order - 1}, options);
+    at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1;
+    at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1});    // Device options should mirror input waveform
+
+    // TBD - Implement for CUDA
+    //auto input_accessor = waveform.packed_accessor64<float,2>();
+    //auto output_accessor = output_waveform.packed_accessor64<float,2>();    
+    //auto a_coeffs_accessor = a_coeffs.packed_accessor64<float,1>();
+    //auto b_coeffs_accessor = b_coeffs.packed_accessor64<float,1>();
+    
+    // initialize the output tensor
+  
+    auto input_accessor = padded_waveform.accessor<float,2>();
+    auto output_accessor = padded_output_waveform.accessor<float,2>();    
+    auto a_coeffs_accessor = a_coeffs.accessor<float,1>();
+    auto b_coeffs_accessor = b_coeffs.accessor<float,1>();
+
+    for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) {
+
+      for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) {
+
+        // calculate the output at time i_frame by iterating through
+        // inputs / outputs at previous time steps and multiplying by coeffs
+        float o0 = 0;
+        for (int i_offset = 0; i_offset < n_order; ++i_offset) {
+          o0 += input_accessor[i_channel][i_frame + i_offset] * b_coeffs_accessor[n_order - i_offset - 1];
+          o0 -= output_accessor[i_channel][i_frame + i_offset] * a_coeffs_accessor[n_order - i_offset - 1];
+        }
+        o0 = o0 / a_coeffs_accessor[0];
+
+        output_accessor[i_channel][i_frame + n_order - 1] = o0;
+
+      }
+    }
+    // return clipped, and without initial conditions
+    return torch::min(ones, torch::max(negones, padded_output_waveform)).slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1);
+  }
+
+}}
+
+
+PYBIND11_MODULE(_torch_filtering, m) {
+  py::options options;
+  options.disable_function_signatures();
+  m.def(
+      "_lfilter_tensor_matrix",
+      &torch::audio::lfilter_tensor_matrix,
+      "Executes difference equation with tensor");
+  m.def(
+      "_lfilter_element_wise",
+      &torch::audio::lfilter_element_wise,
+      "Executes difference equation with tensor");
+}
diff --git a/torchaudio/functional.py b/torchaudio/functional.py
index 842d8495bf..1577c016e9 100644
--- a/torchaudio/functional.py
+++ b/torchaudio/functional.py
@@ -511,7 +511,7 @@ def phase_vocoder(complex_specgrams, rate, phase_advance):
 def lfilter(waveform, a_coeffs, b_coeffs):
     # type: (Tensor, Tensor, Tensor) -> Tensor
     r"""
-    Performs an IIR filter by evaluating difference equation.
+    Performs an IIR filter by evaluating difference equation.  Assumes difference equation is stable.
 
     Args:
         waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`.  Must be normalized to -1 to 1.
@@ -572,7 +572,7 @@ def biquad(waveform, b0, b1, b2, a0, a1, a2):
     https://en.wikipedia.org/wiki/Digital_biquad_filter
 
     Args:
-        waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`
+        waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`.  Currently only supports float32.
         b0 (float): numerator coefficient of current input, x[n]
         b1 (float): numerator coefficient of input one time step ago x[n-1]
         b2 (float): numerator coefficient of input two time steps ago x[n-2]

From 0115ec24ceea0ae1dc8a63f900d3fb1ce129cb29 Mon Sep 17 00:00:00 2001
From: Chuan Zhang <engineerchuan@gmail.com>
Date: Wed, 18 Sep 2019 13:34:04 -0700
Subject: [PATCH 03/17] slight refactoring of cpp lfilter implementations

---
 torchaudio/filtering.cpp | 23 +++++++++++------------
 torchaudio/functional.py |  2 +-
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/torchaudio/filtering.cpp b/torchaudio/filtering.cpp
index aa60a8a818..275db9ceaa 100644
--- a/torchaudio/filtering.cpp
+++ b/torchaudio/filtering.cpp
@@ -45,7 +45,6 @@ at::Tensor lfilter_tensor_matrix(
     at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1;
     at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1});
 
-
     for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) {
 
       // calculate the output at time i_frame for all channels
@@ -79,7 +78,6 @@ at::Tensor lfilter_tensor_matrix(
     at::Tensor const & b_coeffs
   ) {
 
-
     // Assumptions - float32, waveform between -1 and 1
     assert(waveform.dtype() == torch::kFloat32);
 
@@ -108,25 +106,24 @@ at::Tensor lfilter_tensor_matrix(
     at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1;
     at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1});    // Device options should mirror input waveform
 
-    // TBD - Implement for CUDA
-    //auto input_accessor = waveform.packed_accessor64<float,2>();
-    //auto output_accessor = output_waveform.packed_accessor64<float,2>();    
-    //auto a_coeffs_accessor = a_coeffs.packed_accessor64<float,1>();
-    //auto b_coeffs_accessor = b_coeffs.packed_accessor64<float,1>();
-    
-    // initialize the output tensor
-  
+    // Create accessors for fast access - https://pytorch.org/cppdocs/notes/tensor_basics.html#efficient-access-to-tensor-elements
+    // CPU
     auto input_accessor = padded_waveform.accessor<float,2>();
     auto output_accessor = padded_output_waveform.accessor<float,2>();    
     auto a_coeffs_accessor = a_coeffs.accessor<float,1>();
     auto b_coeffs_accessor = b_coeffs.accessor<float,1>();
 
+    // CUDA - TBD
+    //auto input_accessor = waveform.packed_accessor64<float,2>();
+    //auto output_accessor = output_waveform.packed_accessor64<float,2>();    
+    //auto a_coeffs_accessor = a_coeffs.packed_accessor64<float,1>();
+    //auto b_coeffs_accessor = b_coeffs.packed_accessor64<float,1>();
+    
     for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) {
 
       for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) {
 
-        // calculate the output at time i_frame by iterating through
-        // inputs / outputs at previous time steps and multiplying by coeffs
+        // execute the difference equation
         float o0 = 0;
         for (int i_offset = 0; i_offset < n_order; ++i_offset) {
           o0 += input_accessor[i_channel][i_frame + i_offset] * b_coeffs_accessor[n_order - i_offset - 1];
@@ -134,10 +131,12 @@ at::Tensor lfilter_tensor_matrix(
         }
         o0 = o0 / a_coeffs_accessor[0];
 
+        // put back into the main data structure
         output_accessor[i_channel][i_frame + n_order - 1] = o0;
 
       }
     }
+
     // return clipped, and without initial conditions
     return torch::min(ones, torch::max(negones, padded_output_waveform)).slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1);
   }
diff --git a/torchaudio/functional.py b/torchaudio/functional.py
index 1577c016e9..319b5b3904 100644
--- a/torchaudio/functional.py
+++ b/torchaudio/functional.py
@@ -511,7 +511,7 @@ def phase_vocoder(complex_specgrams, rate, phase_advance):
 def lfilter(waveform, a_coeffs, b_coeffs):
     # type: (Tensor, Tensor, Tensor) -> Tensor
     r"""
-    Performs an IIR filter by evaluating difference equation.  Assumes difference equation is stable.
+    Performs an IIR filter by evaluating difference equation.  Coefficients should be designed to be stable.
 
     Args:
         waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`.  Must be normalized to -1 to 1.

From 6b872e96a5aaf2ce787611e39ffee4948ada7d56 Mon Sep 17 00:00:00 2001
From: Chuan Zhang <engineerchuan@gmail.com>
Date: Wed, 18 Sep 2019 13:38:58 -0700
Subject: [PATCH 04/17] Tweaked docs for lowpass_biquad, highpass_biquad,
 biquad

---
 test/test_functional_lfilter_perf.py |  2 +-
 torchaudio/functional.py             | 15 ++++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py
index 63c1fbadde..8784d118b3 100644
--- a/test/test_functional_lfilter_perf.py
+++ b/test/test_functional_lfilter_perf.py
@@ -143,7 +143,7 @@ def time_and_output(func):
         print("CPP Element Wise Runtime       : %10.6f s" % run_time_2)
         print("CPP Matrix Runtime             : %10.6f s" % run_time_3)
         print("-" * 80)
-        print("Ratio Python / CPP ElementWise : %10.2f x" % (run_time_1/run_time_2))
+        print("Ratio Python / CPP ElementWise : %10.2f x" % (run_time_1 / run_time_2))
 
         if assertClose:
             # maxDeviation = torch.kthvalue(torch.abs(output_waveform_3- output_waveform_2), output_waveform_1.size(1))
diff --git a/torchaudio/functional.py b/torchaudio/functional.py
index 319b5b3904..54311e68d3 100644
--- a/torchaudio/functional.py
+++ b/torchaudio/functional.py
@@ -572,7 +572,8 @@ def biquad(waveform, b0, b1, b2, a0, a1, a2):
     https://en.wikipedia.org/wiki/Digital_biquad_filter
 
     Args:
-        waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`.  Currently only supports float32.
+        waveform (torch.Tensor): Audio waveform of dimension of `(n_channel, n_frames)`.
+                                 Currently only supports float32. Normalized [-1, 1]
         b0 (float): numerator coefficient of current input, x[n]
         b1 (float): numerator coefficient of input one time step ago x[n-1]
         b2 (float): numerator coefficient of input two time steps ago x[n-2]
@@ -581,7 +582,7 @@ def biquad(waveform, b0, b1, b2, a0, a1, a2):
         a2 (float): denominator coefficient of current output y[n-2]
 
     Returns:
-        output_waveform (torch.Tensor): Dimension of `(n_channel, n_frames)`
+        output_waveform (torch.Tensor): Dimension of `(n_channel, n_frames)` in range [-1, 1]
     """
 
     assert(waveform.dtype == torch.float32)
@@ -601,13 +602,13 @@ def highpass_biquad(waveform, sample_rate, cutoff_freq, Q=0.707):
     r"""Designs biquad highpass filter and performs filtering.  Similar to SoX implementation.
 
     Args:
-        waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`
+        waveform (torch.Tensor): Reference `biquad` documentation.
         sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
         cutoff_freq (float): filter cutoff frequency
         Q (float): https://en.wikipedia.org/wiki/Q_factor
 
     Returns:
-        output_waveform (torch.Tensor): Dimension of `(n_channel, n_frames)`
+        output_waveform (torch.Tensor): Reference `biquad` documentation.
     """
 
     GAIN = 1  # TBD - add as a parameter
@@ -628,15 +629,15 @@ def highpass_biquad(waveform, sample_rate, cutoff_freq, Q=0.707):
 def lowpass_biquad(waveform, sample_rate, cutoff_freq, Q=0.707):
     # type: (Tensor, int, float, Optional[float]) -> Tensor
     r"""Designs biquad lowpass filter and performs filtering.  Similar to SoX implementation.
-
+    
     Args:
-        waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`
+        waveform (torch.Tensor): Reference `biquad` documentation
         sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
         cutoff_freq (float): filter cutoff frequency
         Q (float): https://en.wikipedia.org/wiki/Q_factor
 
     Returns:
-        output_waveform (torch.Tensor): Dimension of `(n_channel, n_frames)`
+        output_waveform (torch.Tensor): Reference `biquad` documentation.
     """
 
     GAIN = 1

From f8efc98bb778e3da2202fb275bf6bfe538420158 Mon Sep 17 00:00:00 2001
From: Chuan Zhang <engineerchuan@gmail.com>
Date: Wed, 18 Sep 2019 14:43:52 -0700
Subject: [PATCH 05/17] Adding 40th order filter

---
 torchaudio/functional.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchaudio/functional.py b/torchaudio/functional.py
index 54311e68d3..99766025fc 100644
--- a/torchaudio/functional.py
+++ b/torchaudio/functional.py
@@ -629,7 +629,7 @@ def highpass_biquad(waveform, sample_rate, cutoff_freq, Q=0.707):
 def lowpass_biquad(waveform, sample_rate, cutoff_freq, Q=0.707):
     # type: (Tensor, int, float, Optional[float]) -> Tensor
     r"""Designs biquad lowpass filter and performs filtering.  Similar to SoX implementation.
-    
+
     Args:
         waveform (torch.Tensor): Reference `biquad` documentation
         sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)

From 35fcf4a8e3d75a553a61a80c8d427a45becc6c14 Mon Sep 17 00:00:00 2001
From: Chuan Zhang <engineerchuan@gmail.com>
Date: Wed, 18 Sep 2019 15:03:40 -0700
Subject: [PATCH 06/17] adding 40th order filter

---
 test/test_functional_lfilter_perf.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py
index 8784d118b3..7968dd5e89 100644
--- a/test/test_functional_lfilter_perf.py
+++ b/test/test_functional_lfilter_perf.py
@@ -110,6 +110,12 @@ def run_test(n_channels, n_frames, n_order_filter, assertClose=True):
                 0.8471764313073272,
                 -0.11712354962357388,
             ]
+        elif n_order_filter == 40:
+            # Create random set of 40 coefficients, will not be stable, test runtime rather than
+            # correctness
+            a_coeffs = torch.rand(40).numpy()
+            b_coeffs = torch.rand(40).numpy()
+            a_coeffs[0] = 1  # Set a0 to 1.0
 
         # Cast into Tensors
         a_coeffs = torch.tensor(a_coeffs, device="cpu", dtype=torch.float32)
@@ -149,7 +155,7 @@ def time_and_output(func):
             # maxDeviation = torch.kthvalue(torch.abs(output_waveform_3- output_waveform_2), output_waveform_1.size(1))
             assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4)
             assert torch.allclose(output_waveform_2, output_waveform_3, atol=3e-4)
-            print("✓ - all outputs are identical")
+            print("PASS - all outputs are identical")
             print("-" * 80)
 
     def test_lfilter_cmp(self):
@@ -173,6 +179,10 @@ def test_lfilter_cmp(self):
         self.run_test(2, 80000, 18, False)
         self.run_test(2, 800000, 18, False)
 
+        self.run_test(2, 8000, 40, False)
+        self.run_test(2, 80000, 40, False)
+        self.run_test(2, 800000, 40, False)
+
 
 if __name__ == "__main__":
     unittest.main()

From e5fb427660f1d3872e953abef6749f5408564fc2 Mon Sep 17 00:00:00 2001
From: Chuan Zhang <engineerchuan@gmail.com>
Date: Wed, 18 Sep 2019 16:06:56 -0700
Subject: [PATCH 07/17] Applied cpplint to torchaudio/filtering.cpp

---
 torchaudio/filtering.cpp | 134 +++++++++++++++++++++++++--------------
 1 file changed, 85 insertions(+), 49 deletions(-)

diff --git a/torchaudio/filtering.cpp b/torchaudio/filtering.cpp
index 275db9ceaa..0f50189f81 100644
--- a/torchaudio/filtering.cpp
+++ b/torchaudio/filtering.cpp
@@ -12,7 +12,6 @@ at::Tensor lfilter_tensor_matrix(
     at::Tensor const & a_coeffs,
     at::Tensor const & b_coeffs
   ) {
-
     // Assumptions - float32, waveform between -1 and 1
     assert(waveform.dtype() == torch::kFloat32);
 
@@ -21,42 +20,61 @@ at::Tensor lfilter_tensor_matrix(
 
     // Numerator and denominator coefficients shoudlb e same size
     assert(a_coeffs.size(0) == b_coeffs.size(0));
-    int n_order = a_coeffs.size(0); // n'th order - 1 filter
+    int n_order = a_coeffs.size(0);  // n'th order - 1 filter
     assert(n_order > 0);
 
     int64_t n_channels = waveform.size(0);
     int64_t n_frames = waveform.size(1);
 
     // Device options should mirror input waveform
-    auto options = torch::TensorOptions().dtype(waveform.dtype()).device(waveform.device().type());
+    auto options = torch::TensorOptions()
+                     .dtype(waveform.dtype())
+                     .device(waveform.device().type());
 
     // Allocate padded input and output waveform, copy in the input waveform
-    at::Tensor padded_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options);
-    at::Tensor padded_output_waveform = torch::zeros({n_channels, n_frames + n_order - 1},  options);
-    padded_waveform.slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1) = waveform;
+    at::Tensor padded_waveform =
+        torch::zeros({n_channels, n_frames + n_order - 1}, options);
+    at::Tensor padded_output_waveform =
+        torch::zeros({n_channels, n_frames + n_order - 1},  options);
+    padded_waveform.slice(0, 0, n_channels)
+                  .slice(1, n_order - 1, n_order + n_frames - 1) = waveform;
 
     // Create [n_order, n_channels] structure for a and b coefficients
-    at::Tensor a_coeffs_filled = at::transpose(at::flip(a_coeffs, {0,}).repeat({n_channels, 1}), 0, 1);
-    at::Tensor b_coeffs_filled = at::transpose(at::flip(b_coeffs, {0,}).repeat({n_channels, 1}), 0, 1);
+    at::Tensor a_coeffs_filled =
+        at::transpose(at::flip(a_coeffs, {0, }).repeat({n_channels, 1}), 0, 1);
+    at::Tensor b_coeffs_filled =
+        at::transpose(at::flip(b_coeffs, {0, }).repeat({n_channels, 1}), 0, 1);
 
     // few more temporary data structure
     at::Tensor o0 = torch::zeros({n_channels, 1}, options);
-    at::Tensor ones = torch::ones({n_channels, n_frames + n_order - 1}, options);
-    at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1;
-    at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1});
+    at::Tensor ones =
+        torch::ones({n_channels, n_frames + n_order - 1}, options);
+    at::Tensor negones =
+        torch::ones({n_channels, n_frames + n_order -1 }, options) * -1;
+    at::Tensor normalization_a0 =
+        torch::unsqueeze(a_coeffs[0], {0, }).repeat({n_channels, 1});
 
     for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) {
-
       // calculate the output at time i_frame for all channels
-      o0 = torch::zeros({n_channels, 1}, torch::TensorOptions().dtype(torch::kFloat32));
+      o0 = torch::zeros({n_channels, 1},
+                        torch::TensorOptions().dtype(torch::kFloat32));
 
       // time window of input and output, size [n_channels, n_order]
-      at::Tensor const & input_window = padded_waveform.slice(0, 0, n_channels).slice(1, i_frame, i_frame + n_order);
-      at::Tensor const & output_window = padded_output_waveform.slice(0, 0, n_channels).slice(1, i_frame, i_frame+ n_order);
-
-      // matrix multiply to get [n_channels x n_channels], extract diagonal and unsqueeze to get [n_channels, 1] result for a frame
-      at::Tensor inp_result = torch::unsqueeze(torch::diag(torch::mm(input_window, b_coeffs_filled)), 1);
-      at::Tensor out_result = torch::unsqueeze(torch::diag(torch::mm(output_window, a_coeffs_filled)), 1);
+      at::Tensor const & input_window =
+          padded_waveform.slice(0, 0, n_channels)
+                         .slice(1, i_frame, i_frame + n_order);
+      at::Tensor const & output_window =
+          padded_output_waveform.slice(0, 0, n_channels)
+                                .slice(1, i_frame, i_frame+ n_order);
+
+      // matrix multiply to get [n_channels x n_channels],
+      // extract diagonal and unsqueeze to get [n_channels, 1] result
+      at::Tensor inp_result =
+          torch::unsqueeze(torch::diag(torch::mm(input_window,
+                                                 b_coeffs_filled)), 1);
+      at::Tensor out_result =
+          torch::unsqueeze(torch::diag(torch::mm(output_window,
+                                                 a_coeffs_filled)), 1);
 
       o0.add_(inp_result);
       o0.sub_(out_result);
@@ -65,11 +83,15 @@ at::Tensor lfilter_tensor_matrix(
       o0.div_(normalization_a0);
 
       // Set the output
-      padded_output_waveform.slice(0, 0, n_channels).slice(1, i_frame + n_order - 1, i_frame + n_order - 1 + 1) = o0;
-
+      padded_output_waveform.slice(0, 0, n_channels)
+                            .slice(1,
+                                   i_frame + n_order - 1,
+                                   i_frame + n_order - 1 + 1) = o0;
     }
     // return clipped, and without initial conditions
-    return torch::min(ones, torch::max(negones, padded_output_waveform)).slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1);
+    return torch::min(ones, torch::max(negones, padded_output_waveform))
+                  .slice(0, 0, n_channels)
+                  .slice(1, n_order - 1, n_order + n_frames - 1);
   }
 
   at::Tensor lfilter_element_wise(
@@ -77,7 +99,6 @@ at::Tensor lfilter_tensor_matrix(
     at::Tensor const & a_coeffs,
     at::Tensor const & b_coeffs
   ) {
-
     // Assumptions - float32, waveform between -1 and 1
     assert(waveform.dtype() == torch::kFloat32);
 
@@ -86,62 +107,77 @@ at::Tensor lfilter_tensor_matrix(
 
     // Numerator and denominator coefficients shoudlb e same size
     assert(a_coeffs.size(0) == b_coeffs.size(0));
-    int n_order = a_coeffs.size(0); // n'th order - 1 filter
+    int n_order = a_coeffs.size(0);  // n'th order - 1 filter
     assert(n_order > 0);
 
     int64_t n_channels = waveform.size(0);
     int64_t n_frames = waveform.size(1);
 
     // Device options should mirror input waveform
-    auto options = torch::TensorOptions().dtype(waveform.dtype()).device(waveform.device().type());
+    auto options = torch::TensorOptions()
+                       .dtype(waveform.dtype())
+                      .device(waveform.device().type());
 
     // Allocate padded input and output waveform, copy in the input waveform
-    at::Tensor padded_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options);
-    at::Tensor padded_output_waveform = torch::zeros({n_channels, n_frames + n_order - 1},  options);
-    padded_waveform.slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1) = waveform;
+    at::Tensor padded_waveform =
+        torch::zeros({n_channels, n_frames + n_order - 1}, options);
+    at::Tensor padded_output_waveform =
+        torch::zeros({n_channels, n_frames + n_order - 1},  options);
+    padded_waveform.slice(0, 0, n_channels)
+                   .slice(1, n_order - 1, n_order + n_frames - 1) = waveform;
 
     // few more temporary data structure
-    at::Tensor o0 = torch::zeros({n_channels, 1}, options);
-    at::Tensor ones = torch::ones({n_channels, n_frames + n_order - 1}, options);
-    at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1;
-    at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1});    // Device options should mirror input waveform
+    at::Tensor o0 =
+        torch::zeros({n_channels, 1}, options);
+    at::Tensor ones =
+        torch::ones({n_channels, n_frames + n_order - 1}, options);
+    at::Tensor negones =
+        torch::ones({n_channels, n_frames + n_order -1 }, options) * -1;
+    at::Tensor normalization_a0 =
+        torch::unsqueeze(a_coeffs[0], {0, }).repeat({n_channels, 1});
+
+    // Create accessors for fast access
+    // https://pytorch.org/cppdocs/notes/tensor_basics.html
 
-    // Create accessors for fast access - https://pytorch.org/cppdocs/notes/tensor_basics.html#efficient-access-to-tensor-elements
     // CPU
-    auto input_accessor = padded_waveform.accessor<float,2>();
-    auto output_accessor = padded_output_waveform.accessor<float,2>();    
-    auto a_coeffs_accessor = a_coeffs.accessor<float,1>();
-    auto b_coeffs_accessor = b_coeffs.accessor<float,1>();
+    auto input_accessor = padded_waveform.accessor<float, 2>();
+    auto output_accessor = padded_output_waveform.accessor<float, 2>();
+    auto a_coeffs_accessor = a_coeffs.accessor<float, 1>();
+    auto b_coeffs_accessor = b_coeffs.accessor<float, 1>();
 
     // CUDA - TBD
-    //auto input_accessor = waveform.packed_accessor64<float,2>();
-    //auto output_accessor = output_waveform.packed_accessor64<float,2>();    
-    //auto a_coeffs_accessor = a_coeffs.packed_accessor64<float,1>();
-    //auto b_coeffs_accessor = b_coeffs.packed_accessor64<float,1>();
-    
-    for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) {
+    // auto input_accessor = waveform.packed_accessor64<float,2>();
+    // auto output_accessor = output_waveform.packed_accessor64<float,2>();
+    // auto a_coeffs_accessor = a_coeffs.packed_accessor64<float,1>();
+    // auto b_coeffs_accessor = b_coeffs.packed_accessor64<float,1>();
 
+    for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) {
       for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) {
-
         // execute the difference equation
         float o0 = 0;
         for (int i_offset = 0; i_offset < n_order; ++i_offset) {
-          o0 += input_accessor[i_channel][i_frame + i_offset] * b_coeffs_accessor[n_order - i_offset - 1];
-          o0 -= output_accessor[i_channel][i_frame + i_offset] * a_coeffs_accessor[n_order - i_offset - 1];
+          o0 += input_accessor[i_channel][i_frame + i_offset] *
+                b_coeffs_accessor[n_order - i_offset - 1];
+          o0 -= output_accessor[i_channel][i_frame + i_offset] *
+                a_coeffs_accessor[n_order - i_offset - 1];
         }
         o0 = o0 / a_coeffs_accessor[0];
 
         // put back into the main data structure
         output_accessor[i_channel][i_frame + n_order - 1] = o0;
-
       }
     }
 
     // return clipped, and without initial conditions
-    return torch::min(ones, torch::max(negones, padded_output_waveform)).slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1);
+    return torch::min(ones,
+                      torch::max(negones,
+                                 padded_output_waveform))
+           .slice(0, 0, n_channels)
+           .slice(1, n_order - 1, n_order + n_frames - 1);
   }
 
-}}
+}  // namespace audio
+}  // namespace torch
 
 
 PYBIND11_MODULE(_torch_filtering, m) {

From 01a52ee4e4ac04184e6c6559c49fc889fb0fbedc Mon Sep 17 00:00:00 2001
From: Chuan Zhang <engineerchuan@gmail.com>
Date: Thu, 19 Sep 2019 12:01:07 -0700
Subject: [PATCH 08/17] refactor to use python wrapper

---
 test/test_functional_lfilter_perf.py |  21 +++--
 torchaudio/filtering.cpp             | 133 +++++----------------------
 torchaudio/functional.py             |  73 +++++++++++++++
 3 files changed, 112 insertions(+), 115 deletions(-)

diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py
index 7968dd5e89..4d8301a287 100644
--- a/test/test_functional_lfilter_perf.py
+++ b/test/test_functional_lfilter_perf.py
@@ -6,8 +6,7 @@
 import unittest
 import common_utils
 
-from torchaudio.functional import lfilter
-from _torch_filtering import _lfilter_tensor_matrix, _lfilter_element_wise
+from torchaudio.functional import lfilter, lfilter_cpp_impl
 
 
 class TestFunctionalLFilterPerformance(unittest.TestCase):
@@ -16,8 +15,6 @@ class TestFunctionalLFilterPerformance(unittest.TestCase):
     @staticmethod
     def run_test(n_channels, n_frames, n_order_filter, assertClose=True):
         waveform = torch.rand(n_channels, n_frames, device="cpu")
-        b_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu")
-        a_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu")
 
         if n_order_filter == 8:
             # Eighth Order Filter
@@ -133,10 +130,10 @@ def time_and_output(func):
             lambda: lfilter(waveform, a_coeffs, b_coeffs)
         )
         (output_waveform_2, run_time_2) = time_and_output(
-            lambda: _lfilter_element_wise(waveform, a_coeffs, b_coeffs)
+            lambda: lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, 'element_wise')
         )
         (output_waveform_3, run_time_3) = time_and_output(
-            lambda: _lfilter_tensor_matrix(waveform, a_coeffs, b_coeffs)
+            lambda: lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, 'matrix')
         )
 
         print("-" * 80)
@@ -152,12 +149,22 @@ def time_and_output(func):
         print("Ratio Python / CPP ElementWise : %10.2f x" % (run_time_1 / run_time_2))
 
         if assertClose:
-            # maxDeviation = torch.kthvalue(torch.abs(output_waveform_3- output_waveform_2), output_waveform_1.size(1))
+            # maxDeviation = torch.kthvalue(torch.abs(output_waveform_2- output_waveform_1), output_waveform_1.size(1))
+
             assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4)
             assert torch.allclose(output_waveform_2, output_waveform_3, atol=3e-4)
             print("PASS - all outputs are identical")
             print("-" * 80)
 
+    def test_cpp_lfilter_runs(self):
+        waveform = torch.rand(2, 1000)
+        b_coeffs = torch.rand(2, dtype=torch.float32)
+        a_coeffs = torch.rand(2, dtype=torch.float32)
+        a_coeffs[0] = 1
+
+        output_waveform = lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, 'element_wise')
+        output_waveform = lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, 'matrix')
+
     def test_lfilter_cmp(self):
         """
         Runs comparison on CPU
diff --git a/torchaudio/filtering.cpp b/torchaudio/filtering.cpp
index 0f50189f81..16225d4b92 100644
--- a/torchaudio/filtering.cpp
+++ b/torchaudio/filtering.cpp
@@ -1,5 +1,5 @@
 #include <torch/extension.h>
-
+// TBD - for CUDA support #include <ATen/cuda/CUDAContext.h>
 // TBD - Compile on CUDA
 // TBD - Expand to other data types outside float32?
 
@@ -7,52 +7,17 @@ namespace torch {
 namespace audio {
 
 
-at::Tensor lfilter_tensor_matrix(
-    at::Tensor const & waveform,
-    at::Tensor const & a_coeffs,
-    at::Tensor const & b_coeffs
+void _lfilter_tensor_matrix(
+    at::Tensor const & padded_waveform,
+    at::Tensor & padded_output_waveform,
+    at::Tensor const & a_coeffs_filled,
+    at::Tensor const & b_coeffs_filled,
+    at::Tensor & o0,
+    at::Tensor const & normalization_a0
   ) {
-    // Assumptions - float32, waveform between -1 and 1
-    assert(waveform.dtype() == torch::kFloat32);
-
-    // CPU only for now - Implement for GPU in future
-    assert(waveform.device().type() == torch::kCPU);
-
-    // Numerator and denominator coefficients shoudlb e same size
-    assert(a_coeffs.size(0) == b_coeffs.size(0));
-    int n_order = a_coeffs.size(0);  // n'th order - 1 filter
-    assert(n_order > 0);
-
-    int64_t n_channels = waveform.size(0);
-    int64_t n_frames = waveform.size(1);
-
-    // Device options should mirror input waveform
-    auto options = torch::TensorOptions()
-                     .dtype(waveform.dtype())
-                     .device(waveform.device().type());
-
-    // Allocate padded input and output waveform, copy in the input waveform
-    at::Tensor padded_waveform =
-        torch::zeros({n_channels, n_frames + n_order - 1}, options);
-    at::Tensor padded_output_waveform =
-        torch::zeros({n_channels, n_frames + n_order - 1},  options);
-    padded_waveform.slice(0, 0, n_channels)
-                  .slice(1, n_order - 1, n_order + n_frames - 1) = waveform;
-
-    // Create [n_order, n_channels] structure for a and b coefficients
-    at::Tensor a_coeffs_filled =
-        at::transpose(at::flip(a_coeffs, {0, }).repeat({n_channels, 1}), 0, 1);
-    at::Tensor b_coeffs_filled =
-        at::transpose(at::flip(b_coeffs, {0, }).repeat({n_channels, 1}), 0, 1);
-
-    // few more temporary data structure
-    at::Tensor o0 = torch::zeros({n_channels, 1}, options);
-    at::Tensor ones =
-        torch::ones({n_channels, n_frames + n_order - 1}, options);
-    at::Tensor negones =
-        torch::ones({n_channels, n_frames + n_order -1 }, options) * -1;
-    at::Tensor normalization_a0 =
-        torch::unsqueeze(a_coeffs[0], {0, }).repeat({n_channels, 1});
+    int n_order = a_coeffs_filled.size(0);
+    int n_frames = padded_waveform.size(1) - n_order + 1;
+    int n_channels = padded_waveform.size(0);
 
     for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) {
       // calculate the output at time i_frame for all channels
@@ -88,69 +53,28 @@ at::Tensor lfilter_tensor_matrix(
                                    i_frame + n_order - 1,
                                    i_frame + n_order - 1 + 1) = o0;
     }
-    // return clipped, and without initial conditions
-    return torch::min(ones, torch::max(negones, padded_output_waveform))
-                  .slice(0, 0, n_channels)
-                  .slice(1, n_order - 1, n_order + n_frames - 1);
   }
 
-  at::Tensor lfilter_element_wise(
-    at::Tensor const & waveform,
+
+
+  void _lfilter_element_wise(
+    at::Tensor const & padded_waveform,
+    at::Tensor & padded_output_waveform,
     at::Tensor const & a_coeffs,
-    at::Tensor const & b_coeffs
+    at::Tensor const & b_coeffs,
+    at::Tensor & o0
   ) {
-    // Assumptions - float32, waveform between -1 and 1
-    assert(waveform.dtype() == torch::kFloat32);
-
-    // CPU only for now - Implement for GPU in future
-    assert(waveform.device().type() == torch::kCPU);
-
-    // Numerator and denominator coefficients shoudlb e same size
-    assert(a_coeffs.size(0) == b_coeffs.size(0));
-    int n_order = a_coeffs.size(0);  // n'th order - 1 filter
-    assert(n_order > 0);
-
-    int64_t n_channels = waveform.size(0);
-    int64_t n_frames = waveform.size(1);
-
-    // Device options should mirror input waveform
-    auto options = torch::TensorOptions()
-                       .dtype(waveform.dtype())
-                      .device(waveform.device().type());
-
-    // Allocate padded input and output waveform, copy in the input waveform
-    at::Tensor padded_waveform =
-        torch::zeros({n_channels, n_frames + n_order - 1}, options);
-    at::Tensor padded_output_waveform =
-        torch::zeros({n_channels, n_frames + n_order - 1},  options);
-    padded_waveform.slice(0, 0, n_channels)
-                   .slice(1, n_order - 1, n_order + n_frames - 1) = waveform;
-
-    // few more temporary data structure
-    at::Tensor o0 =
-        torch::zeros({n_channels, 1}, options);
-    at::Tensor ones =
-        torch::ones({n_channels, n_frames + n_order - 1}, options);
-    at::Tensor negones =
-        torch::ones({n_channels, n_frames + n_order -1 }, options) * -1;
-    at::Tensor normalization_a0 =
-        torch::unsqueeze(a_coeffs[0], {0, }).repeat({n_channels, 1});
-
-    // Create accessors for fast access
-    // https://pytorch.org/cppdocs/notes/tensor_basics.html
+    int n_order = a_coeffs.size(0);
+    int n_frames = padded_waveform.size(1) - n_order + 1;
+    int n_channels = padded_waveform.size(0);
 
-    // CPU
+    // Create CPU accessors for fast access
+    // https://pytorch.org/cppdocs/notes/tensor_basics.html
     auto input_accessor = padded_waveform.accessor<float, 2>();
     auto output_accessor = padded_output_waveform.accessor<float, 2>();
     auto a_coeffs_accessor = a_coeffs.accessor<float, 1>();
     auto b_coeffs_accessor = b_coeffs.accessor<float, 1>();
 
-    // CUDA - TBD
-    // auto input_accessor = waveform.packed_accessor64<float,2>();
-    // auto output_accessor = output_waveform.packed_accessor64<float,2>();
-    // auto a_coeffs_accessor = a_coeffs.packed_accessor64<float,1>();
-    // auto b_coeffs_accessor = b_coeffs.packed_accessor64<float,1>();
-
     for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) {
       for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) {
         // execute the difference equation
@@ -167,13 +91,6 @@ at::Tensor lfilter_tensor_matrix(
         output_accessor[i_channel][i_frame + n_order - 1] = o0;
       }
     }
-
-    // return clipped, and without initial conditions
-    return torch::min(ones,
-                      torch::max(negones,
-                                 padded_output_waveform))
-           .slice(0, 0, n_channels)
-           .slice(1, n_order - 1, n_order + n_frames - 1);
   }
 
 }  // namespace audio
@@ -185,10 +102,10 @@ PYBIND11_MODULE(_torch_filtering, m) {
   options.disable_function_signatures();
   m.def(
       "_lfilter_tensor_matrix",
-      &torch::audio::lfilter_tensor_matrix,
+      &torch::audio::_lfilter_tensor_matrix,
       "Executes difference equation with tensor");
   m.def(
       "_lfilter_element_wise",
-      &torch::audio::lfilter_element_wise,
+      &torch::audio::_lfilter_element_wise,
       "Executes difference equation with tensor");
 }
diff --git a/torchaudio/functional.py b/torchaudio/functional.py
index 99766025fc..565440eaaf 100644
--- a/torchaudio/functional.py
+++ b/torchaudio/functional.py
@@ -1,6 +1,7 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 import math
 import torch
+from _torch_filtering import _lfilter_tensor_matrix, _lfilter_element_wise
 
 __all__ = [
     "istft",
@@ -15,6 +16,7 @@
     "magphase",
     "phase_vocoder",
     "lfilter",
+    "lfilter_cpp_impl",
     "lowpass_biquad",
     "highpass_biquad",
     "biquad",
@@ -566,6 +568,77 @@ def lfilter(waveform, a_coeffs, b_coeffs):
     return torch.min(ones, torch.max(ones * -1, padded_output_waveform[:, (n_order - 1):]))
 
 
+def lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, execution_method):
+    # type: (Tensor, Tensor, Tensor, string) -> Tensor
+    r"""
+    See lfilter documentation. Execution method can be either `element_wise` or `matrix`.
+    """
+
+    # Perform sanity checks, input check, and memory allocation in python
+
+    # Current limitations to be removed in future
+    assert(waveform.dtype == torch.float32)
+    assert(waveform.device.type == 'cpu')
+    assert(waveform.device == a_coeffs.device)
+    assert(waveform.device == b_coeffs.device)
+
+    # Use these parameters for any calculations
+    device = waveform.device
+    dtype = waveform.dtype
+
+    # Check filter orders
+    n_order = a_coeffs.size(0)
+    assert(b_coeffs.size(0) == n_order)
+    assert(n_order > 0)
+    assert(a_coeffs[0] != 0)  # a0 coeff for y[n] can not be 0
+
+    # Check waveform size
+    assert(len(waveform.size()) == 2)
+    n_channels = waveform.size(0)
+    n_frames = waveform.size(1)
+    n_padded_frames = n_frames + n_order - 1
+
+    # Allocate temporary data structures
+    # First, the input waveform should be padded by the order of the filter
+    #   N.B. Should we look to how we can avoid copying
+    padded_waveform = torch.zeros(n_channels, n_padded_frames, dtype=dtype, device=device)
+    padded_output_waveform = torch.zeros(n_channels, n_padded_frames, dtype=dtype, device=device)
+    padded_waveform[:, (n_order - 1):] = waveform
+
+    # More temporary data structures
+    o0 = torch.zeros(n_channels, 1, dtype=dtype, device=device)
+    normalization_a0 = a_coeffs[0].unsqueeze(0).repeat(n_channels, 1)
+    ones = torch.ones(n_channels, n_padded_frames, dtype=dtype, device=device)
+
+    # Run through assertion size checks
+    assert(normalization_a0.size(0) == n_channels)
+
+    if execution_method == 'element_wise':
+        _lfilter_element_wise(padded_waveform,
+                              padded_output_waveform,
+                              a_coeffs,
+                              b_coeffs,
+                              o0
+                              )
+    elif execution_method == 'matrix':
+        # From [n_order] a_coeffs, create a [n_channel, n_order] tensor
+        # lowest order coefficients e.g. a0, b0 should be at the "bottom"
+        # used for matrix multiply later
+        a_coeffs_filled = a_coeffs.flip(0).repeat(n_channels, 1).t()
+        b_coeffs_filled = b_coeffs.flip(0).repeat(n_channels, 1).t()
+        _lfilter_tensor_matrix(padded_waveform,
+                               padded_output_waveform,
+                               a_coeffs_filled,
+                               b_coeffs_filled,
+                               o0,
+                               normalization_a0,
+                               )
+    else:
+        raise Exception("invalid lfilter execution method %s" % execution_method)
+
+    return torch.min(ones, torch.max(ones * -1, padded_output_waveform))[:, (n_order - 1):]
+
+
 def biquad(waveform, b0, b1, b2, a0, a1, a2):
     # type: (Tensor, float, float, float, float, float, float) -> Tensor
     r"""Performs a biquad filter of input tensor.  Initial conditions set to 0.

From 62a2dfd030a8851468b118f94782b76daf626f49 Mon Sep 17 00:00:00 2001
From: Chuan Zhang <engineerchuan@gmail.com>
Date: Wed, 18 Sep 2019 12:25:06 -0700
Subject: [PATCH 09/17] adding test

---
 test/test_functional_lfilter_perf.py | 131 +++++++++++++++++++++++++++
 1 file changed, 131 insertions(+)
 create mode 100644 test/test_functional_lfilter_perf.py

diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py
new file mode 100644
index 0000000000..0030541277
--- /dev/null
+++ b/test/test_functional_lfilter_perf.py
@@ -0,0 +1,131 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+import math
+import os
+import torch
+import torchaudio
+import unittest
+import common_utils
+
+from torchaudio.functional import lfilter
+from _torch_filtering import (
+    _lfilter_tensor_matrix,
+    _lfilter_tensor,
+    _lfilter_element_wise,
+)
+
+
+class TestFunctionalLFilterPerformance(unittest.TestCase):
+    test_dirpath, test_dir = common_utils.create_temp_assets_dir()
+
+    @staticmethod
+    def run_test(n_channels, n_frames, n_order_filter):
+        waveform = torch.rand(n_channels, n_frames, device="cpu")
+        b_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu")
+        a_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu")
+
+        if n_order_filter == 8:
+            # Eighth Order Filter
+            #  >>> import scipy.signal
+            #  >>>  wp = 0.3
+            #  >>>  ws = 0.5
+            #  >>>  gpass = 1
+            #  >>>  gstop = 100
+            #  >>>  b, a = scipy.signal.iirdesign(wp, ws, gpass, gstop)
+            b_coeffs = [
+                0.0006544487997063485,
+                0.001669274889397942,
+                0.003218714446315984,
+                0.004222562499298002,
+                0.004222562499298002,
+                0.0032187144463159834,
+                0.0016692748893979413,
+                0.0006544487997063485,
+            ]
+            a_coeffs = [
+                1.0,
+                -4.67403506662255,
+                10.516336803850786,
+                -14.399207825856776,
+                12.844181702707655,
+                -7.43604712843608,
+                2.5888616732696077,
+                -0.4205601576432048,
+            ]
+        elif n_order_filter == 5:
+            # Fifth Order Filter
+            # >>> import scipy.signal
+            # >>> wp = 0.3, ws = 0.5, gpass = 1, gstop = 40
+            # >>> b, a = scipy.signal.iirdesign(wp, ws, gpass, gstop)
+            b_coeffs = [
+                0.0353100066384039,
+                0.023370652985988206,
+                0.0560524973457262,
+                0.023370652985988193,
+                0.03531000663840389,
+            ]
+            a_coeffs = [
+                1.0,
+                -2.321010052951366,
+                2.677193357612127,
+                -1.5774235418173692,
+                0.4158137396065854,
+            ]
+
+        # Cast into Tensors
+        a_coeffs = torch.tensor(a_coeffs, device="cpu", dtype=torch.float32)
+        b_coeffs = torch.tensor(b_coeffs, device="cpu", dtype=torch.float32)
+
+        def time_and_output(func):
+            import time
+
+            st = time.time()
+            output = func()
+            run_time = time.time() - st
+            return (output, run_time)
+
+        (output_waveform_1, run_time_1) = time_and_output(
+            lambda: lfilter(waveform, a_coeffs, b_coeffs)
+        )
+        (output_waveform_2, run_time_2) = time_and_output(
+            lambda: _lfilter_element_wise(waveform, a_coeffs, b_coeffs)
+        )
+        (output_waveform_3, run_time_3) = time_and_output(
+            lambda: _lfilter_tensor_matrix(waveform, a_coeffs, b_coeffs)
+        )
+
+        print("-" * 80)
+        print("Evaluating Runtime between lfilter implementations")
+        print("-" * 80)
+        print(
+            "Data Size: [%d x %d], Filter Order: %d"
+            % (waveform.size(0), waveform.size(1), a_coeffs.size(0))
+        )
+        print("-" * 80)
+        print("Python Matrix Runtime [current]: %10.6f s" % run_time_1)
+        print("CPP Element Wise Runtime       : %10.6f s" % run_time_2)
+        print("CPP Matrix Runtime             : %10.6f s" % run_time_3)
+
+        # maxDeviation = torch.kthvalue(torch.abs(output_waveform_1- output_waveform_2), output_waveform_1.size(1))
+
+        assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4)
+        assert torch.allclose(output_waveform_1, output_waveform_3, atol=3e-4)
+        print("-" * 80)
+        print("✓ - all outputs are identical")
+        print("-" * 80)
+
+    def test_lfilter_cmp(self):
+        """
+        Runs comparison on CPU
+        """
+
+        torch.random.manual_seed(423)
+        self.run_test(2, 8000, 5)
+        self.run_test(2, 80000, 5)
+        self.run_test(2, 800000, 5)
+        self.run_test(2, 8000, 8)
+        self.run_test(2, 80000, 8)
+        self.run_test(2, 800000, 8)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 7d08183a03b2f58a70abb5d999228ffdcdc1a029 Mon Sep 17 00:00:00 2001
From: Chuan Zhang <engineerchuan@gmail.com>
Date: Wed, 18 Sep 2019 13:27:10 -0700
Subject: [PATCH 10/17] added cpp lfilter implementation

---
 setup.py                             |   8 ++
 test/test_functional_lfilter_perf.py |  79 ++++++++++---
 torchaudio/filtering.cpp             | 159 +++++++++++++++++++++++++++
 torchaudio/functional.py             |   4 +-
 4 files changed, 232 insertions(+), 18 deletions(-)
 create mode 100644 torchaudio/filtering.cpp

diff --git a/setup.py b/setup.py
index 9ed91f2468..4822a6e9d0 100644
--- a/setup.py
+++ b/setup.py
@@ -113,6 +113,14 @@ def check_env_flag(name, default=''):
             extra_compile_args=eca,
             extra_objects=extra_objects,
             extra_link_args=ela),
+        CppExtension(
+            '_torch_filtering',
+            ['torchaudio/filtering.cpp'],
+            libraries=libraries,
+            include_dirs=include_dirs,
+            extra_compile_args=eca,
+            extra_objects=extra_objects,
+            extra_link_args=ela),
     ],
     cmdclass={'build_ext': BuildExtension},
     install_requires=[pytorch_package_dep]
diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py
index 0030541277..63c1fbadde 100644
--- a/test/test_functional_lfilter_perf.py
+++ b/test/test_functional_lfilter_perf.py
@@ -7,18 +7,14 @@
 import common_utils
 
 from torchaudio.functional import lfilter
-from _torch_filtering import (
-    _lfilter_tensor_matrix,
-    _lfilter_tensor,
-    _lfilter_element_wise,
-)
+from _torch_filtering import _lfilter_tensor_matrix, _lfilter_element_wise
 
 
 class TestFunctionalLFilterPerformance(unittest.TestCase):
     test_dirpath, test_dir = common_utils.create_temp_assets_dir()
 
     @staticmethod
-    def run_test(n_channels, n_frames, n_order_filter):
+    def run_test(n_channels, n_frames, n_order_filter, assertClose=True):
         waveform = torch.rand(n_channels, n_frames, device="cpu")
         b_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu")
         a_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu")
@@ -70,6 +66,50 @@ def run_test(n_channels, n_frames, n_order_filter):
                 -1.5774235418173692,
                 0.4158137396065854,
             ]
+        elif n_order_filter == 18:
+            # >>> import scipy.signal
+            # >>> wp = 0.48, ws = 0.5, gpass = 0.2, gstop = 120
+            # >>> b, a = scipy.signal.iirdesign(wp, ws, gpass, gstop)
+            b_coeffs = [
+                0.0006050813536446144,
+                0.002920916369302935,
+                0.010247568347759453,
+                0.02591236698507957,
+                0.05390501051935878,
+                0.09344581172781004,
+                0.13951533321139883,
+                0.1808658576803922,
+                0.2056643061895918,
+                0.2056643061895911,
+                0.1808658576803912,
+                0.13951533321139847,
+                0.09344581172781012,
+                0.053905010519358885,
+                0.02591236698507962,
+                0.010247568347759466,
+                0.0029209163693029367,
+                0.0006050813536446148,
+            ]
+            a_coeffs = [
+                1.0,
+                -4.3964136877356745,
+                14.650181359641305,
+                -34.45816395187684,
+                67.18247518997862,
+                -108.01956225077998,
+                149.4332056661277,
+                -178.07791467502364,
+                185.28267044557634,
+                -168.13382659655514,
+                133.22364764531704,
+                -91.59439958870928,
+                54.15835239046956,
+                -27.090521914173934,
+                11.163677645454127,
+                -3.627296054625132,
+                0.8471764313073272,
+                -0.11712354962357388,
+            ]
 
         # Cast into Tensors
         a_coeffs = torch.tensor(a_coeffs, device="cpu", dtype=torch.float32)
@@ -93,25 +133,24 @@ def time_and_output(func):
             lambda: _lfilter_tensor_matrix(waveform, a_coeffs, b_coeffs)
         )
 
-        print("-" * 80)
-        print("Evaluating Runtime between lfilter implementations")
         print("-" * 80)
         print(
-            "Data Size: [%d x %d], Filter Order: %d"
+            "lfilter perf - Data Size: [%d x %d], Filter Order: %d"
             % (waveform.size(0), waveform.size(1), a_coeffs.size(0))
         )
         print("-" * 80)
         print("Python Matrix Runtime [current]: %10.6f s" % run_time_1)
         print("CPP Element Wise Runtime       : %10.6f s" % run_time_2)
         print("CPP Matrix Runtime             : %10.6f s" % run_time_3)
-
-        # maxDeviation = torch.kthvalue(torch.abs(output_waveform_1- output_waveform_2), output_waveform_1.size(1))
-
-        assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4)
-        assert torch.allclose(output_waveform_1, output_waveform_3, atol=3e-4)
-        print("-" * 80)
-        print("✓ - all outputs are identical")
         print("-" * 80)
+        print("Ratio Python / CPP ElementWise : %10.2f x" % (run_time_1/run_time_2))
+
+        if assertClose:
+            # maxDeviation = torch.kthvalue(torch.abs(output_waveform_3- output_waveform_2), output_waveform_1.size(1))
+            assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4)
+            assert torch.allclose(output_waveform_2, output_waveform_3, atol=3e-4)
+            print("✓ - all outputs are identical")
+            print("-" * 80)
 
     def test_lfilter_cmp(self):
         """
@@ -126,6 +165,14 @@ def test_lfilter_cmp(self):
         self.run_test(2, 80000, 8)
         self.run_test(2, 800000, 8)
 
+        # For higher order filters, due to floating point precision
+        #  matrix method and element method can get different results depending on order of operations
+        # Also, for longer signals and higher filters, easier to create unstable filter
+        # https://dsp.stackexchange.com/questions/54386/relation-between-order-and-stability-in-iir-filter
+        self.run_test(2, 8000, 18, False)
+        self.run_test(2, 80000, 18, False)
+        self.run_test(2, 800000, 18, False)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/torchaudio/filtering.cpp b/torchaudio/filtering.cpp
new file mode 100644
index 0000000000..aa60a8a818
--- /dev/null
+++ b/torchaudio/filtering.cpp
@@ -0,0 +1,159 @@
+#include <torch/extension.h>
+
+// TBD - Compile on CUDA
+// TBD - Expand to other data types outside float32?
+
+namespace torch {
+namespace audio {
+
+
+at::Tensor lfilter_tensor_matrix(
+    at::Tensor const & waveform,
+    at::Tensor const & a_coeffs,
+    at::Tensor const & b_coeffs
+  ) {
+
+    // Assumptions - float32, waveform between -1 and 1
+    assert(waveform.dtype() == torch::kFloat32);
+
+    // CPU only for now - Implement for GPU in future
+    assert(waveform.device().type() == torch::kCPU);
+
+    // Numerator and denominator coefficients shoudlb e same size
+    assert(a_coeffs.size(0) == b_coeffs.size(0));
+    int n_order = a_coeffs.size(0); // n'th order - 1 filter
+    assert(n_order > 0);
+
+    int64_t n_channels = waveform.size(0);
+    int64_t n_frames = waveform.size(1);
+
+    // Device options should mirror input waveform
+    auto options = torch::TensorOptions().dtype(waveform.dtype()).device(waveform.device().type());
+
+    // Allocate padded input and output waveform, copy in the input waveform
+    at::Tensor padded_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options);
+    at::Tensor padded_output_waveform = torch::zeros({n_channels, n_frames + n_order - 1},  options);
+    padded_waveform.slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1) = waveform;
+
+    // Create [n_order, n_channels] structure for a and b coefficients
+    at::Tensor a_coeffs_filled = at::transpose(at::flip(a_coeffs, {0,}).repeat({n_channels, 1}), 0, 1);
+    at::Tensor b_coeffs_filled = at::transpose(at::flip(b_coeffs, {0,}).repeat({n_channels, 1}), 0, 1);
+
+    // few more temporary data structure
+    at::Tensor o0 = torch::zeros({n_channels, 1}, options);
+    at::Tensor ones = torch::ones({n_channels, n_frames + n_order - 1}, options);
+    at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1;
+    at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1});
+
+
+    for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) {
+
+      // calculate the output at time i_frame for all channels
+      o0 = torch::zeros({n_channels, 1}, torch::TensorOptions().dtype(torch::kFloat32));
+
+      // time window of input and output, size [n_channels, n_order]
+      at::Tensor const & input_window = padded_waveform.slice(0, 0, n_channels).slice(1, i_frame, i_frame + n_order);
+      at::Tensor const & output_window = padded_output_waveform.slice(0, 0, n_channels).slice(1, i_frame, i_frame+ n_order);
+
+      // matrix multiply to get [n_channels x n_channels], extract diagonal and unsqueeze to get [n_channels, 1] result for a frame
+      at::Tensor inp_result = torch::unsqueeze(torch::diag(torch::mm(input_window, b_coeffs_filled)), 1);
+      at::Tensor out_result = torch::unsqueeze(torch::diag(torch::mm(output_window, a_coeffs_filled)), 1);
+
+      o0.add_(inp_result);
+      o0.sub_(out_result);
+
+      // normalize by a0
+      o0.div_(normalization_a0);
+
+      // Set the output
+      padded_output_waveform.slice(0, 0, n_channels).slice(1, i_frame + n_order - 1, i_frame + n_order - 1 + 1) = o0;
+
+    }
+    // return clipped, and without initial conditions
+    return torch::min(ones, torch::max(negones, padded_output_waveform)).slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1);
+  }
+
+  at::Tensor lfilter_element_wise(
+    at::Tensor const & waveform,
+    at::Tensor const & a_coeffs,
+    at::Tensor const & b_coeffs
+  ) {
+
+
+    // Assumptions - float32, waveform between -1 and 1
+    assert(waveform.dtype() == torch::kFloat32);
+
+    // CPU only for now - Implement for GPU in future
+    assert(waveform.device().type() == torch::kCPU);
+
+    // Numerator and denominator coefficients shoudlb e same size
+    assert(a_coeffs.size(0) == b_coeffs.size(0));
+    int n_order = a_coeffs.size(0); // n'th order - 1 filter
+    assert(n_order > 0);
+
+    int64_t n_channels = waveform.size(0);
+    int64_t n_frames = waveform.size(1);
+
+    // Device options should mirror input waveform
+    auto options = torch::TensorOptions().dtype(waveform.dtype()).device(waveform.device().type());
+
+    // Allocate padded input and output waveform, copy in the input waveform
+    at::Tensor padded_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options);
+    at::Tensor padded_output_waveform = torch::zeros({n_channels, n_frames + n_order - 1},  options);
+    padded_waveform.slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1) = waveform;
+
+    // few more temporary data structure
+    at::Tensor o0 = torch::zeros({n_channels, 1}, options);
+    at::Tensor ones = torch::ones({n_channels, n_frames + n_order - 1}, options);
+    at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1;
+    at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1});    // Device options should mirror input waveform
+
+    // TBD - Implement for CUDA
+    //auto input_accessor = waveform.packed_accessor64<float,2>();
+    //auto output_accessor = output_waveform.packed_accessor64<float,2>();    
+    //auto a_coeffs_accessor = a_coeffs.packed_accessor64<float,1>();
+    //auto b_coeffs_accessor = b_coeffs.packed_accessor64<float,1>();
+    
+    // initialize the output tensor
+  
+    auto input_accessor = padded_waveform.accessor<float,2>();
+    auto output_accessor = padded_output_waveform.accessor<float,2>();    
+    auto a_coeffs_accessor = a_coeffs.accessor<float,1>();
+    auto b_coeffs_accessor = b_coeffs.accessor<float,1>();
+
+    for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) {
+
+      for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) {
+
+        // calculate the output at time i_frame by iterating through
+        // inputs / outputs at previous time steps and multiplying by coeffs
+        float o0 = 0;
+        for (int i_offset = 0; i_offset < n_order; ++i_offset) {
+          o0 += input_accessor[i_channel][i_frame + i_offset] * b_coeffs_accessor[n_order - i_offset - 1];
+          o0 -= output_accessor[i_channel][i_frame + i_offset] * a_coeffs_accessor[n_order - i_offset - 1];
+        }
+        o0 = o0 / a_coeffs_accessor[0];
+
+        output_accessor[i_channel][i_frame + n_order - 1] = o0;
+
+      }
+    }
+    // return clipped, and without initial conditions
+    return torch::min(ones, torch::max(negones, padded_output_waveform)).slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1);
+  }
+
+}}
+
+
+PYBIND11_MODULE(_torch_filtering, m) {
+  py::options options;
+  options.disable_function_signatures();
+  m.def(
+      "_lfilter_tensor_matrix",
+      &torch::audio::lfilter_tensor_matrix,
+      "Executes difference equation with tensor");
+  m.def(
+      "_lfilter_element_wise",
+      &torch::audio::lfilter_element_wise,
+      "Executes difference equation with tensor");
+}
diff --git a/torchaudio/functional.py b/torchaudio/functional.py
index c289f61e01..e80abb4333 100644
--- a/torchaudio/functional.py
+++ b/torchaudio/functional.py
@@ -529,7 +529,7 @@ def phase_vocoder(complex_specgrams, rate, phase_advance):
 def lfilter(waveform, a_coeffs, b_coeffs):
     # type: (Tensor, Tensor, Tensor) -> Tensor
     r"""
-    Performs an IIR filter by evaluating difference equation.
+    Performs an IIR filter by evaluating difference equation.  Assumes difference equation is stable.
 
     Args:
         waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`.  Must be normalized to -1 to 1.
@@ -594,7 +594,7 @@ def biquad(waveform, b0, b1, b2, a0, a1, a2):
     https://en.wikipedia.org/wiki/Digital_biquad_filter
 
     Args:
-        waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`
+        waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`.  Currently only supports float32.
         b0 (float): numerator coefficient of current input, x[n]
         b1 (float): numerator coefficient of input one time step ago x[n-1]
         b2 (float): numerator coefficient of input two time steps ago x[n-2]

From a6fefbeb115cca1454d88acd3f330889b424e949 Mon Sep 17 00:00:00 2001
From: Chuan Zhang <engineerchuan@gmail.com>
Date: Wed, 18 Sep 2019 13:34:04 -0700
Subject: [PATCH 11/17] slight refactoring of cpp lfilter implementations

---
 torchaudio/filtering.cpp | 23 +++++++++++------------
 torchaudio/functional.py |  2 +-
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/torchaudio/filtering.cpp b/torchaudio/filtering.cpp
index aa60a8a818..275db9ceaa 100644
--- a/torchaudio/filtering.cpp
+++ b/torchaudio/filtering.cpp
@@ -45,7 +45,6 @@ at::Tensor lfilter_tensor_matrix(
     at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1;
     at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1});
 
-
     for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) {
 
       // calculate the output at time i_frame for all channels
@@ -79,7 +78,6 @@ at::Tensor lfilter_tensor_matrix(
     at::Tensor const & b_coeffs
   ) {
 
-
     // Assumptions - float32, waveform between -1 and 1
     assert(waveform.dtype() == torch::kFloat32);
 
@@ -108,25 +106,24 @@ at::Tensor lfilter_tensor_matrix(
     at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1;
     at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1});    // Device options should mirror input waveform
 
-    // TBD - Implement for CUDA
-    //auto input_accessor = waveform.packed_accessor64<float,2>();
-    //auto output_accessor = output_waveform.packed_accessor64<float,2>();    
-    //auto a_coeffs_accessor = a_coeffs.packed_accessor64<float,1>();
-    //auto b_coeffs_accessor = b_coeffs.packed_accessor64<float,1>();
-    
-    // initialize the output tensor
-  
+    // Create accessors for fast access - https://pytorch.org/cppdocs/notes/tensor_basics.html#efficient-access-to-tensor-elements
+    // CPU
     auto input_accessor = padded_waveform.accessor<float,2>();
     auto output_accessor = padded_output_waveform.accessor<float,2>();    
     auto a_coeffs_accessor = a_coeffs.accessor<float,1>();
     auto b_coeffs_accessor = b_coeffs.accessor<float,1>();
 
+    // CUDA - TBD
+    //auto input_accessor = waveform.packed_accessor64<float,2>();
+    //auto output_accessor = output_waveform.packed_accessor64<float,2>();    
+    //auto a_coeffs_accessor = a_coeffs.packed_accessor64<float,1>();
+    //auto b_coeffs_accessor = b_coeffs.packed_accessor64<float,1>();
+    
     for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) {
 
       for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) {
 
-        // calculate the output at time i_frame by iterating through
-        // inputs / outputs at previous time steps and multiplying by coeffs
+        // execute the difference equation
         float o0 = 0;
         for (int i_offset = 0; i_offset < n_order; ++i_offset) {
           o0 += input_accessor[i_channel][i_frame + i_offset] * b_coeffs_accessor[n_order - i_offset - 1];
@@ -134,10 +131,12 @@ at::Tensor lfilter_tensor_matrix(
         }
         o0 = o0 / a_coeffs_accessor[0];
 
+        // put back into the main data structure
         output_accessor[i_channel][i_frame + n_order - 1] = o0;
 
       }
     }
+
     // return clipped, and without initial conditions
     return torch::min(ones, torch::max(negones, padded_output_waveform)).slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1);
   }
diff --git a/torchaudio/functional.py b/torchaudio/functional.py
index e80abb4333..413d2b4ab5 100644
--- a/torchaudio/functional.py
+++ b/torchaudio/functional.py
@@ -529,7 +529,7 @@ def phase_vocoder(complex_specgrams, rate, phase_advance):
 def lfilter(waveform, a_coeffs, b_coeffs):
     # type: (Tensor, Tensor, Tensor) -> Tensor
     r"""
-    Performs an IIR filter by evaluating difference equation.  Assumes difference equation is stable.
+    Performs an IIR filter by evaluating difference equation.  Coefficients should be designed to be stable.
 
     Args:
         waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`.  Must be normalized to -1 to 1.

From d0e694a39bdd0e763a860d9f9c53722eb1ff5881 Mon Sep 17 00:00:00 2001
From: Chuan Zhang <engineerchuan@gmail.com>
Date: Wed, 18 Sep 2019 13:38:58 -0700
Subject: [PATCH 12/17] Tweaked docs for lowpass_biquad, highpass_biquad,
 biquad

---
 test/test_functional_lfilter_perf.py |  2 +-
 torchaudio/functional.py             | 15 ++++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py
index 63c1fbadde..8784d118b3 100644
--- a/test/test_functional_lfilter_perf.py
+++ b/test/test_functional_lfilter_perf.py
@@ -143,7 +143,7 @@ def time_and_output(func):
         print("CPP Element Wise Runtime       : %10.6f s" % run_time_2)
         print("CPP Matrix Runtime             : %10.6f s" % run_time_3)
         print("-" * 80)
-        print("Ratio Python / CPP ElementWise : %10.2f x" % (run_time_1/run_time_2))
+        print("Ratio Python / CPP ElementWise : %10.2f x" % (run_time_1 / run_time_2))
 
         if assertClose:
             # maxDeviation = torch.kthvalue(torch.abs(output_waveform_3- output_waveform_2), output_waveform_1.size(1))
diff --git a/torchaudio/functional.py b/torchaudio/functional.py
index 413d2b4ab5..ac8102ade5 100644
--- a/torchaudio/functional.py
+++ b/torchaudio/functional.py
@@ -594,7 +594,8 @@ def biquad(waveform, b0, b1, b2, a0, a1, a2):
     https://en.wikipedia.org/wiki/Digital_biquad_filter
 
     Args:
-        waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`.  Currently only supports float32.
+        waveform (torch.Tensor): Audio waveform of dimension of `(n_channel, n_frames)`.
+                                 Currently only supports float32. Normalized [-1, 1]
         b0 (float): numerator coefficient of current input, x[n]
         b1 (float): numerator coefficient of input one time step ago x[n-1]
         b2 (float): numerator coefficient of input two time steps ago x[n-2]
@@ -603,7 +604,7 @@ def biquad(waveform, b0, b1, b2, a0, a1, a2):
         a2 (float): denominator coefficient of current output y[n-2]
 
     Returns:
-        output_waveform (torch.Tensor): Dimension of `(n_channel, n_frames)`
+        output_waveform (torch.Tensor): Dimension of `(n_channel, n_frames)` in range [-1, 1]
     """
 
     device = waveform.device
@@ -626,13 +627,13 @@ def highpass_biquad(waveform, sample_rate, cutoff_freq, Q=0.707):
     r"""Designs biquad highpass filter and performs filtering.  Similar to SoX implementation.
 
     Args:
-        waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`
+        waveform (torch.Tensor): Reference `biquad` documentation.
         sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
         cutoff_freq (float): filter cutoff frequency
         Q (float): https://en.wikipedia.org/wiki/Q_factor
 
     Returns:
-        output_waveform (torch.Tensor): Dimension of `(n_channel, n_frames)`
+        output_waveform (torch.Tensor): Reference `biquad` documentation.
     """
 
     GAIN = 1  # TBD - add as a parameter
@@ -653,15 +654,15 @@ def highpass_biquad(waveform, sample_rate, cutoff_freq, Q=0.707):
 def lowpass_biquad(waveform, sample_rate, cutoff_freq, Q=0.707):
     # type: (Tensor, int, float, Optional[float]) -> Tensor
     r"""Designs biquad lowpass filter and performs filtering.  Similar to SoX implementation.
-
+    
     Args:
-        waveform (torch.Tensor): audio waveform of dimension of `(n_channel, n_frames)`
+        waveform (torch.Tensor): Reference `biquad` documentation
         sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
         cutoff_freq (float): filter cutoff frequency
         Q (float): https://en.wikipedia.org/wiki/Q_factor
 
     Returns:
-        output_waveform (torch.Tensor): Dimension of `(n_channel, n_frames)`
+        output_waveform (torch.Tensor): Reference `biquad` documentation.
     """
 
     GAIN = 1

From 4127096e7758d7a439880b4022d7dee21ae378d5 Mon Sep 17 00:00:00 2001
From: Chuan Zhang <engineerchuan@gmail.com>
Date: Wed, 18 Sep 2019 14:43:52 -0700
Subject: [PATCH 13/17] Adding 40th order filter

---
 torchaudio/functional.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchaudio/functional.py b/torchaudio/functional.py
index ac8102ade5..3aab718f70 100644
--- a/torchaudio/functional.py
+++ b/torchaudio/functional.py
@@ -654,7 +654,7 @@ def highpass_biquad(waveform, sample_rate, cutoff_freq, Q=0.707):
 def lowpass_biquad(waveform, sample_rate, cutoff_freq, Q=0.707):
     # type: (Tensor, int, float, Optional[float]) -> Tensor
     r"""Designs biquad lowpass filter and performs filtering.  Similar to SoX implementation.
-    
+
     Args:
         waveform (torch.Tensor): Reference `biquad` documentation
         sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)

From 64370914d438a56758718261080368ce91303c5f Mon Sep 17 00:00:00 2001
From: Chuan Zhang <engineerchuan@gmail.com>
Date: Wed, 18 Sep 2019 15:03:40 -0700
Subject: [PATCH 14/17] adding 40th order filter

---
 test/test_functional_lfilter_perf.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py
index 8784d118b3..7968dd5e89 100644
--- a/test/test_functional_lfilter_perf.py
+++ b/test/test_functional_lfilter_perf.py
@@ -110,6 +110,12 @@ def run_test(n_channels, n_frames, n_order_filter, assertClose=True):
                 0.8471764313073272,
                 -0.11712354962357388,
             ]
+        elif n_order_filter == 40:
+            # Create random set of 40 coefficients, will not be stable, test runtime rather than
+            # correctness
+            a_coeffs = torch.rand(40).numpy()
+            b_coeffs = torch.rand(40).numpy()
+            a_coeffs[0] = 1  # Set a0 to 1.0
 
         # Cast into Tensors
         a_coeffs = torch.tensor(a_coeffs, device="cpu", dtype=torch.float32)
@@ -149,7 +155,7 @@ def time_and_output(func):
             # maxDeviation = torch.kthvalue(torch.abs(output_waveform_3- output_waveform_2), output_waveform_1.size(1))
             assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4)
             assert torch.allclose(output_waveform_2, output_waveform_3, atol=3e-4)
-            print("✓ - all outputs are identical")
+            print("PASS - all outputs are identical")
             print("-" * 80)
 
     def test_lfilter_cmp(self):
@@ -173,6 +179,10 @@ def test_lfilter_cmp(self):
         self.run_test(2, 80000, 18, False)
         self.run_test(2, 800000, 18, False)
 
+        self.run_test(2, 8000, 40, False)
+        self.run_test(2, 80000, 40, False)
+        self.run_test(2, 800000, 40, False)
+
 
 if __name__ == "__main__":
     unittest.main()

From a056409f40464dc7690f89460f8f6420f0483214 Mon Sep 17 00:00:00 2001
From: Chuan Zhang <engineerchuan@gmail.com>
Date: Wed, 18 Sep 2019 16:06:56 -0700
Subject: [PATCH 15/17] Applied cpplint to torchaudio/filtering.cpp

---
 torchaudio/filtering.cpp | 134 +++++++++++++++++++++++++--------------
 1 file changed, 85 insertions(+), 49 deletions(-)

diff --git a/torchaudio/filtering.cpp b/torchaudio/filtering.cpp
index 275db9ceaa..0f50189f81 100644
--- a/torchaudio/filtering.cpp
+++ b/torchaudio/filtering.cpp
@@ -12,7 +12,6 @@ at::Tensor lfilter_tensor_matrix(
     at::Tensor const & a_coeffs,
     at::Tensor const & b_coeffs
   ) {
-
     // Assumptions - float32, waveform between -1 and 1
     assert(waveform.dtype() == torch::kFloat32);
 
@@ -21,42 +20,61 @@ at::Tensor lfilter_tensor_matrix(
 
     // Numerator and denominator coefficients shoudlb e same size
     assert(a_coeffs.size(0) == b_coeffs.size(0));
-    int n_order = a_coeffs.size(0); // n'th order - 1 filter
+    int n_order = a_coeffs.size(0);  // n'th order - 1 filter
     assert(n_order > 0);
 
     int64_t n_channels = waveform.size(0);
     int64_t n_frames = waveform.size(1);
 
     // Device options should mirror input waveform
-    auto options = torch::TensorOptions().dtype(waveform.dtype()).device(waveform.device().type());
+    auto options = torch::TensorOptions()
+                     .dtype(waveform.dtype())
+                     .device(waveform.device().type());
 
     // Allocate padded input and output waveform, copy in the input waveform
-    at::Tensor padded_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options);
-    at::Tensor padded_output_waveform = torch::zeros({n_channels, n_frames + n_order - 1},  options);
-    padded_waveform.slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1) = waveform;
+    at::Tensor padded_waveform =
+        torch::zeros({n_channels, n_frames + n_order - 1}, options);
+    at::Tensor padded_output_waveform =
+        torch::zeros({n_channels, n_frames + n_order - 1},  options);
+    padded_waveform.slice(0, 0, n_channels)
+                  .slice(1, n_order - 1, n_order + n_frames - 1) = waveform;
 
     // Create [n_order, n_channels] structure for a and b coefficients
-    at::Tensor a_coeffs_filled = at::transpose(at::flip(a_coeffs, {0,}).repeat({n_channels, 1}), 0, 1);
-    at::Tensor b_coeffs_filled = at::transpose(at::flip(b_coeffs, {0,}).repeat({n_channels, 1}), 0, 1);
+    at::Tensor a_coeffs_filled =
+        at::transpose(at::flip(a_coeffs, {0, }).repeat({n_channels, 1}), 0, 1);
+    at::Tensor b_coeffs_filled =
+        at::transpose(at::flip(b_coeffs, {0, }).repeat({n_channels, 1}), 0, 1);
 
     // few more temporary data structure
     at::Tensor o0 = torch::zeros({n_channels, 1}, options);
-    at::Tensor ones = torch::ones({n_channels, n_frames + n_order - 1}, options);
-    at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1;
-    at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1});
+    at::Tensor ones =
+        torch::ones({n_channels, n_frames + n_order - 1}, options);
+    at::Tensor negones =
+        torch::ones({n_channels, n_frames + n_order -1 }, options) * -1;
+    at::Tensor normalization_a0 =
+        torch::unsqueeze(a_coeffs[0], {0, }).repeat({n_channels, 1});
 
     for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) {
-
       // calculate the output at time i_frame for all channels
-      o0 = torch::zeros({n_channels, 1}, torch::TensorOptions().dtype(torch::kFloat32));
+      o0 = torch::zeros({n_channels, 1},
+                        torch::TensorOptions().dtype(torch::kFloat32));
 
       // time window of input and output, size [n_channels, n_order]
-      at::Tensor const & input_window = padded_waveform.slice(0, 0, n_channels).slice(1, i_frame, i_frame + n_order);
-      at::Tensor const & output_window = padded_output_waveform.slice(0, 0, n_channels).slice(1, i_frame, i_frame+ n_order);
-
-      // matrix multiply to get [n_channels x n_channels], extract diagonal and unsqueeze to get [n_channels, 1] result for a frame
-      at::Tensor inp_result = torch::unsqueeze(torch::diag(torch::mm(input_window, b_coeffs_filled)), 1);
-      at::Tensor out_result = torch::unsqueeze(torch::diag(torch::mm(output_window, a_coeffs_filled)), 1);
+      at::Tensor const & input_window =
+          padded_waveform.slice(0, 0, n_channels)
+                         .slice(1, i_frame, i_frame + n_order);
+      at::Tensor const & output_window =
+          padded_output_waveform.slice(0, 0, n_channels)
+                                .slice(1, i_frame, i_frame+ n_order);
+
+      // matrix multiply to get [n_channels x n_channels],
+      // extract diagonal and unsqueeze to get [n_channels, 1] result
+      at::Tensor inp_result =
+          torch::unsqueeze(torch::diag(torch::mm(input_window,
+                                                 b_coeffs_filled)), 1);
+      at::Tensor out_result =
+          torch::unsqueeze(torch::diag(torch::mm(output_window,
+                                                 a_coeffs_filled)), 1);
 
       o0.add_(inp_result);
       o0.sub_(out_result);
@@ -65,11 +83,15 @@ at::Tensor lfilter_tensor_matrix(
       o0.div_(normalization_a0);
 
       // Set the output
-      padded_output_waveform.slice(0, 0, n_channels).slice(1, i_frame + n_order - 1, i_frame + n_order - 1 + 1) = o0;
-
+      padded_output_waveform.slice(0, 0, n_channels)
+                            .slice(1,
+                                   i_frame + n_order - 1,
+                                   i_frame + n_order - 1 + 1) = o0;
     }
     // return clipped, and without initial conditions
-    return torch::min(ones, torch::max(negones, padded_output_waveform)).slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1);
+    return torch::min(ones, torch::max(negones, padded_output_waveform))
+                  .slice(0, 0, n_channels)
+                  .slice(1, n_order - 1, n_order + n_frames - 1);
   }
 
   at::Tensor lfilter_element_wise(
@@ -77,7 +99,6 @@ at::Tensor lfilter_tensor_matrix(
     at::Tensor const & a_coeffs,
     at::Tensor const & b_coeffs
   ) {
-
     // Assumptions - float32, waveform between -1 and 1
     assert(waveform.dtype() == torch::kFloat32);
 
@@ -86,62 +107,77 @@ at::Tensor lfilter_tensor_matrix(
 
     // Numerator and denominator coefficients shoudlb e same size
     assert(a_coeffs.size(0) == b_coeffs.size(0));
-    int n_order = a_coeffs.size(0); // n'th order - 1 filter
+    int n_order = a_coeffs.size(0);  // n'th order - 1 filter
     assert(n_order > 0);
 
     int64_t n_channels = waveform.size(0);
     int64_t n_frames = waveform.size(1);
 
     // Device options should mirror input waveform
-    auto options = torch::TensorOptions().dtype(waveform.dtype()).device(waveform.device().type());
+    auto options = torch::TensorOptions()
+                       .dtype(waveform.dtype())
+                      .device(waveform.device().type());
 
     // Allocate padded input and output waveform, copy in the input waveform
-    at::Tensor padded_waveform = torch::zeros({n_channels, n_frames + n_order - 1}, options);
-    at::Tensor padded_output_waveform = torch::zeros({n_channels, n_frames + n_order - 1},  options);
-    padded_waveform.slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1) = waveform;
+    at::Tensor padded_waveform =
+        torch::zeros({n_channels, n_frames + n_order - 1}, options);
+    at::Tensor padded_output_waveform =
+        torch::zeros({n_channels, n_frames + n_order - 1},  options);
+    padded_waveform.slice(0, 0, n_channels)
+                   .slice(1, n_order - 1, n_order + n_frames - 1) = waveform;
 
     // few more temporary data structure
-    at::Tensor o0 = torch::zeros({n_channels, 1}, options);
-    at::Tensor ones = torch::ones({n_channels, n_frames + n_order - 1}, options);
-    at::Tensor negones = torch::ones({n_channels, n_frames + n_order -1 }, options) * -1;
-    at::Tensor normalization_a0 = torch::unsqueeze(a_coeffs[0], {0,}).repeat({n_channels, 1});    // Device options should mirror input waveform
+    at::Tensor o0 =
+        torch::zeros({n_channels, 1}, options);
+    at::Tensor ones =
+        torch::ones({n_channels, n_frames + n_order - 1}, options);
+    at::Tensor negones =
+        torch::ones({n_channels, n_frames + n_order -1 }, options) * -1;
+    at::Tensor normalization_a0 =
+        torch::unsqueeze(a_coeffs[0], {0, }).repeat({n_channels, 1});
+
+    // Create accessors for fast access
+    // https://pytorch.org/cppdocs/notes/tensor_basics.html
 
-    // Create accessors for fast access - https://pytorch.org/cppdocs/notes/tensor_basics.html#efficient-access-to-tensor-elements
     // CPU
-    auto input_accessor = padded_waveform.accessor<float,2>();
-    auto output_accessor = padded_output_waveform.accessor<float,2>();    
-    auto a_coeffs_accessor = a_coeffs.accessor<float,1>();
-    auto b_coeffs_accessor = b_coeffs.accessor<float,1>();
+    auto input_accessor = padded_waveform.accessor<float, 2>();
+    auto output_accessor = padded_output_waveform.accessor<float, 2>();
+    auto a_coeffs_accessor = a_coeffs.accessor<float, 1>();
+    auto b_coeffs_accessor = b_coeffs.accessor<float, 1>();
 
     // CUDA - TBD
-    //auto input_accessor = waveform.packed_accessor64<float,2>();
-    //auto output_accessor = output_waveform.packed_accessor64<float,2>();    
-    //auto a_coeffs_accessor = a_coeffs.packed_accessor64<float,1>();
-    //auto b_coeffs_accessor = b_coeffs.packed_accessor64<float,1>();
-    
-    for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) {
+    // auto input_accessor = waveform.packed_accessor64<float,2>();
+    // auto output_accessor = output_waveform.packed_accessor64<float,2>();
+    // auto a_coeffs_accessor = a_coeffs.packed_accessor64<float,1>();
+    // auto b_coeffs_accessor = b_coeffs.packed_accessor64<float,1>();
 
+    for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) {
       for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) {
-
         // execute the difference equation
         float o0 = 0;
         for (int i_offset = 0; i_offset < n_order; ++i_offset) {
-          o0 += input_accessor[i_channel][i_frame + i_offset] * b_coeffs_accessor[n_order - i_offset - 1];
-          o0 -= output_accessor[i_channel][i_frame + i_offset] * a_coeffs_accessor[n_order - i_offset - 1];
+          o0 += input_accessor[i_channel][i_frame + i_offset] *
+                b_coeffs_accessor[n_order - i_offset - 1];
+          o0 -= output_accessor[i_channel][i_frame + i_offset] *
+                a_coeffs_accessor[n_order - i_offset - 1];
         }
         o0 = o0 / a_coeffs_accessor[0];
 
         // put back into the main data structure
         output_accessor[i_channel][i_frame + n_order - 1] = o0;
-
       }
     }
 
     // return clipped, and without initial conditions
-    return torch::min(ones, torch::max(negones, padded_output_waveform)).slice(0, 0, n_channels).slice(1, n_order - 1, n_order + n_frames - 1);
+    return torch::min(ones,
+                      torch::max(negones,
+                                 padded_output_waveform))
+           .slice(0, 0, n_channels)
+           .slice(1, n_order - 1, n_order + n_frames - 1);
   }
 
-}}
+}  // namespace audio
+}  // namespace torch
 
 
 PYBIND11_MODULE(_torch_filtering, m) {

From bc6494013a8a91e3d8325a7e51ad2c36c6fd082b Mon Sep 17 00:00:00 2001
From: Chuan Zhang <engineerchuan@gmail.com>
Date: Thu, 19 Sep 2019 12:01:07 -0700
Subject: [PATCH 16/17] refactor to use python wrapper

---
 test/test_functional_lfilter_perf.py |  21 +++--
 torchaudio/filtering.cpp             | 133 +++++----------------------
 torchaudio/functional.py             |  73 +++++++++++++++
 3 files changed, 112 insertions(+), 115 deletions(-)

diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py
index 7968dd5e89..4d8301a287 100644
--- a/test/test_functional_lfilter_perf.py
+++ b/test/test_functional_lfilter_perf.py
@@ -6,8 +6,7 @@
 import unittest
 import common_utils
 
-from torchaudio.functional import lfilter
-from _torch_filtering import _lfilter_tensor_matrix, _lfilter_element_wise
+from torchaudio.functional import lfilter, lfilter_cpp_impl
 
 
 class TestFunctionalLFilterPerformance(unittest.TestCase):
@@ -16,8 +15,6 @@ class TestFunctionalLFilterPerformance(unittest.TestCase):
     @staticmethod
     def run_test(n_channels, n_frames, n_order_filter, assertClose=True):
         waveform = torch.rand(n_channels, n_frames, device="cpu")
-        b_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu")
-        a_coeffs = torch.rand(n_order_filter, dtype=torch.float32, device="cpu")
 
         if n_order_filter == 8:
             # Eighth Order Filter
@@ -133,10 +130,10 @@ def time_and_output(func):
             lambda: lfilter(waveform, a_coeffs, b_coeffs)
         )
         (output_waveform_2, run_time_2) = time_and_output(
-            lambda: _lfilter_element_wise(waveform, a_coeffs, b_coeffs)
+            lambda: lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, 'element_wise')
         )
         (output_waveform_3, run_time_3) = time_and_output(
-            lambda: _lfilter_tensor_matrix(waveform, a_coeffs, b_coeffs)
+            lambda: lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, 'matrix')
         )
 
         print("-" * 80)
@@ -152,12 +149,22 @@ def time_and_output(func):
         print("Ratio Python / CPP ElementWise : %10.2f x" % (run_time_1 / run_time_2))
 
         if assertClose:
-            # maxDeviation = torch.kthvalue(torch.abs(output_waveform_3- output_waveform_2), output_waveform_1.size(1))
+            # maxDeviation = torch.kthvalue(torch.abs(output_waveform_2- output_waveform_1), output_waveform_1.size(1))
+
             assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4)
             assert torch.allclose(output_waveform_2, output_waveform_3, atol=3e-4)
             print("PASS - all outputs are identical")
             print("-" * 80)
 
+    def test_cpp_lfilter_runs(self):
+        waveform = torch.rand(2, 1000)
+        b_coeffs = torch.rand(2, dtype=torch.float32)
+        a_coeffs = torch.rand(2, dtype=torch.float32)
+        a_coeffs[0] = 1
+
+        output_waveform = lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, 'element_wise')
+        output_waveform = lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, 'matrix')
+
     def test_lfilter_cmp(self):
         """
         Runs comparison on CPU
diff --git a/torchaudio/filtering.cpp b/torchaudio/filtering.cpp
index 0f50189f81..16225d4b92 100644
--- a/torchaudio/filtering.cpp
+++ b/torchaudio/filtering.cpp
@@ -1,5 +1,5 @@
 #include <torch/extension.h>
-
+// TBD - for CUDA support #include <ATen/cuda/CUDAContext.h>
 // TBD - Compile on CUDA
 // TBD - Expand to other data types outside float32?
 
@@ -7,52 +7,17 @@ namespace torch {
 namespace audio {
 
 
-at::Tensor lfilter_tensor_matrix(
-    at::Tensor const & waveform,
-    at::Tensor const & a_coeffs,
-    at::Tensor const & b_coeffs
+void _lfilter_tensor_matrix(
+    at::Tensor const & padded_waveform,
+    at::Tensor & padded_output_waveform,
+    at::Tensor const & a_coeffs_filled,
+    at::Tensor const & b_coeffs_filled,
+    at::Tensor & o0,
+    at::Tensor const & normalization_a0
   ) {
-    // Assumptions - float32, waveform between -1 and 1
-    assert(waveform.dtype() == torch::kFloat32);
-
-    // CPU only for now - Implement for GPU in future
-    assert(waveform.device().type() == torch::kCPU);
-
-    // Numerator and denominator coefficients shoudlb e same size
-    assert(a_coeffs.size(0) == b_coeffs.size(0));
-    int n_order = a_coeffs.size(0);  // n'th order - 1 filter
-    assert(n_order > 0);
-
-    int64_t n_channels = waveform.size(0);
-    int64_t n_frames = waveform.size(1);
-
-    // Device options should mirror input waveform
-    auto options = torch::TensorOptions()
-                     .dtype(waveform.dtype())
-                     .device(waveform.device().type());
-
-    // Allocate padded input and output waveform, copy in the input waveform
-    at::Tensor padded_waveform =
-        torch::zeros({n_channels, n_frames + n_order - 1}, options);
-    at::Tensor padded_output_waveform =
-        torch::zeros({n_channels, n_frames + n_order - 1},  options);
-    padded_waveform.slice(0, 0, n_channels)
-                  .slice(1, n_order - 1, n_order + n_frames - 1) = waveform;
-
-    // Create [n_order, n_channels] structure for a and b coefficients
-    at::Tensor a_coeffs_filled =
-        at::transpose(at::flip(a_coeffs, {0, }).repeat({n_channels, 1}), 0, 1);
-    at::Tensor b_coeffs_filled =
-        at::transpose(at::flip(b_coeffs, {0, }).repeat({n_channels, 1}), 0, 1);
-
-    // few more temporary data structure
-    at::Tensor o0 = torch::zeros({n_channels, 1}, options);
-    at::Tensor ones =
-        torch::ones({n_channels, n_frames + n_order - 1}, options);
-    at::Tensor negones =
-        torch::ones({n_channels, n_frames + n_order -1 }, options) * -1;
-    at::Tensor normalization_a0 =
-        torch::unsqueeze(a_coeffs[0], {0, }).repeat({n_channels, 1});
+    int n_order = a_coeffs_filled.size(0);
+    int n_frames = padded_waveform.size(1) - n_order + 1;
+    int n_channels = padded_waveform.size(0);
 
     for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) {
       // calculate the output at time i_frame for all channels
@@ -88,69 +53,28 @@ at::Tensor lfilter_tensor_matrix(
                                    i_frame + n_order - 1,
                                    i_frame + n_order - 1 + 1) = o0;
     }
-    // return clipped, and without initial conditions
-    return torch::min(ones, torch::max(negones, padded_output_waveform))
-                  .slice(0, 0, n_channels)
-                  .slice(1, n_order - 1, n_order + n_frames - 1);
   }
 
-  at::Tensor lfilter_element_wise(
-    at::Tensor const & waveform,
+
+
+  void _lfilter_element_wise(
+    at::Tensor const & padded_waveform,
+    at::Tensor & padded_output_waveform,
     at::Tensor const & a_coeffs,
-    at::Tensor const & b_coeffs
+    at::Tensor const & b_coeffs,
+    at::Tensor & o0
   ) {
-    // Assumptions - float32, waveform between -1 and 1
-    assert(waveform.dtype() == torch::kFloat32);
-
-    // CPU only for now - Implement for GPU in future
-    assert(waveform.device().type() == torch::kCPU);
-
-    // Numerator and denominator coefficients shoudlb e same size
-    assert(a_coeffs.size(0) == b_coeffs.size(0));
-    int n_order = a_coeffs.size(0);  // n'th order - 1 filter
-    assert(n_order > 0);
-
-    int64_t n_channels = waveform.size(0);
-    int64_t n_frames = waveform.size(1);
-
-    // Device options should mirror input waveform
-    auto options = torch::TensorOptions()
-                       .dtype(waveform.dtype())
-                      .device(waveform.device().type());
-
-    // Allocate padded input and output waveform, copy in the input waveform
-    at::Tensor padded_waveform =
-        torch::zeros({n_channels, n_frames + n_order - 1}, options);
-    at::Tensor padded_output_waveform =
-        torch::zeros({n_channels, n_frames + n_order - 1},  options);
-    padded_waveform.slice(0, 0, n_channels)
-                   .slice(1, n_order - 1, n_order + n_frames - 1) = waveform;
-
-    // few more temporary data structure
-    at::Tensor o0 =
-        torch::zeros({n_channels, 1}, options);
-    at::Tensor ones =
-        torch::ones({n_channels, n_frames + n_order - 1}, options);
-    at::Tensor negones =
-        torch::ones({n_channels, n_frames + n_order -1 }, options) * -1;
-    at::Tensor normalization_a0 =
-        torch::unsqueeze(a_coeffs[0], {0, }).repeat({n_channels, 1});
-
-    // Create accessors for fast access
-    // https://pytorch.org/cppdocs/notes/tensor_basics.html
+    int n_order = a_coeffs.size(0);
+    int n_frames = padded_waveform.size(1) - n_order + 1;
+    int n_channels = padded_waveform.size(0);
 
-    // CPU
+    // Create CPU accessors for fast access
+    // https://pytorch.org/cppdocs/notes/tensor_basics.html
     auto input_accessor = padded_waveform.accessor<float, 2>();
     auto output_accessor = padded_output_waveform.accessor<float, 2>();
     auto a_coeffs_accessor = a_coeffs.accessor<float, 1>();
     auto b_coeffs_accessor = b_coeffs.accessor<float, 1>();
 
-    // CUDA - TBD
-    // auto input_accessor = waveform.packed_accessor64<float,2>();
-    // auto output_accessor = output_waveform.packed_accessor64<float,2>();
-    // auto a_coeffs_accessor = a_coeffs.packed_accessor64<float,1>();
-    // auto b_coeffs_accessor = b_coeffs.packed_accessor64<float,1>();
-
     for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) {
       for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) {
         // execute the difference equation
@@ -167,13 +91,6 @@ at::Tensor lfilter_tensor_matrix(
         output_accessor[i_channel][i_frame + n_order - 1] = o0;
       }
     }
-
-    // return clipped, and without initial conditions
-    return torch::min(ones,
-                      torch::max(negones,
-                                 padded_output_waveform))
-           .slice(0, 0, n_channels)
-           .slice(1, n_order - 1, n_order + n_frames - 1);
   }
 
 }  // namespace audio
@@ -185,10 +102,10 @@ PYBIND11_MODULE(_torch_filtering, m) {
   options.disable_function_signatures();
   m.def(
       "_lfilter_tensor_matrix",
-      &torch::audio::lfilter_tensor_matrix,
+      &torch::audio::_lfilter_tensor_matrix,
       "Executes difference equation with tensor");
   m.def(
       "_lfilter_element_wise",
-      &torch::audio::lfilter_element_wise,
+      &torch::audio::_lfilter_element_wise,
       "Executes difference equation with tensor");
 }
diff --git a/torchaudio/functional.py b/torchaudio/functional.py
index 3aab718f70..c3bf904684 100644
--- a/torchaudio/functional.py
+++ b/torchaudio/functional.py
@@ -1,6 +1,7 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 import math
 import torch
+from _torch_filtering import _lfilter_tensor_matrix, _lfilter_element_wise
 
 __all__ = [
     "istft",
@@ -15,6 +16,7 @@
     "magphase",
     "phase_vocoder",
     "lfilter",
+    "lfilter_cpp_impl",
     "lowpass_biquad",
     "highpass_biquad",
     "biquad",
@@ -588,6 +590,77 @@ def lfilter(waveform, a_coeffs, b_coeffs):
     return torch.min(ones, torch.max(ones * -1, padded_output_waveform[:, (n_order - 1):]))
 
 
+def lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, execution_method):
+    # type: (Tensor, Tensor, Tensor, string) -> Tensor
+    r"""
+    See lfilter documentation. Execution method can be either `element_wise` or `matrix`.
+    """
+
+    # Perform sanity checks, input check, and memory allocation in python
+
+    # Current limitations to be removed in future
+    assert(waveform.dtype == torch.float32)
+    assert(waveform.device.type == 'cpu')
+    assert(waveform.device == a_coeffs.device)
+    assert(waveform.device == b_coeffs.device)
+
+    # Use these parameters for any calculations
+    device = waveform.device
+    dtype = waveform.dtype
+
+    # Check filter orders
+    n_order = a_coeffs.size(0)
+    assert(b_coeffs.size(0) == n_order)
+    assert(n_order > 0)
+    assert(a_coeffs[0] != 0)  # a0 coeff for y[n] can not be 0
+
+    # Check waveform size
+    assert(len(waveform.size()) == 2)
+    n_channels = waveform.size(0)
+    n_frames = waveform.size(1)
+    n_padded_frames = n_frames + n_order - 1
+
+    # Allocate temporary data structures
+    # First, the input waveform should be padded by the order of the filter
+    #   N.B. Should we look to how we can avoid copying
+    padded_waveform = torch.zeros(n_channels, n_padded_frames, dtype=dtype, device=device)
+    padded_output_waveform = torch.zeros(n_channels, n_padded_frames, dtype=dtype, device=device)
+    padded_waveform[:, (n_order - 1):] = waveform
+
+    # More temporary data structures
+    o0 = torch.zeros(n_channels, 1, dtype=dtype, device=device)
+    normalization_a0 = a_coeffs[0].unsqueeze(0).repeat(n_channels, 1)
+    ones = torch.ones(n_channels, n_padded_frames, dtype=dtype, device=device)
+
+    # Run through assertion size checks
+    assert(normalization_a0.size(0) == n_channels)
+
+    if execution_method == 'element_wise':
+        _lfilter_element_wise(padded_waveform,
+                              padded_output_waveform,
+                              a_coeffs,
+                              b_coeffs,
+                              o0
+                              )
+    elif execution_method == 'matrix':
+        # From [n_order] a_coeffs, create a [n_channel, n_order] tensor
+        # lowest order coefficients e.g. a0, b0 should be at the "bottom"
+        # used for matrix multiply later
+        a_coeffs_filled = a_coeffs.flip(0).repeat(n_channels, 1).t()
+        b_coeffs_filled = b_coeffs.flip(0).repeat(n_channels, 1).t()
+        _lfilter_tensor_matrix(padded_waveform,
+                               padded_output_waveform,
+                               a_coeffs_filled,
+                               b_coeffs_filled,
+                               o0,
+                               normalization_a0,
+                               )
+    else:
+        raise Exception("invalid lfilter execution method %s" % execution_method)
+
+    return torch.min(ones, torch.max(ones * -1, padded_output_waveform))[:, (n_order - 1):]
+
+
 def biquad(waveform, b0, b1, b2, a0, a1, a2):
     # type: (Tensor, float, float, float, float, float, float) -> Tensor
     r"""Performs a biquad filter of input tensor.  Initial conditions set to 0.

From a982c5f48494afb82c5ab71ad94ee4ef3cc2716f Mon Sep 17 00:00:00 2001
From: Chuan Zhang <engineerchuan@gmail.com>
Date: Sat, 21 Sep 2019 17:54:29 -0400
Subject: [PATCH 17/17] rebased and added template for element wise lfilter

---
 test/test_functional_lfilter_perf.py | 11 +++++--
 torchaudio/filtering.cpp             | 43 ++++++++++++++--------------
 torchaudio/functional.py             | 27 +++++++++++------
 3 files changed, 49 insertions(+), 32 deletions(-)

diff --git a/test/test_functional_lfilter_perf.py b/test/test_functional_lfilter_perf.py
index 4d8301a287..ec9d9b2ee4 100644
--- a/test/test_functional_lfilter_perf.py
+++ b/test/test_functional_lfilter_perf.py
@@ -149,21 +149,26 @@ def time_and_output(func):
         print("Ratio Python / CPP ElementWise : %10.2f x" % (run_time_1 / run_time_2))
 
         if assertClose:
-            # maxDeviation = torch.kthvalue(torch.abs(output_waveform_2- output_waveform_1), output_waveform_1.size(1))
-
+            # maxDeviation = torch.kthvalue(torch.abs(output_waveform_2- output_waveform_3), output_waveform_1.size(1))
             assert torch.allclose(output_waveform_1, output_waveform_2, atol=3e-4)
             assert torch.allclose(output_waveform_2, output_waveform_3, atol=3e-4)
             print("PASS - all outputs are identical")
             print("-" * 80)
 
     def test_cpp_lfilter_runs(self):
-        waveform = torch.rand(2, 1000)
+        waveform = torch.rand(2, 1000, dtype=torch.float32)
         b_coeffs = torch.rand(2, dtype=torch.float32)
         a_coeffs = torch.rand(2, dtype=torch.float32)
         a_coeffs[0] = 1
 
+        double_waveform = torch.rand(2, 1000, dtype=torch.float64)
+        double_b_coeffs = torch.rand(5, dtype=torch.float64)
+        double_a_coeffs = torch.rand(5, dtype=torch.float64)
+        double_a_coeffs[0] = 1
+
         output_waveform = lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, 'element_wise')
         output_waveform = lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, 'matrix')
+        output_waveform = lfilter_cpp_impl(double_waveform, double_a_coeffs, double_b_coeffs, 'element_wise')
 
     def test_lfilter_cmp(self):
         """
diff --git a/torchaudio/filtering.cpp b/torchaudio/filtering.cpp
index 16225d4b92..61f1ec8ca0 100644
--- a/torchaudio/filtering.cpp
+++ b/torchaudio/filtering.cpp
@@ -1,7 +1,6 @@
 #include <torch/extension.h>
 // TBD - for CUDA support #include <ATen/cuda/CUDAContext.h>
 // TBD - Compile on CUDA
-// TBD - Expand to other data types outside float32?
 
 namespace torch {
 namespace audio {
@@ -15,14 +14,13 @@ void _lfilter_tensor_matrix(
     at::Tensor & o0,
     at::Tensor const & normalization_a0
   ) {
-    int n_order = a_coeffs_filled.size(0);
-    int n_frames = padded_waveform.size(1) - n_order + 1;
-    int n_channels = padded_waveform.size(0);
+    int64_t n_order = a_coeffs_filled.size(0);
+    int64_t n_frames = padded_waveform.size(1) - n_order + 1;
+    int64_t n_channels = padded_waveform.size(0);
 
     for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) {
-      // calculate the output at time i_frame for all channels
-      o0 = torch::zeros({n_channels, 1},
-                        torch::TensorOptions().dtype(torch::kFloat32));
+      // reset all o0
+      o0.fill_(0.0);
 
       // time window of input and output, size [n_channels, n_order]
       at::Tensor const & input_window =
@@ -55,30 +53,29 @@ void _lfilter_tensor_matrix(
     }
   }
 
-
-
+  template <typename T>
   void _lfilter_element_wise(
     at::Tensor const & padded_waveform,
     at::Tensor & padded_output_waveform,
     at::Tensor const & a_coeffs,
-    at::Tensor const & b_coeffs,
-    at::Tensor & o0
+    at::Tensor const & b_coeffs
   ) {
-    int n_order = a_coeffs.size(0);
-    int n_frames = padded_waveform.size(1) - n_order + 1;
-    int n_channels = padded_waveform.size(0);
+    int64_t n_order = a_coeffs.size(0);
+    int64_t n_frames = padded_waveform.size(1) - n_order + 1;
+    int64_t n_channels = padded_waveform.size(0);
 
     // Create CPU accessors for fast access
     // https://pytorch.org/cppdocs/notes/tensor_basics.html
-    auto input_accessor = padded_waveform.accessor<float, 2>();
-    auto output_accessor = padded_output_waveform.accessor<float, 2>();
-    auto a_coeffs_accessor = a_coeffs.accessor<float, 1>();
-    auto b_coeffs_accessor = b_coeffs.accessor<float, 1>();
+    auto input_accessor = padded_waveform.accessor<T, 2>();
+    auto output_accessor = padded_output_waveform.accessor<T, 2>();
+    auto a_coeffs_accessor = a_coeffs.accessor<T, 1>();
+    auto b_coeffs_accessor = b_coeffs.accessor<T, 1>();
+    T o0;
 
     for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) {
       for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) {
         // execute the difference equation
-        float o0 = 0;
+        o0 = 0;
         for (int i_offset = 0; i_offset < n_order; ++i_offset) {
           o0 += input_accessor[i_channel][i_frame + i_offset] *
                 b_coeffs_accessor[n_order - i_offset - 1];
@@ -105,7 +102,11 @@ PYBIND11_MODULE(_torch_filtering, m) {
       &torch::audio::_lfilter_tensor_matrix,
       "Executes difference equation with tensor");
   m.def(
-      "_lfilter_element_wise",
-      &torch::audio::_lfilter_element_wise,
+      "_lfilter_element_wise_float",
+      &torch::audio::_lfilter_element_wise<float>,
+      "Executes difference equation with tensor");
+  m.def(
+      "_lfilter_element_wise_double",
+      &torch::audio::_lfilter_element_wise<double>,
       "Executes difference equation with tensor");
 }
diff --git a/torchaudio/functional.py b/torchaudio/functional.py
index c3bf904684..84b2e042ee 100644
--- a/torchaudio/functional.py
+++ b/torchaudio/functional.py
@@ -1,7 +1,7 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 import math
 import torch
-from _torch_filtering import _lfilter_tensor_matrix, _lfilter_element_wise
+from _torch_filtering import _lfilter_tensor_matrix, _lfilter_element_wise_float, _lfilter_element_wise_double
 
 __all__ = [
     "istft",
@@ -599,10 +599,13 @@ def lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, execution_method):
     # Perform sanity checks, input check, and memory allocation in python
 
     # Current limitations to be removed in future
-    assert(waveform.dtype == torch.float32)
+    assert(waveform.dtype == torch.float32 or waveform.dtype == torch.float64)
     assert(waveform.device.type == 'cpu')
+
     assert(waveform.device == a_coeffs.device)
     assert(waveform.device == b_coeffs.device)
+    assert(waveform.dtype == a_coeffs.dtype)
+    assert(waveform.dtype == b_coeffs.dtype)
 
     # Use these parameters for any calculations
     device = waveform.device
@@ -636,12 +639,20 @@ def lfilter_cpp_impl(waveform, a_coeffs, b_coeffs, execution_method):
     assert(normalization_a0.size(0) == n_channels)
 
     if execution_method == 'element_wise':
-        _lfilter_element_wise(padded_waveform,
-                              padded_output_waveform,
-                              a_coeffs,
-                              b_coeffs,
-                              o0
-                              )
+        if (dtype == torch.float32):
+            _lfilter_element_wise_float(padded_waveform,
+                                        padded_output_waveform,
+                                        a_coeffs,
+                                        b_coeffs,
+                                        )
+        elif (dtype == torch.float64):
+            _lfilter_element_wise_double(padded_waveform,
+                                         padded_output_waveform,
+                                         a_coeffs,
+                                         b_coeffs,
+                                         )
+        else:
+            raise Exception("lfilter not supported for type ", dtype)
     elif execution_method == 'matrix':
         # From [n_order] a_coeffs, create a [n_channel, n_order] tensor
         # lowest order coefficients e.g. a0, b0 should be at the "bottom"