From 48016d09699697b17b651acca4272b42997c77cd Mon Sep 17 00:00:00 2001
From: Chin Yun Yu <ya70201@gmail.com>
Date: Mon, 7 Jun 2021 10:42:18 +0800
Subject: [PATCH 01/11] draft

---
 torchaudio/csrc/lfilter.cpp        | 40 ++++++++++++++++--------------
 torchaudio/functional/filtering.py | 13 +++++++++-
 2 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/torchaudio/csrc/lfilter.cpp b/torchaudio/csrc/lfilter.cpp
index dcc5dbe442..3a7b21f389 100644
--- a/torchaudio/csrc/lfilter.cpp
+++ b/torchaudio/csrc/lfilter.cpp
@@ -8,23 +8,26 @@ void host_lfilter_core_loop(
     const torch::Tensor& input_signal_windows,
     const torch::Tensor& a_coeff_flipped,
     torch::Tensor& padded_output_waveform) {
-  int64_t n_channel = input_signal_windows.size(0);
-  int64_t n_samples_input = input_signal_windows.size(1);
-  int64_t n_samples_output = padded_output_waveform.size(1);
-  int64_t n_order = a_coeff_flipped.size(0);
+  int64_t n_batch = input_signal_windows.size(0);
+  int64_t n_channel = input_signal_windows.size(1);
+  int64_t n_samples_input = input_signal_windows.size(2);
+  int64_t n_samples_output = padded_output_waveform.size(2);
+  int64_t n_order = a_coeff_flipped.size(1);
   scalar_t* output_data = padded_output_waveform.data_ptr<scalar_t>();
   const scalar_t* input_data = input_signal_windows.data_ptr<scalar_t>();
   const scalar_t* a_coeff_flipped_data = a_coeff_flipped.data_ptr<scalar_t>();
-  for (int64_t i_channel = 0; i_channel < n_channel; i_channel++) {
-    for (int64_t i_sample = 0; i_sample < n_samples_input; i_sample++) {
-      int64_t offset_input = i_channel * n_samples_input;
-      int64_t offset_output = i_channel * n_samples_output;
-      scalar_t a0 = input_data[offset_input + i_sample];
-      for (int64_t i_coeff = 0; i_coeff < n_order; i_coeff++) {
-        a0 -= output_data[offset_output + i_sample + i_coeff] *
-            a_coeff_flipped_data[i_coeff];
+  for (int64_t i_batch = 0; i_batch < n_batch; i_batch++) {
+    for (int64_t i_channel = 0; i_channel < n_channel; i_channel++) {
+      for (int64_t i_sample = 0; i_sample < n_samples_input; i_sample++) {
+        int64_t offset_input = i_batch * i_channel * n_samples_input;
+        int64_t offset_output = i_batch * i_channel * n_samples_output;
+        scalar_t a0 = input_data[offset_input + i_sample];
+        for (int64_t i_coeff = 0; i_coeff < n_order; i_coeff++) {
+          a0 -= output_data[offset_output + i_sample + i_coeff] *
+              a_coeff_flipped_data[i_coeff + i_channel * n_order];
+        }
+        output_data[offset_output + i_sample + n_order - 1] = a0;
       }
-      output_data[offset_output + i_sample + n_order - 1] = a0;
     }
   }
 }
@@ -219,19 +222,20 @@ torch::Tensor lfilter_core(
     const torch::Tensor& b_coeffs) {
   TORCH_CHECK(waveform.device() == a_coeffs.device());
   TORCH_CHECK(b_coeffs.device() == a_coeffs.device());
-  TORCH_CHECK(a_coeffs.size(0) == b_coeffs.size(0));
+  TORCH_CHECK(a_coeffs.sizes() == b_coeffs.sizes());
 
-  TORCH_INTERNAL_ASSERT(waveform.sizes().size() == 2);
+  TORCH_INTERNAL_ASSERT(waveform.sizes().size() == 3);
+  TORCH_INTERNAL_ASSERT(a_coeffs.sizes().size() == 2);
 
-  int64_t n_order = b_coeffs.size(0);
+  int64_t n_order = b_coeffs.size(1);
 
   TORCH_INTERNAL_ASSERT(n_order > 0);
 
   auto filtered_waveform =
-      DifferentiableFIR::apply(waveform, b_coeffs / a_coeffs[0]);
+      DifferentiableFIR::apply(waveform, b_coeffs / a_coeffs[:, :1]);
 
   auto output =
-      DifferentiableIIR::apply(filtered_waveform, a_coeffs / a_coeffs[0]);
+      DifferentiableIIR::apply(filtered_waveform, a_coeffs / a_coeffs[:, :1]);
   return output;
 }
 
diff --git a/torchaudio/functional/filtering.py b/torchaudio/functional/filtering.py
index 68269d9b34..0576971487 100644
--- a/torchaudio/functional/filtering.py
+++ b/torchaudio/functional/filtering.py
@@ -951,7 +951,18 @@ def lfilter(
     """
     # pack batch
     shape = waveform.size()
-    waveform = waveform.reshape(-1, shape[-1])
+    assert a_coeffs.size() == b_coeffs.size()
+    assert a_coeffs.ndim <= 2
+    assert waveform.ndim >= a_coeffs.ndim
+
+    if a_coeffs.ndim > 1:
+        shape = shape[:-2] + (a_coeffs.shape[0], shape[-1])
+        waveform = torch.broadcast_to(waveform, shape)
+        waveform = waveform.reshape(-1, shape[-2], shape[-1])
+    else:
+        waveform = waveform.reshape(-1, 1, shape[-1])
+        a_coeffs = a_coeffs.unsqueeze(0)
+        b_coeffs = b_coeffs.unsqueeze(0)
 
     output = _lfilter(waveform, a_coeffs, b_coeffs)
 

From de4e385129881b2d23077f531c24f5c356cefa98 Mon Sep 17 00:00:00 2001
From: Chin Yun Yu <ya70201@gmail.com>
Date: Mon, 7 Jun 2021 18:28:56 +0800
Subject: [PATCH 02/11] draft

---
 torchaudio/csrc/lfilter.cpp | 31 +++++++++++++++++++++----------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/torchaudio/csrc/lfilter.cpp b/torchaudio/csrc/lfilter.cpp
index 3a7b21f389..5318d28de4 100644
--- a/torchaudio/csrc/lfilter.cpp
+++ b/torchaudio/csrc/lfilter.cpp
@@ -54,10 +54,11 @@ void cpu_lfilter_core_loop(
        padded_output_waveform.dtype() == torch::kFloat64));
 
   TORCH_CHECK(input_signal_windows.size(0) == padded_output_waveform.size(0));
+  TORCH_CHECK(input_signal_windows.size(1) == padded_output_waveform.size(1));
 
   TORCH_CHECK(
-      input_signal_windows.size(1) + a_coeff_flipped.size(0) - 1 ==
-      padded_output_waveform.size(1));
+      input_signal_windows.size(2) + a_coeff_flipped.size(1) - 1 ==
+      padded_output_waveform.size(2));
 
   AT_DISPATCH_FLOATING_TYPES(
       input_signal_windows.scalar_type(), "lfilter_core_loop", [&] {
@@ -70,16 +71,26 @@ void lfilter_core_generic_loop(
     const torch::Tensor& input_signal_windows,
     const torch::Tensor& a_coeff_flipped,
     torch::Tensor& padded_output_waveform) {
-  int64_t n_samples_input = input_signal_windows.size(1);
-  int64_t n_order = a_coeff_flipped.size(0);
+  int64_t n_samples_input = input_signal_windows.size(2);
+  int64_t n_order = a_coeff_flipped.size(1);
+  auto coeff = a_coeff_flipped.unsqueeze(2);
   for (int64_t i_sample = 0; i_sample < n_samples_input; i_sample++) {
-    auto windowed_output_signal = padded_output_waveform.index(
-        {torch::indexing::Slice(),
-         torch::indexing::Slice(i_sample, i_sample + n_order)});
-    auto o0 = input_signal_windows.index({torch::indexing::Slice(), i_sample})
-                  .addmv(windowed_output_signal, a_coeff_flipped, 1, -1);
+    auto windowed_output_signal =
+        padded_output_waveform
+            .index(
+                {torch::indexing::Slice(),
+                 torch::indexing::Slice(),
+                 torch::indexing::Slice(i_sample, i_sample + n_order)})
+            .transpose(0, 1);
+    auto o0 =
+        input_signal_windows.index(
+            {torch::indexing::Slice(), torch::indexing::Slice(), i_sample}) -
+        torch.matmul(windowed_output_signal, coeff).squeeze(2).transpose(0, 1);
     padded_output_waveform.index_put_(
-        {torch::indexing::Slice(), i_sample + n_order - 1}, o0);
+        {torch::indexing::Slice(),
+         torch::indexing::Slice(),
+         i_sample + n_order - 1},
+        o0);
   }
 }
 

From 448483adc64f33cb70b550537e76b254151461cc Mon Sep 17 00:00:00 2001
From: Chin Yun Yu <ya70201@gmail.com>
Date: Tue, 8 Jun 2021 13:40:48 +0800
Subject: [PATCH 03/11] fix iir wrong index

---
 torchaudio/csrc/lfilter.cpp | 93 +++++++++++++++++++++----------------
 1 file changed, 54 insertions(+), 39 deletions(-)

diff --git a/torchaudio/csrc/lfilter.cpp b/torchaudio/csrc/lfilter.cpp
index 5318d28de4..4aeef27453 100644
--- a/torchaudio/csrc/lfilter.cpp
+++ b/torchaudio/csrc/lfilter.cpp
@@ -19,8 +19,10 @@ void host_lfilter_core_loop(
   for (int64_t i_batch = 0; i_batch < n_batch; i_batch++) {
     for (int64_t i_channel = 0; i_channel < n_channel; i_channel++) {
       for (int64_t i_sample = 0; i_sample < n_samples_input; i_sample++) {
-        int64_t offset_input = i_batch * i_channel * n_samples_input;
-        int64_t offset_output = i_batch * i_channel * n_samples_output;
+        int64_t offset_input =
+            ((i_batch * n_channel) + i_channel) * n_samples_input;
+        int64_t offset_output =
+            ((i_batch * n_channel) + i_channel) * n_samples_output;
         scalar_t a0 = input_data[offset_input + i_sample];
         for (int64_t i_coeff = 0; i_coeff < n_order; i_coeff++) {
           a0 -= output_data[offset_output + i_sample + i_coeff] *
@@ -85,7 +87,7 @@ void lfilter_core_generic_loop(
     auto o0 =
         input_signal_windows.index(
             {torch::indexing::Slice(), torch::indexing::Slice(), i_sample}) -
-        torch.matmul(windowed_output_signal, coeff).squeeze(2).transpose(0, 1);
+        at::matmul(windowed_output_signal, coeff).squeeze(2).transpose(0, 1);
     padded_output_waveform.index_put_(
         {torch::indexing::Slice(),
          torch::indexing::Slice(),
@@ -102,16 +104,17 @@ class DifferentiableIIR : public torch::autograd::Function<DifferentiableIIR> {
       const torch::Tensor& a_coeffs_normalized) {
     auto device = waveform.device();
     auto dtype = waveform.dtype();
-    int64_t n_channel = waveform.size(0);
-    int64_t n_sample = waveform.size(1);
-    int64_t n_order = a_coeffs_normalized.size(0);
+    int64_t n_batch = waveform.size(0);
+    int64_t n_channel = waveform.size(1);
+    int64_t n_sample = waveform.size(2);
+    int64_t n_order = a_coeffs_normalized.size(1);
     int64_t n_sample_padded = n_sample + n_order - 1;
 
-    auto a_coeff_flipped = a_coeffs_normalized.flip(0).contiguous();
+    auto a_coeff_flipped = a_coeffs_normalized.flip(1).contiguous();
 
     auto options = torch::TensorOptions().dtype(dtype).device(device);
     auto padded_output_waveform =
-        torch::zeros({n_channel, n_sample_padded}, options);
+        torch::zeros({n_batch, n_channel, n_sample_padded}, options);
 
     if (device.is_cpu()) {
       cpu_lfilter_core_loop(waveform, a_coeff_flipped, padded_output_waveform);
@@ -122,6 +125,7 @@ class DifferentiableIIR : public torch::autograd::Function<DifferentiableIIR> {
 
     auto output = padded_output_waveform.index(
         {torch::indexing::Slice(),
+         torch::indexing::Slice(),
          torch::indexing::Slice(n_order - 1, torch::indexing::None)});
 
     ctx->save_for_backward({waveform, a_coeffs_normalized, output});
@@ -136,8 +140,9 @@ class DifferentiableIIR : public torch::autograd::Function<DifferentiableIIR> {
     auto a_coeffs_normalized = saved[1];
     auto y = saved[2];
 
-    int64_t n_channel = x.size(0);
-    int64_t n_order = a_coeffs_normalized.size(0);
+    int64_t n_batch = x.size(0);
+    int64_t n_channel = x.size(1);
+    int64_t n_order = a_coeffs_normalized.size(1);
 
     auto dx = torch::Tensor();
     auto da = torch::Tensor();
@@ -151,16 +156,16 @@ class DifferentiableIIR : public torch::autograd::Function<DifferentiableIIR> {
           F::PadFuncOptions({n_order - 1, 0}));
 
       da = F::conv1d(
-               dyda.unsqueeze(0),
-               dy.unsqueeze(1),
-               F::Conv1dFuncOptions().groups(n_channel))
-               .sum(1)
-               .squeeze(0)
-               .flip(0);
+               dyda.view({1, n_batch * n_channel, -1}),
+               dy.view({n_batch * n_channel, 1, -1}),
+               F::Conv1dFuncOptions().groups(n_batch * n_channel))
+               .view({n_batch, n_channel, -1})
+               .sum(0)
+               .flip(1);
     }
 
     if (x.requires_grad()) {
-      dx = DifferentiableIIR::apply(dy.flip(1), a_coeffs_normalized).flip(1);
+      dx = DifferentiableIIR::apply(dy.flip(2), a_coeffs_normalized).flip(2);
     }
 
     return {dx, da};
@@ -173,17 +178,18 @@ class DifferentiableFIR : public torch::autograd::Function<DifferentiableFIR> {
       torch::autograd::AutogradContext* ctx,
       const torch::Tensor& waveform,
       const torch::Tensor& b_coeffs) {
-    int64_t n_order = b_coeffs.size(0);
+    int64_t n_order = b_coeffs.size(1);
+    int64_t n_channel = b_coeffs.size(0);
 
     namespace F = torch::nn::functional;
-    auto b_coeff_flipped = b_coeffs.flip(0).contiguous();
+    auto b_coeff_flipped = b_coeffs.flip(1).contiguous();
     auto padded_waveform =
         F::pad(waveform, F::PadFuncOptions({n_order - 1, 0}));
 
-    auto output =
-        F::conv1d(
-            padded_waveform.unsqueeze(1), b_coeff_flipped.view({1, 1, n_order}))
-            .squeeze(1);
+    auto output = F::conv1d(
+        padded_waveform,
+        b_coeff_flipped.unsqueeze(1),
+        F::Conv1dFuncOptions().groups(n_channel));
 
     ctx->save_for_backward({waveform, b_coeffs, output});
     return output;
@@ -197,8 +203,9 @@ class DifferentiableFIR : public torch::autograd::Function<DifferentiableFIR> {
     auto b_coeffs = saved[1];
     auto y = saved[2];
 
-    int64_t n_channel = x.size(0);
-    int64_t n_order = b_coeffs.size(0);
+    int64_t n_batch = x.size(0);
+    int64_t n_channel = x.size(1);
+    int64_t n_order = b_coeffs.size(1);
 
     auto dx = torch::Tensor();
     auto db = torch::Tensor();
@@ -208,19 +215,20 @@ class DifferentiableFIR : public torch::autograd::Function<DifferentiableFIR> {
 
     if (b_coeffs.requires_grad()) {
       db = F::conv1d(
-               F::pad(x.unsqueeze(0), F::PadFuncOptions({n_order - 1, 0})),
-               dy.unsqueeze(1),
-               F::Conv1dFuncOptions().groups(n_channel))
-               .sum(1)
-               .squeeze(0)
-               .flip(0);
+               F::pad(x, F::PadFuncOptions({n_order - 1, 0}))
+                   .view({1, n_batch * n_channel, -1}),
+               dy.view({n_batch * n_channel, 1, -1}),
+               F::Conv1dFuncOptions().groups(n_batch * n_channel))
+               .view({n_batch, n_channel, -1})
+               .sum(0)
+               .flip(1);
     }
 
     if (x.requires_grad()) {
       dx = F::conv1d(
-               F::pad(dy.unsqueeze(1), F::PadFuncOptions({0, n_order - 1})),
-               b_coeffs.view({1, 1, n_order}))
-               .squeeze(1);
+          F::pad(dy, F::PadFuncOptions({0, n_order - 1})),
+          b_coeffs.unsqueeze(1),
+          F::Conv1dFuncOptions().groups(n_channel));
     }
 
     return {dx, db};
@@ -237,16 +245,23 @@ torch::Tensor lfilter_core(
 
   TORCH_INTERNAL_ASSERT(waveform.sizes().size() == 3);
   TORCH_INTERNAL_ASSERT(a_coeffs.sizes().size() == 2);
+  TORCH_INTERNAL_ASSERT(a_coeffs.size(0) == waveform.size(1));
 
   int64_t n_order = b_coeffs.size(1);
 
   TORCH_INTERNAL_ASSERT(n_order > 0);
 
-  auto filtered_waveform =
-      DifferentiableFIR::apply(waveform, b_coeffs / a_coeffs[:, :1]);
-
-  auto output =
-      DifferentiableIIR::apply(filtered_waveform, a_coeffs / a_coeffs[:, :1]);
+  auto filtered_waveform = DifferentiableFIR::apply(
+      waveform,
+      b_coeffs /
+          a_coeffs.index(
+              {torch::indexing::Slice(), torch::indexing::Slice(0, 1)}));
+
+  auto output = DifferentiableIIR::apply(
+      filtered_waveform,
+      a_coeffs /
+          a_coeffs.index(
+              {torch::indexing::Slice(), torch::indexing::Slice(0, 1)}));
   return output;
 }
 

From c2844e8af8e6b59044484df623b1cbb6d8a3b490 Mon Sep 17 00:00:00 2001
From: Chin Yun Yu <ya70201@gmail.com>
Date: Tue, 8 Jun 2021 14:34:22 +0800
Subject: [PATCH 04/11] add tests

---
 .../functional/autograd_impl.py                | 18 ++++++++++++++++++
 .../functional/functional_impl.py              | 14 ++++++++++++++
 torchaudio/functional/filtering.py             |  3 ++-
 3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/test/torchaudio_unittest/functional/autograd_impl.py b/test/torchaudio_unittest/functional/autograd_impl.py
index 049ae25c90..dc1bf9547d 100644
--- a/test/torchaudio_unittest/functional/autograd_impl.py
+++ b/test/torchaudio_unittest/functional/autograd_impl.py
@@ -59,6 +59,24 @@ def test_lfilter_all_inputs(self):
         b = torch.tensor([0.4, 0.2, 0.9])
         self.assert_grad(F.lfilter, (x, a, b))
 
+    def test_batch_lfilter(self):
+        torch.random.manual_seed(2434)
+        x = get_whitenoise(sample_rate=22050, duration=0.01, n_channels=2)
+        a = torch.tensor([[0.7, 0.2, 0.6],
+                          [0.8, 0.2, 0.9]])
+        b = torch.tensor([[0.4, 0.2, 0.9],
+                          [0.7, 0.2, 0.6]])
+        self.assert_grad(F.lfilter, (x, a, b))
+
+    def test_filter_banks_lfilter(self):
+        torch.random.manual_seed(2434)
+        x = get_whitenoise(sample_rate=22050, duration=0.01, n_channels=2).unsqueeze(1)
+        a = torch.tensor([[0.7, 0.2, 0.6],
+                          [0.8, 0.2, 0.9]])
+        b = torch.tensor([[0.4, 0.2, 0.9],
+                          [0.7, 0.2, 0.6]])
+        self.assert_grad(F.lfilter, (x, a, b))
+
     def test_biquad(self):
         torch.random.manual_seed(2434)
         x = get_whitenoise(sample_rate=22050, duration=0.01, n_channels=1)
diff --git a/test/torchaudio_unittest/functional/functional_impl.py b/test/torchaudio_unittest/functional/functional_impl.py
index 518a9fa6f6..48d83267d3 100644
--- a/test/torchaudio_unittest/functional/functional_impl.py
+++ b/test/torchaudio_unittest/functional/functional_impl.py
@@ -80,6 +80,20 @@ def test_lfilter_shape(self, shape):
         output_waveform = F.lfilter(waveform, a_coeffs, b_coeffs)
         assert shape == waveform.size() == output_waveform.size()
 
+    @parameterized.expand([
+        ((44100,), (2, 3), (2, 44100)),
+        ((3, 44100), (1, 3), (3, 44100)),
+        ((3, 44100), (3, 3), (3, 44100)),
+        ((1, 2, 1, 44100), (3, 3), (1, 2, 3, 44100))
+    ])
+    def test_lfilter_broadcast_shape(self, input_shape, coeff_shape, target_shape):
+        torch.random.manual_seed(42)
+        waveform = torch.rand(*input_shape, dtype=self.dtype, device=self.device)
+        b_coeffs = torch.rand(*coeff_shape, dtype=self.dtype, device=self.device)
+        a_coeffs = torch.rand(*coeff_shape, dtype=self.dtype, device=self.device)
+        output_waveform = F.lfilter(waveform, a_coeffs, b_coeffs)
+        assert target_shape == output_waveform.size()
+
     def test_lfilter_9th_order_filter_stability(self):
         """
         Validate the precision of lfilter against reference scipy implementation when using high order filter.
diff --git a/torchaudio/functional/filtering.py b/torchaudio/functional/filtering.py
index 0576971487..7736d94dd4 100644
--- a/torchaudio/functional/filtering.py
+++ b/torchaudio/functional/filtering.py
@@ -953,7 +953,8 @@ def lfilter(
     shape = waveform.size()
     assert a_coeffs.size() == b_coeffs.size()
     assert a_coeffs.ndim <= 2
-    assert waveform.ndim >= a_coeffs.ndim
+
+    a_coeffs, b_coeffs = a_coeffs.squeeze(), b_coeffs.squeeze()
 
     if a_coeffs.ndim > 1:
         shape = shape[:-2] + (a_coeffs.shape[0], shape[-1])

From 6d814711007f9a81164aaec5e805063dc8ba168d Mon Sep 17 00:00:00 2001
From: Chin Yun Yu <ya70201@gmail.com>
Date: Tue, 8 Jun 2021 18:17:22 +0800
Subject: [PATCH 05/11] modified python part

---
 torchaudio/functional/filtering.py | 36 ++++++++++++++++--------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/torchaudio/functional/filtering.py b/torchaudio/functional/filtering.py
index 7736d94dd4..c56e36e15b 100644
--- a/torchaudio/functional/filtering.py
+++ b/torchaudio/functional/filtering.py
@@ -857,13 +857,14 @@ def highpass_biquad(
 
 
 def _lfilter_core_generic_loop(input_signal_windows: Tensor, a_coeffs_flipped: Tensor, padded_output_waveform: Tensor):
-    n_order = a_coeffs_flipped.size(0)
-    for i_sample, o0 in enumerate(input_signal_windows.t()):
+    n_order = a_coeffs_flipped.size(1)
+    a_coeffs_flipped = a_coeffs_flipped.unsqueeze(2)
+    for i_sample, o0 in enumerate(input_signal_windows.permute(2, 0, 1)):
         windowed_output_signal = padded_output_waveform[
-            :, i_sample:i_sample + n_order
+            :, :, i_sample:i_sample + n_order
         ]
-        o0.addmv_(windowed_output_signal, a_coeffs_flipped, alpha=-1)
-        padded_output_waveform[:, i_sample + n_order - 1] = o0
+        o0 -= (windowed_output_signal.transpose(0, 1) @ a_coeffs_flipped)[..., 0].t()
+        padded_output_waveform[:, :, i_sample + n_order - 1] = o0
 
 
 try:
@@ -879,13 +880,13 @@ def _lfilter_core(
     b_coeffs: Tensor,
 ) -> Tensor:
 
-    assert a_coeffs.size(0) == b_coeffs.size(0)
-    assert len(waveform.size()) == 2
+    assert a_coeffs.size() == b_coeffs.size()
+    assert len(waveform.size()) == 3
     assert waveform.device == a_coeffs.device
     assert b_coeffs.device == a_coeffs.device
 
-    n_channel, n_sample = waveform.size()
-    n_order = a_coeffs.size(0)
+    n_batch, n_channel, n_sample = waveform.size()
+    n_order = a_coeffs.size(1)
     assert n_order > 0
 
     # Pad the input and create output
@@ -895,17 +896,18 @@ def _lfilter_core(
 
     # Set up the coefficients matrix
     # Flip coefficients' order
-    a_coeffs_flipped = a_coeffs.flip(0)
-    b_coeffs_flipped = b_coeffs.flip(0)
+    a_coeffs_flipped = a_coeffs.flip(1)
+    b_coeffs_flipped = b_coeffs.flip(1)
 
     # calculate windowed_input_signal in parallel using convolution
     input_signal_windows = torch.nn.functional.conv1d(
-        padded_waveform.unsqueeze(1),
-        b_coeffs_flipped.view(1, 1, -1)
-    ).squeeze(1)
+        padded_waveform,
+        b_coeffs_flipped.unsqueeze(1),
+        groups=n_channel
+    )
 
-    input_signal_windows.div_(a_coeffs[0])
-    a_coeffs_flipped.div_(a_coeffs[0])
+    input_signal_windows.div_(a_coeffs[:, :1])
+    a_coeffs_flipped.div_(a_coeffs[:, :1])
 
     if input_signal_windows.device == torch.device('cpu') and\
        a_coeffs_flipped.device == torch.device('cpu') and\
@@ -914,7 +916,7 @@ def _lfilter_core(
     else:
         _lfilter_core_generic_loop(input_signal_windows, a_coeffs_flipped, padded_output_waveform)
 
-    output = padded_output_waveform[:, n_order - 1:]
+    output = padded_output_waveform[:, :, n_order - 1:]
     return output
 
 try:

From d53c3d9d7aeaf6f02d7f49f14e844fed5679e5f5 Mon Sep 17 00:00:00 2001
From: Chin Yun Yu <ya70201@gmail.com>
Date: Tue, 8 Jun 2021 18:29:23 +0800
Subject: [PATCH 06/11] update docstrings

---
 torchaudio/functional/filtering.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/torchaudio/functional/filtering.py b/torchaudio/functional/filtering.py
index c56e36e15b..cf8cd54ea0 100644
--- a/torchaudio/functional/filtering.py
+++ b/torchaudio/functional/filtering.py
@@ -940,16 +940,18 @@ def lfilter(
 
     Args:
         waveform (Tensor): audio waveform of dimension of ``(..., time)``.  Must be normalized to -1 to 1.
-        a_coeffs (Tensor): denominator coefficients of difference equation of dimension of ``(n_order + 1)``.
+        a_coeffs (Tensor): denominator coefficients of difference equation of dimension of ``(*, n_order + 1)``.
+                                Where * is the optional number of filter banks, and must be broadcastable to ``waveform`` except time dimension.
                                 Lower delays coefficients are first, e.g. ``[a0, a1, a2, ...]``.
                                 Must be same size as b_coeffs (pad with 0's as necessary).
-        b_coeffs (Tensor): numerator coefficients of difference equation of dimension of ``(n_order + 1)``.
+        b_coeffs (Tensor): numerator coefficients of difference equation of dimension of ``(*, n_order + 1)``.
+                                 Where * is the optional number of filter banks, and must be broadcastable to ``waveform`` except time dimension.
                                  Lower delays coefficients are first, e.g. ``[b0, b1, b2, ...]``.
                                  Must be same size as a_coeffs (pad with 0's as necessary).
         clamp (bool, optional): If ``True``, clamp the output signal to be in the range [-1, 1] (Default: ``True``)
 
     Returns:
-        Tensor: Waveform with dimension of ``(..., time)``.
+        Tensor: Waveform with dimension of ``(..., *, time)``.
     """
     # pack batch
     shape = waveform.size()

From 93cabef0d424b046aa15d7065f82bab5452d0482 Mon Sep 17 00:00:00 2001
From: Chin Yun Yu <ya70201@gmail.com>
Date: Tue, 8 Jun 2021 19:56:55 +0800
Subject: [PATCH 07/11] fix python style

---
 torchaudio/functional/filtering.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/torchaudio/functional/filtering.py b/torchaudio/functional/filtering.py
index cf8cd54ea0..548c556c24 100644
--- a/torchaudio/functional/filtering.py
+++ b/torchaudio/functional/filtering.py
@@ -941,11 +941,13 @@ def lfilter(
     Args:
         waveform (Tensor): audio waveform of dimension of ``(..., time)``.  Must be normalized to -1 to 1.
         a_coeffs (Tensor): denominator coefficients of difference equation of dimension of ``(*, n_order + 1)``.
-                                Where * is the optional number of filter banks, and must be broadcastable to ``waveform`` except time dimension.
+                                Where * is the optional number of filter banks, 
+                                and must be broadcastable to ``waveform`` except time dimension.
                                 Lower delays coefficients are first, e.g. ``[a0, a1, a2, ...]``.
                                 Must be same size as b_coeffs (pad with 0's as necessary).
         b_coeffs (Tensor): numerator coefficients of difference equation of dimension of ``(*, n_order + 1)``.
-                                 Where * is the optional number of filter banks, and must be broadcastable to ``waveform`` except time dimension.
+                                 Where * is the optional number of filter banks, 
+                                 and must be broadcastable to ``waveform`` except time dimension.
                                  Lower delays coefficients are first, e.g. ``[b0, b1, b2, ...]``.
                                  Must be same size as a_coeffs (pad with 0's as necessary).
         clamp (bool, optional): If ``True``, clamp the output signal to be in the range [-1, 1] (Default: ``True``)

From 17ada5a9bca5246a42a293b13ffdf5a1a33bf82c Mon Sep 17 00:00:00 2001
From: Chin Yun Yu <ya70201@gmail.com>
Date: Tue, 8 Jun 2021 19:59:34 +0800
Subject: [PATCH 08/11] remove trailing white space

---
 torchaudio/functional/filtering.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchaudio/functional/filtering.py b/torchaudio/functional/filtering.py
index 548c556c24..ee58ea4bc1 100644
--- a/torchaudio/functional/filtering.py
+++ b/torchaudio/functional/filtering.py
@@ -941,12 +941,12 @@ def lfilter(
     Args:
         waveform (Tensor): audio waveform of dimension of ``(..., time)``.  Must be normalized to -1 to 1.
         a_coeffs (Tensor): denominator coefficients of difference equation of dimension of ``(*, n_order + 1)``.
-                                Where * is the optional number of filter banks, 
+                                Where * is the optional number of filter banks,
                                 and must be broadcastable to ``waveform`` except time dimension.
                                 Lower delays coefficients are first, e.g. ``[a0, a1, a2, ...]``.
                                 Must be same size as b_coeffs (pad with 0's as necessary).
         b_coeffs (Tensor): numerator coefficients of difference equation of dimension of ``(*, n_order + 1)``.
-                                 Where * is the optional number of filter banks, 
+                                 Where * is the optional number of filter banks,
                                  and must be broadcastable to ``waveform`` except time dimension.
                                  Lower delays coefficients are first, e.g. ``[b0, b1, b2, ...]``.
                                  Must be same size as a_coeffs (pad with 0's as necessary).

From 18a7d7185c18c241427ec06fad32001df167fadd Mon Sep 17 00:00:00 2001
From: Chin Yun Yu <ya70201@gmail.com>
Date: Wed, 9 Jun 2021 21:51:46 +0800
Subject: [PATCH 09/11] test: add batch consistency test

---
 .../functional/batch_consistency_test.py          | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/test/torchaudio_unittest/functional/batch_consistency_test.py b/test/torchaudio_unittest/functional/batch_consistency_test.py
index bd91103b15..21e8226d86 100644
--- a/test/torchaudio_unittest/functional/batch_consistency_test.py
+++ b/test/torchaudio_unittest/functional/batch_consistency_test.py
@@ -217,3 +217,18 @@ def test_compute_kaldi_pitch(self):
         batch = waveform.view(self.batch_size, n_channels, waveform.size(-1))
         self.assert_batch_consistency(
             F.compute_kaldi_pitch, batch, sample_rate=sample_rate)
+
+    def test_lfilter(self):
+        signal_length = 2048
+        torch.manual_seed(2434)
+        x = torch.randn(self.batch_size, signal_length)
+        a = torch.rand(self.batch_size, 3)
+        b = torch.rand(self.batch_size, 3)
+
+        batchwise_output = F.lfilter(x, a, b)
+        itemwise_output = torch.stack([
+            F.lfilter(x[i], a[i], b[i])
+            for i in range(self.batch_size)
+        ])
+
+        self.assertEqual(batchwise_output, itemwise_output)

From 9641c1b23982e5d64e9cfd9133271e6775d248a4 Mon Sep 17 00:00:00 2001
From: Chin Yun Yu <ya70201@gmail.com>
Date: Thu, 10 Jun 2021 22:25:32 +0800
Subject: [PATCH 10/11] test: add new batch consistency test

---
 .../functional/batch_consistency_test.py              | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/test/torchaudio_unittest/functional/batch_consistency_test.py b/test/torchaudio_unittest/functional/batch_consistency_test.py
index 21e8226d86..84b80f1fdc 100644
--- a/test/torchaudio_unittest/functional/batch_consistency_test.py
+++ b/test/torchaudio_unittest/functional/batch_consistency_test.py
@@ -218,7 +218,7 @@ def test_compute_kaldi_pitch(self):
         self.assert_batch_consistency(
             F.compute_kaldi_pitch, batch, sample_rate=sample_rate)
 
-    def test_lfilter(self):
+    def test_lfilter_separated_filters(self):
         signal_length = 2048
         torch.manual_seed(2434)
         x = torch.randn(self.batch_size, signal_length)
@@ -232,3 +232,12 @@ def test_lfilter(self):
         ])
 
         self.assertEqual(batchwise_output, itemwise_output)
+
+    def test_lfilter(self):
+        signal_length = 2048
+        torch.manual_seed(2434)
+        x = torch.randn(self.batch_size, 1, signal_length)
+        a = torch.rand(4, 3)
+        b = torch.rand(4, 3)
+
+        self.assert_batch_consistency(F.lfilter, x, a, b)

From 29602744c56441bce3bdf0a1e816a392b8de9d17 Mon Sep 17 00:00:00 2001
From: Chin Yun Yu <ya70201@gmail.com>
Date: Thu, 10 Jun 2021 23:03:48 +0800
Subject: [PATCH 11/11] test: batch behavior of filter coefficients

---
 .../functional/batch_consistency_test.py                  | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/test/torchaudio_unittest/functional/batch_consistency_test.py b/test/torchaudio_unittest/functional/batch_consistency_test.py
index 84b80f1fdc..b9770b22af 100644
--- a/test/torchaudio_unittest/functional/batch_consistency_test.py
+++ b/test/torchaudio_unittest/functional/batch_consistency_test.py
@@ -236,8 +236,12 @@ def test_lfilter_separated_filters(self):
     def test_lfilter(self):
         signal_length = 2048
         torch.manual_seed(2434)
-        x = torch.randn(self.batch_size, 1, signal_length)
+        x = torch.randn(signal_length)
         a = torch.rand(4, 3)
         b = torch.rand(4, 3)
 
-        self.assert_batch_consistency(F.lfilter, x, a, b)
+        def filter_wrapper(ab_coeffs, waveform):
+            a, b = ab_coeffs[..., 0, :], ab_coeffs[..., 1, :]
+            return F.lfilter(waveform, a, b)
+
+        self.assert_batch_consistency(filter_wrapper, torch.stack([a, b], 1), x)