From 005ced94638e6b72cf65ab0bc073219d9815848d Mon Sep 17 00:00:00 2001
From: Bhargav Kathivarapu <bhargavkathivarapu31@gmail.com>
Date: Tue, 23 Feb 2021 10:29:29 +0000
Subject: [PATCH 1/6] overdrive cpp ext

---
 torchaudio/csrc/CMakeLists.txt     |  1 +
 torchaudio/csrc/overdrive.cpp      | 49 ++++++++++++++++++++++++++++++
 torchaudio/functional/filtering.py | 19 ++++++++++--
 3 files changed, 67 insertions(+), 2 deletions(-)
 create mode 100644 torchaudio/csrc/overdrive.cpp
diff --git a/torchaudio/csrc/CMakeLists.txt b/torchaudio/csrc/CMakeLists.txt
index 1bab67be5a..9d21dc175e 100644
--- a/torchaudio/csrc/CMakeLists.txt
+++ b/torchaudio/csrc/CMakeLists.txt
@@ -11,6 +11,7 @@ set(
   sox/effects_chain.cpp
   sox/types.cpp
   lfilter.cpp
+  overdrive.cpp
   )
 
 if(BUILD_TRANSDUCER)
diff --git a/torchaudio/csrc/overdrive.cpp b/torchaudio/csrc/overdrive.cpp
new file mode 100644
index 0000000000..44f20fe6d6
--- /dev/null
+++ b/torchaudio/csrc/overdrive.cpp
@@ -0,0 +1,49 @@
+#include <torch/script.h>
+
+namespace {
+
+template <typename scalar_t>
+void overdrive_cpu_kernel(
+    at::TensorAccessor<scalar_t, 2> waveform_accessor,
+    at::TensorAccessor<scalar_t, 2> temp_accessor,
+    at::TensorAccessor<scalar_t, 1> last_in_accessor,
+    at::TensorAccessor<scalar_t, 1> last_out_accessor,
+    at::TensorAccessor<scalar_t, 2> output_waveform_accessor) {
+  int64_t n_frames = waveform_accessor.size(1);
+  int64_t n_channels = waveform_accessor.size(0);
+
+  for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) {
+    for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) {
+      last_out_accessor[i_channel] = temp_accessor[i_channel][i_frame] -
+          last_in_accessor[i_channel] + 0.995 * last_out_accessor[i_channel];
+      last_in_accessor[i_channel] = temp_accessor[i_channel][i_frame];
+      output_waveform_accessor[i_channel][i_frame] =
+          waveform_accessor[i_channel][i_frame] * 0.5 +
+          last_out_accessor[i_channel] * 0.75;
+    }
+  }
+}
+
+void overdrive_core_loop_cpu(
+    at::Tensor& waveform,
+    at::Tensor& temp,
+    at::Tensor& last_in,
+    at::Tensor& last_out,
+    at::Tensor& output_waveform) {
+  AT_DISPATCH_FLOATING_TYPES(waveform.scalar_type(), "overdrive_cpu", ([&] {
+                               overdrive_cpu_kernel<scalar_t>(
+                                   waveform.accessor<scalar_t, 2>(),
+                                   temp.accessor<scalar_t, 2>(),
+                                   last_in.accessor<scalar_t, 1>(),
+                                   last_out.accessor<scalar_t, 1>(),
+                                   output_waveform.accessor<scalar_t, 2>());
+                             }));
+}
+
+} // namespace
+
+// Note: We want to avoid using "catch-all" kernel.
+// The following registration should be replaced with CPU specific registration.
+TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
+  m.def("torchaudio::_overdrive_core_loop", &overdrive_core_loop_cpu);
+}
diff --git a/torchaudio/functional/filtering.py b/torchaudio/functional/filtering.py
index 3c27a0d617..2ccdbe4fe2 100644
--- a/torchaudio/functional/filtering.py
+++ b/torchaudio/functional/filtering.py
@@ -981,8 +981,23 @@ def overdrive(waveform: Tensor, gain: float = 20, colour: float = 20) -> Tensor:
 
     output_waveform = torch.zeros_like(waveform, dtype=dtype, device=device)
 
-    # TODO: Implement a torch CPP extension
-    for i in range(waveform.shape[-1]):
+    # # TODO: Implement a torch CPP extension
+    # for i in range(waveform.shape[-1]):
+    #     last_out = temp[:, i] - last_in + 0.995 * last_out
+    #     last_in = temp[:, i]
+    #     output_waveform[:, i] = waveform[:, i] * 0.5 + last_out * 0.75
+
+
+    try:
+        _overdrive_core_loop_cpu = torch.ops.torchaudio._overdrive_core_loop
+    except RuntimeError as err:
+        assert str(err) == 'No such operator torchaudio::_overdrive_core_loop'
+        
+    
+    if device == torch.device('cpu'):
+        _overdrive_core_loop_cpu(waveform, temp, last_in, last_out, output_waveform)
+    else:
+        for i in range(waveform.shape[-1]):
         last_out = temp[:, i] - last_in + 0.995 * last_out
         last_in = temp[:, i]
         output_waveform[:, i] = waveform[:, i] * 0.5 + last_out * 0.75

From 22878938857a4fa6d1e7bc6600ec0cfeea5e6732 Mon Sep 17 00:00:00 2001
From: Bhargav Kathivarapu <bhargavkathivarapu31@gmail.com>
Date: Tue, 23 Feb 2021 10:44:34 +0000
Subject: [PATCH 2/6] flake8 fix

---
 torchaudio/functional/filtering.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/torchaudio/functional/filtering.py b/torchaudio/functional/filtering.py
index 2ccdbe4fe2..e72ba86715 100644
--- a/torchaudio/functional/filtering.py
+++ b/torchaudio/functional/filtering.py
@@ -987,20 +987,18 @@ def overdrive(waveform: Tensor, gain: float = 20, colour: float = 20) -> Tensor:
     #     last_in = temp[:, i]
     #     output_waveform[:, i] = waveform[:, i] * 0.5 + last_out * 0.75
 
-
     try:
         _overdrive_core_loop_cpu = torch.ops.torchaudio._overdrive_core_loop
     except RuntimeError as err:
         assert str(err) == 'No such operator torchaudio::_overdrive_core_loop'
-        
-    
+
     if device == torch.device('cpu'):
         _overdrive_core_loop_cpu(waveform, temp, last_in, last_out, output_waveform)
     else:
         for i in range(waveform.shape[-1]):
-        last_out = temp[:, i] - last_in + 0.995 * last_out
-        last_in = temp[:, i]
-        output_waveform[:, i] = waveform[:, i] * 0.5 + last_out * 0.75
+            last_out = temp[:, i] - last_in + 0.995 * last_out
+            last_in = temp[:, i]
+            output_waveform[:, i] = waveform[:, i] * 0.5 + last_out * 0.75
 
     return output_waveform.clamp(min=-1, max=1).view(actual_shape)
 

From fb893410fdb71a7d96327baeacaccb85217cd8bd Mon Sep 17 00:00:00 2001
From: Bhargav Kathivarapu <bhargavkathivarapu31@gmail.com>
Date: Tue, 23 Feb 2021 11:15:33 +0000
Subject: [PATCH 3/6] JIT issue fix

---
 torchaudio/functional/filtering.py | 30 +++++++++++++++++++++---------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/torchaudio/functional/filtering.py b/torchaudio/functional/filtering.py
index e72ba86715..0b35c194f0 100644
--- a/torchaudio/functional/filtering.py
+++ b/torchaudio/functional/filtering.py
@@ -939,6 +939,26 @@ def lowpass_biquad(
     return biquad(waveform, b0, b1, b2, a0, a1, a2)
 
 
+def _overdrive_core_loop_generic(
+    waveform: Tensor,
+    temp: Tensor,
+    last_in: Tensor,
+    last_out: Tensor,
+    output_waveform: Tensor
+):
+    for i in range(waveform.shape[-1]):
+        last_out = temp[:, i] - last_in + 0.995 * last_out
+        last_in = temp[:, i]
+        output_waveform[:, i] = waveform[:, i] * 0.5 + last_out * 0.75
+
+
+try:
+    _overdrive_core_loop_cpu = torch.ops.torchaudio._overdrive_core_loop
+except RuntimeError as err:
+    assert str(err) == 'No such operator torchaudio::_overdrive_core_loop'
+    _overdrive_core_loop_cpu = _overdrive_core_loop_generic
+
+
 def overdrive(waveform: Tensor, gain: float = 20, colour: float = 20) -> Tensor:
     r"""Apply a overdrive effect to the audio. Similar to SoX implementation.
     This effect applies a non linear distortion to the audio signal.
@@ -987,18 +1007,10 @@ def overdrive(waveform: Tensor, gain: float = 20, colour: float = 20) -> Tensor:
     #     last_in = temp[:, i]
     #     output_waveform[:, i] = waveform[:, i] * 0.5 + last_out * 0.75
 
-    try:
-        _overdrive_core_loop_cpu = torch.ops.torchaudio._overdrive_core_loop
-    except RuntimeError as err:
-        assert str(err) == 'No such operator torchaudio::_overdrive_core_loop'
-
     if device == torch.device('cpu'):
         _overdrive_core_loop_cpu(waveform, temp, last_in, last_out, output_waveform)
     else:
-        for i in range(waveform.shape[-1]):
-            last_out = temp[:, i] - last_in + 0.995 * last_out
-            last_in = temp[:, i]
-            output_waveform[:, i] = waveform[:, i] * 0.5 + last_out * 0.75
+        _overdrive_core_loop_generic(waveform, temp, last_in, last_out, output_waveform)
 
     return output_waveform.clamp(min=-1, max=1).view(actual_shape)
 

From 70ebbe534440e130ea005fb2743bf6d9775da12f Mon Sep 17 00:00:00 2001
From: Bhargav Kathivarapu <bhargavkathivarapu31@gmail.com>
Date: Wed, 24 Feb 2021 10:18:14 +0000
Subject: [PATCH 4/6] Use parallel_for on CPU

---
 torchaudio/csrc/overdrive.cpp      | 21 +++++++++++----------
 torchaudio/functional/filtering.py |  7 +------
 2 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/torchaudio/csrc/overdrive.cpp b/torchaudio/csrc/overdrive.cpp
index 44f20fe6d6..3e44a65977 100644
--- a/torchaudio/csrc/overdrive.cpp
+++ b/torchaudio/csrc/overdrive.cpp
@@ -12,17 +12,18 @@ void overdrive_cpu_kernel(
   int64_t n_frames = waveform_accessor.size(1);
   int64_t n_channels = waveform_accessor.size(0);
 
-  for (int64_t i_channel = 0; i_channel < n_channels; ++i_channel) {
-    for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) {
-      last_out_accessor[i_channel] = temp_accessor[i_channel][i_frame] -
-          last_in_accessor[i_channel] + 0.995 * last_out_accessor[i_channel];
-      last_in_accessor[i_channel] = temp_accessor[i_channel][i_frame];
-      output_waveform_accessor[i_channel][i_frame] =
-          waveform_accessor[i_channel][i_frame] * 0.5 +
-          last_out_accessor[i_channel] * 0.75;
+  at::parallel_for(0, n_channels, 1, [&](int64_t begin, int64_t end) {
+    for (int64_t i_channel = begin; i_channel < end; ++i_channel) {
+      for (int64_t i_frame = 0; i_frame < n_frames; ++i_frame) {
+        last_out_accessor[i_channel] = temp_accessor[i_channel][i_frame] -
+            last_in_accessor[i_channel] + 0.995 * last_out_accessor[i_channel];
+        last_in_accessor[i_channel] = temp_accessor[i_channel][i_frame];
+        output_waveform_accessor[i_channel][i_frame] =
+            waveform_accessor[i_channel][i_frame] * 0.5 +
+            last_out_accessor[i_channel] * 0.75;
+      }
     }
-  }
-}
+  });
 
 void overdrive_core_loop_cpu(
     at::Tensor& waveform,
diff --git a/torchaudio/functional/filtering.py b/torchaudio/functional/filtering.py
index 0b35c194f0..94b98ac7fc 100644
--- a/torchaudio/functional/filtering.py
+++ b/torchaudio/functional/filtering.py
@@ -1001,12 +1001,7 @@ def overdrive(waveform: Tensor, gain: float = 20, colour: float = 20) -> Tensor:
 
     output_waveform = torch.zeros_like(waveform, dtype=dtype, device=device)
 
-    # # TODO: Implement a torch CPP extension
-    # for i in range(waveform.shape[-1]):
-    #     last_out = temp[:, i] - last_in + 0.995 * last_out
-    #     last_in = temp[:, i]
-    #     output_waveform[:, i] = waveform[:, i] * 0.5 + last_out * 0.75
-
+    # Uses CPU optimized loop function if available for CPU device
     if device == torch.device('cpu'):
         _overdrive_core_loop_cpu(waveform, temp, last_in, last_out, output_waveform)
     else:

From d05514cfc50a6ea20d6f30294d097e66e0a349e3 Mon Sep 17 00:00:00 2001
From: Bhargav Kathivarapu <bhargavkathivarapu31@gmail.com>
Date: Wed, 24 Feb 2021 10:30:19 +0000
Subject: [PATCH 5/6] Minor fix

---
 torchaudio/csrc/overdrive.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torchaudio/csrc/overdrive.cpp b/torchaudio/csrc/overdrive.cpp
index 3e44a65977..27240a1a51 100644
--- a/torchaudio/csrc/overdrive.cpp
+++ b/torchaudio/csrc/overdrive.cpp
@@ -24,6 +24,7 @@ void overdrive_cpu_kernel(
       }
     }
   });
+}
 
 void overdrive_core_loop_cpu(
     at::Tensor& waveform,

From 136b2669b64849f02a760851e403bf20a5ffc4b9 Mon Sep 17 00:00:00 2001
From: Bhargav Kathivarapu <bhargavkathivarapu31@gmail.com>
Date: Wed, 24 Feb 2021 11:07:42 +0000
Subject: [PATCH 6/6] include torch.h for parallel_for

---
 torchaudio/csrc/overdrive.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torchaudio/csrc/overdrive.cpp b/torchaudio/csrc/overdrive.cpp
index 27240a1a51..4954271e41 100644
--- a/torchaudio/csrc/overdrive.cpp
+++ b/torchaudio/csrc/overdrive.cpp
@@ -1,4 +1,5 @@
 #include <torch/script.h>
+#include <torch/torch.h>
 
 namespace {