From f03cbea66ae618900eabfc498e8e237e84316f19 Mon Sep 17 00:00:00 2001
From: jcaw <toastedjcaw@gmail.com>
Date: Wed, 3 Mar 2021 22:17:04 +0000
Subject: [PATCH 1/2] Change the name of the specgram named `waveform`

`F.sliding_window_cmn` takes a spectrogram as input (of shape
`(..., freq, time)`). However, this spectrogram is named `waveform`.
This appears to be an error, so rename this (and the output tensor) to
reflect that both are spectrograms.
---
 torchaudio/functional/functional.py | 34 ++++++++++++++---------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/torchaudio/functional/functional.py b/torchaudio/functional/functional.py
index 7cab92b7f1..dde5d4eeaf 100644
--- a/torchaudio/functional/functional.py
+++ b/torchaudio/functional/functional.py
@@ -942,7 +942,7 @@ def detect_pitch_frequency(
 
 
 def sliding_window_cmn(
-    waveform: Tensor,
+    specgram: Tensor,
     cmn_window: int = 600,
     min_cmn_window: int = 100,
     center: bool = False,
@@ -952,7 +952,7 @@ def sliding_window_cmn(
     Apply sliding-window cepstral mean (and optionally variance) normalization per utterance.
 
     Args:
-        waveform (Tensor): Tensor of audio of dimension (..., freq, time)
+        specgram (Tensor): Tensor of audio of dimension (..., freq, time)
         cmn_window (int, optional): Window in frames for running average CMN computation (int, default = 600)
         min_cmn_window (int, optional):  Minimum CMN window used at start of decoding (adds latency only at start).
             Only applicable if center == false, ignored if center==true (int, default = 100)
@@ -963,17 +963,17 @@ def sliding_window_cmn(
     Returns:
         Tensor: Tensor of freq of dimension (..., frame)
     """
-    input_shape = waveform.shape
+    input_shape = specgram.shape
     num_frames, num_feats = input_shape[-2:]
-    waveform = waveform.view(-1, num_frames, num_feats)
-    num_channels = waveform.shape[0]
+    specgram = specgram.view(-1, num_frames, num_feats)
+    num_channels = specgram.shape[0]
 
-    dtype = waveform.dtype
-    device = waveform.device
+    dtype = specgram.dtype
+    device = specgram.device
     last_window_start = last_window_end = -1
     cur_sum = torch.zeros(num_channels, num_feats, dtype=dtype, device=device)
     cur_sumsq = torch.zeros(num_channels, num_feats, dtype=dtype, device=device)
-    cmn_waveform = torch.zeros(
+    cmn_specgram = torch.zeros(
         num_channels, num_frames, num_feats, dtype=dtype, device=device)
     for t in range(num_frames):
         window_start = 0
@@ -996,40 +996,40 @@ def sliding_window_cmn(
             if window_start < 0:
                 window_start = 0
         if last_window_start == -1:
-            input_part = waveform[:, window_start: window_end - window_start, :]
+            input_part = specgram[:, window_start: window_end - window_start, :]
             cur_sum += torch.sum(input_part, 1)
             if norm_vars:
                 cur_sumsq += torch.cumsum(input_part ** 2, 1)[:, -1, :]
         else:
             if window_start > last_window_start:
-                frame_to_remove = waveform[:, last_window_start, :]
+                frame_to_remove = specgram[:, last_window_start, :]
                 cur_sum -= frame_to_remove
                 if norm_vars:
                     cur_sumsq -= (frame_to_remove ** 2)
             if window_end > last_window_end:
-                frame_to_add = waveform[:, last_window_end, :]
+                frame_to_add = specgram[:, last_window_end, :]
                 cur_sum += frame_to_add
                 if norm_vars:
                     cur_sumsq += (frame_to_add ** 2)
         window_frames = window_end - window_start
         last_window_start = window_start
         last_window_end = window_end
-        cmn_waveform[:, t, :] = waveform[:, t, :] - cur_sum / window_frames
+        cmn_specgram[:, t, :] = specgram[:, t, :] - cur_sum / window_frames
         if norm_vars:
             if window_frames == 1:
-                cmn_waveform[:, t, :] = torch.zeros(
+                cmn_specgram[:, t, :] = torch.zeros(
                     num_channels, num_feats, dtype=dtype, device=device)
             else:
                 variance = cur_sumsq
                 variance = variance / window_frames
                 variance -= ((cur_sum ** 2) / (window_frames ** 2))
                 variance = torch.pow(variance, -0.5)
-                cmn_waveform[:, t, :] *= variance
+                cmn_specgram[:, t, :] *= variance
 
-    cmn_waveform = cmn_waveform.view(input_shape[:-2] + (num_frames, num_feats))
+    cmn_specgram = cmn_specgram.view(input_shape[:-2] + (num_frames, num_feats))
     if len(input_shape) == 2:
-        cmn_waveform = cmn_waveform.squeeze(0)
-    return cmn_waveform
+        cmn_specgram = cmn_specgram.squeeze(0)
+    return cmn_specgram
 
 
 def spectral_centroid(

From a52e3796487d2f446e86c0a8add652750d2532bb Mon Sep 17 00:00:00 2001
From: jcaw <toastedjcaw@gmail.com>
Date: Wed, 3 Mar 2021 22:19:01 +0000
Subject: [PATCH 2/2] Correct tensor description in docstring

The output tensor of `F.sliding_window_cmn` is also a spectrogram.
Update the description to reflect this.
---
 torchaudio/functional/functional.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchaudio/functional/functional.py b/torchaudio/functional/functional.py
index dde5d4eeaf..1d537b856b 100644
--- a/torchaudio/functional/functional.py
+++ b/torchaudio/functional/functional.py
@@ -961,7 +961,7 @@ def sliding_window_cmn(
         norm_vars (bool, optional): If true, normalize variance to one. (bool, default = false)
 
     Returns:
-        Tensor: Tensor of freq of dimension (..., frame)
+        Tensor: Tensor matching input shape (..., freq, time)
     """
     input_shape = specgram.shape
     num_frames, num_feats = input_shape[-2:]