initial commit and tests

bshall · bshall · commit d131232f8580 · 2022-06-13T14:35:59.000+02:00
diff --git a/test/torchaudio_unittest/functional/functional_impl.py b/test/torchaudio_unittest/functional/functional_impl.py
@@ -10,6 +10,8 @@
 from scipy import signal
 from torchaudio_unittest.common_utils import (
     beamform_utils,
+    get_asset_path,
+    load_wav,
     get_sinusoid,
     get_whitenoise,
     nested_params,
diff --git a/test/torchaudio_unittest/functional/loudness_compliance_cpu_test.py b/test/torchaudio_unittest/functional/loudness_compliance_cpu_test.py
@@ -0,0 +1,8 @@
+import torch
+from torchaudio_unittest import common_utils
+
+from .loudness_compliance_test_impl import Loudness
+
+
+class TestLoudnessCPU(Loudness, common_utils.PytorchTestCase):
+    device = torch.device("cpu")
diff --git a/test/torchaudio_unittest/functional/loudness_compliance_cuda_test.py b/test/torchaudio_unittest/functional/loudness_compliance_cuda_test.py
@@ -0,0 +1,9 @@
+import torch
+from torchaudio_unittest import common_utils
+
+from .loudness_compliance_test_impl import Loudness
+
+
+@common_utils.skipIfNoCuda
+class TestLoudnessCUDA(Loudness, common_utils.PytorchTestCase):
+    device = torch.device("cuda")
diff --git a/test/torchaudio_unittest/functional/loudness_compliance_test_impl.py b/test/torchaudio_unittest/functional/loudness_compliance_test_impl.py
@@ -0,0 +1,63 @@
+"""Test suite for compliance with the ITU-R BS.1770-4 recommendation"""
+import torch
+import os.path
+import zipfile
+import torchaudio.functional as F
+from torchaudio_unittest.common_utils import (
+    load_wav,
+    TempDirMixin,
+    TestBaseMixin,
+)
+
+# Test files linked in https://www.itu.int/dms_pub/itu-r/opb/rep/R-REP-BS.2217-2-2016-PDF-E.pdf
+_COMPLIANCE_FILE_URLS = {
+    "1770-2_Comp_RelGateTest": "http://www.itu.int/dms_pub/itu-r/oth/11/02/R11020000010030ZIPM.zip",
+    "1770-2_Comp_AbsGateTest": "http://www.itu.int/dms_pub/itu-r/oth/11/02/R11020000010029ZIPM.zip",
+    "1770-2_Comp_24LKFS_500Hz_2ch": "http://www.itu.int/dms_pub/itu-r/oth/11/02/R11020000010018ZIPM.zip",
+    "1770-2 Conf Mono Voice+Music-24LKFS": "http://www.itu.int/dms_pub/itu-r/oth/11/02/R11020000010038ZIPM.zip",
+}
+
+
+class Loudness(TempDirMixin, TestBaseMixin):
+    def download_and_extract_file(self, filename):
+        zippath = self.get_temp_path(filename + ".zip")
+        torch.hub.download_url_to_file(_COMPLIANCE_FILE_URLS[filename], zippath, progress=False)
+        with zipfile.ZipFile(zippath) as file:
+            file.extractall(os.path.dirname(zippath))
+        return self.get_temp_path(filename + ".wav")
+
+    def test_measure_loudness_relative_gate(self):
+        filepath = self.download_and_extract_file("1770-2_Comp_RelGateTest")
+        waveform, sample_rate = load_wav(filepath)
+        waveform = waveform.to(self.device)
+
+        loudness = F.measure_loudness(waveform, sample_rate)
+        expected = torch.tensor(-10.0, dtype=loudness.dtype, device=self.device)
+        self.assertEqual(loudness, expected, rtol=0.01, atol=0.1)
+
+    def test_measure_loudness_absolute_gate(self):
+        filepath = self.download_and_extract_file("1770-2_Comp_AbsGateTest")
+        waveform, sample_rate = load_wav(filepath)
+        waveform = waveform.to(self.device)
+
+        loudness = F.measure_loudness(waveform, sample_rate)
+        expected = torch.tensor(-69.5, dtype=loudness.dtype, device=self.device)
+        self.assertEqual(loudness, expected, rtol=0.01, atol=0.1)
+
+    def test_measure_loudness_two_channels(self):
+        filepath = filepath = self.download_and_extract_file("1770-2_Comp_24LKFS_500Hz_2ch")
+        waveform, sample_rate = load_wav(filepath)
+        waveform = waveform.to(self.device)
+
+        loudness = F.measure_loudness(waveform, sample_rate)
+        expected = torch.tensor(-24.0, dtype=loudness.dtype, device=self.device)
+        self.assertEqual(loudness, expected, rtol=0.01, atol=0.1)
+
+    def test_measure_loudness_mono_voice_music(self):
+        filepath = self.download_and_extract_file("1770-2 Conf Mono Voice+Music-24LKFS")
+        waveform, sample_rate = load_wav(filepath)
+        waveform = waveform.to(self.device)
+
+        loudness = F.measure_loudness(waveform, sample_rate)
+        expected = torch.tensor(-24.0, dtype=loudness.dtype, device=self.device)
+        self.assertEqual(loudness, expected, rtol=0.01, atol=0.1)
diff --git a/test/torchaudio_unittest/functional/torchscript_consistency_impl.py b/test/torchaudio_unittest/functional/torchscript_consistency_impl.py
@@ -111,6 +111,14 @@ def func(tensor):
 
         self._assert_consistency(func, (waveform,))
 
+    def test_measure_loudness(self):
+        if self.dtype == torch.float64:
+            raise unittest.SkipTest("This test is known to fail for float64")
+
+        sample_rate = 44100
+        waveform = common_utils.get_sinusoid(sample_rate=sample_rate, device=self.device)
+        self._assert_consistency(F.measure_loudness, (waveform, sample_rate))
+
     def test_melscale_fbanks(self):
         if self.device != torch.device("cpu"):
             raise unittest.SkipTest("No need to perform test on device other than CPU")
diff --git a/torchaudio/functional/__init__.py b/torchaudio/functional/__init__.py
@@ -30,6 +30,7 @@
     compute_kaldi_pitch,
     create_dct,
     DB_to_amplitude,
+    measure_loudness,
     detect_pitch_frequency,
     edit_distance,
     griffinlim,
@@ -62,6 +63,7 @@
     "melscale_fbanks",
     "linear_fbanks",
     "DB_to_amplitude",
+    "measure_loudness",
     "detect_pitch_frequency",
     "griffinlim",
     "mask_along_axis",
diff --git a/torchaudio/functional/functional.py b/torchaudio/functional/functional.py
@@ -10,6 +10,7 @@
 import torchaudio
 from torch import Tensor
 from torchaudio._internal import module_utils as _mod_utils
+from .filtering import highpass_biquad, treble_biquad
 
 __all__ = [
     "spectrogram",
@@ -35,6 +36,7 @@
     "apply_codec",
     "resample",
     "edit_distance",
+    "measure_loudness",
     "pitch_shift",
     "rnnt_loss",
     "psd",
@@ -1602,6 +1604,66 @@ def edit_distance(seq1: Sequence, seq2: Sequence) -> int:
     return int(dold[-1])
 
 
+def measure_loudness(waveform: Tensor, sample_rate: int):
+    r"""Measure audio loudness according to the ITU-R BS.1770-4 recommendation.
+
+    .. devices:: CPU CUDA
+
+    .. properties:: TorchScript
+
+    Args:
+        waveform(torch.Tensor): audio waveform of dimension of `(..., channels, time)`
+        sample_rate (int): sampling rate of the waveform, e.g. 44100 (Hz)
+
+    Returns:
+        Tensor: loudness estimates (LKFS)
+
+    Reference:
+        - https://www.itu.int/rec/R-REC-BS.1770-4-201510-I/en
+    """
+
+    if waveform.size(-2) > 5:
+        raise ValueError("Only up to 5 channels are supported.")
+
+    gate_duration: float = 0.4
+    overlap: float = 0.75
+    gamma_abs: float = -70.0
+    gate_samples = int(round(gate_duration * sample_rate))
+    step = int(round(gate_samples * (1 - overlap)))
+
+    # Apply K-weighting
+    waveform = treble_biquad(waveform, sample_rate, 4.0, 1500.0, 1 / math.sqrt(2))
+    waveform = highpass_biquad(waveform, sample_rate, 38.0, 0.5)
+
+    # Compute the energy for each block
+    energy = torch.square(waveform).unfold(-1, gate_samples, step)
+    energy = torch.mean(energy, dim=-1)
+
+    # Compute channel-weighted summation
+    g = torch.tensor([1.0, 1.0, 1.0, 1.41, 1.41], dtype=waveform.dtype, device=waveform.device)
+    g = g[: energy.size(-2)]
+
+    energy_weighted = torch.sum(g.unsqueeze(-1) * energy, dim=-2)
+    loudness = -0.691 + 10 * torch.log10(energy_weighted)
+
+    # Apply absolute gating of the blocks
+    gated_blocks = loudness > gamma_abs
+    gated_blocks = gated_blocks.unsqueeze(-2)
+
+    energy_filtered = torch.sum(gated_blocks * energy, dim=-1) / torch.count_nonzero(gated_blocks, dim=-1)
+    energy_weighted = torch.sum(g * energy_filtered, dim=-1)
+    gamma_rel = -0.691 + 10 * torch.log10(energy_weighted) - 10
+
+    # Apply relative gating of the blocks
+    gated_blocks = torch.logical_and(gated_blocks.squeeze(-2), loudness > gamma_rel.unsqueeze(-1))
+    gated_blocks = gated_blocks.unsqueeze(-2)
+
+    energy_filtered = torch.sum(gated_blocks * energy, dim=-1) / torch.count_nonzero(gated_blocks, dim=-1)
+    energy_weighted = torch.sum(g * energy_filtered, dim=-1)
+    LKFS = -0.691 + 10 * torch.log10(energy_weighted)
+    return LKFS
+
+
 def pitch_shift(
     waveform: Tensor,
     sample_rate: int,