From 3f07949802db985e348012664fd95bc6d62f169d Mon Sep 17 00:00:00 2001 From: Sanchit Date: Tue, 23 Feb 2021 14:29:14 -0600 Subject: [PATCH 1/4] Cherrypicking 1276 & 1291 from master --- .../backend/sox_io/info_test.py | 18 +++++++++++++++++- .../backend/sox_io/save_test.py | 6 ++++++ torchaudio/backend/sox_io_backend.py | 5 ++++- torchaudio/csrc/sox/io.cpp | 10 ++++++++++ torchaudio/csrc/sox/types.cpp | 2 ++ torchaudio/csrc/sox/types.h | 1 + torchaudio/csrc/sox/utils.cpp | 13 ++++++++++++- 7 files changed, 52 insertions(+), 3 deletions(-) diff --git a/test/torchaudio_unittest/backend/sox_io/info_test.py b/test/torchaudio_unittest/backend/sox_io/info_test.py index a2a93648a1..5be66c5e77 100644 --- a/test/torchaudio_unittest/backend/sox_io/info_test.py +++ b/test/torchaudio_unittest/backend/sox_io/info_test.py @@ -205,7 +205,7 @@ def test_ulaw(self): assert info.encoding == "ULAW" def test_alaw(self): - """`sox_io_backend.info` can check ulaw file correctly""" + """`sox_io_backend.info` can check alaw file correctly""" duration = 1 num_channels = 1 sample_rate = 8000 @@ -220,6 +220,22 @@ def test_alaw(self): assert info.num_channels == num_channels assert info.bits_per_sample == 8 assert info.encoding == "ALAW" + + def test_htk(self): + """`sox_io_backend.info` can check HTK file correctly""" + duration = 1 + num_channels = 1 + sample_rate = 8000 + path = self.get_temp_path('data.htk') + sox_utils.gen_audio_file( + path, sample_rate=sample_rate, num_channels=num_channels, + bit_depth=16, duration=duration) + info = sox_io_backend.info(path) + assert info.sample_rate == sample_rate + assert info.num_frames == sample_rate * duration + assert info.num_channels == num_channels + assert info.bits_per_sample == 16 + assert info.encoding == "PCM_S" @skipIfNoExtension diff --git a/test/torchaudio_unittest/backend/sox_io/save_test.py b/test/torchaudio_unittest/backend/sox_io/save_test.py index 5d3fdb03ca..da9365a24a 100644 --- a/test/torchaudio_unittest/backend/sox_io/save_test.py +++ b/test/torchaudio_unittest/backend/sox_io/save_test.py @@ -317,6 +317,12 @@ def test_save_gsm(self, test_mode): self.assert_save_consistency( "gsm", test_mode=test_mode) + @nested_params( + ["path", "fileobj", "bytesio"], + ) + def test_save_htk(self, test_mode): + self.assert_save_consistency("htk", test_mode=test_mode, num_channels=1) + @parameterized.expand([ ("wav", "PCM_S", 16), ("mp3", ), diff --git a/torchaudio/backend/sox_io_backend.py b/torchaudio/backend/sox_io_backend.py index 91b4474a9d..e175913daa 100644 --- a/torchaudio/backend/sox_io_backend.py +++ b/torchaudio/backend/sox_io_backend.py @@ -195,7 +195,7 @@ def save( When ``filepath`` argument is file-like object, this argument is required. Valid values are ``"wav"``, ``"mp3"``, ``"ogg"``, ``"vorbis"``, ``"amr-nb"``, - ``"amb"``, ``"flac"``, ``"sph"`` and ``"gsm"``. + ``"amb"``, ``"flac"``, ``"sph"``, ``"gsm"``, and ``"htk"``. encoding (str, optional): Changes the encoding for the supported formats. This argument is effective only for supported formats, such as ``"wav"``, ``""amb"`` and ``"sph"``. Valid values are; @@ -294,6 +294,9 @@ def save( ``"gsm"`` Lossy Speech Compression, CPU intensive. + ``"htk"`` + Uses a default single-channel 16-bit PCM format. + Note: To save into formats that ``libsox`` does not handle natively, (such as ``"mp3"``, ``"flac"``, ``"ogg"`` and ``"vorbis"``), your installation of ``torchaudio`` has diff --git a/torchaudio/csrc/sox/io.cpp b/torchaudio/csrc/sox/io.cpp index 8bc520feba..4414e6ca5d 100644 --- a/torchaudio/csrc/sox/io.cpp +++ b/torchaudio/csrc/sox/io.cpp @@ -132,6 +132,10 @@ void save_audio_file( const auto num_channels = tensor.size(channels_first ? 0 : 1); TORCH_CHECK( num_channels == 1, "amr-nb format only supports single channel audio."); + } else if (filetype == "htk") { + const auto num_channels = tensor.size(channels_first ? 0 : 1); + TORCH_CHECK( + num_channels == 1, "htk format only supports single channel audio."); } const auto signal_info = get_signalinfo(&tensor, sample_rate, filetype, channels_first); @@ -268,6 +272,12 @@ void save_audio_fileobj( throw std::runtime_error( "amr-nb format only supports single channel audio."); } + } else if (filetype == "htk") { + const auto num_channels = tensor.size(channels_first ? 0 : 1); + if (num_channels != 1) { + throw std::runtime_error( + "htk format only supports single channel audio."); + } } const auto signal_info = get_signalinfo(&tensor, sample_rate, filetype, channels_first); diff --git a/torchaudio/csrc/sox/types.cpp b/torchaudio/csrc/sox/types.cpp index 59e9d320c4..ef5ca40812 100644 --- a/torchaudio/csrc/sox/types.cpp +++ b/torchaudio/csrc/sox/types.cpp @@ -20,6 +20,8 @@ Format get_format_from_string(const std::string& format) { return Format::AMB; if (format == "sph") return Format::SPHERE; + if (format == "htk") + return Format::HTK; if (format == "gsm") return Format::GSM; std::ostringstream stream; diff --git a/torchaudio/csrc/sox/types.h b/torchaudio/csrc/sox/types.h index f3a337407c..c3512f42d4 100644 --- a/torchaudio/csrc/sox/types.h +++ b/torchaudio/csrc/sox/types.h @@ -16,6 +16,7 @@ enum class Format { AMB, SPHERE, GSM, + HTK, }; Format get_format_from_string(const std::string& format); diff --git a/torchaudio/csrc/sox/utils.cpp b/torchaudio/csrc/sox/utils.cpp index 71bca54b7e..cda2b3162c 100644 --- a/torchaudio/csrc/sox/utils.cpp +++ b/torchaudio/csrc/sox/utils.cpp @@ -314,6 +314,13 @@ std::tuple get_save_encoding( throw std::runtime_error( "mp3 does not support `bits_per_sample` option."); return std::make_tuple<>(SOX_ENCODING_MP3, 16); + case Format::HTK: + if (enc != Encoding::NOT_PROVIDED) + throw std::runtime_error("htk does not support `encoding` option."); + if (bps != BitDepth::NOT_PROVIDED) + throw std::runtime_error( + "htk does not support `bits_per_sample` option."); + return std::make_tuple<>(SOX_ENCODING_SIGN2, 16); case Format::VORBIS: if (enc != Encoding::NOT_PROVIDED) throw std::runtime_error("vorbis does not support `encoding` option."); @@ -417,8 +424,12 @@ unsigned get_precision(const std::string filetype, caffe2::TypeMeta dtype) { if (filetype == "amr-nb") { return 16; } - if (filetype == "gsm") + if (filetype == "gsm") { + return 16; return 16; + } + if (filetype == "htk") { return 16; + } throw std::runtime_error("Unsupported file type: " + filetype); } From 9e2d0a882417f9a98a30083349dd309b79faf713 Mon Sep 17 00:00:00 2001 From: SJ <76181208+imaginary-person@users.noreply.github.com> Date: Tue, 23 Feb 2021 14:35:08 -0600 Subject: [PATCH 2/4] Fix typo --- torchaudio/csrc/sox/utils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchaudio/csrc/sox/utils.cpp b/torchaudio/csrc/sox/utils.cpp index cda2b3162c..83bb31bc9c 100644 --- a/torchaudio/csrc/sox/utils.cpp +++ b/torchaudio/csrc/sox/utils.cpp @@ -425,7 +425,7 @@ unsigned get_precision(const std::string filetype, caffe2::TypeMeta dtype) { return 16; } if (filetype == "gsm") { - return 16; return 16; + return 16; } if (filetype == "htk") { return 16; From 9b16f1a5255758800e759ea845d904cdc7831029 Mon Sep 17 00:00:00 2001 From: SJ <76181208+imaginary-person@users.noreply.github.com> Date: Tue, 23 Feb 2021 14:42:29 -0600 Subject: [PATCH 3/4] Remove whitespace --- test/torchaudio_unittest/backend/sox_io/info_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/torchaudio_unittest/backend/sox_io/info_test.py b/test/torchaudio_unittest/backend/sox_io/info_test.py index 5be66c5e77..8701414f6e 100644 --- a/test/torchaudio_unittest/backend/sox_io/info_test.py +++ b/test/torchaudio_unittest/backend/sox_io/info_test.py @@ -220,7 +220,7 @@ def test_alaw(self): assert info.num_channels == num_channels assert info.bits_per_sample == 8 assert info.encoding == "ALAW" - + def test_htk(self): """`sox_io_backend.info` can check HTK file correctly""" duration = 1 From 013a65723b47e03c9a8c970979a772d5cec1121f Mon Sep 17 00:00:00 2001 From: SJ <76181208+imaginary-person@users.noreply.github.com> Date: Tue, 23 Feb 2021 14:53:50 -0600 Subject: [PATCH 4/4] Edit comment to trigger CI CI failure is due to CUDA unavailability at https://app.circleci.com/pipelines/github/pytorch/audio/5168/workflows/27fce196-7df5-46cc-b41b-50a656b86aab/jobs/169438. So, editing a comment would trigger CI again. --- torchaudio/backend/sox_io_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchaudio/backend/sox_io_backend.py b/torchaudio/backend/sox_io_backend.py index e175913daa..036aa5f4ac 100644 --- a/torchaudio/backend/sox_io_backend.py +++ b/torchaudio/backend/sox_io_backend.py @@ -295,7 +295,7 @@ def save( Lossy Speech Compression, CPU intensive. ``"htk"`` - Uses a default single-channel 16-bit PCM format. + Uses its default single-channel 16-bit PCM format. Note: To save into formats that ``libsox`` does not handle natively, (such as ``"mp3"``,