From 058fbf102b6f9c905ef1b50764f1d6b381d3048e Mon Sep 17 00:00:00 2001 From: Sanchit Date: Wed, 17 Feb 2021 14:47:50 -0600 Subject: [PATCH 1/8] Added HTK format support to soxio's save function --- test/torchaudio_unittest/backend/sox_io/save_test.py | 6 ++++++ torchaudio/backend/sox_io_backend.py | 5 ++++- torchaudio/csrc/sox/types.cpp | 2 ++ torchaudio/csrc/sox/types.h | 1 + torchaudio/csrc/sox/utils.cpp | 10 ++++++++++ 5 files changed, 23 insertions(+), 1 deletion(-) diff --git a/test/torchaudio_unittest/backend/sox_io/save_test.py b/test/torchaudio_unittest/backend/sox_io/save_test.py index dca9c3ad9c..17d2130cb6 100644 --- a/test/torchaudio_unittest/backend/sox_io/save_test.py +++ b/test/torchaudio_unittest/backend/sox_io/save_test.py @@ -237,6 +237,12 @@ def test_save_flac(self, test_mode, bits_per_sample, compression_level): "flac", compression=compression_level, bits_per_sample=bits_per_sample, test_mode=test_mode) + @nested_params( + ["path", "fileobj", "bytesio"], + ) + def test_save_hkt(self, test_mode, ): + self.assert_save_consistency("hkt", test_mode=test_mode, num_channels=1) + @nested_params( ["path", "fileobj", "bytesio"], [ diff --git a/torchaudio/backend/sox_io_backend.py b/torchaudio/backend/sox_io_backend.py index 16abc70deb..d3d0bf5f3f 100644 --- a/torchaudio/backend/sox_io_backend.py +++ b/torchaudio/backend/sox_io_backend.py @@ -195,7 +195,7 @@ def save( When ``filepath`` argument is file-like object, this argument is required. Valid values are ``"wav"``, ``"mp3"``, ``"ogg"``, ``"vorbis"``, ``"amr-nb"``, - ``"amb"``, ``"flac"`` and ``"sph"``. + ``"amb"``, ``"flac"``, ``"sph"``, and ``"htk"``. encoding (str, optional): Changes the encoding for the supported formats. This argument is effective only for supported formats, cush as ``"wav"``, ``""amb"`` and ``"sph"``. Valid values are; @@ -291,6 +291,9 @@ def save( ``"amr-nb"`` Bitrate ranging from 4.75 kbit/s to 12.2 kbit/s. Default: 4.75 kbit/s + ``"htk"`` + Uses its default Single channel 16-bit PCM format. + Note: To save into formats that ``libsox`` does not handle natively, (such as ``"mp3"``, ``"flac"``, ``"ogg"`` and ``"vorbis"``), your installation of ``torchaudio`` has diff --git a/torchaudio/csrc/sox/types.cpp b/torchaudio/csrc/sox/types.cpp index 51e8e720d6..49e6762fdc 100644 --- a/torchaudio/csrc/sox/types.cpp +++ b/torchaudio/csrc/sox/types.cpp @@ -20,6 +20,8 @@ Format get_format_from_string(const std::string& format) { return Format::AMB; if (format == "sph") return Format::SPHERE; + if (format == "htk") + return Format::HTK; std::ostringstream stream; stream << "Internal Error: unexpected format value: " << format; throw std::runtime_error(stream.str()); diff --git a/torchaudio/csrc/sox/types.h b/torchaudio/csrc/sox/types.h index f3ed637478..192330cdb0 100644 --- a/torchaudio/csrc/sox/types.h +++ b/torchaudio/csrc/sox/types.h @@ -15,6 +15,7 @@ enum class Format { AMR_WB, AMB, SPHERE, + HTK, }; Format get_format_from_string(const std::string& format); diff --git a/torchaudio/csrc/sox/utils.cpp b/torchaudio/csrc/sox/utils.cpp index 99a264642f..e9cea37359 100644 --- a/torchaudio/csrc/sox/utils.cpp +++ b/torchaudio/csrc/sox/utils.cpp @@ -314,6 +314,13 @@ std::tuple get_save_encoding( throw std::runtime_error( "mp3 does not support `bits_per_sample` option."); return std::make_tuple<>(SOX_ENCODING_MP3, 16); + case Format::HTK: + if (enc != Encoding::NOT_PROVIDED) + throw std::runtime_error("htk does not support `encoding` option."); + if (bps != BitDepth::NOT_PROVIDED) + throw std::runtime_error( + "htk does not support `bits_per_sample` option."); + return std::make_tuple<>(SOX_ENCODING_SIGN2, 16); case Format::VORBIS: if (enc != Encoding::NOT_PROVIDED) throw std::runtime_error("vorbis does not support `encoding` option."); @@ -409,6 +416,9 @@ unsigned get_precision(const std::string filetype, caffe2::TypeMeta dtype) { if (filetype == "amr-nb") { return 16; } + if (filetype == "htk") { + return 16; + } throw std::runtime_error("Unsupported file type: " + filetype); } From eeb5731de6c7705580c86f953c3b34f83aa83dc3 Mon Sep 17 00:00:00 2001 From: SJ <76181208+imaginary-person@users.noreply.github.com> Date: Wed, 17 Feb 2021 14:59:48 -0600 Subject: [PATCH 2/8] Fix typo --- test/torchaudio_unittest/backend/sox_io/save_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/torchaudio_unittest/backend/sox_io/save_test.py b/test/torchaudio_unittest/backend/sox_io/save_test.py index 17d2130cb6..b1c8697f91 100644 --- a/test/torchaudio_unittest/backend/sox_io/save_test.py +++ b/test/torchaudio_unittest/backend/sox_io/save_test.py @@ -240,8 +240,8 @@ def test_save_flac(self, test_mode, bits_per_sample, compression_level): @nested_params( ["path", "fileobj", "bytesio"], ) - def test_save_hkt(self, test_mode, ): - self.assert_save_consistency("hkt", test_mode=test_mode, num_channels=1) + def test_save_htk(self, test_mode, ): + self.assert_save_consistency("htk", test_mode=test_mode, num_channels=1) @nested_params( ["path", "fileobj", "bytesio"], From fec3ca3d3645b64f8d4399628a0f1470e6c6c38b Mon Sep 17 00:00:00 2001 From: SJ <76181208+imaginary-person@users.noreply.github.com> Date: Wed, 17 Feb 2021 15:07:45 -0600 Subject: [PATCH 3/8] Fix another typo :/ --- test/torchaudio_unittest/backend/sox_io/save_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/torchaudio_unittest/backend/sox_io/save_test.py b/test/torchaudio_unittest/backend/sox_io/save_test.py index b1c8697f91..349042b1a3 100644 --- a/test/torchaudio_unittest/backend/sox_io/save_test.py +++ b/test/torchaudio_unittest/backend/sox_io/save_test.py @@ -240,7 +240,7 @@ def test_save_flac(self, test_mode, bits_per_sample, compression_level): @nested_params( ["path", "fileobj", "bytesio"], ) - def test_save_htk(self, test_mode, ): + def test_save_htk(self, test_mode): self.assert_save_consistency("htk", test_mode=test_mode, num_channels=1) @nested_params( From 282caa35ec6036d4bf06493d701b9376f2d4a3b6 Mon Sep 17 00:00:00 2001 From: SJ <76181208+imaginary-person@users.noreply.github.com> Date: Wed, 17 Feb 2021 15:32:20 -0600 Subject: [PATCH 4/8] Edit comment to trigger CI Edit comment to trigger CI --- torchaudio/backend/sox_io_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchaudio/backend/sox_io_backend.py b/torchaudio/backend/sox_io_backend.py index d3d0bf5f3f..626def6772 100644 --- a/torchaudio/backend/sox_io_backend.py +++ b/torchaudio/backend/sox_io_backend.py @@ -292,7 +292,7 @@ def save( Bitrate ranging from 4.75 kbit/s to 12.2 kbit/s. Default: 4.75 kbit/s ``"htk"`` - Uses its default Single channel 16-bit PCM format. + Uses its default single-channel 16-bit PCM format. Note: To save into formats that ``libsox`` does not handle natively, (such as ``"mp3"``, From 3db2aeab4c193ffd73e5341cedecec3799e764f2 Mon Sep 17 00:00:00 2001 From: SJ <76181208+imaginary-person@users.noreply.github.com> Date: Thu, 18 Feb 2021 11:06:22 -0600 Subject: [PATCH 5/8] Changed merge-order to simplify diff --- torchaudio/csrc/sox/utils.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchaudio/csrc/sox/utils.cpp b/torchaudio/csrc/sox/utils.cpp index b75d89275b..83bb31bc9c 100644 --- a/torchaudio/csrc/sox/utils.cpp +++ b/torchaudio/csrc/sox/utils.cpp @@ -424,10 +424,10 @@ unsigned get_precision(const std::string filetype, caffe2::TypeMeta dtype) { if (filetype == "amr-nb") { return 16; } - if (filetype == "htk") { + if (filetype == "gsm") { return 16; } - if (filetype == "gsm") { + if (filetype == "htk") { return 16; } throw std::runtime_error("Unsupported file type: " + filetype); From 4c98e5ed1c268d9e746239944d9d379b5cd4caad Mon Sep 17 00:00:00 2001 From: Sanchit Date: Thu, 18 Feb 2021 11:41:45 -0600 Subject: [PATCH 6/8] Added info test for htk --- .../backend/sox_io/info_test.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/test/torchaudio_unittest/backend/sox_io/info_test.py b/test/torchaudio_unittest/backend/sox_io/info_test.py index a2a93648a1..5cf4881099 100644 --- a/test/torchaudio_unittest/backend/sox_io/info_test.py +++ b/test/torchaudio_unittest/backend/sox_io/info_test.py @@ -205,7 +205,7 @@ def test_ulaw(self): assert info.encoding == "ULAW" def test_alaw(self): - """`sox_io_backend.info` can check ulaw file correctly""" + """`sox_io_backend.info` can check alaw file correctly""" duration = 1 num_channels = 1 sample_rate = 8000 @@ -221,6 +221,21 @@ def test_alaw(self): assert info.bits_per_sample == 8 assert info.encoding == "ALAW" + def test_htk(self): + """`sox_io_backend.info` can check HTK file correctly""" + duration = 1 + num_channels = 1 + sample_rate = 8000 + path = self.get_temp_path('data.wav') + sox_utils.gen_audio_file( + path, sample_rate=sample_rate, num_channels=num_channels, + bit_depth=16, duration=duration) + info = sox_io_backend.info(path) + assert info.sample_rate == sample_rate + assert info.num_frames == sample_rate * duration + assert info.num_channels == num_channels + assert info.bits_per_sample == 16 + assert info.encoding == "PCM_S" @skipIfNoExtension class TestInfoOpus(PytorchTestCase): From 1a96f76d5d3139d461707f9a5e42d3b93deea023 Mon Sep 17 00:00:00 2001 From: SJ <76181208+imaginary-person@users.noreply.github.com> Date: Thu, 18 Feb 2021 11:58:39 -0600 Subject: [PATCH 7/8] Fix typo & add newline --- test/torchaudio_unittest/backend/sox_io/info_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/torchaudio_unittest/backend/sox_io/info_test.py b/test/torchaudio_unittest/backend/sox_io/info_test.py index 5cf4881099..8701414f6e 100644 --- a/test/torchaudio_unittest/backend/sox_io/info_test.py +++ b/test/torchaudio_unittest/backend/sox_io/info_test.py @@ -226,7 +226,7 @@ def test_htk(self): duration = 1 num_channels = 1 sample_rate = 8000 - path = self.get_temp_path('data.wav') + path = self.get_temp_path('data.htk') sox_utils.gen_audio_file( path, sample_rate=sample_rate, num_channels=num_channels, bit_depth=16, duration=duration) @@ -237,6 +237,7 @@ def test_htk(self): assert info.bits_per_sample == 16 assert info.encoding == "PCM_S" + @skipIfNoExtension class TestInfoOpus(PytorchTestCase): @parameterized.expand(list(itertools.product( From 91e611905bc23556ffbc0027dbf0863b10b9701e Mon Sep 17 00:00:00 2001 From: SJ <76181208+imaginary-person@users.noreply.github.com> Date: Thu, 18 Feb 2021 21:46:59 -0600 Subject: [PATCH 8/8] Update comment to trigger CI Update comment to trigger CI --- torchaudio/backend/sox_io_backend.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchaudio/backend/sox_io_backend.py b/torchaudio/backend/sox_io_backend.py index 54061ce1fe..54bacd5e5f 100644 --- a/torchaudio/backend/sox_io_backend.py +++ b/torchaudio/backend/sox_io_backend.py @@ -296,7 +296,7 @@ def save( Lossy Speech Compression, CPU intensive. ``"htk"`` - Uses its default single-channel 16-bit PCM format. + Uses a default single-channel 16-bit PCM format. Note: To save into formats that ``libsox`` does not handle natively, (such as ``"mp3"``,