From 2223571044dff75e744a2fdec5d2dfc7bc384779 Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Mon, 6 Jul 2020 21:42:46 -0700 Subject: [PATCH 1/2] merge load and chain --- torchaudio/csrc/sox_io.cpp | 68 +++++++------------------------------- 1 file changed, 12 insertions(+), 56 deletions(-) diff --git a/torchaudio/csrc/sox_io.cpp b/torchaudio/csrc/sox_io.cpp index 2785c7910d..c033c19286 100644 --- a/torchaudio/csrc/sox_io.cpp +++ b/torchaudio/csrc/sox_io.cpp @@ -1,6 +1,7 @@ #include #include #include +#include using namespace torch::indexing; using namespace torchaudio::sox_utils; @@ -60,64 +61,19 @@ c10::intrusive_ptr load_audio_file( "Invalid argument: num_frames must be -1 or greater than 0."); } - SoxFormat sf(sox_open_read( - path.c_str(), - /*signal=*/nullptr, - /*encoding=*/nullptr, - /*filetype=*/nullptr)); - - validate_input_file(sf); - - const int64_t num_channels = sf->signal.channels; - const int64_t num_total_samples = sf->signal.length; - const int64_t sample_start = sf->signal.channels * frame_offset; - - if (sox_seek(sf, sample_start, 0) == SOX_EOF) { - throw std::runtime_error("Error reading audio file: offset past EOF."); + std::vector> effects; + if (num_frames != -1) { + std::ostringstream offset, frames; + offset << frame_offset << "s"; + frames << "+" << num_frames << "s"; + effects.emplace_back(std::vector{"trim", offset.str(), frames.str()}); + } else if (frame_offset != 0) { + std::ostringstream offset; + offset << frame_offset << "s"; + effects.emplace_back(std::vector{"trim", offset.str()}); } - const int64_t sample_end = [&]() { - if (num_frames == -1) - return num_total_samples; - const int64_t sample_end_ = num_channels * num_frames + sample_start; - if (num_total_samples < sample_end_) { - // For lossy encoding, it is difficult to predict exact size of buffer for - // reading the number of samples required. - // So we allocate buffer size of given `num_frames` and ask sox to read as - // much as possible. For lossless format, sox reads exact number of - // samples, but for lossy encoding, sox can end up reading less. (i.e. - // mp3) For the consistent behavior specification between lossy/lossless - // format, we allow users to provide `num_frames` value that exceeds #of - // available samples, and we adjust it here. - return num_total_samples; - } - return sample_end_; - }(); - - const int64_t max_samples = sample_end - sample_start; - - // Read samples into buffer - std::vector buffer; - buffer.reserve(max_samples); - const int64_t num_samples = sox_read(sf, buffer.data(), max_samples); - if (num_samples == 0) { - throw std::runtime_error( - "Error reading audio file: empty file or read operation failed."); - } - // NOTE: num_samples may be smaller than max_samples if the input - // format is compressed (i.e. mp3). - - // Convert to Tensor - auto tensor = convert_to_tensor( - buffer.data(), - num_samples, - num_channels, - get_dtype(sf->encoding.encoding, sf->signal.precision), - normalize, - channels_first); - - return c10::make_intrusive( - tensor, static_cast(sf->signal.rate), channels_first); + return torchaudio::sox_effects::apply_effects_file(path, effects, normalize, channels_first); } void save_audio_file( From 141b569629b1ff14735445f2e1666b440134a328 Mon Sep 17 00:00:00 2001 From: moto <855818+mthrok@users.noreply.github.com> Date: Tue, 7 Jul 2020 17:09:49 +0000 Subject: [PATCH 2/2] Replace save function with sox effects chain --- torchaudio/csrc/sox_effects_chain.cpp | 53 ++++++++++++++++++++++++++- torchaudio/csrc/sox_effects_chain.h | 2 + torchaudio/csrc/sox_io.cpp | 32 ++++++---------- 3 files changed, 66 insertions(+), 21 deletions(-) diff --git a/torchaudio/csrc/sox_effects_chain.cpp b/torchaudio/csrc/sox_effects_chain.cpp index 05d730b6e7..449e33be98 100644 --- a/torchaudio/csrc/sox_effects_chain.cpp +++ b/torchaudio/csrc/sox_effects_chain.cpp @@ -46,6 +46,9 @@ struct TensorInputPriv { struct TensorOutputPriv { std::vector* buffer; }; +struct FileOutputPriv { + sox_format_t* sf; +}; /// Callback function to feed Tensor data to SoxEffectChain. int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) { @@ -84,7 +87,7 @@ int tensor_input_drain(sox_effect_t* effp, sox_sample_t* obuf, size_t* osamp) { /// Callback function to fetch data from SoxEffectChain. int tensor_output_flow( - sox_effect_t* effp LSX_UNUSED, + sox_effect_t* effp, sox_sample_t const* ibuf, sox_sample_t* obuf LSX_UNUSED, size_t* isamp, @@ -97,6 +100,28 @@ int tensor_output_flow( return SOX_SUCCESS; } +int file_output_flow( + sox_effect_t* effp, + sox_sample_t const* ibuf, + sox_sample_t* obuf LSX_UNUSED, + size_t* isamp, + size_t* osamp) { + *osamp = 0; + if (*isamp) { + auto sf = static_cast(effp->priv)->sf; + if (sox_write(sf, ibuf, *isamp) != *isamp) { + if (sf->sox_errno) { + std::ostringstream stream; + stream << sf->sox_errstr << " " << sox_strerror(sf->sox_errno) << " " + << sf->filename; + throw std::runtime_error(stream.str()); + } + return SOX_EOF; + } + } + return SOX_SUCCESS; +} + sox_effect_handler_t* get_tensor_input_handler() { static sox_effect_handler_t handler{/*name=*/"input_tensor", /*usage=*/NULL, @@ -125,6 +150,20 @@ sox_effect_handler_t* get_tensor_output_handler() { return &handler; } +sox_effect_handler_t* get_file_output_handler() { + static sox_effect_handler_t handler{/*name=*/"output_file", + /*usage=*/NULL, + /*flags=*/SOX_EFF_MCHAN, + /*getopts=*/NULL, + /*start=*/NULL, + /*flow=*/file_output_flow, + /*drain=*/NULL, + /*stop=*/NULL, + /*kill=*/NULL, + /*priv_size=*/sizeof(FileOutputPriv)}; + return &handler; +} + } // namespace SoxEffectsChain::SoxEffectsChain( @@ -134,6 +173,7 @@ SoxEffectsChain::SoxEffectsChain( out_enc_(output_encoding), in_sig_(), interm_sig_(), + out_sig_(), sec_(sox_create_effects_chain(&in_enc_, &out_enc_)) { if (!sec_) { throw std::runtime_error("Failed to create effect chain."); @@ -184,6 +224,17 @@ void SoxEffectsChain::addInputFile(sox_format_t* sf) { } } +void SoxEffectsChain::addOutputFile(sox_format_t* sf) { + out_sig_ = sf->signal; + SoxEffect e(sox_create_effect(get_file_output_handler())); + static_cast(e->priv)->sf = sf; + if (sox_add_effect(sec_, e, &interm_sig_, &out_sig_) != SOX_SUCCESS) { + std::ostringstream stream; + stream << "Failed to add effect: output " << sf->filename; + throw std::runtime_error(stream.str()); + } +} + void SoxEffectsChain::addEffect(const std::vector effect) { const auto num_args = effect.size(); if (num_args == 0) { diff --git a/torchaudio/csrc/sox_effects_chain.h b/torchaudio/csrc/sox_effects_chain.h index 9168e94121..fa55182281 100644 --- a/torchaudio/csrc/sox_effects_chain.h +++ b/torchaudio/csrc/sox_effects_chain.h @@ -14,6 +14,7 @@ class SoxEffectsChain { const sox_encodinginfo_t out_enc_; sox_signalinfo_t in_sig_; sox_signalinfo_t interm_sig_; + sox_signalinfo_t out_sig_; sox_effects_chain_t* sec_; public: @@ -29,6 +30,7 @@ class SoxEffectsChain { void addInputTensor(torchaudio::sox_utils::TensorSignal* signal); void addInputFile(sox_format_t* sf); void addOutputBuffer(std::vector* output_buffer); + void addOutputFile(sox_format_t* sf); void addEffect(const std::vector effect); int64_t getOutputNumChannels(); int64_t getOutputSampleRate(); diff --git a/torchaudio/csrc/sox_io.cpp b/torchaudio/csrc/sox_io.cpp index c033c19286..4092d215d0 100644 --- a/torchaudio/csrc/sox_io.cpp +++ b/torchaudio/csrc/sox_io.cpp @@ -1,7 +1,8 @@ #include +#include +#include #include #include -#include using namespace torch::indexing; using namespace torchaudio::sox_utils; @@ -66,14 +67,16 @@ c10::intrusive_ptr load_audio_file( std::ostringstream offset, frames; offset << frame_offset << "s"; frames << "+" << num_frames << "s"; - effects.emplace_back(std::vector{"trim", offset.str(), frames.str()}); + effects.emplace_back( + std::vector{"trim", offset.str(), frames.str()}); } else if (frame_offset != 0) { std::ostringstream offset; offset << frame_offset << "s"; effects.emplace_back(std::vector{"trim", offset.str()}); } - return torchaudio::sox_effects::apply_effects_file(path, effects, normalize, channels_first); + return torchaudio::sox_effects::apply_effects_file( + path, effects, normalize, channels_first); } void save_audio_file( @@ -81,7 +84,6 @@ void save_audio_file( const c10::intrusive_ptr& signal, const double compression) { const auto tensor = signal->getTensor(); - const auto channels_first = signal->getChannelsFirst(); validate_input_tensor(tensor); @@ -102,22 +104,12 @@ void save_audio_file( throw std::runtime_error("Error saving audio file: failed to open file."); } - auto tensor_ = tensor; - if (channels_first) { - tensor_ = tensor_.t(); - } - - const int64_t frames_per_chunk = 65536; - for (int64_t i = 0; i < tensor_.size(0); i += frames_per_chunk) { - auto chunk = tensor_.index({Slice(i, i + frames_per_chunk), Slice()}); - chunk = unnormalize_wav(chunk).contiguous(); - - const size_t numel = chunk.numel(); - if (sox_write(sf, chunk.data_ptr(), numel) != numel) { - throw std::runtime_error( - "Error saving audio file: failed to write the entier buffer."); - } - } + torchaudio::sox_effects_chain::SoxEffectsChain chain( + /*input_encoding=*/get_encodinginfo("wav", tensor.dtype(), 0.), + /*output_encoding=*/sf->encoding); + chain.addInputTensor(signal.get()); + chain.addOutputFile(sf); + chain.run(); } } // namespace sox_io