Skip to content

Commit c7520eb

Browse files
committed
merge load and chain
1 parent 60a8e23 commit c7520eb

File tree

1 file changed

+12
-56
lines changed

1 file changed

+12
-56
lines changed

torchaudio/csrc/sox_io.cpp

Lines changed: 12 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include <sox.h>
22
#include <torchaudio/csrc/sox_io.h>
33
#include <torchaudio/csrc/sox_utils.h>
4+
#include <torchaudio/csrc/sox_effects.h>
45

56
using namespace torch::indexing;
67
using namespace torchaudio::sox_utils;
@@ -60,64 +61,19 @@ c10::intrusive_ptr<TensorSignal> load_audio_file(
6061
"Invalid argument: num_frames must be -1 or greater than 0.");
6162
}
6263

63-
SoxFormat sf(sox_open_read(
64-
path.c_str(),
65-
/*signal=*/nullptr,
66-
/*encoding=*/nullptr,
67-
/*filetype=*/nullptr));
68-
69-
validate_input_file(sf);
70-
71-
const int64_t num_channels = sf->signal.channels;
72-
const int64_t num_total_samples = sf->signal.length;
73-
const int64_t sample_start = sf->signal.channels * frame_offset;
74-
75-
if (sox_seek(sf, sample_start, 0) == SOX_EOF) {
76-
throw std::runtime_error("Error reading audio file: offset past EOF.");
64+
std::vector<std::vector<std::string>> effects;
65+
if (num_frames != -1) {
66+
std::ostringstream offset, frames;
67+
offset << frame_offset << "s";
68+
frames << "+" << num_frames << "s";
69+
effects.emplace_back(std::vector<std::string>{"trim", offset.str(), frames.str()});
70+
} else if (frame_offset != 0) {
71+
std::ostringstream offset;
72+
offset << frame_offset << "s";
73+
effects.emplace_back(std::vector<std::string>{"trim", offset.str()});
7774
}
7875

79-
const int64_t sample_end = [&]() {
80-
if (num_frames == -1)
81-
return num_total_samples;
82-
const int64_t sample_end_ = num_channels * num_frames + sample_start;
83-
if (num_total_samples < sample_end_) {
84-
// For lossy encoding, it is difficult to predict exact size of buffer for
85-
// reading the number of samples required.
86-
// So we allocate buffer size of given `num_frames` and ask sox to read as
87-
// much as possible. For lossless format, sox reads exact number of
88-
// samples, but for lossy encoding, sox can end up reading less. (i.e.
89-
// mp3) For the consistent behavior specification between lossy/lossless
90-
// format, we allow users to provide `num_frames` value that exceeds #of
91-
// available samples, and we adjust it here.
92-
return num_total_samples;
93-
}
94-
return sample_end_;
95-
}();
96-
97-
const int64_t max_samples = sample_end - sample_start;
98-
99-
// Read samples into buffer
100-
std::vector<sox_sample_t> buffer;
101-
buffer.reserve(max_samples);
102-
const int64_t num_samples = sox_read(sf, buffer.data(), max_samples);
103-
if (num_samples == 0) {
104-
throw std::runtime_error(
105-
"Error reading audio file: empty file or read operation failed.");
106-
}
107-
// NOTE: num_samples may be smaller than max_samples if the input
108-
// format is compressed (i.e. mp3).
109-
110-
// Convert to Tensor
111-
auto tensor = convert_to_tensor(
112-
buffer.data(),
113-
num_samples,
114-
num_channels,
115-
get_dtype(sf->encoding.encoding, sf->signal.precision),
116-
normalize,
117-
channels_first);
118-
119-
return c10::make_intrusive<TensorSignal>(
120-
tensor, static_cast<int64_t>(sf->signal.rate), channels_first);
76+
return torchaudio::sox_effects::apply_effects_file(path, effects, normalize, channels_first);
12177
}
12278

12379
void save_audio_file(

0 commit comments

Comments
 (0)