|
1 | 1 | #include <sox.h> |
2 | 2 | #include <torchaudio/csrc/sox_io.h> |
3 | 3 | #include <torchaudio/csrc/sox_utils.h> |
| 4 | +#include <torchaudio/csrc/sox_effects.h> |
4 | 5 |
|
5 | 6 | using namespace torch::indexing; |
6 | 7 | using namespace torchaudio::sox_utils; |
@@ -60,64 +61,19 @@ c10::intrusive_ptr<TensorSignal> load_audio_file( |
60 | 61 | "Invalid argument: num_frames must be -1 or greater than 0."); |
61 | 62 | } |
62 | 63 |
|
63 | | - SoxFormat sf(sox_open_read( |
64 | | - path.c_str(), |
65 | | - /*signal=*/nullptr, |
66 | | - /*encoding=*/nullptr, |
67 | | - /*filetype=*/nullptr)); |
68 | | - |
69 | | - validate_input_file(sf); |
70 | | - |
71 | | - const int64_t num_channels = sf->signal.channels; |
72 | | - const int64_t num_total_samples = sf->signal.length; |
73 | | - const int64_t sample_start = sf->signal.channels * frame_offset; |
74 | | - |
75 | | - if (sox_seek(sf, sample_start, 0) == SOX_EOF) { |
76 | | - throw std::runtime_error("Error reading audio file: offset past EOF."); |
| 64 | + std::vector<std::vector<std::string>> effects; |
| 65 | + if (num_frames != -1) { |
| 66 | + std::ostringstream offset, frames; |
| 67 | + offset << frame_offset << "s"; |
| 68 | + frames << "+" << num_frames << "s"; |
| 69 | + effects.emplace_back(std::vector<std::string>{"trim", offset.str(), frames.str()}); |
| 70 | + } else if (frame_offset != 0) { |
| 71 | + std::ostringstream offset; |
| 72 | + offset << frame_offset << "s"; |
| 73 | + effects.emplace_back(std::vector<std::string>{"trim", offset.str()}); |
77 | 74 | } |
78 | 75 |
|
79 | | - const int64_t sample_end = [&]() { |
80 | | - if (num_frames == -1) |
81 | | - return num_total_samples; |
82 | | - const int64_t sample_end_ = num_channels * num_frames + sample_start; |
83 | | - if (num_total_samples < sample_end_) { |
84 | | - // For lossy encoding, it is difficult to predict exact size of buffer for |
85 | | - // reading the number of samples required. |
86 | | - // So we allocate buffer size of given `num_frames` and ask sox to read as |
87 | | - // much as possible. For lossless format, sox reads exact number of |
88 | | - // samples, but for lossy encoding, sox can end up reading less. (i.e. |
89 | | - // mp3) For the consistent behavior specification between lossy/lossless |
90 | | - // format, we allow users to provide `num_frames` value that exceeds #of |
91 | | - // available samples, and we adjust it here. |
92 | | - return num_total_samples; |
93 | | - } |
94 | | - return sample_end_; |
95 | | - }(); |
96 | | - |
97 | | - const int64_t max_samples = sample_end - sample_start; |
98 | | - |
99 | | - // Read samples into buffer |
100 | | - std::vector<sox_sample_t> buffer; |
101 | | - buffer.reserve(max_samples); |
102 | | - const int64_t num_samples = sox_read(sf, buffer.data(), max_samples); |
103 | | - if (num_samples == 0) { |
104 | | - throw std::runtime_error( |
105 | | - "Error reading audio file: empty file or read operation failed."); |
106 | | - } |
107 | | - // NOTE: num_samples may be smaller than max_samples if the input |
108 | | - // format is compressed (i.e. mp3). |
109 | | - |
110 | | - // Convert to Tensor |
111 | | - auto tensor = convert_to_tensor( |
112 | | - buffer.data(), |
113 | | - num_samples, |
114 | | - num_channels, |
115 | | - get_dtype(sf->encoding.encoding, sf->signal.precision), |
116 | | - normalize, |
117 | | - channels_first); |
118 | | - |
119 | | - return c10::make_intrusive<TensorSignal>( |
120 | | - tensor, static_cast<int64_t>(sf->signal.rate), channels_first); |
| 76 | + return torchaudio::sox_effects::apply_effects_file(path, effects, normalize, channels_first); |
121 | 77 | } |
122 | 78 |
|
123 | 79 | void save_audio_file( |
|
0 commit comments