Support encoding to file-like object (#754)

NicolasHug · web-flow · commit 1ec566667ca0 · 2025-08-13T09:42:03.000+01:00
diff --git a/src/torchcodec/_core/AVIOContextHolder.cpp b/src/torchcodec/_core/AVIOContextHolder.cpp
@@ -14,6 +14,7 @@ void AVIOContextHolder::createAVIOContext(
     AVIOWriteFunction write,
     AVIOSeekFunction seek,
     void* heldData,
+    bool isForWriting,
     int bufferSize) {
   TORCH_CHECK(
       bufferSize > 0,
@@ -23,14 +24,18 @@ void AVIOContextHolder::createAVIOContext(
       buffer != nullptr,
       "Failed to allocate buffer of size " + std::to_string(bufferSize));
 
-  TORCH_CHECK(
-      (seek != nullptr) && ((write != nullptr) ^ (read != nullptr)),
-      "seek method must be defined, and either write or read must be defined. "
-      "But not both!")
+  TORCH_CHECK(seek != nullptr, "seek method must be defined");
+
+  if (isForWriting) {
+    TORCH_CHECK(write != nullptr, "write method must be defined for writing");
+  } else {
+    TORCH_CHECK(read != nullptr, "read method must be defined for reading");
+  }
+
   avioContext_.reset(avioAllocContext(
       buffer,
       bufferSize,
-      /*write_flag=*/write != nullptr,
+      /*write_flag=*/isForWriting,
       heldData,
       read,
       write,
diff --git a/src/torchcodec/_core/AVIOContextHolder.h b/src/torchcodec/_core/AVIOContextHolder.h
@@ -51,6 +51,7 @@ class AVIOContextHolder {
       AVIOWriteFunction write,
       AVIOSeekFunction seek,
       void* heldData,
+      bool isForWriting,
       int bufferSize = defaultBufferSize);
 
  private:
diff --git a/src/torchcodec/_core/AVIOFileLikeContext.cpp b/src/torchcodec/_core/AVIOFileLikeContext.cpp
@@ -9,21 +9,29 @@
 
 namespace facebook::torchcodec {
 
-AVIOFileLikeContext::AVIOFileLikeContext(py::object fileLike)
+AVIOFileLikeContext::AVIOFileLikeContext(py::object fileLike, bool isForWriting)
     : fileLike_{UniquePyObject(new py::object(fileLike))} {
   {
     // TODO: Is it necessary to acquire the GIL here? Is it maybe even
     // harmful? At the moment, this is only called from within a pybind
     // function, and pybind guarantees we have the GIL.
     py::gil_scoped_acquire gil;
-    TORCH_CHECK(
-        py::hasattr(fileLike, "read"),
-        "File like object must implement a read method.");
+
+    if (isForWriting) {
+      TORCH_CHECK(
+          py::hasattr(fileLike, "write"),
+          "File like object must implement a write method for writing.");
+    } else {
+      TORCH_CHECK(
+          py::hasattr(fileLike, "read"),
+          "File like object must implement a read method for reading.");
+    }
+
     TORCH_CHECK(
         py::hasattr(fileLike, "seek"),
         "File like object must implement a seek method.");
   }
-  createAVIOContext(&read, nullptr, &seek, &fileLike_);
+  createAVIOContext(&read, &write, &seek, &fileLike_, isForWriting);
 }
 
 int AVIOFileLikeContext::read(void* opaque, uint8_t* buf, int buf_size) {
@@ -77,4 +85,12 @@ int64_t AVIOFileLikeContext::seek(void* opaque, int64_t offset, int whence) {
   return py::cast<int64_t>((*fileLike)->attr("seek")(offset, whence));
 }
 
+int AVIOFileLikeContext::write(void* opaque, const uint8_t* buf, int buf_size) {
+  auto fileLike = static_cast<UniquePyObject*>(opaque);
+  py::gil_scoped_acquire gil;
+  py::bytes bytes_obj(reinterpret_cast<const char*>(buf), buf_size);
+
+  return py::cast<int64_t>((*fileLike)->attr("write")(bytes_obj));
+}
+
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/AVIOFileLikeContext.h b/src/torchcodec/_core/AVIOFileLikeContext.h
@@ -19,11 +19,12 @@ namespace facebook::torchcodec {
 // and seek calls back up to the methods on the Python object.
 class AVIOFileLikeContext : public AVIOContextHolder {
  public:
-  explicit AVIOFileLikeContext(py::object fileLike);
+  explicit AVIOFileLikeContext(py::object fileLike, bool isForWriting);
 
  private:
   static int read(void* opaque, uint8_t* buf, int buf_size);
   static int64_t seek(void* opaque, int64_t offset, int whence);
+  static int write(void* opaque, const uint8_t* buf, int buf_size);
 
   // Note that we dynamically allocate the Python object because we need to
   // strictly control when its destructor is called. We must hold the GIL
diff --git a/src/torchcodec/_core/AVIOTensorContext.cpp b/src/torchcodec/_core/AVIOTensorContext.cpp
@@ -105,12 +105,14 @@ AVIOFromTensorContext::AVIOFromTensorContext(torch::Tensor data)
   TORCH_CHECK(data.numel() > 0, "data must not be empty");
   TORCH_CHECK(data.is_contiguous(), "data must be contiguous");
   TORCH_CHECK(data.scalar_type() == torch::kUInt8, "data must be kUInt8");
-  createAVIOContext(&read, nullptr, &seek, &tensorContext_);
+  createAVIOContext(
+      &read, nullptr, &seek, &tensorContext_, /*isForWriting=*/false);
 }
 
 AVIOToTensorContext::AVIOToTensorContext()
     : tensorContext_{torch::empty({INITIAL_TENSOR_SIZE}, {torch::kUInt8}), 0} {
-  createAVIOContext(nullptr, &write, &seek, &tensorContext_);
+  createAVIOContext(
+      nullptr, &write, &seek, &tensorContext_, /*isForWriting=*/true);
 }
 
 torch::Tensor AVIOToTensorContext::getOutputTensor() {
diff --git a/src/torchcodec/_core/Encoder.cpp b/src/torchcodec/_core/Encoder.cpp
@@ -149,7 +149,7 @@ AudioEncoder::AudioEncoder(
     const torch::Tensor& samples,
     int sampleRate,
     std::string_view formatName,
-    std::unique_ptr<AVIOToTensorContext> avioContextHolder,
+    std::unique_ptr<AVIOContextHolder> avioContextHolder,
     const AudioStreamOptions& audioStreamOptions)
     : samples_(validateSamples(samples)),
       inSampleRate_(sampleRate),
@@ -248,9 +248,12 @@ void AudioEncoder::initializeEncoder(
 torch::Tensor AudioEncoder::encodeToTensor() {
   TORCH_CHECK(
       avioContextHolder_ != nullptr,
-      "Cannot encode to tensor, avio context doesn't exist.");
+      "Cannot encode to tensor, avio tensor context doesn't exist.");
   encode();
-  return avioContextHolder_->getOutputTensor();
+  auto avioToTensorContext =
+      dynamic_cast<AVIOToTensorContext*>(avioContextHolder_.get());
+  TORCH_CHECK(avioToTensorContext != nullptr, "Invalid AVIO context holder.");
+  return avioToTensorContext->getOutputTensor();
 }
 
 void AudioEncoder::encode() {
@@ -501,6 +504,7 @@ void AudioEncoder::maybeFlushSwrBuffers(AutoAVPacket& autoAVPacket) {
 void AudioEncoder::flushBuffers() {
   AutoAVPacket autoAVPacket;
   maybeFlushSwrBuffers(autoAVPacket);
+
   encodeFrame(autoAVPacket, UniqueAVFrame(nullptr));
 }
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/_core/Encoder.h b/src/torchcodec/_core/Encoder.h
@@ -1,6 +1,6 @@
 #pragma once
 #include <torch/types.h>
-#include "src/torchcodec/_core/AVIOTensorContext.h"
+#include "src/torchcodec/_core/AVIOContextHolder.h"
 #include "src/torchcodec/_core/FFMPEGCommon.h"
 #include "src/torchcodec/_core/StreamOptions.h"
 
@@ -14,13 +14,16 @@ class AudioEncoder {
       int sampleRate,
       std::string_view fileName,
       const AudioStreamOptions& audioStreamOptions);
+
   AudioEncoder(
       const torch::Tensor& samples,
       int sampleRate,
       std::string_view formatName,
-      std::unique_ptr<AVIOToTensorContext> avioContextHolder,
+      std::unique_ptr<AVIOContextHolder> avioContextHolder,
       const AudioStreamOptions& audioStreamOptions);
+
   void encode();
+
   torch::Tensor encodeToTensor();
 
  private:
@@ -49,8 +52,7 @@ class AudioEncoder {
 
   UniqueAVAudioFifo avAudioFifo_;
 
-  // Stores the AVIOContext for the output tensor buffer.
-  std::unique_ptr<AVIOToTensorContext> avioContextHolder_;
+  std::unique_ptr<AVIOContextHolder> avioContextHolder_;
 
   bool encodeWasCalled_ = false;
   int64_t lastEncodedAVFramePts_ = 0;
diff --git a/src/torchcodec/_core/__init__.py b/src/torchcodec/_core/__init__.py
@@ -23,6 +23,7 @@
     create_from_file_like,
     create_from_tensor,
     encode_audio_to_file,
+    encode_audio_to_file_like,
     encode_audio_to_tensor,
     get_ffmpeg_library_versions,
     get_frame_at_index,
diff --git a/src/torchcodec/_core/ops.py b/src/torchcodec/_core/ops.py
@@ -151,6 +151,62 @@ def create_from_file_like(
     return _convert_to_tensor(_pybind_ops.create_from_file_like(file_like, seek_mode))
 
 
+def encode_audio_to_file_like(
+    samples: torch.Tensor,
+    sample_rate: int,
+    format: str,
+    file_like: Union[io.RawIOBase, io.BufferedIOBase],
+    bit_rate: Optional[int] = None,
+    num_channels: Optional[int] = None,
+    desired_sample_rate: Optional[int] = None,
+) -> None:
+    """Encode audio samples to a file-like object.
+
+    Args:
+        samples: Audio samples tensor
+        sample_rate: Sample rate in Hz
+        format: Audio format (e.g., "wav", "mp3", "flac")
+        file_like: File-like object that supports write() and seek() methods
+        bit_rate: Optional bit rate for encoding
+        num_channels: Optional number of output channels
+        desired_sample_rate: Optional desired sample rate for the output.
+    """
+    assert _pybind_ops is not None
+
+    if samples.dtype != torch.float32:
+        raise ValueError(f"samples must have dtype torch.float32, got {samples.dtype}")
+
+    # We're having the same problem as with the decoder's create_from_file_like:
+    # We should be able to pass a tensor directly, but this leads to a pybind
+    # error. In order to work around this, we pass the pointer to the tensor's
+    # data, and its shape, in order to re-construct it in C++. For this to work:
+    # - the tensor must be float32
+    # - the tensor  must be contiguous, which is why we call contiguous().
+    #   In theory we could avoid this restriction by also passing the strides?
+    # - IMPORTANT: the input samples tensor and its underlying data must be
+    #   alive during the call.
+    #
+    # A more elegant solution would be to cast the tensor into a py::object, but
+    # casting the py::object backk to a tensor in C++ seems to lead to the same
+    # pybing error.
+
+    samples = samples.contiguous()
+    _pybind_ops.encode_audio_to_file_like(
+        samples.data_ptr(),
+        list(samples.shape),
+        sample_rate,
+        format,
+        file_like,
+        bit_rate,
+        num_channels,
+        desired_sample_rate,
+    )
+
+    # This check is useless but it's critical to keep it to ensures that samples
+    # is still alive during the call to encode_audio_to_file_like.
+    assert samples.is_contiguous()
+
+
 # ==============================
 # Abstract impl for the operators. Needed by torch.compile.
 # ==============================
diff --git a/src/torchcodec/_core/pybind_ops.cpp b/src/torchcodec/_core/pybind_ops.cpp
@@ -10,7 +10,9 @@
 #include <string>
 
 #include "src/torchcodec/_core/AVIOFileLikeContext.h"
+#include "src/torchcodec/_core/Encoder.h"
 #include "src/torchcodec/_core/SingleStreamDecoder.h"
+#include "src/torchcodec/_core/StreamOptions.h"
 
 namespace py = pybind11;
 
@@ -31,19 +33,55 @@ int64_t create_from_file_like(
     realSeek = seekModeFromString(seek_mode.value());
   }
 
-  auto avioContextHolder = std::make_unique<AVIOFileLikeContext>(file_like);
+  auto avioContextHolder =
+      std::make_unique<AVIOFileLikeContext>(file_like, /*isForWriting=*/false);
 
   SingleStreamDecoder* decoder =
       new SingleStreamDecoder(std::move(avioContextHolder), realSeek);
   return reinterpret_cast<int64_t>(decoder);
 }
 
+void encode_audio_to_file_like(
+    int64_t data_ptr,
+    const std::vector<int64_t>& shape,
+    int64_t sample_rate,
+    std::string_view format,
+    py::object file_like,
+    std::optional<int64_t> bit_rate = std::nullopt,
+    std::optional<int64_t> num_channels = std::nullopt,
+    std::optional<int64_t> desired_sample_rate = std::nullopt) {
+  // We assume float32 *and* contiguity, this must be enforced by the caller.
+  auto tensor_options = torch::TensorOptions().dtype(torch::kFloat32);
+  auto samples = torch::from_blob(
+      reinterpret_cast<void*>(data_ptr), shape, tensor_options);
+
+  // TODO Fix implicit int conversion:
+  // https://github.com/pytorch/torchcodec/issues/679
+  // same for sample_rate parameter below
+  AudioStreamOptions audioStreamOptions;
+  audioStreamOptions.bitRate = bit_rate;
+  audioStreamOptions.numChannels = num_channels;
+  audioStreamOptions.sampleRate = desired_sample_rate;
+
+  auto avioContextHolder =
+      std::make_unique<AVIOFileLikeContext>(file_like, /*isForWriting=*/true);
+
+  AudioEncoder encoder(
+      samples,
+      static_cast<int>(sample_rate),
+      format,
+      std::move(avioContextHolder),
+      audioStreamOptions);
+  encoder.encode();
+}
+
 #ifndef PYBIND_OPS_MODULE_NAME
 #error PYBIND_OPS_MODULE_NAME must be defined!
 #endif
 
 PYBIND11_MODULE(PYBIND_OPS_MODULE_NAME, m) {
   m.def("create_from_file_like", &create_from_file_like);
+  m.def("encode_audio_to_file_like", &encode_audio_to_file_like);
 }
 
 } // namespace facebook::torchcodec
diff --git a/src/torchcodec/encoders/_audio_encoder.py b/src/torchcodec/encoders/_audio_encoder.py
@@ -108,3 +108,42 @@ def to_tensor(
             num_channels=num_channels,
             desired_sample_rate=sample_rate,
         )
+
+    def to_file_like(
+        self,
+        file_like,
+        format: str,
+        *,
+        bit_rate: Optional[int] = None,
+        num_channels: Optional[int] = None,
+        sample_rate: Optional[int] = None,
+    ) -> None:
+        """Encode samples into a file-like object.
+
+        Args:
+            file_like: A file-like object that supports ``write()`` and
+                ``seek()`` methods, such as io.BytesIO(), an open file in binary
+                write mode, etc. Methods must have the following signature:
+                ``write(data: bytes) -> int`` and ``seek(offset: int, whence:
+                int = 0) -> int``.
+            format (str): The format of the encoded samples, e.g. "mp3", "wav"
+                or "flac".
+            bit_rate (int, optional): The output bit rate. Encoders typically
+                support a finite set of bit rate values, so ``bit_rate`` will be
+                matched to one of those supported values. The default is chosen
+                by FFmpeg.
+            num_channels (int, optional): The number of channels of the encoded
+                output samples. By default, the number of channels of the input
+                ``samples`` is used.
+            sample_rate (int, optional): The sample rate of the encoded output.
+                By default, the sample rate of the input ``samples`` is used.
+        """
+        _core.encode_audio_to_file_like(
+            samples=self._samples,
+            sample_rate=self._sample_rate,
+            format=format,
+            file_like=file_like,
+            bit_rate=bit_rate,
+            num_channels=num_channels,
+            desired_sample_rate=sample_rate,
+        )
diff --git a/test/test_encoders.py b/test/test_encoders.py