From a765f3bd65b563a1ce03e21320a64de622c4a590 Mon Sep 17 00:00:00 2001
From: Dennis Keck <26092524+fellhorn@users.noreply.github.com>
Date: Sun, 31 Aug 2025 23:34:04 +0200
Subject: [PATCH 1/5] Address code review comments

Signed-off-by: Dennis Keck <26092524+fellhorn@users.noreply.github.com>
---
 llama-cpp-2/src/mtmd.rs | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/llama-cpp-2/src/mtmd.rs b/llama-cpp-2/src/mtmd.rs
index e1f712ad..d03dcfeb 100644
--- a/llama-cpp-2/src/mtmd.rs
+++ b/llama-cpp-2/src/mtmd.rs
@@ -43,7 +43,7 @@ impl From<llama_cpp_sys_2::mtmd_input_chunk_type> for MtmdInputChunkType {
             llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_TEXT => MtmdInputChunkType::Text,
             llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_IMAGE => MtmdInputChunkType::Image,
             llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_AUDIO => MtmdInputChunkType::Audio,
-            _ => panic!("Unknown MTMD input chunk type"),
+            _ => panic!("Unknown MTMD input chunk type: {}", chunk_type),
         }
     }
 }
@@ -211,10 +211,11 @@ impl MtmdContext {
     }
 
     /// Get audio bitrate in Hz (e.g., 16000 for Whisper).
-    /// Returns -1 if audio is not supported.
+    /// Returns None if audio is not supported.
     #[must_use]
-    pub fn get_audio_bitrate(&self) -> i32 {
-        unsafe { llama_cpp_sys_2::mtmd_get_audio_bitrate(self.context.as_ptr()) }
+    pub fn get_audio_bitrate(&self) -> Option<u32> {
+        let rate = unsafe { llama_cpp_sys_2::mtmd_get_audio_bitrate(self.context.as_ptr()) };
+        (rate > 0).then(|| rate as u32)
     }
 
     /// Tokenize input text and bitmaps into chunks.

From ae21a1cc063019f8a2cb30626f185f128dd40570 Mon Sep 17 00:00:00 2001
From: Dennis Keck <26092524+fellhorn@users.noreply.github.com>
Date: Tue, 2 Sep 2025 00:10:48 +0200
Subject: [PATCH 2/5] Clippy & fix batch size

Signed-off-by: Dennis Keck <26092524+fellhorn@users.noreply.github.com>
---
 examples/mtmd/src/mtmd.rs | 12 ++++++++----
 llama-cpp-2/src/mtmd.rs   | 33 ++++++++++++++++-----------------
 2 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/examples/mtmd/src/mtmd.rs b/examples/mtmd/src/mtmd.rs
index 6a704d3c..7e0c3179 100644
--- a/examples/mtmd/src/mtmd.rs
+++ b/examples/mtmd/src/mtmd.rs
@@ -50,6 +50,9 @@ pub struct MtmdCliParams {
     /// Number of threads
     #[arg(short = 't', long = "threads", value_name = "N", default_value = "4")]
     pub n_threads: i32,
+    /// Number of tokens to process in a batch during eval chunks
+    #[arg(long = "batch-size", value_name = "b", default_value = "64")]
+    pub batch_size: i32,
     /// Maximum number of tokens in context
     #[arg(long = "n-tokens", value_name = "N", default_value = "4096")]
     pub n_tokens: NonZeroU32,
@@ -140,6 +143,7 @@ impl MtmdCliContext {
         context: &mut LlamaContext,
         msg: LlamaChatMessage,
         add_bos: bool,
+        batch_size: i32,
     ) -> Result<(), Box<dyn std::error::Error>> {
         self.chat.push(msg);
 
@@ -168,7 +172,7 @@ impl MtmdCliContext {
         // Clear bitmaps after tokenization
         self.bitmaps.clear();
 
-        self.n_past = chunks.eval_chunks(&self.mtmd_ctx, context, 0, 0, 1, true)?;
+        self.n_past = chunks.eval_chunks(&self.mtmd_ctx, context, 0, 0, batch_size, true)?;
         Ok(())
     }
 
@@ -186,7 +190,7 @@ impl MtmdCliContext {
 
         for _i in 0..max_predict {
             // Sample next token
-            let token = sampler.sample(context, 0);
+            let token = sampler.sample(context, -1);
             generated_tokens.push(token);
             sampler.accept(token);
 
@@ -244,7 +248,7 @@ fn run_single_turn(
     println!("Evaluating message: {msg:?}");
 
     // Evaluate the message (prefill)
-    ctx.eval_message(model, context, msg, true)?;
+    ctx.eval_message(model, context, msg, true, params.batch_size)?;
 
     // Generate response (decode)
     ctx.generate_response(model, context, sampler, params.n_predict)?;
@@ -286,7 +290,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
     // Create context
     let context_params = LlamaContextParams::default()
         .with_n_threads(params.n_threads)
-        .with_n_batch(1)
+        .with_n_batch(params.batch_size.try_into()?)
         .with_n_ctx(Some(params.n_tokens));
     let mut context = model.new_context(&backend, context_params)?;
 
diff --git a/llama-cpp-2/src/mtmd.rs b/llama-cpp-2/src/mtmd.rs
index d03dcfeb..226937a5 100644
--- a/llama-cpp-2/src/mtmd.rs
+++ b/llama-cpp-2/src/mtmd.rs
@@ -30,10 +30,13 @@ use crate::token::LlamaToken;
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 pub enum MtmdInputChunkType {
     /// Text input chunk
+    #[allow(clippy::cast_possible_wrap)]
     Text = llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_TEXT as isize,
     /// Image input chunk
+    #[allow(clippy::cast_possible_wrap)]
     Image = llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_IMAGE as isize,
     /// Audio input chunk
+    #[allow(clippy::cast_possible_wrap)]
     Audio = llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_AUDIO as isize,
 }
 
@@ -43,7 +46,7 @@ impl From<llama_cpp_sys_2::mtmd_input_chunk_type> for MtmdInputChunkType {
             llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_TEXT => MtmdInputChunkType::Text,
             llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_IMAGE => MtmdInputChunkType::Image,
             llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_AUDIO => MtmdInputChunkType::Audio,
-            _ => panic!("Unknown MTMD input chunk type: {}", chunk_type),
+            _ => panic!("Unknown MTMD input chunk type: {chunk_type}"),
         }
     }
 }
@@ -106,9 +109,7 @@ impl From<llama_cpp_sys_2::mtmd_context_params> for MtmdContextParams {
             use_gpu: params.use_gpu,
             print_timings: params.print_timings,
             n_threads: params.n_threads,
-            media_marker: unsafe { CStr::from_ptr(params.media_marker) }
-                .to_owned()
-                .into(),
+            media_marker: unsafe { CStr::from_ptr(params.media_marker) }.to_owned(),
         }
     }
 }
@@ -215,7 +216,7 @@ impl MtmdContext {
     #[must_use]
     pub fn get_audio_bitrate(&self) -> Option<u32> {
         let rate = unsafe { llama_cpp_sys_2::mtmd_get_audio_bitrate(self.context.as_ptr()) };
-        (rate > 0).then(|| rate as u32)
+        (rate > 0).then_some(rate.unsigned_abs())
     }
 
     /// Tokenize input text and bitmaps into chunks.
@@ -276,7 +277,7 @@ impl MtmdContext {
             llama_cpp_sys_2::mtmd_tokenize(
                 self.context.as_ptr(),
                 chunks.chunks.as_ptr(),
-                &input_text,
+                &raw const input_text,
                 bitmap_ptrs.as_ptr().cast_mut(),
                 bitmaps.len(),
             )
@@ -627,15 +628,10 @@ impl MtmdInputChunks {
         let chunk_ptr =
             unsafe { llama_cpp_sys_2::mtmd_input_chunks_get(self.chunks.as_ptr(), index) };
 
-        if chunk_ptr.is_null() {
-            None
-        } else {
-            // Note: We don't own this chunk, it's owned by the chunks collection
-            Some(MtmdInputChunk {
-                chunk: NonNull::new(chunk_ptr.cast_mut()).unwrap(),
-                owned: false,
-            })
-        }
+        NonNull::new(chunk_ptr.cast_mut()).map(|ptr| MtmdInputChunk {
+            chunk: ptr,
+            owned: false,
+        })
     }
 
     /// Get total number of tokens across all chunks.
@@ -702,7 +698,7 @@ impl MtmdInputChunks {
                 seq_id,
                 n_batch,
                 logits_last,
-                &mut new_n_past,
+                &raw mut new_n_past,
             )
         };
 
@@ -754,7 +750,10 @@ impl MtmdInputChunk {
 
         let mut n_tokens = 0usize;
         let tokens_ptr = unsafe {
-            llama_cpp_sys_2::mtmd_input_chunk_get_tokens_text(self.chunk.as_ptr(), &mut n_tokens)
+            llama_cpp_sys_2::mtmd_input_chunk_get_tokens_text(
+                self.chunk.as_ptr(),
+                &raw mut n_tokens,
+            )
         };
 
         if tokens_ptr.is_null() || n_tokens == 0 {

From 40f398e2b19e046d220ca5fdaaf0fc630ae87f51 Mon Sep 17 00:00:00 2001
From: Dennis Keck <26092524+fellhorn@users.noreply.github.com>
Date: Tue, 2 Sep 2025 00:23:12 +0200
Subject: [PATCH 3/5] MtmdInputChunkType u32 repr

Signed-off-by: Dennis Keck <26092524+fellhorn@users.noreply.github.com>
---
 llama-cpp-2/src/mtmd.rs | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/llama-cpp-2/src/mtmd.rs b/llama-cpp-2/src/mtmd.rs
index 226937a5..28055619 100644
--- a/llama-cpp-2/src/mtmd.rs
+++ b/llama-cpp-2/src/mtmd.rs
@@ -25,19 +25,18 @@ use crate::token::LlamaToken;
 /// let audio_chunk = MtmdInputChunkType::Audio;
 ///
 /// assert_eq!(text_chunk, MtmdInputChunkType::Text);
+/// assert_eq!(text_chunk as u32, llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_TEXT);
 /// assert_ne!(text_chunk, image_chunk);
 /// ```
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
+#[repr(u32)]
 pub enum MtmdInputChunkType {
     /// Text input chunk
-    #[allow(clippy::cast_possible_wrap)]
-    Text = llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_TEXT as isize,
+    Text = llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_TEXT,
     /// Image input chunk
-    #[allow(clippy::cast_possible_wrap)]
-    Image = llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_IMAGE as isize,
+    Image = llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_IMAGE,
     /// Audio input chunk
-    #[allow(clippy::cast_possible_wrap)]
-    Audio = llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_AUDIO as isize,
+    Audio = llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_AUDIO,
 }
 
 impl From<llama_cpp_sys_2::mtmd_input_chunk_type> for MtmdInputChunkType {

From 3ddf41abcb1697cf96a02970ce3ce43ef68c000f Mon Sep 17 00:00:00 2001
From: Dennis Keck <26092524+fellhorn@users.noreply.github.com>
Date: Tue, 2 Sep 2025 09:51:50 +0200
Subject: [PATCH 4/5] Add comment re ownership

Signed-off-by: Dennis Keck <26092524+fellhorn@users.noreply.github.com>
---
 llama-cpp-2/src/mtmd.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llama-cpp-2/src/mtmd.rs b/llama-cpp-2/src/mtmd.rs
index 28055619..6ff50054 100644
--- a/llama-cpp-2/src/mtmd.rs
+++ b/llama-cpp-2/src/mtmd.rs
@@ -627,6 +627,7 @@ impl MtmdInputChunks {
         let chunk_ptr =
             unsafe { llama_cpp_sys_2::mtmd_input_chunks_get(self.chunks.as_ptr(), index) };
 
+        // Note: We don't own this chunk, it's owned by the chunks collection
         NonNull::new(chunk_ptr.cast_mut()).map(|ptr| MtmdInputChunk {
             chunk: ptr,
             owned: false,

From 94a83e9a31ea8dc5f920aa29afe0c1d75cd2fa0c Mon Sep 17 00:00:00 2001
From: Dennis Keck <26092524+fellhorn@users.noreply.github.com>
Date: Thu, 4 Sep 2025 00:20:51 +0200
Subject: [PATCH 5/5] Fix MtmdInputChunkType win build & default bs 1

Signed-off-by: Dennis Keck <26092524+fellhorn@users.noreply.github.com>
---
 examples/mtmd/src/mtmd.rs | 2 +-
 llama-cpp-2/src/mtmd.rs   | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/examples/mtmd/src/mtmd.rs b/examples/mtmd/src/mtmd.rs
index 7e0c3179..e0f52de9 100644
--- a/examples/mtmd/src/mtmd.rs
+++ b/examples/mtmd/src/mtmd.rs
@@ -51,7 +51,7 @@ pub struct MtmdCliParams {
     #[arg(short = 't', long = "threads", value_name = "N", default_value = "4")]
     pub n_threads: i32,
     /// Number of tokens to process in a batch during eval chunks
-    #[arg(long = "batch-size", value_name = "b", default_value = "64")]
+    #[arg(long = "batch-size", value_name = "b", default_value = "1")]
     pub batch_size: i32,
     /// Maximum number of tokens in context
     #[arg(long = "n-tokens", value_name = "N", default_value = "4096")]
diff --git a/llama-cpp-2/src/mtmd.rs b/llama-cpp-2/src/mtmd.rs
index 6ff50054..71b21ef8 100644
--- a/llama-cpp-2/src/mtmd.rs
+++ b/llama-cpp-2/src/mtmd.rs
@@ -25,18 +25,18 @@ use crate::token::LlamaToken;
 /// let audio_chunk = MtmdInputChunkType::Audio;
 ///
 /// assert_eq!(text_chunk, MtmdInputChunkType::Text);
-/// assert_eq!(text_chunk as u32, llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_TEXT);
+/// assert_eq!(text_chunk, llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_TEXT.into());
 /// assert_ne!(text_chunk, image_chunk);
 /// ```
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 #[repr(u32)]
 pub enum MtmdInputChunkType {
     /// Text input chunk
-    Text = llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_TEXT,
+    Text = llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_TEXT as _,
     /// Image input chunk
-    Image = llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_IMAGE,
+    Image = llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_IMAGE as _,
     /// Audio input chunk
-    Audio = llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_AUDIO,
+    Audio = llama_cpp_sys_2::MTMD_INPUT_CHUNK_TYPE_AUDIO as _,
 }
 
 impl From<llama_cpp_sys_2::mtmd_input_chunk_type> for MtmdInputChunkType {