From 3bf33aacc2681524573ebe57efe76cb3539f21c1 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 15 Oct 2025 11:38:25 +0200
Subject: [PATCH 1/4] llama-quant: add support for mmproj

---
 src/llama-arch.cpp  | 5 +++++
 src/llama-arch.h    | 1 +
 src/llama-model.cpp | 4 +++-
 src/llama-quant.cpp | 8 +++++++-
 src/llama.cpp       | 3 +++
 5 files changed, 19 insertions(+), 2 deletions(-)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 869e4dccf0dc9..b7e00b275b6f7 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -5,6 +5,7 @@
 #include <map>
 
 static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+    { LLM_ARCH_CLIP,             "clip"             }, // dummy, only used by llama-quantize
     { LLM_ARCH_LLAMA,            "llama"            },
     { LLM_ARCH_LLAMA4,           "llama4"           },
     { LLM_ARCH_DECI,             "deci"             },
@@ -275,6 +276,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
 };
 
 static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
+    {
+        LLM_ARCH_CLIP,
+        {},
+    },
     {
         LLM_ARCH_LLAMA,
         {
diff --git a/src/llama-arch.h b/src/llama-arch.h
index c3ae71655b17b..c41de89859d5c 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -9,6 +9,7 @@
 //
 
 enum llm_arch {
+    LLM_ARCH_CLIP,
     LLM_ARCH_LLAMA,
     LLM_ARCH_LLAMA4,
     LLM_ARCH_DECI,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 0cdad9babd9b2..5002bd42ff04e 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -478,7 +478,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_GENERAL_NAME, name, false);
 
     // everything past this point is not vocab-related
-    if (hparams.vocab_only) {
+    // for CLIP models, we only need to load tensors, no hparams
+    if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
         return;
     }
 
@@ -20013,6 +20014,7 @@ int32_t llama_n_head(const llama_model * model) {
 llama_rope_type llama_model_rope_type(const llama_model * model) {
     switch (model->arch) {
         // these models do not use RoPE
+        case LLM_ARCH_CLIP:
         case LLM_ARCH_GPT2:
         case LLM_ARCH_GPTJ:
         case LLM_ARCH_MPT:
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 97228b2a69324..2304e9f6b6789 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -701,6 +701,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         });
     }
 
+    bool is_clip_model = false;
     for (const auto * it : tensors) {
         const struct ggml_tensor * tensor = it->tensor;
 
@@ -714,12 +715,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
             qs.has_output = true;
         }
+
+        is_clip_model |= name.find("mm.") != std::string::npos;
     }
 
     qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
 
     // sanity checks for models that have attention layers
-    if (qs.n_attention_wv != 0)
+    if (qs.n_attention_wv != 0 && !is_clip_model)
     {
         const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
         // attention layers have a non-zero number of kv heads
@@ -881,6 +884,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
         // do not quantize relative position bias (T5)
         quantize &= name.find("attn_rel_b.weight") == std::string::npos;
 
+        // do not quantize specific multimodal tensors
+        quantize &= name.find(".position_embd.") == std::string::npos;
+
         ggml_type new_type;
         void * new_data;
         size_t new_size;
diff --git a/src/llama.cpp b/src/llama.cpp
index 38700f97a0688..e0dfceed03622 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -124,6 +124,9 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
         } catch(const std::exception & e) {
             throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
         }
+        if (model.arch == LLM_ARCH_CLIP) {
+            throw std::runtime_error("mmproj cannot be used as main model, use it with --mmproj instead");
+        }
         try {
             model.load_vocab(ml);
         } catch(const std::exception & e) {

From 94e32467585fc3398a4a100a8251eb1cae3da3e4 Mon Sep 17 00:00:00 2001
From: Xuan-Son Nguyen <thichthat@gmail.com>
Date: Wed, 15 Oct 2025 12:19:04 +0200
Subject: [PATCH 2/4] Update src/llama.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 src/llama.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama.cpp b/src/llama.cpp
index e0dfceed03622..ab2e9868af468 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -125,7 +125,7 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
             throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
         }
         if (model.arch == LLM_ARCH_CLIP) {
-            throw std::runtime_error("mmproj cannot be used as main model, use it with --mmproj instead");
+            throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
         }
         try {
             model.load_vocab(ml);

From aacdf2bacf35b611df2db6226d9643b278a69412 Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 15 Oct 2025 12:20:58 +0200
Subject: [PATCH 3/4] check prefix instead

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 2304e9f6b6789..db6dbea608dc1 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -716,7 +716,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             qs.has_output = true;
         }
 
-        is_clip_model |= name.find("mm.") != std::string::npos;
+        is_clip_model |= name.rfind("mm.") == 0; // check the "mm." prefix
     }
 
     qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;

From 2312fad1660c6c64a17ee4a3d556354fa9a91c4a Mon Sep 17 00:00:00 2001
From: Xuan Son Nguyen <son@huggingface.co>
Date: Wed, 15 Oct 2025 13:05:06 +0200
Subject: [PATCH 4/4] small fix

---
 src/llama-quant.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index db6dbea608dc1..6dd40412b488e 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -716,7 +716,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
             qs.has_output = true;
         }
 
-        is_clip_model |= name.rfind("mm.") == 0; // check the "mm." prefix
+        is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
     }
 
     qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;