From 3bf33aacc2681524573ebe57efe76cb3539f21c1 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 15 Oct 2025 11:38:25 +0200 Subject: [PATCH 1/4] llama-quant: add support for mmproj --- src/llama-arch.cpp | 5 +++++ src/llama-arch.h | 1 + src/llama-model.cpp | 4 +++- src/llama-quant.cpp | 8 +++++++- src/llama.cpp | 3 +++ 5 files changed, 19 insertions(+), 2 deletions(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 869e4dccf0dc9..b7e00b275b6f7 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -5,6 +5,7 @@ #include static const std::map LLM_ARCH_NAMES = { + { LLM_ARCH_CLIP, "clip" }, // dummy, only used by llama-quantize { LLM_ARCH_LLAMA, "llama" }, { LLM_ARCH_LLAMA4, "llama4" }, { LLM_ARCH_DECI, "deci" }, @@ -275,6 +276,10 @@ static const std::map LLM_KV_NAMES = { }; static const std::map> LLM_TENSOR_NAMES = { + { + LLM_ARCH_CLIP, + {}, + }, { LLM_ARCH_LLAMA, { diff --git a/src/llama-arch.h b/src/llama-arch.h index c3ae71655b17b..c41de89859d5c 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -9,6 +9,7 @@ // enum llm_arch { + LLM_ARCH_CLIP, LLM_ARCH_LLAMA, LLM_ARCH_LLAMA4, LLM_ARCH_DECI, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 0cdad9babd9b2..5002bd42ff04e 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -478,7 +478,8 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_GENERAL_NAME, name, false); // everything past this point is not vocab-related - if (hparams.vocab_only) { + // for CLIP models, we only need to load tensors, no hparams + if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) { return; } @@ -20013,6 +20014,7 @@ int32_t llama_n_head(const llama_model * model) { llama_rope_type llama_model_rope_type(const llama_model * model) { switch (model->arch) { // these models do not use RoPE + case LLM_ARCH_CLIP: case LLM_ARCH_GPT2: case LLM_ARCH_GPTJ: case LLM_ARCH_MPT: diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 97228b2a69324..2304e9f6b6789 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -701,6 +701,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: }); } + bool is_clip_model = false; for (const auto * it : tensors) { const struct ggml_tensor * tensor = it->tensor; @@ -714,12 +715,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) { qs.has_output = true; } + + is_clip_model |= name.find("mm.") != std::string::npos; } qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; // sanity checks for models that have attention layers - if (qs.n_attention_wv != 0) + if (qs.n_attention_wv != 0 && !is_clip_model) { const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin(); // attention layers have a non-zero number of kv heads @@ -881,6 +884,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // do not quantize relative position bias (T5) quantize &= name.find("attn_rel_b.weight") == std::string::npos; + // do not quantize specific multimodal tensors + quantize &= name.find(".position_embd.") == std::string::npos; + ggml_type new_type; void * new_data; size_t new_size; diff --git a/src/llama.cpp b/src/llama.cpp index 38700f97a0688..e0dfceed03622 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -124,6 +124,9 @@ static int llama_model_load(const std::string & fname, std::vector } catch(const std::exception & e) { throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what())); } + if (model.arch == LLM_ARCH_CLIP) { + throw std::runtime_error("mmproj cannot be used as main model, use it with --mmproj instead"); + } try { model.load_vocab(ml); } catch(const std::exception & e) { From 94e32467585fc3398a4a100a8251eb1cae3da3e4 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Wed, 15 Oct 2025 12:19:04 +0200 Subject: [PATCH 2/4] Update src/llama.cpp Co-authored-by: Georgi Gerganov --- src/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama.cpp b/src/llama.cpp index e0dfceed03622..ab2e9868af468 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -125,7 +125,7 @@ static int llama_model_load(const std::string & fname, std::vector throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what())); } if (model.arch == LLM_ARCH_CLIP) { - throw std::runtime_error("mmproj cannot be used as main model, use it with --mmproj instead"); + throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead"); } try { model.load_vocab(ml); From aacdf2bacf35b611df2db6226d9643b278a69412 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 15 Oct 2025 12:20:58 +0200 Subject: [PATCH 3/4] check prefix instead --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 2304e9f6b6789..db6dbea608dc1 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -716,7 +716,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: qs.has_output = true; } - is_clip_model |= name.find("mm.") != std::string::npos; + is_clip_model |= name.rfind("mm.") == 0; // check the "mm." prefix } qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer; From 2312fad1660c6c64a17ee4a3d556354fa9a91c4a Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 15 Oct 2025 13:05:06 +0200 Subject: [PATCH 4/4] small fix --- src/llama-quant.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index db6dbea608dc1..6dd40412b488e 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -716,7 +716,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: qs.has_output = true; } - is_clip_model |= name.rfind("mm.") == 0; // check the "mm." prefix + is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix } qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;