From 8448b23afdef5e9940ef8eac32bb62702ea0defa Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 14 Oct 2025 02:13:16 -0500 Subject: [PATCH 01/23] initial commit for branch glm45v --- convert_hf_to_gguf.py | 29 +++++++++++++++++++++++++++++ src/llama-arch.h | 2 +- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 8c5132193e0e0..36278866da005 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9219,6 +9219,35 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [] # skip other tensors +@ModelBase.register("Glm4vMoeForConditionalGeneration") +class GLM4V_MoE(MmprojModel): + # + # the HF model's type is `glm4v_moe`. internally, it consists of two models: + # - `glm4v_moe_text` + # + main text model + # + tensor names start with "model.language_model." + # + "2D-RoPE" (aKa Roformer) w/ embeddings dynamically adapted via bicubic interpolation + # - `glm4v_moe` + # + vision adapter (ViT) + # + tensor names start with "model.visual." + # + "3D-RoPE" (without the interpolation mentioned above) + # + # other notable quirks include: + # - has MTP layer (need to keep these tensors - same as GLM-4.5-Air) + # - RoPE theta value (θ): use 10k rather than 100k for GLM-4.5-Air + # - the model's vision supports video input, but this is not implemented here + # + # for more info, refer to: + # - reference impl : https://github.com/huggingface/transformers/tree/main/src/transformers/models/glm4v_moe + # - HF model card : https://huggingface.co/zai-org/GLM-4.5V + # - arXiv paper (model) : https://arxiv.org/abs/2507.01006 + # - arXiv paper (orig. ViT) : https://arxiv.org/abs/2411.14402 + # + # TODO: the model's tokenizer has video-related special tokens - deal with these (??) + # + pass + + ###### CONVERSION LOGIC ###### diff --git a/src/llama-arch.h b/src/llama-arch.h index c41de89859d5c..831ec378ef332 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -70,6 +70,7 @@ enum llm_arch { LLM_ARCH_CHATGLM, LLM_ARCH_GLM4, LLM_ARCH_GLM4_MOE, + LLM_ARCH_GLM4V_MOE, LLM_ARCH_BITNET, LLM_ARCH_T5, LLM_ARCH_T5ENCODER, @@ -123,7 +124,6 @@ enum llm_kv { LLM_KV_GENERAL_LICENSE, LLM_KV_GENERAL_SOURCE_URL, LLM_KV_GENERAL_SOURCE_HF_REPO, - LLM_KV_VOCAB_SIZE, LLM_KV_CONTEXT_LENGTH, LLM_KV_EMBEDDING_LENGTH, From 70c86861a4a0a4620f5591062240df0bb802aded Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 14 Oct 2025 16:47:12 -0500 Subject: [PATCH 02/23] use F32 accumulators for GLM4V_MOE --- src/llama-graph.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index f29a1e98c9103..ffc2187a1b107 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -817,7 +817,7 @@ ggml_tensor * llm_graph_context::build_ffn( if (down) { cur = build_lora_mm(down, cur); - if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) { + if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_GLM4V_MOE) { // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators ggml_mul_mat_set_prec(cur, GGML_PREC_F32); } @@ -1583,7 +1583,7 @@ ggml_tensor * llm_graph_context::build_attn( if (wo) { cur = build_lora_mm(wo, cur); - if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) { + if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_GLM4V_MOE) { // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators ggml_mul_mat_set_prec(cur, GGML_PREC_F32); } From 631d4fa8693b7617d8c50e8824b54f9f3580ad5e Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 14 Oct 2025 16:48:21 -0500 Subject: [PATCH 03/23] add arch --- gguf-py/gguf/constants.py | 34 ++++++++++++++++++++++++++++++---- src/llama-arch.cpp | 28 ++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 4 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index f5e5fba8008bd..0afc58331b565 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -385,6 +385,7 @@ class MODEL_ARCH(IntEnum): CHATGLM = auto() GLM4 = auto() GLM4_MOE = auto() + GLM4V_MOE = auto() BITNET = auto() T5 = auto() T5ENCODER = auto() @@ -656,10 +657,10 @@ class MODEL_TENSOR(IntEnum): A_MM_NORM_PRE = auto() A_MM_NORM_MID = auto() # nextn/mtp - NEXTN_EH_PROJ = auto() - NEXTN_EMBED_TOKENS = auto() - NEXTN_ENORM = auto() - NEXTN_HNORM = auto() + NEXTN_EH_PROJ = auto() + NEXTN_EMBED_TOKENS = auto() + NEXTN_ENORM = auto() + NEXTN_HNORM = auto() NEXTN_SHARED_HEAD_HEAD = auto() NEXTN_SHARED_HEAD_NORM = auto() @@ -729,6 +730,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.CHATGLM: "chatglm", MODEL_ARCH.GLM4: "glm4", MODEL_ARCH.GLM4_MOE: "glm4moe", + MODEL_ARCH.GLM4V_MOE: "glm4v_moe", MODEL_ARCH.BITNET: "bitnet", MODEL_ARCH.T5: "t5", MODEL_ARCH.T5ENCODER: "t5encoder", @@ -2273,6 +2275,30 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD, MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM, ], + MODEL_ARCH.GLM4V_MOE: [ # same as GLM4_MOE without MTP tensors + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_GATE_INP, + MODEL_TENSOR.FFN_GATE_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + MODEL_TENSOR.FFN_UP_EXP, + MODEL_TENSOR.FFN_GATE_SHEXP, + MODEL_TENSOR.FFN_DOWN_SHEXP, + MODEL_TENSOR.FFN_UP_SHEXP, + MODEL_TENSOR.FFN_EXP_PROBS_B, + ], MODEL_ARCH.BITNET: [ MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index b7e00b275b6f7..f2a8cbdf99a2e 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -66,6 +66,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_CHATGLM, "chatglm" }, { LLM_ARCH_GLM4, "glm4" }, { LLM_ARCH_GLM4_MOE, "glm4moe" }, + { LLM_ARCH_GLM4V_MOE, "glm4v_moe" }, { LLM_ARCH_BITNET, "bitnet" }, { LLM_ARCH_T5, "t5" }, { LLM_ARCH_T5ENCODER, "t5encoder" }, @@ -1507,6 +1508,33 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" }, }, }, + { + LLM_ARCH_GLMV4_MOE, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" }, + { LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" }, + { LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" }, + { LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" }, + { LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" }, + { LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" }, + { LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" }, + { LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" }, + }, + }, { LLM_ARCH_BITNET, { From 2aa698558b0bfa58f8c759b81b34868cb8947086 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 14 Oct 2025 17:10:55 -0500 Subject: [PATCH 04/23] llama-model : add placeholders --- src/llama-model.cpp | 20 ++++++++++++++++++++ src/llama-model.h | 2 +- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 5002bd42ff04e..dc0ab0bf6a6f7 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1611,6 +1611,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_GLM4V_MOE: + { + // TODO + } break; case LLM_ARCH_BITNET: { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); @@ -4892,6 +4896,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } break; + case LLM_ARCH_GLM4V_MOE: + { + // TODO + } + break; case LLM_ARCH_NEMOTRON: { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); @@ -14683,6 +14692,12 @@ struct llm_build_glm4_moe : public llm_graph_context { } }; +struct llm_build_glm4v_moe : public llm_graph_context { + llm_build_glm4v_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + // TODO + } +}; + struct llm_build_nemotron : public llm_graph_context { llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -19750,6 +19765,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_GLM4V_MOE: + { + llm = std::make_unique(*this, params); + } break; case LLM_ARCH_BITNET: { llm = std::make_unique(*this, params); @@ -20119,6 +20138,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { return LLAMA_ROPE_TYPE_NEOX; case LLM_ARCH_QWEN2VL: + case LLM_ARCH_GLM4V_MOE: return LLAMA_ROPE_TYPE_MROPE; // all model arches should be listed explicitly here diff --git a/src/llama-model.h b/src/llama-model.h index 7f48662f2807a..2c9b05fbc790f 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -110,7 +110,7 @@ enum llm_type { LLM_TYPE_8B_A1B, // lfm2moe LLM_TYPE_21B_A3B, // Ernie MoE small LLM_TYPE_30B_A3B, - LLM_TYPE_106B_A12B, // GLM-4.5-Air + LLM_TYPE_106B_A12B, // GLM-4.5-Air (and GLM-4.5V text model) LLM_TYPE_235B_A22B, LLM_TYPE_300B_A47B, // Ernie MoE big LLM_TYPE_355B_A32B, // GLM-4.5 From d0e9dce27d92a4be7e901ed9cec92484dd1f78a0 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 14 Oct 2025 17:13:29 -0500 Subject: [PATCH 05/23] fix arch name for tensor names --- src/llama-arch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index f2a8cbdf99a2e..6964c75ac6268 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -1509,7 +1509,7 @@ static const std::map> LLM_TENSOR_N }, }, { - LLM_ARCH_GLMV4_MOE, + LLM_ARCH_GLM4V_MOE, { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, From 01d085dd4ac152ded1db173551fb322010e4d056 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 15 Oct 2025 17:44:50 -0500 Subject: [PATCH 06/23] WIP conversion logic --- convert_hf_to_gguf.py | 68 ++++++++++++++++++++++++++------------- gguf-py/gguf/constants.py | 2 ++ 2 files changed, 47 insertions(+), 23 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 36278866da005..ee80dd9a568f5 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9219,33 +9219,55 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [] # skip other tensors + +@ModelBase.register("Glm4vMoeForConditionalGeneration") +class GLM4V_Text_MoE(Glm4MoeModel): + """Text model from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V) + + ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)""" + model_arch = gguf.MODEL_ARCH.GLM4_MOE + + def set_gguf_parameters(self): + # parameters specific to GLM-4.5V like rope_theta=10000 and context_length=65536 + # should be correctly picked up from the text_config by the base classes + super().set_gguf_parameters() + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + # skip vision tensors for the text model + if name.startswith("model.visual."): + return [] + + # the Glm4MoeModel class expects tensor names to start with 'model.', + # so we strip the we strip the 'language_model.' part + if name.startswith("model.language_model."): + name = name.replace("model.language_model.", "model.", 1) + + # let the parent class handle the MoE logic and tensor mapping + yield from super().modify_tensors(data_torch, name, bid) + + @ModelBase.register("Glm4vMoeForConditionalGeneration") class GLM4V_MoE(MmprojModel): + """Multimodal projector from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V). + + ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)""" # - # the HF model's type is `glm4v_moe`. internally, it consists of two models: - # - `glm4v_moe_text` - # + main text model - # + tensor names start with "model.language_model." - # + "2D-RoPE" (aKa Roformer) w/ embeddings dynamically adapted via bicubic interpolation - # - `glm4v_moe` - # + vision adapter (ViT) - # + tensor names start with "model.visual." - # + "3D-RoPE" (without the interpolation mentioned above) - # - # other notable quirks include: - # - has MTP layer (need to keep these tensors - same as GLM-4.5-Air) - # - RoPE theta value (θ): use 10k rather than 100k for GLM-4.5-Air - # - the model's vision supports video input, but this is not implemented here - # - # for more info, refer to: - # - reference impl : https://github.com/huggingface/transformers/tree/main/src/transformers/models/glm4v_moe - # - HF model card : https://huggingface.co/zai-org/GLM-4.5V - # - arXiv paper (model) : https://arxiv.org/abs/2507.01006 - # - arXiv paper (orig. ViT) : https://arxiv.org/abs/2411.14402 - # - # TODO: the model's tokenizer has video-related special tokens - deal with these (??) + # TODO: this is not complete yet! need to handle custom RoPE nonsense. # - pass + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V) + self.gguf_writer.add_vision_use_gelu(True) + if (ln_eps := self.find_vparam(["layer_norm_eps"], optional=True)) is not None: + self.gguf_writer.add_vision_attention_layernorm_eps(ln_eps) + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if name.startswith("model.visual."): + yield self.map_tensor_name(name), data_torch + else: + return ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 0afc58331b565..c9708253163c8 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -428,6 +428,7 @@ class VISION_PROJECTOR_TYPE(IntEnum): GLM_EDGE = auto() MERGER = auto() GEMMA3 = auto() + GLM4V = auto() class MODEL_TENSOR(IntEnum): @@ -3055,6 +3056,7 @@ class VisionProjectorType: VOXTRAL = "voxtral" LFM2 = "lfm2" KIMIVL = "kimivl" + GLM4V = "glm4v_moe" # Items here are (block size, type size) From 14cee9c9d748ae384f3ba00dc078ba66b648a649 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 15 Oct 2025 18:03:11 -0500 Subject: [PATCH 07/23] better class names --- convert_hf_to_gguf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ee80dd9a568f5..664050f60c13a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9221,7 +9221,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @ModelBase.register("Glm4vMoeForConditionalGeneration") -class GLM4V_Text_MoE(Glm4MoeModel): +class GLM4VMoEModel(Glm4MoeModel): """Text model from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V) ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)""" @@ -9249,7 +9249,7 @@ def modify_tensors( @ModelBase.register("Glm4vMoeForConditionalGeneration") -class GLM4V_MoE(MmprojModel): +class GLM4VMoEVisionModel(MmprojModel): """Multimodal projector from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V). ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)""" From e0b6064d90a48f9a3fa9b359c310f2bc032f1cff Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 15 Oct 2025 19:32:15 -0500 Subject: [PATCH 08/23] add `clip.vision.rope.*` to GGUF constants need `clip.vision.rope.freq_base` for GLM-4.5V --- gguf-py/gguf/constants.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index c9708253163c8..935a005930fc3 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -276,6 +276,21 @@ class ClipVision: USE_SILU = "clip.use_silu" N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl + class Rope: + DIMENSION_COUNT = "clip.vision.rope.dimension_count" + DIMENSION_SECTIONS = "clip.vision.rope.dimension_sections" + FREQ_BASE = "clip.vision.rope.freq_base" + SCALING_TYPE = "clip.vision.rope.scaling.type" + SCALING_FACTOR = "clip.vision.rope.scaling.factor" + SCALING_ATTN_FACTOR = "clip.vision.rope.scaling.attn_factor" + SCALING_ORIG_CTX_LEN = "clip.vision.rope.scaling.original_context_length" + SCALING_FINETUNED = "clip.vision.rope.scaling.finetuned" + SCALING_YARN_LOG_MUL = "clip.vision.rope.scaling.yarn_log_multiplier" + SCALING_YARN_EXT_FACTOR = "clip.vision.rope.scaling.yarn_ext_factor" + SCALING_YARN_ATTN_FACTOR = "clip.vision.rope.scaling.yarn_attn_factor" + SCALING_YARN_BETA_FAST = "clip.vision.rope.scaling.yarn_beta_fast" + SCALING_YARN_BETA_SLOW = "clip.vision.rope.scaling.yarn_beta_slow" + class Attention: HEAD_COUNT = "clip.vision.attention.head_count" LAYERNORM_EPS = "clip.vision.attention.layer_norm_epsilon" From 7bdc330708df45b6ec8b28d51ddb38236593155f Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 15 Oct 2025 19:32:39 -0500 Subject: [PATCH 09/23] add `add_vision_rope_freq_base` for GGUF metadata --- gguf-py/gguf/gguf_writer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 306679e21834b..5076b44866715 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1038,6 +1038,9 @@ def add_vision_head_count(self, value: int) -> None: def add_vision_attention_layernorm_eps(self, value: float) -> None: self.add_float32(Keys.ClipVision.Attention.LAYERNORM_EPS, value) + def add_vision_rope_freq_base(self, value: float) -> None: + self.add_float32(Keys.ClipVision.Rope.FREQ_BASE, value) + def add_vision_image_size(self, value: int) -> None: self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value) From ed7c271047edf9b2dc98df8e747f9a35d745a688 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 15 Oct 2025 19:33:17 -0500 Subject: [PATCH 10/23] set `clip.vision.rope.freq_base` during conversion --- convert_hf_to_gguf.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 664050f60c13a..b4e309c353030 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9254,16 +9254,27 @@ class GLM4VMoEVisionModel(MmprojModel): ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)""" # - # TODO: this is not complete yet! need to handle custom RoPE nonsense. + # TODO: this is not complete yet! # def set_gguf_parameters(self): super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V) + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V) self.gguf_writer.add_vision_use_gelu(True) + if (ln_eps := self.find_vparam(["layer_norm_eps"], optional=True)) is not None: self.gguf_writer.add_vision_attention_layernorm_eps(ln_eps) + # the ViT in GLM-4.5V applies its own RoPE inside its attention blocks + if (rope_theta := self.find_vparam(["rope_theta"], optional=True)) is not None: + self.gguf_writer.add_vision_rope_freq_base(rope_theta) + logger.info(f"gguf: vision rope theta = {rope_theta}") + else: + logger.warning('gguf: -------------------------------------------------------------') + logger.warning('gguf: missing vision rope theta! the conversion might be incorrect!') + logger.warning('gguf: -------------------------------------------------------------') + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + del bid # unused if name.startswith("model.visual."): yield self.map_tensor_name(name), data_torch else: From 99e8e6a7f699839fd45a6ca16a91dbb9b04ac631 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 4 Nov 2025 18:42:39 -0600 Subject: [PATCH 11/23] use the same ViT for GLM-4.1V and GLM-4.5V multimodal projector is identical between the models --- convert_hf_to_gguf.py | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index a7e3d4b7edf58..81876f7380e2e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9771,6 +9771,31 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter return [] # skip other tensors +@ModelBase.register("Glm4vForConditionalGeneration") +class GLM4VModel(Glm4Model): + """Text model from [zai-org/GLM-4.1V-9B-Thinking](https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking) + + ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)""" + model_arch = gguf.MODEL_ARCH.GLM4_MOE + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + def modify_tensors( + self, data_torch: Tensor, name: str, bid: int | None + ) -> Iterable[tuple[str, Tensor]]: + # skip vision tensors for the text model + if name.startswith("model.visual."): + return [] + + # the Glm4Model class expects tensor names to start with 'model.', + # so we strip the we strip the 'language_model.' part + if name.startswith("model.language_model."): + name = name.replace("model.language_model.", "model.", 1) + + # let the Glm4Model class handle the tensor mapping + yield from super().modify_tensors(data_torch, name, bid) + @ModelBase.register("Glm4vMoeForConditionalGeneration") class GLM4VMoEModel(Glm4MoeModel): @@ -9796,13 +9821,15 @@ def modify_tensors( if name.startswith("model.language_model."): name = name.replace("model.language_model.", "model.", 1) - # let the parent class handle the MoE logic and tensor mapping + # let the Glm4MoeModel class handle the MoE logic and tensor mapping yield from super().modify_tensors(data_torch, name, bid) -@ModelBase.register("Glm4vMoeForConditionalGeneration") -class GLM4VMoEVisionModel(MmprojModel): - """Multimodal projector from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V). +@ModelBase.register("Glm4vMoeForConditionalGeneration", "Glm4vForConditionalGeneration") +class GLM4VisionModel(MmprojModel): + """Multimodal projector from: + - [zai-org/GLM-4.1V-9B-Thinking](https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking) + - [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V) ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)""" # @@ -9816,7 +9843,7 @@ def set_gguf_parameters(self): if (ln_eps := self.find_vparam(["layer_norm_eps"], optional=True)) is not None: self.gguf_writer.add_vision_attention_layernorm_eps(ln_eps) - # the ViT in GLM-4.5V applies its own RoPE inside its attention blocks + # the ViT applies its own RoPE inside its attention blocks if (rope_theta := self.find_vparam(["rope_theta"], optional=True)) is not None: self.gguf_writer.add_vision_rope_freq_base(rope_theta) logger.info(f"gguf: vision rope theta = {rope_theta}") From 94e89836e2903d89cd347849ba3c91fd050b3596 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 4 Nov 2025 19:35:46 -0600 Subject: [PATCH 12/23] separate architectures for GLM4V and GLM4V_MOE --- convert_hf_to_gguf.py | 4 ++-- gguf-py/gguf/constants.py | 3 ++- src/llama-arch.cpp | 1 + src/llama-arch.h | 1 + src/llama-graph.cpp | 18 ++++++++++++++---- src/llama-model.cpp | 14 ++++++++++++++ 6 files changed, 34 insertions(+), 7 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 81876f7380e2e..ce892e3c21283 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9776,7 +9776,7 @@ class GLM4VModel(Glm4Model): """Text model from [zai-org/GLM-4.1V-9B-Thinking](https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking) ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)""" - model_arch = gguf.MODEL_ARCH.GLM4_MOE + model_arch = gguf.MODEL_ARCH.GLM4V def set_gguf_parameters(self): super().set_gguf_parameters() @@ -9802,7 +9802,7 @@ class GLM4VMoEModel(Glm4MoeModel): """Text model from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V) ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)""" - model_arch = gguf.MODEL_ARCH.GLM4_MOE + model_arch = gguf.MODEL_ARCH.GLM4V_MOE def set_gguf_parameters(self): # parameters specific to GLM-4.5V like rope_theta=10000 and context_length=65536 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 48bac8632fa37..93f7484b0ec04 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -406,6 +406,7 @@ class MODEL_ARCH(IntEnum): CHATGLM = auto() GLM4 = auto() GLM4_MOE = auto() + GLM4V = auto() GLM4V_MOE = auto() BITNET = auto() T5 = auto() @@ -3226,7 +3227,7 @@ class VisionProjectorType: VOXTRAL = "voxtral" LFM2 = "lfm2" KIMIVL = "kimivl" - GLM4V = "glm4v_moe" + GLM4V = "glm4v" LIGHTONOCR = "lightonocr" COGVLM = "cogvlm" JANUS_PRO = "janus_pro" diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 6b32c53bbd303..bd91144683ab0 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -68,6 +68,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_CHATGLM, "chatglm" }, { LLM_ARCH_GLM4, "glm4" }, { LLM_ARCH_GLM4_MOE, "glm4moe" }, + { LLM_ARCH_GLM4V, "glm4v" }, { LLM_ARCH_GLM4V_MOE, "glm4v_moe" }, { LLM_ARCH_BITNET, "bitnet" }, { LLM_ARCH_T5, "t5" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index f8d87a82f9654..1e7f2bf7cefe3 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -72,6 +72,7 @@ enum llm_arch { LLM_ARCH_CHATGLM, LLM_ARCH_GLM4, LLM_ARCH_GLM4_MOE, + LLM_ARCH_GLM4V, LLM_ARCH_GLM4V_MOE, LLM_ARCH_BITNET, LLM_ARCH_T5, diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 0156069547e13..baa8ea052c4c6 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -820,8 +820,13 @@ ggml_tensor * llm_graph_context::build_ffn( if (down) { cur = build_lora_mm(down, cur); - if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_GLM4V_MOE) { - // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators + if ( + arch == LLM_ARCH_GLM4 || + arch == LLM_ARCH_GLM4_MOE || + arch == LLM_ARCH_GLM4V || + arch == LLM_ARCH_GLM4V + ) { + // GLM4 models seem to have numerical issues with half-precision accumulators ggml_mul_mat_set_prec(cur, GGML_PREC_F32); } } @@ -1618,8 +1623,13 @@ ggml_tensor * llm_graph_context::build_attn( if (wo) { cur = build_lora_mm(wo, cur); - if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_GLM4V_MOE) { - // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators + if ( + arch == LLM_ARCH_GLM4 || + arch == LLM_ARCH_GLM4_MOE || + arch == LLM_ARCH_GLM4V || + arch == LLM_ARCH_GLM4V + ) { + // GLM4 models seem to have numerical issues with half-precision accumulators ggml_mul_mat_set_prec(cur, GGML_PREC_F32); } } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 2954acb14d0f3..f0b52e5682583 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1665,6 +1665,10 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_GLM4V: + { + // TODO + } break; case LLM_ARCH_GLM4V_MOE: { // TODO @@ -5011,6 +5015,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) { } } break; + case LLM_ARCH_GLM4V: + { + // TODO + } + break; case LLM_ARCH_GLM4V_MOE: { // TODO @@ -7107,6 +7116,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { { llm = std::make_unique(*this, params); } break; + case LLM_ARCH_GLM4V: + { + llm = std::make_unique(*this, params); + } break; case LLM_ARCH_GLM4V_MOE: { llm = std::make_unique(*this, params); @@ -7495,6 +7508,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { return LLAMA_ROPE_TYPE_NEOX; case LLM_ARCH_QWEN2VL: + case LLM_ARCH_GLM4V: case LLM_ARCH_GLM4V_MOE: return LLAMA_ROPE_TYPE_MROPE; case LLM_ARCH_QWEN3VL: From 484d18cc661c415728580b40e9298b98b64d6ee6 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 4 Nov 2025 23:21:32 -0600 Subject: [PATCH 13/23] fix typo --- src/llama-graph.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index baa8ea052c4c6..798579844c8a6 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -824,7 +824,7 @@ ggml_tensor * llm_graph_context::build_ffn( arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_GLM4V || - arch == LLM_ARCH_GLM4V + arch == LLM_ARCH_GLM4V_MOE ) { // GLM4 models seem to have numerical issues with half-precision accumulators ggml_mul_mat_set_prec(cur, GGML_PREC_F32); @@ -1627,7 +1627,7 @@ ggml_tensor * llm_graph_context::build_attn( arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_GLM4V || - arch == LLM_ARCH_GLM4V + arch == LLM_ARCH_GLM4V_MOE ) { // GLM4 models seem to have numerical issues with half-precision accumulators ggml_mul_mat_set_prec(cur, GGML_PREC_F32); From c84a4314365450fadb61e50580cea2620a5ee15d Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 4 Nov 2025 23:22:01 -0600 Subject: [PATCH 14/23] add GLM4V arch tensor map --- src/llama-arch.cpp | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index bd91144683ab0..8720b7e4dd272 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -1507,7 +1507,7 @@ static const std::map> LLM_TENSOR_N LLM_ARCH_GLM4, { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, - { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, // does this really exist? { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, { LLM_TENSOR_OUTPUT, "output" }, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, @@ -1556,6 +1556,24 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" }, }, }, + { + LLM_ARCH_GLM4V, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" }, + { LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" }, + }, + }, { LLM_ARCH_GLM4V_MOE, { From 7267e8a0984e1bb43f197b9ed823185aaefd06e9 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 4 Nov 2025 23:29:00 -0600 Subject: [PATCH 15/23] fix typo --- src/llama-model.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index f0b52e5682583..ac3a849918f3e 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -7118,11 +7118,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const { } break; case LLM_ARCH_GLM4V: { - llm = std::make_unique(*this, params); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_GLM4V_MOE: { - llm = std::make_unique(*this, params); + llm = std::make_unique(*this, params); } break; case LLM_ARCH_BITNET: { From ac54c7165269846a4ce5c36a41776569840b00f2 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Tue, 4 Nov 2025 23:32:34 -0600 Subject: [PATCH 16/23] add `glm4v` and `glm4v_moe` to src/models --- src/models/glm4v.cpp | 5 +++++ src/models/glm4v_moe.cpp | 5 +++++ src/models/models.h | 8 ++++++++ 3 files changed, 18 insertions(+) create mode 100644 src/models/glm4v.cpp create mode 100644 src/models/glm4v_moe.cpp diff --git a/src/models/glm4v.cpp b/src/models/glm4v.cpp new file mode 100644 index 0000000000000..b058f275e2599 --- /dev/null +++ b/src/models/glm4v.cpp @@ -0,0 +1,5 @@ +#include "models.h" + +llm_build_glm4v::llm_build_glm4v(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + // TODO +} diff --git a/src/models/glm4v_moe.cpp b/src/models/glm4v_moe.cpp new file mode 100644 index 0000000000000..ab63839a23aab --- /dev/null +++ b/src/models/glm4v_moe.cpp @@ -0,0 +1,5 @@ +#include "models.h" + +llm_build_glm4v_moe::llm_build_glm4v_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { + // TODO +} diff --git a/src/models/models.h b/src/models/models.h index af203343a4d71..ecaa560000ef3 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -216,6 +216,14 @@ struct llm_build_glm4_moe : public llm_graph_context { llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params); }; +struct llm_build_glm4v : public llm_graph_context { + llm_build_glm4v(const llama_model & model, const llm_graph_params & params); +}; + +struct llm_build_glm4v_moe : public llm_graph_context { + llm_build_glm4v_moe(const llama_model & model, const llm_graph_params & params); +}; + struct llm_build_gpt2 : public llm_graph_context { llm_build_gpt2(const llama_model & model, const llm_graph_params & params); }; From c17d4b97cccd366f572aced7e801e40f3918e1f5 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 5 Nov 2025 16:02:37 -0600 Subject: [PATCH 17/23] revert old RoPE GGUF changes --- gguf-py/gguf/constants.py | 35 +++++++++++++++++++---------------- gguf-py/gguf/gguf_writer.py | 3 --- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 496cdabfb9a44..d70a74db39f5f 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -280,21 +280,6 @@ class ClipVision: N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl IS_DEEPSTACK_LAYERS = "clip.vision.is_deepstack_layers" - class Rope: - DIMENSION_COUNT = "clip.vision.rope.dimension_count" - DIMENSION_SECTIONS = "clip.vision.rope.dimension_sections" - FREQ_BASE = "clip.vision.rope.freq_base" - SCALING_TYPE = "clip.vision.rope.scaling.type" - SCALING_FACTOR = "clip.vision.rope.scaling.factor" - SCALING_ATTN_FACTOR = "clip.vision.rope.scaling.attn_factor" - SCALING_ORIG_CTX_LEN = "clip.vision.rope.scaling.original_context_length" - SCALING_FINETUNED = "clip.vision.rope.scaling.finetuned" - SCALING_YARN_LOG_MUL = "clip.vision.rope.scaling.yarn_log_multiplier" - SCALING_YARN_EXT_FACTOR = "clip.vision.rope.scaling.yarn_ext_factor" - SCALING_YARN_ATTN_FACTOR = "clip.vision.rope.scaling.yarn_attn_factor" - SCALING_YARN_BETA_FAST = "clip.vision.rope.scaling.yarn_beta_fast" - SCALING_YARN_BETA_SLOW = "clip.vision.rope.scaling.yarn_beta_slow" - class Attention: HEAD_COUNT = "clip.vision.attention.head_count" LAYERNORM_EPS = "clip.vision.attention.layer_norm_epsilon" @@ -775,7 +760,8 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.DEEPSEEK2: "deepseek2", MODEL_ARCH.CHATGLM: "chatglm", MODEL_ARCH.GLM4: "glm4", - MODEL_ARCH.GLM4_MOE: "glm4moe", + MODEL_ARCH.GLM4_MOE: "glm4_moe", + MODEL_ARCH.GLM4V: "glm4v", MODEL_ARCH.GLM4V_MOE: "glm4v_moe", MODEL_ARCH.BITNET: "bitnet", MODEL_ARCH.T5: "t5", @@ -824,6 +810,7 @@ class MODEL_TENSOR(IntEnum): VISION_PROJECTOR_TYPE.GLM_EDGE: "adapter", VISION_PROJECTOR_TYPE.MERGER: "qwen2vl_merger", VISION_PROJECTOR_TYPE.GEMMA3: "gemma3", + VISION_PROJECTOR_TYPE.GLM4V: "glm4v", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -2384,6 +2371,22 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD, MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM, ], + MODEL_ARCH.GLM4V : [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_QKV, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.ATTN_POST_NORM, + MODEL_TENSOR.FFN_POST_NORM, + ], MODEL_ARCH.GLM4V_MOE: [ # same as GLM4_MOE without MTP tensors MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.OUTPUT_NORM, diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 6d0ec74560ac0..a051daeeb1341 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1047,9 +1047,6 @@ def add_vision_head_count(self, value: int) -> None: def add_vision_attention_layernorm_eps(self, value: float) -> None: self.add_float32(Keys.ClipVision.Attention.LAYERNORM_EPS, value) - def add_vision_rope_freq_base(self, value: float) -> None: - self.add_float32(Keys.ClipVision.Rope.FREQ_BASE, value) - def add_vision_image_size(self, value: int) -> None: self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value) From 8a6ad0c55a7ae41db18c96a9b52f51c5ecc7e52c Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 5 Nov 2025 16:04:04 -0600 Subject: [PATCH 18/23] begin adding GLM4V projector --- tools/mtmd/clip-impl.h | 46 ++++++++++++++++++++++-------------------- tools/mtmd/clip.cpp | 4 ++++ 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 722b1a4948d6f..f61e1e0ee1f78 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -156,32 +156,34 @@ enum projector_type { PROJECTOR_TYPE_LIGHTONOCR, PROJECTOR_TYPE_COGVLM, PROJECTOR_TYPE_JANUS_PRO, + PROJECTOR_TYPE_GLM4V, PROJECTOR_TYPE_UNKNOWN, }; static std::map PROJECTOR_TYPE_NAMES = { - { PROJECTOR_TYPE_MLP, "mlp" }, - { PROJECTOR_TYPE_LDP, "ldp" }, - { PROJECTOR_TYPE_LDPV2, "ldpv2"}, - { PROJECTOR_TYPE_MINICPMV, "resampler"}, - { PROJECTOR_TYPE_GLM_EDGE, "adapter"}, - { PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger"}, - { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"}, - { PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"}, - { PROJECTOR_TYPE_GEMMA3, "gemma3"}, - { PROJECTOR_TYPE_IDEFICS3, "idefics3"}, - { PROJECTOR_TYPE_PIXTRAL, "pixtral"}, - { PROJECTOR_TYPE_ULTRAVOX, "ultravox"}, - { PROJECTOR_TYPE_INTERNVL, "internvl"}, - { PROJECTOR_TYPE_LLAMA4, "llama4"}, - { PROJECTOR_TYPE_QWEN2A, "qwen2a"}, - { PROJECTOR_TYPE_QWEN25O, "qwen2.5o"}, - { PROJECTOR_TYPE_VOXTRAL, "voxtral"}, - { PROJECTOR_TYPE_LFM2, "lfm2"}, - { PROJECTOR_TYPE_KIMIVL, "kimivl"}, - { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"}, - { PROJECTOR_TYPE_COGVLM, "cogvlm"}, - { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"}, + { PROJECTOR_TYPE_MLP, "mlp" }, + { PROJECTOR_TYPE_LDP, "ldp" }, + { PROJECTOR_TYPE_LDPV2, "ldpv2" }, + { PROJECTOR_TYPE_MINICPMV, "resampler" }, + { PROJECTOR_TYPE_GLM_EDGE, "adapter" }, + { PROJECTOR_TYPE_QWEN2VL, "qwen2vl_merger" }, + { PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger" }, + { PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger" }, + { PROJECTOR_TYPE_GEMMA3, "gemma3" }, + { PROJECTOR_TYPE_IDEFICS3, "idefics3" }, + { PROJECTOR_TYPE_PIXTRAL, "pixtral" }, + { PROJECTOR_TYPE_ULTRAVOX, "ultravox" }, + { PROJECTOR_TYPE_INTERNVL, "internvl" }, + { PROJECTOR_TYPE_LLAMA4, "llama4" }, + { PROJECTOR_TYPE_QWEN2A, "qwen2a" }, + { PROJECTOR_TYPE_QWEN25O, "qwen2.5o" }, + { PROJECTOR_TYPE_VOXTRAL, "voxtral" }, + { PROJECTOR_TYPE_LFM2, "lfm2" }, + { PROJECTOR_TYPE_KIMIVL, "kimivl" }, + { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr" }, + { PROJECTOR_TYPE_COGVLM, "cogvlm" }, + { PROJECTOR_TYPE_JANUS_PRO, "janus_pro" }, + { PROJECTOR_TYPE_GLM4V, "glm4v" }, }; static projector_type clip_projector_type_from_string(const std::string & str) { diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 16781fb19523f..06983fd494d52 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -1082,6 +1082,10 @@ struct clip_graph { return gf; } + ggml_cgraph * build_glm4v() { + /* TODO */ + } + ggml_cgraph * build_minicpmv() { const int batch_size = 1; From f39b231f2b48c3562b1f5ce6f555752f641eda49 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 5 Nov 2025 16:09:46 -0600 Subject: [PATCH 19/23] copy LLM graph code from text models (WIP) still need to figure out what exactly needs to be changed... --- src/models/glm4v.cpp | 125 +++++++++++++++++++++++++++++++- src/models/glm4v_moe.cpp | 153 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 276 insertions(+), 2 deletions(-) diff --git a/src/models/glm4v.cpp b/src/models/glm4v.cpp index b058f275e2599..0ec311f3f0739 100644 --- a/src/models/glm4v.cpp +++ b/src/models/glm4v.cpp @@ -1,5 +1,128 @@ #include "models.h" llm_build_glm4v::llm_build_glm4v(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - // TODO + // + // TODO -- currently this is just copied from `llm_build_glm4` -- still WIP + // + const int64_t n_embd_head = hparams.n_embd_head_v; + const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + ggml_tensor * inpSA = inpL; + + // Pre-attention norm + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + ggml_tensor * Qcur = nullptr; + ggml_tensor * Kcur = nullptr; + ggml_tensor * Vcur = nullptr; + + if (model.layers[il].wqkv == nullptr) { + Qcur = build_lora_mm(model.layers[il].wq, cur); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + } + Kcur = build_lora_mm(model.layers[il].wk, cur); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + } + Vcur = build_lora_mm(model.layers[il].wv, cur); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + } + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + } else { + cur = build_lora_mm(model.layers[il].wqkv, cur); + cb(cur, "wqkv", il); + if (model.layers[il].bqkv) { + cur = ggml_add(ctx0, cur, model.layers[il].bqkv); + cb(cur, "bqkv", il); + } + Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1], + 0 * sizeof(float) * (n_embd)); + Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), + cur->nb[1], 1 * sizeof(float) * (n_embd)); + Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float), + cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa)); + } + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il); + } + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + // Post-attention norm (new!) + cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "post_attn_norm", il); + + // Add the input (residual connection after post-attention norm) + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // FF + { + // Pre-MLP norm + cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + // MLP + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + NULL, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, LLM_FFN_SWIGLU, LLM_FFN_SEQ, il); + cb(cur, "ffn_out", il); + + // Post-MLP norm + cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "post_mlp_norm", il); + } + // Add residual connection after post-MLP norm + inpL = ggml_add(ctx0, cur, ffn_inp); + cb(inpL, "l_out", il); + } + // Final norm + cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // Output projection + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); } diff --git a/src/models/glm4v_moe.cpp b/src/models/glm4v_moe.cpp index ab63839a23aab..09bf8abbb2db9 100644 --- a/src/models/glm4v_moe.cpp +++ b/src/models/glm4v_moe.cpp @@ -1,5 +1,156 @@ #include "models.h" llm_build_glm4v_moe::llm_build_glm4v_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) { - // TODO + // + // TODO -- currently this is just copied from `llm_build_glm4_moe` -- still WIP + // + const int64_t n_embd_head = hparams.n_embd_head_v; + + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + // inp_pos - contains the positions + ggml_tensor * inp_pos = build_inp_pos(); + + auto * inp_attn = build_attn_inp_kv(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + // Only process up to last layer (skip final NextN layer) + // Final layer tensors are loaded but not processed in forward pass + const int n_transformer_layers = n_layer - hparams.nextn_predict_layers; + for (int il = 0; il < n_transformer_layers; ++il) { + ggml_tensor * inpSA = inpL; + + // Pre-attention norm + cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // self-attention + { + ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + } + cb(Qcur, "Qcur", il); + + ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + } + cb(Kcur, "Kcur", il); + + ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + } + cb(Vcur, "Vcur", il); + + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + // Apply Q/K norm if available (GLM-4.5 355B variant) + if (model.layers[il].attn_q_norm) { + Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); + cb(Qcur, "Qcur_normed", il); + } + if (model.layers[il].attn_k_norm) { + Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); + cb(Kcur, "Kcur_normed", il); + } + Qcur = ggml_rope_ext( + ctx0, Qcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + Kcur = ggml_rope_ext( + ctx0, Kcur, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + cb(Vcur, "Vcur", il); + + cur = build_attn(inp_attn, + model.layers[il].wo, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + } + if (il == n_transformer_layers - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // Post-attention norm + cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "post_attn_norm", il); + + // Check if this is a dense layer (n_layer_dense_lead=1, so layer 0 is dense) + if (static_cast(il) < hparams.n_layer_dense_lead) { + // Dense FFN layer + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + } else { + // Process routed experts using existing MoE infrastructure + ggml_tensor * routed_out = build_moe_ffn(cur, + model.layers[il].ffn_gate_inp, + model.layers[il].ffn_up_exps, + model.layers[il].ffn_gate_exps, + model.layers[il].ffn_down_exps, + model.layers[il].ffn_exp_probs_b, + n_expert, n_expert_used, + LLM_FFN_SILU, hparams.expert_weights_norm, + true, hparams.expert_weights_scale, + (llama_expert_gating_func_type) hparams.expert_gating_func, + il); + cb(routed_out, "ffn_moe_out", il); + + // Process shared expert on original input + ggml_tensor * shared_out = build_ffn(cur, + model.layers[il].ffn_up_shexp, NULL, NULL, + model.layers[il].ffn_gate_shexp, NULL, NULL, + model.layers[il].ffn_down_shexp, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(shared_out, "ffn_shexp_out", il); + + // Final output: routed_output + shared_output + cur = ggml_add(ctx0, routed_out, shared_out); + cb(cur, "ffn_out", il); + } + cur = ggml_add(ctx0, cur, ffn_inp); + + cur = build_cvec(cur, il); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + cur = inpL; + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // lm_head + cur = build_lora_mm(model.output, cur); + + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); } From b60c16a9a30d0b94eb6d233832d9028467f10657 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Wed, 5 Nov 2025 17:15:56 -0600 Subject: [PATCH 20/23] consistent arch naming --- src/llama-arch.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index eb1cd07fb7dfc..8346af56eb569 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -67,7 +67,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_DEEPSEEK2, "deepseek2" }, { LLM_ARCH_CHATGLM, "chatglm" }, { LLM_ARCH_GLM4, "glm4" }, - { LLM_ARCH_GLM4_MOE, "glm4moe" }, + { LLM_ARCH_GLM4_MOE, "glm4_moe" }, { LLM_ARCH_GLM4V, "glm4v" }, { LLM_ARCH_GLM4V_MOE, "glm4v_moe" }, { LLM_ARCH_BITNET, "bitnet" }, From 6443ecb8848dd2e3c39313e07fc0ba51955922a5 Mon Sep 17 00:00:00 2001 From: ddh0 Date: Thu, 6 Nov 2025 15:01:59 -0600 Subject: [PATCH 21/23] WIP conversion logic --- convert_hf_to_gguf.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index dd35f7d93d3d6..a4971c3825cd9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -9869,24 +9869,17 @@ class GLM4VisionModel(MmprojModel): ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)""" # - # TODO: this is not complete yet! + # TODO: conversion logic is still WIP! # def set_gguf_parameters(self): super().set_gguf_parameters() - self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V) - self.gguf_writer.add_vision_use_gelu(True) - - if (ln_eps := self.find_vparam(["layer_norm_eps"], optional=True)) is not None: - self.gguf_writer.add_vision_attention_layernorm_eps(ln_eps) + assert self.hparams_vision is not None + vparams = self.hparams_vision + ln_eps = vparams.get("layer_norm_eps", 1e-5) - # the ViT applies its own RoPE inside its attention blocks - if (rope_theta := self.find_vparam(["rope_theta"], optional=True)) is not None: - self.gguf_writer.add_vision_rope_freq_base(rope_theta) - logger.info(f"gguf: vision rope theta = {rope_theta}") - else: - logger.warning('gguf: -------------------------------------------------------------') - logger.warning('gguf: missing vision rope theta! the conversion might be incorrect!') - logger.warning('gguf: -------------------------------------------------------------') + self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V) + self.gguf_writer.add_vision_attention_layernorm_eps(ln_eps) + self.gguf_writer.add_vision_use_silu(True) def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused From eb2c8b89b839e5919b71d59fab702533e9fd1eeb Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sat, 8 Nov 2025 15:02:43 -0600 Subject: [PATCH 22/23] include `glm4v-moe.cpp` and `glm4v.cpp` in CMake also renamed `glm4v_moe.cpp` to `glm4v-moe.cpp` to match other model files --- src/CMakeLists.txt | 2 ++ src/models/{glm4v_moe.cpp => glm4v-moe.cpp} | 0 2 files changed, 2 insertions(+) rename src/models/{glm4v_moe.cpp => glm4v-moe.cpp} (100%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 630b2cddf67e8..0988f3114819d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -70,6 +70,8 @@ add_library(llama models/gemma3n-iswa.cpp models/glm4-moe.cpp models/glm4.cpp + models/glm4v-moe.cpp + models/glm4v.cpp models/gpt2.cpp models/gptneox.cpp models/granite-hybrid.cpp diff --git a/src/models/glm4v_moe.cpp b/src/models/glm4v-moe.cpp similarity index 100% rename from src/models/glm4v_moe.cpp rename to src/models/glm4v-moe.cpp From b37d3265a82e3f21028f5c977c928e5845c907dd Mon Sep 17 00:00:00 2001 From: ddh0 Date: Sat, 8 Nov 2025 19:13:55 -0600 Subject: [PATCH 23/23] mtmd : WIP build_glm4v cgraph --- tools/mtmd/clip.cpp | 143 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 139 insertions(+), 4 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 86f5877ef369e..c0b7bc5f0c36c 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -281,7 +281,7 @@ struct clip_model { // embeddings ggml_tensor * class_embedding = nullptr; ggml_tensor * patch_embeddings_0 = nullptr; - ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL) + ggml_tensor * patch_embeddings_1 = nullptr; // second Conv2D kernel when we decouple Conv3D along temporal dimension (Qwen2VL, GLM4V) ggml_tensor * patch_bias = nullptr; ggml_tensor * position_embeddings = nullptr; @@ -400,6 +400,22 @@ struct clip_model { ggml_tensor * mm_boi = nullptr; ggml_tensor * mm_eoi = nullptr; + // GLM4V projection + ggml_tensor * mm_post_conv_ln_w = nullptr; + ggml_tensor * mm_post_conv_ln_b = nullptr; + ggml_tensor * mm_downsample_w = nullptr; + ggml_tensor * mm_downsample_b = nullptr; + ggml_tensor * mm_merger_proj_w = nullptr; + ggml_tensor * mm_merger_proj_b = nullptr; + ggml_tensor * mm_merger_norm_w = nullptr; + ggml_tensor * mm_merger_norm_b = nullptr; + ggml_tensor * mm_merger_gate_w = nullptr; + ggml_tensor * mm_merger_gate_b = nullptr; + ggml_tensor * mm_merger_up_w = nullptr; + ggml_tensor * mm_merger_up_b = nullptr; + ggml_tensor * mm_merger_down_w = nullptr; + ggml_tensor * mm_merger_down_b = nullptr; + bool audio_has_avgpool() const { return proj_type == PROJECTOR_TYPE_QWEN2A || proj_type == PROJECTOR_TYPE_VOXTRAL; @@ -1083,7 +1099,122 @@ struct clip_graph { } ggml_cgraph * build_glm4v() { - /* TODO */ + GGML_ASSERT(model.patch_embeddings_0 != nullptr); + GGML_ASSERT(model.patch_embeddings_1 != nullptr); + GGML_ASSERT(model.position_embeddings != nullptr); + GGML_ASSERT(model.class_embedding == nullptr); + + // 2D RoPE input positions + ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_h, "pos_h"); + ggml_set_input(pos_h); + + ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches); + ggml_set_name(pos_w, "pos_w"); + ggml_set_input(pos_w); + + ggml_tensor * inp_raw = build_inp_raw(); + ggml_tensor * inp; + + // patch embedding + // - this is similar to Qwen2VL's handling of Conv3d for video/image inputs + // - for single images, the input is duplicated along the temporal axis + // + // ref: `class Glm4vVisionPatchEmbed(Qwen2_5_VisionPatchEmbed):` + + ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + if (model.patch_embeddings_1) { + auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1); + inp = ggml_add(ctx0, inp, inp_1); + } + + const int batch_size = 1; + inp = ggml_permute(ctx0, inp, 1, 2, 0, 3); // [w, h, c, b] -> [c, w, h, b] + inp = ggml_cont_4d(ctx0, inp, n_embd * 2, n_patches_x / 2, n_patches_y, batch_size); + inp = ggml_reshape_4d(ctx0, inp, n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2)); + inp = ggml_permute(ctx0, inp, 0, 2, 1, 3); + inp = ggml_cont_3d(ctx0, inp, n_embd, n_patches_x * n_patches_y, batch_size); + cb(inp, "patch_embed", -1); + + // post-convolution layernorm + // + // ref: `self.post_conv_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)` + inp = build_norm(inp, model.mm_post_conv_ln_w, model.mm_post_conv_ln_b, NORM_TYPE_RMS, eps, -1); + cb(inp, "post_conv_ln", -1); + + // absolute position embeddings (interpolated) + // + // ref: self.embeddings + ggml_tensor * learned_pos_embd = resize_position_embeddings(); + inp = ggml_add(ctx0, inp, learned_pos_embd); + cb(inp, "abs_pos_embed", -1); + + // RoPE to be applied inside ViT blocks + // + // ref: self.rotary_pos_emb + auto add_pos = [&](ggml_tensor * cur, const clip_layer &) { + return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false); + }; + + // ViT blocks + ggml_tensor * cur = build_vit( + inp, n_patches, + NORM_TYPE_RMS, + FFN_SILU, // hidden_act is "silu" + nullptr, // absolute embeddings already added + add_pos); + + // post-ViT layernorm + cur = build_norm(cur, model.post_ln_w, model.post_ln_b, NORM_TYPE_RMS, eps, -1); + cb(cur, "post_vit_ln", -1); + + // reshape and permute to prepare for conv2d + const int merge_size = model.hparams.n_merge; // WIP: is this the correct value to use? + cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y); + cur = ggml_permute(ctx0, cur, 1, 2, 0, 3); // -> [C, W, H, B] -> [W, H, C, B] for ggml + cb(cur, "pre_downsample_permute", -1); + + // downsampling conv2d + cur = ggml_conv_2d(ctx0, model.mm_downsample_w, cur, merge_size, merge_size, 0, 0, 1, 1); + cb(cur, "downsample_conv", -1); + + // reshape to [tokens, features] + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]); + cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur)); + cb(cur, "post_downsample_reshape", -1); + + // patch merger FFN + // + // ref: `class Glm4vVisionPatchMerger(nn.Module):` + { + // input projection + cur = ggml_mul_mat(ctx0, model.mm_merger_proj_w, cur); + + // apply norm + GELU + cur = build_norm(cur, model.mm_merger_norm_w, model.mm_merger_norm_b, NORM_TYPE_NORMAL, 1e-5f, -1); + cur = ggml_gelu(ctx0, cur); + ggml_tensor * ffn_input = cur; + cb(cur, "merger_ffn_inp", -1); + + // gate projection + ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_merger_gate_w, ffn_input); + cb(cur, "merger_gate", -1); + + // up projection + ggml_tensor * up = ggml_mul_mat(ctx0, model.mm_merger_up_w, ffn_input); + cb(cur, "merger_up", -1); + + // activation + down projection + cur = ggml_silu(ctx0, gate); + cur = ggml_mul(ctx0, cur, up); + cur = ggml_mul_mat(ctx0, model.mm_merger_down_w, cur); + cb(cur, "merger_ffn_out", -1); + } + + // build the graph + ggml_build_forward_expand(gf, cur); + + return gf; } ggml_cgraph * build_minicpmv() { @@ -2520,13 +2651,17 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { res = graph.build_kimivl(); } break; + case PROJECTOR_TYPE_COGVLM: + { + res = graph.build_cogvlm(); + } break; case PROJECTOR_TYPE_JANUS_PRO: { res = graph.build_siglip(); } break; - case PROJECTOR_TYPE_COGVLM: + case PROJECTOR_TYPE_GLM4V: { - res = graph.build_cogvlm(); + res = graph.build_glm4v(); } break; default: {