From 8448b23afdef5e9940ef8eac32bb62702ea0defa Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Tue, 14 Oct 2025 02:13:16 -0500
Subject: [PATCH 01/23] initial commit for branch glm45v

---
 convert_hf_to_gguf.py | 29 +++++++++++++++++++++++++++++
 src/llama-arch.h      |  2 +-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 8c5132193e0e0..36278866da005 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -9219,6 +9219,35 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
         return [] # skip other tensors
 
+@ModelBase.register("Glm4vMoeForConditionalGeneration")
+class GLM4V_MoE(MmprojModel):
+    #
+    # the HF model's type is `glm4v_moe`. internally, it consists of two models:
+    # - `glm4v_moe_text`
+    # + main text model
+    # + tensor names start with "model.language_model."
+    # + "2D-RoPE" (aKa Roformer) w/ embeddings dynamically adapted via bicubic interpolation
+    # - `glm4v_moe`
+    # + vision adapter (ViT)
+    # + tensor names start with "model.visual."
+    # + "3D-RoPE" (without the interpolation mentioned above)
+    #
+    # other notable quirks include:
+    # - has MTP layer (need to keep these tensors - same as GLM-4.5-Air)
+    # - RoPE theta value (θ): use 10k rather than 100k for GLM-4.5-Air
+    # - the model's vision supports video input, but this is not implemented here
+    #
+    # for more info, refer to:
+    # - reference impl          : https://github.com/huggingface/transformers/tree/main/src/transformers/models/glm4v_moe
+    # - HF model card           : https://huggingface.co/zai-org/GLM-4.5V
+    # - arXiv paper (model)     : https://arxiv.org/abs/2507.01006
+    # - arXiv paper (orig. ViT) : https://arxiv.org/abs/2411.14402
+    #
+    # TODO: the model's tokenizer has video-related special tokens - deal with these (??)
+    #
+    pass
+
+
 ###### CONVERSION LOGIC ######
 
 
diff --git a/src/llama-arch.h b/src/llama-arch.h
index c41de89859d5c..831ec378ef332 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -70,6 +70,7 @@ enum llm_arch {
     LLM_ARCH_CHATGLM,
     LLM_ARCH_GLM4,
     LLM_ARCH_GLM4_MOE,
+    LLM_ARCH_GLM4V_MOE,
     LLM_ARCH_BITNET,
     LLM_ARCH_T5,
     LLM_ARCH_T5ENCODER,
@@ -123,7 +124,6 @@ enum llm_kv {
     LLM_KV_GENERAL_LICENSE,
     LLM_KV_GENERAL_SOURCE_URL,
     LLM_KV_GENERAL_SOURCE_HF_REPO,
-
     LLM_KV_VOCAB_SIZE,
     LLM_KV_CONTEXT_LENGTH,
     LLM_KV_EMBEDDING_LENGTH,

From 70c86861a4a0a4620f5591062240df0bb802aded Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Tue, 14 Oct 2025 16:47:12 -0500
Subject: [PATCH 02/23] use F32 accumulators for GLM4V_MOE

---
 src/llama-graph.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index f29a1e98c9103..ffc2187a1b107 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -817,7 +817,7 @@ ggml_tensor * llm_graph_context::build_ffn(
 
     if (down) {
         cur = build_lora_mm(down, cur);
-        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
+        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_GLM4V_MOE) {
             // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
             ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
         }
@@ -1583,7 +1583,7 @@ ggml_tensor * llm_graph_context::build_attn(
 
     if (wo) {
         cur = build_lora_mm(wo, cur);
-        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE) {
+        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_GLM4V_MOE) {
             // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
             ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
         }

From 631d4fa8693b7617d8c50e8824b54f9f3580ad5e Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Tue, 14 Oct 2025 16:48:21 -0500
Subject: [PATCH 03/23] add arch

---
 gguf-py/gguf/constants.py | 34 ++++++++++++++++++++++++++++++----
 src/llama-arch.cpp        | 28 ++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index f5e5fba8008bd..0afc58331b565 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -385,6 +385,7 @@ class MODEL_ARCH(IntEnum):
     CHATGLM          = auto()
     GLM4             = auto()
     GLM4_MOE         = auto()
+    GLM4V_MOE        = auto()
     BITNET           = auto()
     T5               = auto()
     T5ENCODER        = auto()
@@ -656,10 +657,10 @@ class MODEL_TENSOR(IntEnum):
     A_MM_NORM_PRE        = auto()
     A_MM_NORM_MID        = auto()
     # nextn/mtp
-    NEXTN_EH_PROJ        = auto()
-    NEXTN_EMBED_TOKENS   = auto()
-    NEXTN_ENORM          = auto()
-    NEXTN_HNORM          = auto()
+    NEXTN_EH_PROJ          = auto()
+    NEXTN_EMBED_TOKENS     = auto()
+    NEXTN_ENORM            = auto()
+    NEXTN_HNORM            = auto()
     NEXTN_SHARED_HEAD_HEAD = auto()
     NEXTN_SHARED_HEAD_NORM = auto()
 
@@ -729,6 +730,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.CHATGLM:          "chatglm",
     MODEL_ARCH.GLM4:             "glm4",
     MODEL_ARCH.GLM4_MOE:         "glm4moe",
+    MODEL_ARCH.GLM4V_MOE:        "glm4v_moe",
     MODEL_ARCH.BITNET:           "bitnet",
     MODEL_ARCH.T5:               "t5",
     MODEL_ARCH.T5ENCODER:        "t5encoder",
@@ -2273,6 +2275,30 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
         MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
     ],
+    MODEL_ARCH.GLM4V_MOE: [ # same as GLM4_MOE without MTP tensors
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_POST_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_Q_NORM,
+        MODEL_TENSOR.ATTN_K_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.FFN_GATE_INP,
+        MODEL_TENSOR.FFN_GATE_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
+        MODEL_TENSOR.FFN_UP_EXP,
+        MODEL_TENSOR.FFN_GATE_SHEXP,
+        MODEL_TENSOR.FFN_DOWN_SHEXP,
+        MODEL_TENSOR.FFN_UP_SHEXP,
+        MODEL_TENSOR.FFN_EXP_PROBS_B,
+    ],
     MODEL_ARCH.BITNET: [
         MODEL_TENSOR.ATTN_Q,
         MODEL_TENSOR.ATTN_K,
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index b7e00b275b6f7..f2a8cbdf99a2e 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -66,6 +66,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_CHATGLM,          "chatglm"          },
     { LLM_ARCH_GLM4,             "glm4"             },
     { LLM_ARCH_GLM4_MOE,         "glm4moe"          },
+    { LLM_ARCH_GLM4V_MOE,        "glm4v_moe"        },
     { LLM_ARCH_BITNET,           "bitnet"           },
     { LLM_ARCH_T5,               "t5"               },
     { LLM_ARCH_T5ENCODER,        "t5encoder"        },
@@ -1507,6 +1508,33 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" },
         },
     },
+    {
+        LLM_ARCH_GLMV4_MOE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_POST_NORM,     "blk.%d.post_attention_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_Q_NORM,        "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K_NORM,        "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_FFN_GATE,           "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,           "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_FFN_GATE_SHEXP,     "blk.%d.ffn_gate_shexp" },
+            { LLM_TENSOR_FFN_DOWN_SHEXP,     "blk.%d.ffn_down_shexp" },
+            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
+            { LLM_TENSOR_FFN_EXP_PROBS_B,    "blk.%d.exp_probs_b" },
+        },
+    },
     {
         LLM_ARCH_BITNET,
         {

From 2aa698558b0bfa58f8c759b81b34868cb8947086 Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Tue, 14 Oct 2025 17:10:55 -0500
Subject: [PATCH 04/23] llama-model : add placeholders

---
 src/llama-model.cpp | 20 ++++++++++++++++++++
 src/llama-model.h   |  2 +-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 5002bd42ff04e..dc0ab0bf6a6f7 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1611,6 +1611,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_GLM4V_MOE:
+            {
+                // TODO
+            } break;
         case LLM_ARCH_BITNET:
             {
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
@@ -4892,6 +4896,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                     }
                 }
                 break;
+            case LLM_ARCH_GLM4V_MOE:
+                {
+                    // TODO
+                }
+                break;
             case LLM_ARCH_NEMOTRON:
                 {
                     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
@@ -14683,6 +14692,12 @@ struct llm_build_glm4_moe : public llm_graph_context {
     }
 };
 
+struct llm_build_glm4v_moe : public llm_graph_context {
+    llm_build_glm4v_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+        // TODO
+    }
+};
+
 struct llm_build_nemotron : public llm_graph_context {
     llm_build_nemotron(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -19750,6 +19765,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             {
                 llm = std::make_unique<llm_build_glm4_moe>(*this, params);
             } break;
+        case LLM_ARCH_GLM4V_MOE:
+            {
+                llm = std::make_unique<llm_build_glm4_moe>(*this, params);
+            } break;
         case LLM_ARCH_BITNET:
             {
                 llm = std::make_unique<llm_build_bitnet>(*this, params);
@@ -20119,6 +20138,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
             return LLAMA_ROPE_TYPE_NEOX;
 
         case LLM_ARCH_QWEN2VL:
+        case LLM_ARCH_GLM4V_MOE:
             return LLAMA_ROPE_TYPE_MROPE;
 
         // all model arches should be listed explicitly here
diff --git a/src/llama-model.h b/src/llama-model.h
index 7f48662f2807a..2c9b05fbc790f 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -110,7 +110,7 @@ enum llm_type {
     LLM_TYPE_8B_A1B, // lfm2moe
     LLM_TYPE_21B_A3B, // Ernie MoE small
     LLM_TYPE_30B_A3B,
-    LLM_TYPE_106B_A12B, // GLM-4.5-Air
+    LLM_TYPE_106B_A12B, // GLM-4.5-Air (and GLM-4.5V text model)
     LLM_TYPE_235B_A22B,
     LLM_TYPE_300B_A47B, // Ernie MoE big
     LLM_TYPE_355B_A32B, // GLM-4.5

From d0e9dce27d92a4be7e901ed9cec92484dd1f78a0 Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Tue, 14 Oct 2025 17:13:29 -0500
Subject: [PATCH 05/23] fix arch name for tensor names

---
 src/llama-arch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index f2a8cbdf99a2e..6964c75ac6268 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -1509,7 +1509,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
         },
     },
     {
-        LLM_ARCH_GLMV4_MOE,
+        LLM_ARCH_GLM4V_MOE,
         {
             { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
             { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },

From 01d085dd4ac152ded1db173551fb322010e4d056 Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Wed, 15 Oct 2025 17:44:50 -0500
Subject: [PATCH 06/23] WIP conversion logic

---
 convert_hf_to_gguf.py     | 68 ++++++++++++++++++++++++++-------------
 gguf-py/gguf/constants.py |  2 ++
 2 files changed, 47 insertions(+), 23 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 36278866da005..ee80dd9a568f5 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -9219,33 +9219,55 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
         return [] # skip other tensors
 
+
+@ModelBase.register("Glm4vMoeForConditionalGeneration")
+class GLM4V_Text_MoE(Glm4MoeModel):
+    """Text model from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V)
+
+    ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)"""
+    model_arch = gguf.MODEL_ARCH.GLM4_MOE
+
+    def set_gguf_parameters(self):
+        # parameters specific to GLM-4.5V like rope_theta=10000 and context_length=65536
+        # should be correctly picked up from the text_config by the base classes
+        super().set_gguf_parameters()
+
+    def modify_tensors(
+        self, data_torch: Tensor, name: str, bid: int | None
+    ) -> Iterable[tuple[str, Tensor]]:
+        # skip vision tensors for the text model
+        if name.startswith("model.visual."):
+            return []
+
+        # the Glm4MoeModel class expects tensor names to start with 'model.',
+        # so we strip the we strip the 'language_model.' part
+        if name.startswith("model.language_model."):
+            name = name.replace("model.language_model.", "model.", 1)
+
+        # let the parent class handle the MoE logic and tensor mapping
+        yield from super().modify_tensors(data_torch, name, bid)
+
+
 @ModelBase.register("Glm4vMoeForConditionalGeneration")
 class GLM4V_MoE(MmprojModel):
+    """Multimodal projector from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V).
+
+    ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)"""
     #
-    # the HF model's type is `glm4v_moe`. internally, it consists of two models:
-    # - `glm4v_moe_text`
-    # + main text model
-    # + tensor names start with "model.language_model."
-    # + "2D-RoPE" (aKa Roformer) w/ embeddings dynamically adapted via bicubic interpolation
-    # - `glm4v_moe`
-    # + vision adapter (ViT)
-    # + tensor names start with "model.visual."
-    # + "3D-RoPE" (without the interpolation mentioned above)
-    #
-    # other notable quirks include:
-    # - has MTP layer (need to keep these tensors - same as GLM-4.5-Air)
-    # - RoPE theta value (θ): use 10k rather than 100k for GLM-4.5-Air
-    # - the model's vision supports video input, but this is not implemented here
-    #
-    # for more info, refer to:
-    # - reference impl          : https://github.com/huggingface/transformers/tree/main/src/transformers/models/glm4v_moe
-    # - HF model card           : https://huggingface.co/zai-org/GLM-4.5V
-    # - arXiv paper (model)     : https://arxiv.org/abs/2507.01006
-    # - arXiv paper (orig. ViT) : https://arxiv.org/abs/2411.14402
-    #
-    # TODO: the model's tokenizer has video-related special tokens - deal with these (??)
+    # TODO: this is not complete yet! need to handle custom RoPE nonsense.
     #
-    pass
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V)        
+        self.gguf_writer.add_vision_use_gelu(True)
+        if (ln_eps := self.find_vparam(["layer_norm_eps"], optional=True)) is not None:
+            self.gguf_writer.add_vision_attention_layernorm_eps(ln_eps)
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if name.startswith("model.visual."):
+            yield self.map_tensor_name(name), data_torch
+        else:
+            return
 
 
 ###### CONVERSION LOGIC ######
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 0afc58331b565..c9708253163c8 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -428,6 +428,7 @@ class VISION_PROJECTOR_TYPE(IntEnum):
     GLM_EDGE  = auto()
     MERGER    = auto()
     GEMMA3    = auto()
+    GLM4V     = auto()
 
 
 class MODEL_TENSOR(IntEnum):
@@ -3055,6 +3056,7 @@ class VisionProjectorType:
     VOXTRAL = "voxtral"
     LFM2 = "lfm2"
     KIMIVL = "kimivl"
+    GLM4V = "glm4v_moe"
 
 
 # Items here are (block size, type size)

From 14cee9c9d748ae384f3ba00dc078ba66b648a649 Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Wed, 15 Oct 2025 18:03:11 -0500
Subject: [PATCH 07/23] better class names

---
 convert_hf_to_gguf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index ee80dd9a568f5..664050f60c13a 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -9221,7 +9221,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
 
 @ModelBase.register("Glm4vMoeForConditionalGeneration")
-class GLM4V_Text_MoE(Glm4MoeModel):
+class GLM4VMoEModel(Glm4MoeModel):
     """Text model from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V)
 
     ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)"""
@@ -9249,7 +9249,7 @@ def modify_tensors(
 
 
 @ModelBase.register("Glm4vMoeForConditionalGeneration")
-class GLM4V_MoE(MmprojModel):
+class GLM4VMoEVisionModel(MmprojModel):
     """Multimodal projector from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V).
 
     ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)"""

From e0b6064d90a48f9a3fa9b359c310f2bc032f1cff Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Wed, 15 Oct 2025 19:32:15 -0500
Subject: [PATCH 08/23] add `clip.vision.rope.*` to GGUF constants

need `clip.vision.rope.freq_base` for GLM-4.5V
---
 gguf-py/gguf/constants.py | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index c9708253163c8..935a005930fc3 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -276,6 +276,21 @@ class ClipVision:
         USE_SILU            = "clip.use_silu"
         N_WA_PATTERN        = "clip.vision.n_wa_pattern" # used by qwen2.5vl
 
+        class Rope:
+            DIMENSION_COUNT          = "clip.vision.rope.dimension_count"
+            DIMENSION_SECTIONS       = "clip.vision.rope.dimension_sections"
+            FREQ_BASE                = "clip.vision.rope.freq_base"
+            SCALING_TYPE             = "clip.vision.rope.scaling.type"
+            SCALING_FACTOR           = "clip.vision.rope.scaling.factor"
+            SCALING_ATTN_FACTOR      = "clip.vision.rope.scaling.attn_factor"
+            SCALING_ORIG_CTX_LEN     = "clip.vision.rope.scaling.original_context_length"
+            SCALING_FINETUNED        = "clip.vision.rope.scaling.finetuned"
+            SCALING_YARN_LOG_MUL     = "clip.vision.rope.scaling.yarn_log_multiplier"
+            SCALING_YARN_EXT_FACTOR  = "clip.vision.rope.scaling.yarn_ext_factor"
+            SCALING_YARN_ATTN_FACTOR = "clip.vision.rope.scaling.yarn_attn_factor"
+            SCALING_YARN_BETA_FAST   = "clip.vision.rope.scaling.yarn_beta_fast"
+            SCALING_YARN_BETA_SLOW   = "clip.vision.rope.scaling.yarn_beta_slow"
+
         class Attention:
             HEAD_COUNT      = "clip.vision.attention.head_count"
             LAYERNORM_EPS   = "clip.vision.attention.layer_norm_epsilon"

From 7bdc330708df45b6ec8b28d51ddb38236593155f Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Wed, 15 Oct 2025 19:32:39 -0500
Subject: [PATCH 09/23] add `add_vision_rope_freq_base` for GGUF metadata

---
 gguf-py/gguf/gguf_writer.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 306679e21834b..5076b44866715 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -1038,6 +1038,9 @@ def add_vision_head_count(self, value: int) -> None:
     def add_vision_attention_layernorm_eps(self, value: float) -> None:
         self.add_float32(Keys.ClipVision.Attention.LAYERNORM_EPS, value)
 
+    def add_vision_rope_freq_base(self, value: float) -> None:
+        self.add_float32(Keys.ClipVision.Rope.FREQ_BASE, value)
+
     def add_vision_image_size(self, value: int) -> None:
         self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value)
 

From ed7c271047edf9b2dc98df8e747f9a35d745a688 Mon Sep 17 00:00:00 2001
From: ddh0 <dylanhalladay02@icloud.com>
Date: Wed, 15 Oct 2025 19:33:17 -0500
Subject: [PATCH 10/23] set `clip.vision.rope.freq_base` during conversion

---
 convert_hf_to_gguf.py | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 664050f60c13a..b4e309c353030 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -9254,16 +9254,27 @@ class GLM4VMoEVisionModel(MmprojModel):
 
     ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)"""
     #
-    # TODO: this is not complete yet! need to handle custom RoPE nonsense.
+    # TODO: this is not complete yet!
     #
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V)        
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V)
         self.gguf_writer.add_vision_use_gelu(True)
+
         if (ln_eps := self.find_vparam(["layer_norm_eps"], optional=True)) is not None:
             self.gguf_writer.add_vision_attention_layernorm_eps(ln_eps)
 
+        # the ViT in GLM-4.5V applies its own RoPE inside its attention blocks
+        if (rope_theta := self.find_vparam(["rope_theta"], optional=True)) is not None:
+            self.gguf_writer.add_vision_rope_freq_base(rope_theta)
+            logger.info(f"gguf: vision rope theta = {rope_theta}")
+        else:
+            logger.warning('gguf: -------------------------------------------------------------')
+            logger.warning('gguf: missing vision rope theta! the conversion might be incorrect!')
+            logger.warning('gguf: -------------------------------------------------------------')
+
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        del bid # unused
         if name.startswith("model.visual."):
             yield self.map_tensor_name(name), data_torch
         else:

From 99e8e6a7f699839fd45a6ca16a91dbb9b04ac631 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Tue, 4 Nov 2025 18:42:39 -0600
Subject: [PATCH 11/23] use the same ViT for GLM-4.1V and GLM-4.5V

multimodal projector is identical between the models
---
 convert_hf_to_gguf.py | 37 ++++++++++++++++++++++++++++++++-----
 1 file changed, 32 insertions(+), 5 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index a7e3d4b7edf58..81876f7380e2e 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -9771,6 +9771,31 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
         return [] # skip other tensors
 
+@ModelBase.register("Glm4vForConditionalGeneration")
+class GLM4VModel(Glm4Model):
+    """Text model from [zai-org/GLM-4.1V-9B-Thinking](https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking)
+
+    ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)"""
+    model_arch = gguf.MODEL_ARCH.GLM4_MOE
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+    def modify_tensors(
+        self, data_torch: Tensor, name: str, bid: int | None
+    ) -> Iterable[tuple[str, Tensor]]:
+        # skip vision tensors for the text model
+        if name.startswith("model.visual."):
+            return []
+
+        # the Glm4Model class expects tensor names to start with 'model.',
+        # so we strip the we strip the 'language_model.' part
+        if name.startswith("model.language_model."):
+            name = name.replace("model.language_model.", "model.", 1)
+
+        # let the Glm4Model class handle the tensor mapping
+        yield from super().modify_tensors(data_torch, name, bid)
+
 
 @ModelBase.register("Glm4vMoeForConditionalGeneration")
 class GLM4VMoEModel(Glm4MoeModel):
@@ -9796,13 +9821,15 @@ def modify_tensors(
         if name.startswith("model.language_model."):
             name = name.replace("model.language_model.", "model.", 1)
 
-        # let the parent class handle the MoE logic and tensor mapping
+        # let the Glm4MoeModel class handle the MoE logic and tensor mapping
         yield from super().modify_tensors(data_torch, name, bid)
 
 
-@ModelBase.register("Glm4vMoeForConditionalGeneration")
-class GLM4VMoEVisionModel(MmprojModel):
-    """Multimodal projector from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V).
+@ModelBase.register("Glm4vMoeForConditionalGeneration", "Glm4vForConditionalGeneration")
+class GLM4VisionModel(MmprojModel):
+    """Multimodal projector from:
+    - [zai-org/GLM-4.1V-9B-Thinking](https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking)
+    - [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V)
 
     ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)"""
     #
@@ -9816,7 +9843,7 @@ def set_gguf_parameters(self):
         if (ln_eps := self.find_vparam(["layer_norm_eps"], optional=True)) is not None:
             self.gguf_writer.add_vision_attention_layernorm_eps(ln_eps)
 
-        # the ViT in GLM-4.5V applies its own RoPE inside its attention blocks
+        # the ViT applies its own RoPE inside its attention blocks
         if (rope_theta := self.find_vparam(["rope_theta"], optional=True)) is not None:
             self.gguf_writer.add_vision_rope_freq_base(rope_theta)
             logger.info(f"gguf: vision rope theta = {rope_theta}")

From 94e89836e2903d89cd347849ba3c91fd050b3596 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Tue, 4 Nov 2025 19:35:46 -0600
Subject: [PATCH 12/23] separate architectures for GLM4V and GLM4V_MOE

---
 convert_hf_to_gguf.py     |  4 ++--
 gguf-py/gguf/constants.py |  3 ++-
 src/llama-arch.cpp        |  1 +
 src/llama-arch.h          |  1 +
 src/llama-graph.cpp       | 18 ++++++++++++++----
 src/llama-model.cpp       | 14 ++++++++++++++
 6 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 81876f7380e2e..ce892e3c21283 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -9776,7 +9776,7 @@ class GLM4VModel(Glm4Model):
     """Text model from [zai-org/GLM-4.1V-9B-Thinking](https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking)
 
     ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)"""
-    model_arch = gguf.MODEL_ARCH.GLM4_MOE
+    model_arch = gguf.MODEL_ARCH.GLM4V
 
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
@@ -9802,7 +9802,7 @@ class GLM4VMoEModel(Glm4MoeModel):
     """Text model from [zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V)
 
     ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)"""
-    model_arch = gguf.MODEL_ARCH.GLM4_MOE
+    model_arch = gguf.MODEL_ARCH.GLM4V_MOE
 
     def set_gguf_parameters(self):
         # parameters specific to GLM-4.5V like rope_theta=10000 and context_length=65536
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 48bac8632fa37..93f7484b0ec04 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -406,6 +406,7 @@ class MODEL_ARCH(IntEnum):
     CHATGLM          = auto()
     GLM4             = auto()
     GLM4_MOE         = auto()
+    GLM4V            = auto()
     GLM4V_MOE        = auto()
     BITNET           = auto()
     T5               = auto()
@@ -3226,7 +3227,7 @@ class VisionProjectorType:
     VOXTRAL = "voxtral"
     LFM2 = "lfm2"
     KIMIVL = "kimivl"
-    GLM4V = "glm4v_moe"
+    GLM4V = "glm4v"
     LIGHTONOCR = "lightonocr"
     COGVLM = "cogvlm"
     JANUS_PRO = "janus_pro"
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 6b32c53bbd303..bd91144683ab0 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -68,6 +68,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_CHATGLM,          "chatglm"          },
     { LLM_ARCH_GLM4,             "glm4"             },
     { LLM_ARCH_GLM4_MOE,         "glm4moe"          },
+    { LLM_ARCH_GLM4V,            "glm4v"            },
     { LLM_ARCH_GLM4V_MOE,        "glm4v_moe"        },
     { LLM_ARCH_BITNET,           "bitnet"           },
     { LLM_ARCH_T5,               "t5"               },
diff --git a/src/llama-arch.h b/src/llama-arch.h
index f8d87a82f9654..1e7f2bf7cefe3 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -72,6 +72,7 @@ enum llm_arch {
     LLM_ARCH_CHATGLM,
     LLM_ARCH_GLM4,
     LLM_ARCH_GLM4_MOE,
+    LLM_ARCH_GLM4V,
     LLM_ARCH_GLM4V_MOE,
     LLM_ARCH_BITNET,
     LLM_ARCH_T5,
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 0156069547e13..baa8ea052c4c6 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -820,8 +820,13 @@ ggml_tensor * llm_graph_context::build_ffn(
 
     if (down) {
         cur = build_lora_mm(down, cur);
-        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_GLM4V_MOE) {
-            // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
+        if (
+            arch == LLM_ARCH_GLM4 ||
+            arch == LLM_ARCH_GLM4_MOE ||
+            arch == LLM_ARCH_GLM4V ||
+            arch == LLM_ARCH_GLM4V
+        ) {
+            // GLM4 models seem to have numerical issues with half-precision accumulators
             ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
         }
     }
@@ -1618,8 +1623,13 @@ ggml_tensor * llm_graph_context::build_attn(
 
     if (wo) {
         cur = build_lora_mm(wo, cur);
-        if (arch == LLM_ARCH_GLM4 || arch == LLM_ARCH_GLM4_MOE || arch == LLM_ARCH_GLM4V_MOE) {
-            // GLM4 and GLM4_MOE seem to have numerical issues with half-precision accumulators
+        if (
+            arch == LLM_ARCH_GLM4 ||
+            arch == LLM_ARCH_GLM4_MOE ||
+            arch == LLM_ARCH_GLM4V ||
+            arch == LLM_ARCH_GLM4V
+        ) {
+            // GLM4 models seem to have numerical issues with half-precision accumulators
             ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
         }
     }
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 2954acb14d0f3..f0b52e5682583 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1665,6 +1665,10 @@ void llama_model::load_hparams(llama_model_loader & ml) {
                     default: type = LLM_TYPE_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_GLM4V:
+            {
+                // TODO
+            } break;
         case LLM_ARCH_GLM4V_MOE:
             {
                 // TODO
@@ -5011,6 +5015,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                     }
                 }
                 break;
+            case LLM_ARCH_GLM4V:
+                {
+                    // TODO
+                }
+                break;
             case LLM_ARCH_GLM4V_MOE:
                 {
                     // TODO
@@ -7107,6 +7116,10 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             {
                 llm = std::make_unique<llm_build_glm4_moe>(*this, params);
             } break;
+        case LLM_ARCH_GLM4V:
+            {
+                llm = std::make_unique<llm_build_glm4_moe>(*this, params);
+            } break;
         case LLM_ARCH_GLM4V_MOE:
             {
                 llm = std::make_unique<llm_build_glm4_moe>(*this, params);
@@ -7495,6 +7508,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
             return LLAMA_ROPE_TYPE_NEOX;
 
         case LLM_ARCH_QWEN2VL:
+        case LLM_ARCH_GLM4V:
         case LLM_ARCH_GLM4V_MOE:
             return LLAMA_ROPE_TYPE_MROPE;
         case LLM_ARCH_QWEN3VL:

From 484d18cc661c415728580b40e9298b98b64d6ee6 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Tue, 4 Nov 2025 23:21:32 -0600
Subject: [PATCH 13/23] fix typo

---
 src/llama-graph.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index baa8ea052c4c6..798579844c8a6 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -824,7 +824,7 @@ ggml_tensor * llm_graph_context::build_ffn(
             arch == LLM_ARCH_GLM4 ||
             arch == LLM_ARCH_GLM4_MOE ||
             arch == LLM_ARCH_GLM4V ||
-            arch == LLM_ARCH_GLM4V
+            arch == LLM_ARCH_GLM4V_MOE
         ) {
             // GLM4 models seem to have numerical issues with half-precision accumulators
             ggml_mul_mat_set_prec(cur, GGML_PREC_F32);
@@ -1627,7 +1627,7 @@ ggml_tensor * llm_graph_context::build_attn(
             arch == LLM_ARCH_GLM4 ||
             arch == LLM_ARCH_GLM4_MOE ||
             arch == LLM_ARCH_GLM4V ||
-            arch == LLM_ARCH_GLM4V
+            arch == LLM_ARCH_GLM4V_MOE
         ) {
             // GLM4 models seem to have numerical issues with half-precision accumulators
             ggml_mul_mat_set_prec(cur, GGML_PREC_F32);

From c84a4314365450fadb61e50580cea2620a5ee15d Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Tue, 4 Nov 2025 23:22:01 -0600
Subject: [PATCH 14/23] add GLM4V arch tensor map

---
 src/llama-arch.cpp | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index bd91144683ab0..8720b7e4dd272 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -1507,7 +1507,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
         LLM_ARCH_GLM4,
         {
             { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
-            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" }, // does this really exist?
             { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
             { LLM_TENSOR_OUTPUT,          "output" },
             { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
@@ -1556,6 +1556,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
             { LLM_TENSOR_NEXTN_SHARED_HEAD_NORM, "blk.%d.nextn.shared_head_norm" },
         },
     },
+    {
+        LLM_ARCH_GLM4V,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_ATTN_POST_NORM,  "blk.%d.post_attention_norm" },
+            { LLM_TENSOR_FFN_POST_NORM,   "blk.%d.post_ffw_norm" },
+        },
+    },
     {
         LLM_ARCH_GLM4V_MOE,
         {

From 7267e8a0984e1bb43f197b9ed823185aaefd06e9 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Tue, 4 Nov 2025 23:29:00 -0600
Subject: [PATCH 15/23] fix typo

---
 src/llama-model.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index f0b52e5682583..ac3a849918f3e 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -7118,11 +7118,11 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
             } break;
         case LLM_ARCH_GLM4V:
             {
-                llm = std::make_unique<llm_build_glm4_moe>(*this, params);
+                llm = std::make_unique<llm_build_glm4v>(*this, params);
             } break;
         case LLM_ARCH_GLM4V_MOE:
             {
-                llm = std::make_unique<llm_build_glm4_moe>(*this, params);
+                llm = std::make_unique<llm_build_glm4v_moe>(*this, params);
             } break;
         case LLM_ARCH_BITNET:
             {

From ac54c7165269846a4ce5c36a41776569840b00f2 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Tue, 4 Nov 2025 23:32:34 -0600
Subject: [PATCH 16/23] add `glm4v` and `glm4v_moe` to src/models

---
 src/models/glm4v.cpp     | 5 +++++
 src/models/glm4v_moe.cpp | 5 +++++
 src/models/models.h      | 8 ++++++++
 3 files changed, 18 insertions(+)
 create mode 100644 src/models/glm4v.cpp
 create mode 100644 src/models/glm4v_moe.cpp

diff --git a/src/models/glm4v.cpp b/src/models/glm4v.cpp
new file mode 100644
index 0000000000000..b058f275e2599
--- /dev/null
+++ b/src/models/glm4v.cpp
@@ -0,0 +1,5 @@
+#include "models.h"
+
+llm_build_glm4v::llm_build_glm4v(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ // TODO
+}
diff --git a/src/models/glm4v_moe.cpp b/src/models/glm4v_moe.cpp
new file mode 100644
index 0000000000000..ab63839a23aab
--- /dev/null
+++ b/src/models/glm4v_moe.cpp
@@ -0,0 +1,5 @@
+#include "models.h"
+
+llm_build_glm4v_moe::llm_build_glm4v_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
+ // TODO
+}
diff --git a/src/models/models.h b/src/models/models.h
index af203343a4d71..ecaa560000ef3 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -216,6 +216,14 @@ struct llm_build_glm4_moe : public llm_graph_context {
     llm_build_glm4_moe(const llama_model & model, const llm_graph_params & params);
 };
 
+struct llm_build_glm4v : public llm_graph_context {
+    llm_build_glm4v(const llama_model & model, const llm_graph_params & params);
+};
+
+struct llm_build_glm4v_moe : public llm_graph_context {
+    llm_build_glm4v_moe(const llama_model & model, const llm_graph_params & params);
+};
+
 struct llm_build_gpt2 : public llm_graph_context {
     llm_build_gpt2(const llama_model & model, const llm_graph_params & params);
 };

From c17d4b97cccd366f572aced7e801e40f3918e1f5 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Wed, 5 Nov 2025 16:02:37 -0600
Subject: [PATCH 17/23] revert old RoPE GGUF changes

---
 gguf-py/gguf/constants.py   | 35 +++++++++++++++++++----------------
 gguf-py/gguf/gguf_writer.py |  3 ---
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 496cdabfb9a44..d70a74db39f5f 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -280,21 +280,6 @@ class ClipVision:
         N_WA_PATTERN        = "clip.vision.n_wa_pattern" # used by qwen2.5vl
         IS_DEEPSTACK_LAYERS = "clip.vision.is_deepstack_layers"
 
-        class Rope:
-            DIMENSION_COUNT          = "clip.vision.rope.dimension_count"
-            DIMENSION_SECTIONS       = "clip.vision.rope.dimension_sections"
-            FREQ_BASE                = "clip.vision.rope.freq_base"
-            SCALING_TYPE             = "clip.vision.rope.scaling.type"
-            SCALING_FACTOR           = "clip.vision.rope.scaling.factor"
-            SCALING_ATTN_FACTOR      = "clip.vision.rope.scaling.attn_factor"
-            SCALING_ORIG_CTX_LEN     = "clip.vision.rope.scaling.original_context_length"
-            SCALING_FINETUNED        = "clip.vision.rope.scaling.finetuned"
-            SCALING_YARN_LOG_MUL     = "clip.vision.rope.scaling.yarn_log_multiplier"
-            SCALING_YARN_EXT_FACTOR  = "clip.vision.rope.scaling.yarn_ext_factor"
-            SCALING_YARN_ATTN_FACTOR = "clip.vision.rope.scaling.yarn_attn_factor"
-            SCALING_YARN_BETA_FAST   = "clip.vision.rope.scaling.yarn_beta_fast"
-            SCALING_YARN_BETA_SLOW   = "clip.vision.rope.scaling.yarn_beta_slow"
-
         class Attention:
             HEAD_COUNT      = "clip.vision.attention.head_count"
             LAYERNORM_EPS   = "clip.vision.attention.layer_norm_epsilon"
@@ -775,7 +760,8 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.DEEPSEEK2:        "deepseek2",
     MODEL_ARCH.CHATGLM:          "chatglm",
     MODEL_ARCH.GLM4:             "glm4",
-    MODEL_ARCH.GLM4_MOE:         "glm4moe",
+    MODEL_ARCH.GLM4_MOE:         "glm4_moe",
+    MODEL_ARCH.GLM4V:            "glm4v",
     MODEL_ARCH.GLM4V_MOE:        "glm4v_moe",
     MODEL_ARCH.BITNET:           "bitnet",
     MODEL_ARCH.T5:               "t5",
@@ -824,6 +810,7 @@ class MODEL_TENSOR(IntEnum):
     VISION_PROJECTOR_TYPE.GLM_EDGE:  "adapter",
     VISION_PROJECTOR_TYPE.MERGER:    "qwen2vl_merger",
     VISION_PROJECTOR_TYPE.GEMMA3:    "gemma3",
+    VISION_PROJECTOR_TYPE.GLM4V:     "glm4v",
 }
 
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -2384,6 +2371,22 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.NEXTN_SHARED_HEAD_HEAD,
         MODEL_TENSOR.NEXTN_SHARED_HEAD_NORM,
     ],
+    MODEL_ARCH.GLM4V : [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_QKV,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.ATTN_POST_NORM,
+        MODEL_TENSOR.FFN_POST_NORM,
+    ],
     MODEL_ARCH.GLM4V_MOE: [ # same as GLM4_MOE without MTP tensors
         MODEL_TENSOR.TOKEN_EMBD,
         MODEL_TENSOR.OUTPUT_NORM,
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 6d0ec74560ac0..a051daeeb1341 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -1047,9 +1047,6 @@ def add_vision_head_count(self, value: int) -> None:
     def add_vision_attention_layernorm_eps(self, value: float) -> None:
         self.add_float32(Keys.ClipVision.Attention.LAYERNORM_EPS, value)
 
-    def add_vision_rope_freq_base(self, value: float) -> None:
-        self.add_float32(Keys.ClipVision.Rope.FREQ_BASE, value)
-
     def add_vision_image_size(self, value: int) -> None:
         self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value)
 

From 8a6ad0c55a7ae41db18c96a9b52f51c5ecc7e52c Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Wed, 5 Nov 2025 16:04:04 -0600
Subject: [PATCH 18/23] begin adding GLM4V projector

---
 tools/mtmd/clip-impl.h | 46 ++++++++++++++++++++++--------------------
 tools/mtmd/clip.cpp    |  4 ++++
 2 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h
index 722b1a4948d6f..f61e1e0ee1f78 100644
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -156,32 +156,34 @@ enum projector_type {
     PROJECTOR_TYPE_LIGHTONOCR,
     PROJECTOR_TYPE_COGVLM,
     PROJECTOR_TYPE_JANUS_PRO,
+    PROJECTOR_TYPE_GLM4V,
     PROJECTOR_TYPE_UNKNOWN,
 };
 
 static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
-    { PROJECTOR_TYPE_MLP,       "mlp" },
-    { PROJECTOR_TYPE_LDP,       "ldp" },
-    { PROJECTOR_TYPE_LDPV2,     "ldpv2"},
-    { PROJECTOR_TYPE_MINICPMV,  "resampler"},
-    { PROJECTOR_TYPE_GLM_EDGE,  "adapter"},
-    { PROJECTOR_TYPE_QWEN2VL,   "qwen2vl_merger"},
-    { PROJECTOR_TYPE_QWEN25VL,  "qwen2.5vl_merger"},
-    { PROJECTOR_TYPE_QWEN3VL,   "qwen3vl_merger"},
-    { PROJECTOR_TYPE_GEMMA3,    "gemma3"},
-    { PROJECTOR_TYPE_IDEFICS3,  "idefics3"},
-    { PROJECTOR_TYPE_PIXTRAL,   "pixtral"},
-    { PROJECTOR_TYPE_ULTRAVOX,  "ultravox"},
-    { PROJECTOR_TYPE_INTERNVL,  "internvl"},
-    { PROJECTOR_TYPE_LLAMA4,    "llama4"},
-    { PROJECTOR_TYPE_QWEN2A,    "qwen2a"},
-    { PROJECTOR_TYPE_QWEN25O,   "qwen2.5o"},
-    { PROJECTOR_TYPE_VOXTRAL,   "voxtral"},
-    { PROJECTOR_TYPE_LFM2,      "lfm2"},
-    { PROJECTOR_TYPE_KIMIVL,    "kimivl"},
-    { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
-    { PROJECTOR_TYPE_COGVLM,    "cogvlm"},
-    { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
+    { PROJECTOR_TYPE_MLP,       "mlp"              },
+    { PROJECTOR_TYPE_LDP,       "ldp"              },
+    { PROJECTOR_TYPE_LDPV2,     "ldpv2"            },
+    { PROJECTOR_TYPE_MINICPMV,  "resampler"        },
+    { PROJECTOR_TYPE_GLM_EDGE,  "adapter"          },
+    { PROJECTOR_TYPE_QWEN2VL,   "qwen2vl_merger"   },
+    { PROJECTOR_TYPE_QWEN25VL,  "qwen2.5vl_merger" },
+    { PROJECTOR_TYPE_QWEN3VL,   "qwen3vl_merger"   },
+    { PROJECTOR_TYPE_GEMMA3,    "gemma3"           },
+    { PROJECTOR_TYPE_IDEFICS3,  "idefics3"         },
+    { PROJECTOR_TYPE_PIXTRAL,   "pixtral"          },
+    { PROJECTOR_TYPE_ULTRAVOX,  "ultravox"         },
+    { PROJECTOR_TYPE_INTERNVL,  "internvl"         },
+    { PROJECTOR_TYPE_LLAMA4,    "llama4"           },
+    { PROJECTOR_TYPE_QWEN2A,    "qwen2a"           },
+    { PROJECTOR_TYPE_QWEN25O,   "qwen2.5o"         },
+    { PROJECTOR_TYPE_VOXTRAL,   "voxtral"          },
+    { PROJECTOR_TYPE_LFM2,      "lfm2"             },
+    { PROJECTOR_TYPE_KIMIVL,    "kimivl"           },
+    { PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"       },
+    { PROJECTOR_TYPE_COGVLM,    "cogvlm"           },
+    { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"        },
+    { PROJECTOR_TYPE_GLM4V,     "glm4v"            },
 };
 
 static projector_type clip_projector_type_from_string(const std::string & str) {
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 16781fb19523f..06983fd494d52 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -1082,6 +1082,10 @@ struct clip_graph {
         return gf;
     }
 
+    ggml_cgraph * build_glm4v() {
+        /* TODO */
+    }
+
     ggml_cgraph * build_minicpmv() {
         const int batch_size = 1;
 

From f39b231f2b48c3562b1f5ce6f555752f641eda49 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Wed, 5 Nov 2025 16:09:46 -0600
Subject: [PATCH 19/23] copy LLM graph code from text models (WIP)

still need to figure out what exactly needs to be changed...
---
 src/models/glm4v.cpp     | 125 +++++++++++++++++++++++++++++++-
 src/models/glm4v_moe.cpp | 153 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 276 insertions(+), 2 deletions(-)

diff --git a/src/models/glm4v.cpp b/src/models/glm4v.cpp
index b058f275e2599..0ec311f3f0739 100644
--- a/src/models/glm4v.cpp
+++ b/src/models/glm4v.cpp
@@ -1,5 +1,128 @@
 #include "models.h"
 
 llm_build_glm4v::llm_build_glm4v(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- // TODO
+    //
+    // TODO -- currently this is just copied from `llm_build_glm4` -- still WIP
+    //
+    const int64_t n_embd_head = hparams.n_embd_head_v;
+    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+
+    inpL = build_inp_embd(model.tok_embd);
+
+    // inp_pos - contains the positions
+    ggml_tensor * inp_pos = build_inp_pos();
+
+    auto * inp_attn = build_attn_inp_kv();
+
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    for (int il = 0; il < n_layer; ++il) {
+        ggml_tensor * inpSA = inpL;
+
+        // Pre-attention norm
+        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+
+        // self-attention
+        {
+            ggml_tensor * Qcur = nullptr;
+            ggml_tensor * Kcur = nullptr;
+            ggml_tensor * Vcur = nullptr;
+
+            if (model.layers[il].wqkv == nullptr) {
+                Qcur = build_lora_mm(model.layers[il].wq, cur);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                }
+                Kcur = build_lora_mm(model.layers[il].wk, cur);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                }
+                Vcur = build_lora_mm(model.layers[il].wv, cur);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                }
+                Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+                Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+                Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+            } else {
+                cur = build_lora_mm(model.layers[il].wqkv, cur);
+                cb(cur, "wqkv", il);
+                if (model.layers[il].bqkv) {
+                    cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
+                    cb(cur, "bqkv", il);
+                }
+                Qcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head, n_tokens, n_embd_head * sizeof(float), cur->nb[1],
+                                    0 * sizeof(float) * (n_embd));
+                Kcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+                                    cur->nb[1], 1 * sizeof(float) * (n_embd));
+                Vcur = ggml_view_3d(ctx0, cur, n_embd_head, n_head_kv, n_tokens, n_embd_head * sizeof(float),
+                                    cur->nb[1], 1 * sizeof(float) * (n_embd + n_embd_gqa));
+            }
+            Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                                 ext_factor, attn_factor, beta_fast, beta_slow);
+
+            Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, nullptr, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                                 ext_factor, attn_factor, beta_fast, beta_slow);
+
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            cur = build_attn(inp_attn,
+                    model.layers[il].wo, NULL,
+                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf(float(n_embd_head)), il);
+        }
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+        // Post-attention norm (new!)
+        cur = build_norm(cur, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
+        cb(cur, "post_attn_norm", il);
+
+        // Add the input (residual connection after post-attention norm)
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+
+        // FF
+        {
+            // Pre-MLP norm
+            cur = build_norm(ffn_inp, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il);
+            cb(cur, "ffn_norm", il);
+
+            // MLP
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up, NULL, NULL,
+                    NULL, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL, LLM_FFN_SWIGLU, LLM_FFN_SEQ, il);
+            cb(cur, "ffn_out", il);
+
+            // Post-MLP norm
+            cur = build_norm(cur, model.layers[il].ffn_post_norm, NULL, LLM_NORM_RMS, il);
+            cb(cur, "post_mlp_norm", il);
+        }
+        // Add residual connection after post-MLP norm
+        inpL = ggml_add(ctx0, cur, ffn_inp);
+        cb(inpL, "l_out", il);
+    }
+    // Final norm
+    cur = build_norm(inpL, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+
+    // Output projection
+    cur = build_lora_mm(model.output, cur);
+
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
 }
diff --git a/src/models/glm4v_moe.cpp b/src/models/glm4v_moe.cpp
index ab63839a23aab..09bf8abbb2db9 100644
--- a/src/models/glm4v_moe.cpp
+++ b/src/models/glm4v_moe.cpp
@@ -1,5 +1,156 @@
 #include "models.h"
 
 llm_build_glm4v_moe::llm_build_glm4v_moe(const llama_model & model, const llm_graph_params & params) : llm_graph_context(params) {
- // TODO
+    //
+    // TODO -- currently this is just copied from `llm_build_glm4_moe` -- still WIP
+    //
+    const int64_t n_embd_head = hparams.n_embd_head_v;
+
+    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+
+    inpL = build_inp_embd(model.tok_embd);
+
+    // inp_pos - contains the positions
+    ggml_tensor * inp_pos = build_inp_pos();
+
+    auto * inp_attn = build_attn_inp_kv();
+
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    // Only process up to last layer (skip final NextN layer)
+    // Final layer tensors are loaded but not processed in forward pass
+    const int n_transformer_layers = n_layer - hparams.nextn_predict_layers;
+    for (int il = 0; il < n_transformer_layers; ++il) {
+        ggml_tensor * inpSA = inpL;
+
+        // Pre-attention norm
+        cur = build_norm(inpL, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+
+        // self-attention
+        {
+            ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
+            if (model.layers[il].bq) {
+                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+            }
+            cb(Qcur, "Qcur", il);
+
+            ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
+            if (model.layers[il].bk) {
+                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+            }
+            cb(Kcur, "Kcur", il);
+
+            ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
+            if (model.layers[il].bv) {
+                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+            }
+            cb(Vcur, "Vcur", il);
+
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+            // Apply Q/K norm if available (GLM-4.5 355B variant)
+            if (model.layers[il].attn_q_norm) {
+                Qcur = build_norm(Qcur, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il);
+                cb(Qcur, "Qcur_normed", il);
+            }
+            if (model.layers[il].attn_k_norm) {
+                Kcur = build_norm(Kcur, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il);
+                cb(Kcur, "Kcur_normed", il);
+            }
+            Qcur = ggml_rope_ext(
+                    ctx0, Qcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                    );
+
+            Kcur = ggml_rope_ext(
+                    ctx0, Kcur, inp_pos, nullptr,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow
+                    );
+
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+            cb(Vcur, "Vcur", il);
+
+            cur = build_attn(inp_attn,
+                    model.layers[il].wo, NULL,
+                    Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il);
+        }
+        if (il == n_transformer_layers - 1 && inp_out_ids) {
+            cur   = ggml_get_rows(ctx0, cur, inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+
+        // Post-attention norm
+        cur = build_norm(ffn_inp, model.layers[il].attn_post_norm, NULL, LLM_NORM_RMS, il);
+        cb(cur, "post_attn_norm", il);
+
+        // Check if this is a dense layer (n_layer_dense_lead=1, so layer 0 is dense)
+        if (static_cast<uint32_t>(il) < hparams.n_layer_dense_lead) {
+            // Dense FFN layer
+            cur = build_ffn(cur,
+                    model.layers[il].ffn_up,   NULL, NULL,
+                    model.layers[il].ffn_gate, NULL, NULL,
+                    model.layers[il].ffn_down, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(cur, "ffn_out", il);
+        } else {
+            // Process routed experts using existing MoE infrastructure
+            ggml_tensor * routed_out = build_moe_ffn(cur,
+                    model.layers[il].ffn_gate_inp,
+                    model.layers[il].ffn_up_exps,
+                    model.layers[il].ffn_gate_exps,
+                    model.layers[il].ffn_down_exps,
+                    model.layers[il].ffn_exp_probs_b,
+                    n_expert, n_expert_used,
+                    LLM_FFN_SILU, hparams.expert_weights_norm,
+                    true, hparams.expert_weights_scale,
+                    (llama_expert_gating_func_type) hparams.expert_gating_func,
+                    il);
+            cb(routed_out, "ffn_moe_out", il);
+
+            // Process shared expert on original input
+            ggml_tensor * shared_out = build_ffn(cur,
+                    model.layers[il].ffn_up_shexp,   NULL, NULL,
+                    model.layers[il].ffn_gate_shexp, NULL, NULL,
+                    model.layers[il].ffn_down_shexp, NULL, NULL,
+                    NULL,
+                    LLM_FFN_SILU, LLM_FFN_PAR, il);
+            cb(shared_out, "ffn_shexp_out", il);
+
+            // Final output: routed_output + shared_output
+            cur = ggml_add(ctx0, routed_out, shared_out);
+            cb(cur, "ffn_out", il);
+        }
+        cur = ggml_add(ctx0, cur, ffn_inp);
+
+        cur = build_cvec(cur, il);
+        cb(cur, "l_out", il);
+
+        // input for next layer
+        inpL = cur;
+    }
+    cur = inpL;
+    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+
+    // lm_head
+    cur = build_lora_mm(model.output, cur);
+
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
 }

From b60c16a9a30d0b94eb6d233832d9028467f10657 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Wed, 5 Nov 2025 17:15:56 -0600
Subject: [PATCH 20/23] consistent arch naming

---
 src/llama-arch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index eb1cd07fb7dfc..8346af56eb569 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -67,7 +67,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_DEEPSEEK2,        "deepseek2"        },
     { LLM_ARCH_CHATGLM,          "chatglm"          },
     { LLM_ARCH_GLM4,             "glm4"             },
-    { LLM_ARCH_GLM4_MOE,         "glm4moe"          },
+    { LLM_ARCH_GLM4_MOE,         "glm4_moe"         },
     { LLM_ARCH_GLM4V,            "glm4v"            },
     { LLM_ARCH_GLM4V_MOE,        "glm4v_moe"        },
     { LLM_ARCH_BITNET,           "bitnet"           },

From 6443ecb8848dd2e3c39313e07fc0ba51955922a5 Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Thu, 6 Nov 2025 15:01:59 -0600
Subject: [PATCH 21/23] WIP conversion logic

---
 convert_hf_to_gguf.py | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index dd35f7d93d3d6..a4971c3825cd9 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -9869,24 +9869,17 @@ class GLM4VisionModel(MmprojModel):
 
     ref: [#16600](https://github.com/ggml-org/llama.cpp/pull/16600)"""
     #
-    # TODO: this is not complete yet!
+    # TODO: conversion logic is still WIP!
     #
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
-        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V)
-        self.gguf_writer.add_vision_use_gelu(True)
-
-        if (ln_eps := self.find_vparam(["layer_norm_eps"], optional=True)) is not None:
-            self.gguf_writer.add_vision_attention_layernorm_eps(ln_eps)
+        assert self.hparams_vision is not None
+        vparams = self.hparams_vision
+        ln_eps = vparams.get("layer_norm_eps", 1e-5)
 
-        # the ViT applies its own RoPE inside its attention blocks
-        if (rope_theta := self.find_vparam(["rope_theta"], optional=True)) is not None:
-            self.gguf_writer.add_vision_rope_freq_base(rope_theta)
-            logger.info(f"gguf: vision rope theta = {rope_theta}")
-        else:
-            logger.warning('gguf: -------------------------------------------------------------')
-            logger.warning('gguf: missing vision rope theta! the conversion might be incorrect!')
-            logger.warning('gguf: -------------------------------------------------------------')
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.GLM4V)
+        self.gguf_writer.add_vision_attention_layernorm_eps(ln_eps)
+        self.gguf_writer.add_vision_use_silu(True)
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         del bid # unused

From eb2c8b89b839e5919b71d59fab702533e9fd1eeb Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Sat, 8 Nov 2025 15:02:43 -0600
Subject: [PATCH 22/23] include `glm4v-moe.cpp` and `glm4v.cpp` in CMake

also renamed `glm4v_moe.cpp` to `glm4v-moe.cpp` to match other model
files
---
 src/CMakeLists.txt                          | 2 ++
 src/models/{glm4v_moe.cpp => glm4v-moe.cpp} | 0
 2 files changed, 2 insertions(+)
 rename src/models/{glm4v_moe.cpp => glm4v-moe.cpp} (100%)

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 630b2cddf67e8..0988f3114819d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -70,6 +70,8 @@ add_library(llama
             models/gemma3n-iswa.cpp
             models/glm4-moe.cpp
             models/glm4.cpp
+            models/glm4v-moe.cpp
+            models/glm4v.cpp
             models/gpt2.cpp
             models/gptneox.cpp
             models/granite-hybrid.cpp
diff --git a/src/models/glm4v_moe.cpp b/src/models/glm4v-moe.cpp
similarity index 100%
rename from src/models/glm4v_moe.cpp
rename to src/models/glm4v-moe.cpp

From b37d3265a82e3f21028f5c977c928e5845c907dd Mon Sep 17 00:00:00 2001
From: ddh0 <chemist-mulches-39@icloud.com>
Date: Sat, 8 Nov 2025 19:13:55 -0600
Subject: [PATCH 23/23] mtmd : WIP build_glm4v cgraph

---
 tools/mtmd/clip.cpp | 143 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 139 insertions(+), 4 deletions(-)

diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index 86f5877ef369e..c0b7bc5f0c36c 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -281,7 +281,7 @@ struct clip_model {
     // embeddings
     ggml_tensor * class_embedding = nullptr;
     ggml_tensor * patch_embeddings_0 = nullptr;
-    ggml_tensor * patch_embeddings_1 = nullptr;  // second Conv2D kernel when we decouple Conv3D along temproal dimension (Qwen2VL)
+    ggml_tensor * patch_embeddings_1 = nullptr;  // second Conv2D kernel when we decouple Conv3D along temporal dimension (Qwen2VL, GLM4V)
     ggml_tensor * patch_bias = nullptr;
     ggml_tensor * position_embeddings = nullptr;
 
@@ -400,6 +400,22 @@ struct clip_model {
     ggml_tensor * mm_boi = nullptr;
     ggml_tensor * mm_eoi = nullptr;
 
+    // GLM4V projection
+    ggml_tensor * mm_post_conv_ln_w = nullptr;
+    ggml_tensor * mm_post_conv_ln_b = nullptr;
+    ggml_tensor * mm_downsample_w = nullptr;
+    ggml_tensor * mm_downsample_b = nullptr;
+    ggml_tensor * mm_merger_proj_w = nullptr;
+    ggml_tensor * mm_merger_proj_b = nullptr;
+    ggml_tensor * mm_merger_norm_w = nullptr;
+    ggml_tensor * mm_merger_norm_b = nullptr;
+    ggml_tensor * mm_merger_gate_w = nullptr;
+    ggml_tensor * mm_merger_gate_b = nullptr;
+    ggml_tensor * mm_merger_up_w = nullptr;
+    ggml_tensor * mm_merger_up_b = nullptr;
+    ggml_tensor * mm_merger_down_w = nullptr;
+    ggml_tensor * mm_merger_down_b = nullptr;
+
     bool audio_has_avgpool() const {
         return proj_type == PROJECTOR_TYPE_QWEN2A
             || proj_type == PROJECTOR_TYPE_VOXTRAL;
@@ -1083,7 +1099,122 @@ struct clip_graph {
     }
 
     ggml_cgraph * build_glm4v() {
-        /* TODO */
+        GGML_ASSERT(model.patch_embeddings_0 != nullptr);
+        GGML_ASSERT(model.patch_embeddings_1 != nullptr);
+        GGML_ASSERT(model.position_embeddings != nullptr);
+        GGML_ASSERT(model.class_embedding == nullptr);
+
+        // 2D RoPE input positions
+        ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+        ggml_set_name(pos_h, "pos_h");
+        ggml_set_input(pos_h);
+
+        ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_patches);
+        ggml_set_name(pos_w, "pos_w");
+        ggml_set_input(pos_w);
+
+        ggml_tensor * inp_raw = build_inp_raw();
+        ggml_tensor * inp;
+
+        // patch embedding
+        // - this is similar to Qwen2VL's handling of Conv3d for video/image inputs
+        // - for single images, the input is duplicated along the temporal axis
+        //
+        // ref: `class Glm4vVisionPatchEmbed(Qwen2_5_VisionPatchEmbed):`
+
+        ggml_tensor * inp = ggml_conv_2d(ctx0, model.patch_embeddings_0, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+        if (model.patch_embeddings_1) {
+            auto inp_1 = ggml_conv_2d(ctx0, model.patch_embeddings_1, inp_raw, patch_size, patch_size, 0, 0, 1, 1);
+            inp = ggml_add(ctx0, inp, inp_1);
+        }
+
+        const int batch_size = 1;
+        inp = ggml_permute(ctx0, inp, 1, 2, 0, 3);  // [w, h, c, b] -> [c, w, h, b]
+        inp = ggml_cont_4d(ctx0, inp, n_embd * 2, n_patches_x / 2, n_patches_y, batch_size);
+        inp = ggml_reshape_4d(ctx0, inp, n_embd * 2, n_patches_x / 2, 2, batch_size * (n_patches_y / 2));
+        inp = ggml_permute(ctx0, inp, 0, 2, 1, 3);
+        inp = ggml_cont_3d(ctx0, inp, n_embd, n_patches_x * n_patches_y, batch_size);
+        cb(inp, "patch_embed", -1);
+
+        // post-convolution layernorm
+        //
+        // ref: `self.post_conv_layernorm = Glm4vRMSNorm(config.hidden_size, eps=config.rms_norm_eps)`
+        inp = build_norm(inp, model.mm_post_conv_ln_w, model.mm_post_conv_ln_b, NORM_TYPE_RMS, eps, -1);
+        cb(inp, "post_conv_ln", -1);
+
+        // absolute position embeddings (interpolated)
+        //
+        // ref: self.embeddings
+        ggml_tensor * learned_pos_embd = resize_position_embeddings();
+        inp = ggml_add(ctx0, inp, learned_pos_embd);
+        cb(inp, "abs_pos_embed", -1);
+
+        // RoPE to be applied inside ViT blocks
+        //
+        // ref: self.rotary_pos_emb
+        auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
+            return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
+        };
+
+        // ViT blocks
+        ggml_tensor * cur = build_vit(
+                                inp, n_patches,
+                                NORM_TYPE_RMS,
+                                FFN_SILU, // hidden_act is "silu"
+                                nullptr,  // absolute embeddings already added
+                                add_pos);
+
+        // post-ViT layernorm
+        cur = build_norm(cur, model.post_ln_w, model.post_ln_b, NORM_TYPE_RMS, eps, -1);
+        cb(cur, "post_vit_ln", -1);
+
+        // reshape and permute to prepare for conv2d
+        const int merge_size = model.hparams.n_merge; // WIP: is this the correct value to use?
+        cur = ggml_reshape_3d(ctx0, cur, n_embd, n_patches_x, n_patches_y);
+        cur = ggml_permute(ctx0, cur, 1, 2, 0, 3); // -> [C, W, H, B] -> [W, H, C, B] for ggml
+        cb(cur, "pre_downsample_permute", -1);
+
+        // downsampling conv2d
+        cur = ggml_conv_2d(ctx0, model.mm_downsample_w, cur, merge_size, merge_size, 0, 0, 1, 1);
+        cb(cur, "downsample_conv", -1);
+
+        // reshape to [tokens, features]
+        cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]);
+        cur = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
+        cb(cur, "post_downsample_reshape", -1);
+
+        // patch merger FFN
+        //
+        // ref: `class Glm4vVisionPatchMerger(nn.Module):`
+        {
+            // input projection
+            cur = ggml_mul_mat(ctx0, model.mm_merger_proj_w, cur);
+
+            // apply norm + GELU
+            cur = build_norm(cur, model.mm_merger_norm_w, model.mm_merger_norm_b, NORM_TYPE_NORMAL, 1e-5f, -1);
+            cur = ggml_gelu(ctx0, cur);
+            ggml_tensor * ffn_input = cur;
+            cb(cur, "merger_ffn_inp", -1);
+
+            // gate projection
+            ggml_tensor * gate = ggml_mul_mat(ctx0, model.mm_merger_gate_w, ffn_input);
+            cb(cur, "merger_gate", -1);
+
+            // up projection
+            ggml_tensor * up = ggml_mul_mat(ctx0, model.mm_merger_up_w, ffn_input);
+            cb(cur, "merger_up", -1);
+
+            // activation + down projection
+            cur = ggml_silu(ctx0, gate);
+            cur = ggml_mul(ctx0, cur, up);
+            cur = ggml_mul_mat(ctx0, model.mm_merger_down_w, cur);
+            cb(cur, "merger_ffn_out", -1);
+        }
+
+        // build the graph
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
     }
 
     ggml_cgraph * build_minicpmv() {
@@ -2520,13 +2651,17 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
             {
                 res = graph.build_kimivl();
             } break;
+        case PROJECTOR_TYPE_COGVLM:
+            {
+                res = graph.build_cogvlm();
+            } break;
         case PROJECTOR_TYPE_JANUS_PRO:
             {
                 res = graph.build_siglip();
             } break;
-        case PROJECTOR_TYPE_COGVLM:
+        case PROJECTOR_TYPE_GLM4V:
             {
-                res = graph.build_cogvlm();
+                res = graph.build_glm4v();
             } break;
         default:
             {