From 52b2da6e2d8c2453e2fc2b6867e52e267d5484cc Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Thu, 10 Jul 2025 15:06:11 +0200 Subject: [PATCH 1/8] model : LiquidAI lfm2 350M/700M/1.2B dense text-only --- convert_hf_to_gguf.py | 53 ++++++++ convert_hf_to_gguf_update.py | 1 + ggml/src/ggml-cuda/ssm-conv.cu | 8 ++ gguf-py/gguf/constants.py | 29 ++++ gguf-py/gguf/gguf_writer.py | 6 + gguf-py/gguf/tensor_mapping.py | 12 ++ src/llama-arch.cpp | 30 +++++ src/llama-arch.h | 8 ++ src/llama-hparams.cpp | 5 + src/llama-hparams.h | 2 + src/llama-model-loader.cpp | 7 +- src/llama-model.cpp | 235 +++++++++++++++++++++++++++++++++ src/llama-model.h | 11 ++ src/llama-quant.cpp | 1 + src/llama-vocab.cpp | 3 +- 15 files changed, 408 insertions(+), 3 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2419126ec4ea2..372c7004cef9f 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -300,6 +300,7 @@ def prepare_tensors(self): gguf.MODEL_TENSOR.POS_EMBD, gguf.MODEL_TENSOR.TOKEN_TYPES, gguf.MODEL_TENSOR.SSM_CONV1D, + gguf.MODEL_TENSOR.SHORTCONV_CONV, gguf.MODEL_TENSOR.TIME_MIX_FIRST, gguf.MODEL_TENSOR.TIME_MIX_W1, gguf.MODEL_TENSOR.TIME_MIX_W2, @@ -833,6 +834,9 @@ def get_vocab_base_pre(self, tokenizer) -> str: if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b": # ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base res = "falcon-h1" + if chkhsh == "169bf0296a13c4d9b7672313f749eb36501d931022de052aad6e36f2bf34dd51": + # ref: https://huggingface.co/LiquidAI/LFM2-Tokenizer + res = "lfm2" if res is None: logger.warning("\n") @@ -6943,6 +6947,55 @@ def set_vocab(self): chat_template = tokenizer.chat_template.replace("[:]", "") self.gguf_writer.add_chat_template(chat_template) +@ModelBase.register("LFM2ForCausalLM") +class LFM2Model(TextModel): + model_arch = gguf.MODEL_ARCH.LFM2 + + def _add_feed_forward_length(self): + ff_dim = self.hparams["block_ff_dim"] + + auto_adjust_ff_dim = self.hparams["block_auto_adjust_ff_dim"] + ff_dim = self.hparams["block_ff_dim"] + ffn_dim_multiplier = self.hparams["block_ffn_dim_multiplier"] + multiple_of = self.hparams["block_multiple_of"] + + if auto_adjust_ff_dim: + ff_dim = int(2 * ff_dim / 3) + # custom dim factor multiplier + if ffn_dim_multiplier is not None: + ff_dim = int(ffn_dim_multiplier * ff_dim) + ff_dim = multiple_of * ((ff_dim + multiple_of - 1) // multiple_of) + + self.gguf_writer.add_feed_forward_length(ff_dim) + + + def set_gguf_parameters(self): + # set only for attention layers before calling super().set_gguf_parameters() + self.hparams["num_key_value_heads"] = [(self.hparams["num_key_value_heads"] if x in self.hparams["full_attn_idxs"] else 0) for x in range(self.block_count)] + + super().set_gguf_parameters() + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"]) + self.gguf_writer.add_is_recurrent_layer([x not in self.hparams["full_attn_idxs"] for x in range(self.block_count)]) + self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"]) + self._add_feed_forward_length() + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + if 'operator_norm' in name: + name = name.replace('operator_norm', 'norm') + elif 'attention.k_layernorm' in name or 'attention.q_layernorm' in name: + name = name.replace('attention', 'self_attn') + elif name.startswith("model.embedding_norm"): + name = name.replace("model.embedding_norm", 'word_embeddings_layernorm') + elif 'conv.conv' in name: + # conv op requires 2d tensor + data_torch = data_torch.squeeze(1) + elif 'self_attn.out_proj' in name: + name = name.replace('out_proj', 'o_proj') + + return [(self.map_tensor_name(name), data_torch)] + + ###### CONVERSION LOGIC ###### diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index b8cb6027d6de5..3ae85a8267cad 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -129,6 +129,7 @@ class TOKENIZER_TYPE(IntEnum): {"name": "pixtral", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", }, {"name": "seed-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", }, {"name": "a.x-4.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", }, + {"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"}, ] # some models are known to be broken upstream, so we will skip them as exceptions diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu index f637571963730..2d84c90782d59 100644 --- a/ggml/src/ggml-cuda/ssm-conv.cu +++ b/ggml/src/ggml-cuda/ssm-conv.cu @@ -107,6 +107,9 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int if (nc == 4) { ssm_conv_f32<<>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t); + } else if (nc == 3) { + ssm_conv_f32<<>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, + dst, dst_nb0, dst_nb1, dst_nb2, n_t); } else { GGML_ABORT("Only support kernel size = 4 now."); } @@ -116,6 +119,11 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t); ssm_conv_long_token_f32<<>>( src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t); + } else if (nc == 3) { + const int64_t split_n_t = 32; + dim3 blocks(n_s, (nr + threads - 1) / threads, (n_t + split_n_t - 1) / split_n_t); + ssm_conv_long_token_f32<<>>( + src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t); } else { GGML_ABORT("Only support kernel size = 4 right now."); } diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index fbe3f53273a35..958f3e2ee3bfc 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -122,6 +122,7 @@ class LLM: ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx" ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs" EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input" + IS_RECURRENT_LAYER = "{arch}.is_recurrent_layer" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -187,6 +188,9 @@ class ConvNext: class Classifier: OUTPUT_LABELS = "{arch}.classifier.output_labels" + class ShortConv: + L_CACHE = "{arch}.shortconv.l_cache" + class Tokenizer: MODEL = "tokenizer.ggml.model" PRE = "tokenizer.ggml.pre" @@ -361,6 +365,7 @@ class MODEL_ARCH(IntEnum): ERNIE4_5 = auto() HUNYUAN_MOE = auto() SMOLLM3 = auto() + LFM2 = auto() class VISION_PROJECTOR_TYPE(IntEnum): @@ -532,6 +537,9 @@ class MODEL_TENSOR(IntEnum): POSNET_ATTN_K = auto() POSNET_ATTN_V = auto() POSNET_ATTN_OUT = auto() + SHORTCONV_CONV = auto() + SHORTCONV_INPROJ = auto() + SHORTCONV_OUTPROJ = auto() # vision V_MMPROJ = auto() V_MMPROJ_FC = auto() @@ -671,6 +679,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.FALCON_H1: "falcon-h1", MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe", MODEL_ARCH.SMOLLM3: "smollm3", + MODEL_ARCH.LFM2: "lfm2", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -842,6 +851,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k", MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v", MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output", + MODEL_TENSOR.SHORTCONV_CONV: "blk.{bid}.shortconv.conv", + MODEL_TENSOR.SHORTCONV_INPROJ: "blk.{bid}.shortconv.in_proj", + MODEL_TENSOR.SHORTCONV_OUTPROJ: "blk.{bid}.shortconv.out_proj", # vision MODEL_TENSOR.V_MMPROJ: "mm.{bid}", MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc", @@ -2323,6 +2335,23 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, + MODEL_ARCH.LFM2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.TOKEN_EMBD_NORM, + MODEL_TENSOR.SHORTCONV_CONV, + MODEL_TENSOR.SHORTCONV_INPROJ, + MODEL_TENSOR.SHORTCONV_OUTPROJ, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.ATTN_NORM, # operator_norm + MODEL_TENSOR.ATTN_Q_NORM, + MODEL_TENSOR.ATTN_K_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, ], # TODO } diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index a7ecf3d31209f..94ee6a32ad164 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -648,6 +648,12 @@ def add_convnext_embedding_length(self, length: int) -> None: def add_convnext_block_count(self, length: int) -> None: self.add_uint32(Keys.ConvNext.BLOCK_COUNT.format(arch=self.arch), length) + def add_shortconv_l_cache(self, length: int) -> None: + self.add_uint32(Keys.ShortConv.L_CACHE.format(arch=self.arch), length) + + def add_is_recurrent_layer(self, value: Sequence[bool]) -> None: + self.add_array(Keys.LLM.IS_RECURRENT_LAYER.format(arch=self.arch), value) + def add_block_count(self, length: int) -> None: self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 215eb297ebcc1..f610ccb79f059 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1015,6 +1015,18 @@ class TensorNameMap: "backbone.posnet.{bid}.proj_out", # wavtokenizer ), + MODEL_TENSOR.SHORTCONV_CONV: ( + "model.layers.{bid}.conv.conv", + ), + + MODEL_TENSOR.SHORTCONV_INPROJ: ( + "model.layers.{bid}.conv.in_proj", + ), + + MODEL_TENSOR.SHORTCONV_OUTPROJ: ( + "model.layers.{bid}.conv.out_proj", + ), + ############################################################################# ## Vision encoder diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index cb2c9dba8d358..2eca0a4f63d85 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -82,6 +82,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_ERNIE4_5, "ernie4_5" }, { LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" }, { LLM_ARCH_SMOLLM3, "smollm3" }, + { LLM_ARCH_LFM2, "lfm2" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -188,6 +189,10 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" }, + { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" }, + + { LLM_KV_IS_RECURRENT_LAYER, "%s.is_recurrent_layer" }, + { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, @@ -1793,6 +1798,27 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_LFM2, + { + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" }, + { LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + { LLM_TENSOR_SHORTCONV_CONV, "blk.%d.shortconv.conv" }, + { LLM_TENSOR_SHORTCONV_INPROJ, "blk.%d.shortconv.in_proj" }, + { LLM_TENSOR_SHORTCONV_OUTPROJ, "blk.%d.shortconv.out_proj" }, + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" }, + } + }, { LLM_ARCH_UNKNOWN, { @@ -1960,6 +1986,9 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, }; LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {} @@ -2031,6 +2060,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) { switch (arch) { case LLM_ARCH_JAMBA: case LLM_ARCH_FALCON_H1: + case LLM_ARCH_LFM2: return true; default: return false; diff --git a/src/llama-arch.h b/src/llama-arch.h index 3381b8dc4a4b7..73cf7e010c533 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -86,6 +86,7 @@ enum llm_arch { LLM_ARCH_ERNIE4_5, LLM_ARCH_HUNYUAN_MOE, LLM_ARCH_SMOLLM3, + LLM_ARCH_LFM2, LLM_ARCH_UNKNOWN, }; @@ -227,6 +228,10 @@ enum llm_kv { LLM_KV_CLASSIFIER_OUTPUT_LABELS, + LLM_KV_SHORTCONV_L_CACHE, + + LLM_KV_IS_RECURRENT_LAYER, + // deprecated: LLM_KV_TOKENIZER_PREFIX_ID, LLM_KV_TOKENIZER_SUFFIX_ID, @@ -396,6 +401,9 @@ enum llm_tensor { LLM_TENSOR_POS_NET_ATTN_K, LLM_TENSOR_POS_NET_ATTN_V, LLM_TENSOR_POS_NET_ATTN_OUT, + LLM_TENSOR_SHORTCONV_CONV, + LLM_TENSOR_SHORTCONV_INPROJ, + LLM_TENSOR_SHORTCONV_OUTPROJ, }; enum llm_tensor_layer { diff --git a/src/llama-hparams.cpp b/src/llama-hparams.cpp index 86c814d51b901..7aa736e2f39db 100644 --- a/src/llama-hparams.cpp +++ b/src/llama-hparams.cpp @@ -71,6 +71,11 @@ uint32_t llama_hparams::n_embd_r() const { return token_shift_count * n_embd; } + if (n_shortconv_l_cache != 0) { + // for LFM2 models + return n_embd * (n_shortconv_l_cache - 1); + } + // TODO: maybe support other convolution strides than 1 // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed // Corresponds to Mamba's conv_states size diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 476d0a5eade28..d0500e4d0fd77 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -55,6 +55,8 @@ struct llama_hparams { struct llama_hparams_posnet posnet; struct llama_hparams_convnext convnext; + uint32_t n_shortconv_l_cache = 0; + std::array n_head_arr; std::array n_head_kv_arr; std::array n_ff_arr; diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index bd9e6da8832b7..bc48027862169 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -305,10 +305,11 @@ namespace GGUFMeta { case GGUF_TYPE_UINT32: case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same::value) || (std::is_same::value)); break; + case GGUF_TYPE_BOOL: GGML_ASSERT((std::is_same::value)); break; case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same::value)); break; default: - throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str())); + throw std::runtime_error(format("%s is not a string/float32/uint32/int32/bool array", key.c_str())); } if constexpr (std::is_same::value) { @@ -346,10 +347,11 @@ namespace GGUFMeta { case GGUF_TYPE_UINT32: case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same::value) || (std::is_same::value)); break; + case GGUF_TYPE_BOOL: GGML_ASSERT((std::is_same::value)); break; case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same::value)); break; default: - throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str())); + throw std::runtime_error(format("%s is not a string/float32/uint32/int32/bool array", key.c_str())); } if (arr_info.length > N_MAX) { @@ -464,6 +466,7 @@ namespace GGUFMeta { // TODO: this is not very clever - figure out something better template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); + template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); llama_model_loader::llama_model_loader( const std::string & fname, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ca094e47b6cb5..38d780727403a 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -42,15 +42,18 @@ const char * llm_type_name(llm_type type) { case LLM_TYPE_250M: return "250M"; case LLM_TYPE_270M: return "270M"; case LLM_TYPE_335M: return "335M"; + case LLM_TYPE_350M: return "350M"; case LLM_TYPE_410M: return "410M"; case LLM_TYPE_450M: return "450M"; case LLM_TYPE_475M: return "475M"; + case LLM_TYPE_700M: return "700M"; case LLM_TYPE_770M: return "770M"; case LLM_TYPE_780M: return "780M"; case LLM_TYPE_0_3B: return "0.3B"; case LLM_TYPE_0_5B: return "0.5B"; case LLM_TYPE_0_6B: return "0.6B"; case LLM_TYPE_1B: return "1B"; + case LLM_TYPE_1_2B: return "1.2B"; case LLM_TYPE_1_3B: return "1.3B"; case LLM_TYPE_1_4B: return "1.4B"; case LLM_TYPE_1_5B: return "1.5B"; @@ -495,6 +498,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.n_head_kv_arr = hparams.n_head_arr; ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false); + ml.get_key_or_arr(LLM_KV_IS_RECURRENT_LAYER, hparams.recurrent_layer_arr, hparams.n_layer, false); bool rope_finetuned = false; ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); @@ -1622,6 +1626,17 @@ void llama_model::load_hparams(llama_model_loader & ml) { default: type = LLM_TYPE_UNKNOWN; } } break; + case LLM_ARCH_LFM2: + { + ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache); + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + switch (hparams.n_embd) { + case 1024: type = LLM_TYPE_350M; break; + case 1536: type = LLM_TYPE_700M; break; + case 2048: type = LLM_TYPE_1_2B; break; + default: type = LLM_TYPE_UNKNOWN; + } + } break; default: throw std::runtime_error("unsupported model architecture"); } @@ -4772,6 +4787,39 @@ bool llama_model::load_tensors(llama_model_loader & ml) { layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); } } break; + case LLM_ARCH_LFM2: + { + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + tok_norm = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, 0); + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + // ffn is same for transformer and conv layers + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + + // for operator_norm + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + if (!hparams.is_recurrent(i)) { + layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd_head_k}, 0); + layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd_head_k}, 0); + GGML_ASSERT(n_embd_v_gqa == n_embd_k_gqa); + + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, hparams.n_embd_k_gqa(i)}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, hparams.n_embd_v_gqa(i)}, 0); + + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0); + } else { + layer.shortconv.conv = create_tensor(tn(LLM_TENSOR_SHORTCONV_CONV, "weight", i), {hparams.n_shortconv_l_cache, n_embd}, 0); + layer.shortconv.in_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_INPROJ, "weight", i), {n_embd, 3 * n_embd}, 0); + layer.shortconv.out_proj = create_tensor(tn(LLM_TENSOR_SHORTCONV_OUTPROJ, "weight", i), {n_embd, n_embd}, 0); + } + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -15474,6 +15522,188 @@ struct llm_build_smollm3 : public llm_graph_context { } }; +struct llm_build_lfm2 : public llm_graph_context { + const llama_model & model; + + llm_build_lfm2(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params), model(model) { + + ggml_tensor * cur = build_inp_embd(model.tok_embd); + cb(cur, "model.embed_tokens", -1); + + ggml_tensor * inp_pos = build_inp_pos(); + + auto *inp = build_inp_mem_hybrid(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + // add s_copy to graph + ggml_build_forward_expand(gf, inp->s_copy); + + for (int il = 0; il < n_layer; ++il) { + auto *prev_cur = cur; + cur = lfm2_rms_norm(cur, model.layers[il].attn_norm); + cb(cur, "model.layers.{}.operator_norm", il); + + cur = hparams.is_recurrent(il) ? + build_shortconv_block(gf, cur, il) : + build_attn_block(gf, cur, inp_pos, inp, il) ; + + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + prev_cur = ggml_get_rows(ctx0, prev_cur, inp_out_ids); + } + + cur = ggml_add(ctx0, prev_cur, cur); + cur = ggml_add(ctx0, cur, build_feed_forward(cur, il)); + } + + cur = lfm2_rms_norm(cur, model.tok_norm); + cb(cur, "model.embedding_norm", -1); + res->t_embd = cur; + + // lm_head is tied with embeddings + cur = ggml_mul_mat(ctx0, model.tok_embd, cur); + cb(cur, "lm_head", -1); + + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); + } + + ggml_tensor *build_feed_forward( + ggml_tensor * cur, + int il) const { + cur = lfm2_rms_norm(cur, model.layers[il].ffn_norm); + cb(cur, "model.layers.{}.ffn_norm", il); + + GGML_ASSERT(!model.layers[il].ffn_up_b); + GGML_ASSERT(!model.layers[il].ffn_gate_b); + GGML_ASSERT(!model.layers[il].ffn_down_b); + cur = build_ffn(cur, + model.layers[il].ffn_up, NULL, NULL, + model.layers[il].ffn_gate, NULL, NULL, + model.layers[il].ffn_down, NULL, NULL, + NULL, + LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "model.layers.{}.feed_forward.w2", il); + + return cur; + } + + ggml_tensor *build_attn_block( + ggml_cgraph *gf, + ggml_tensor *cur, + ggml_tensor *inp_pos, + llm_graph_input_mem_hybrid *inp, + int il) const { + GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il)); + auto const n_embd_head = hparams.n_embd_head_v; + auto const n_head_kv = hparams.n_head_kv(il); + + auto *q = build_lora_mm(model.layers[il].wq, cur); + cb(q, "model.layers.{}.self_attn.q_proj", il); + auto *k = build_lora_mm(model.layers[il].wk, cur); + cb(k, "model.layers.{}.self_attn.k_proj", il); + auto *v = build_lora_mm(model.layers[il].wv, cur); + cb(v, "model.layers.{}.self_attn.v_proj", il); + + q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens); + k = ggml_reshape_3d(ctx0, k, n_embd_head, n_head_kv, n_tokens); + v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens); + + // qk norm + q = lfm2_rms_norm(q, model.layers[il].attn_q_norm); + cb(q, "model.layers.{}.self_attn.q_layernorm", il); + k = lfm2_rms_norm(k, model.layers[il].attn_k_norm); + cb(k, "model.layers.{}.self_attn.k_layernorm", il); + + // RoPE + q = ggml_rope_ext( + ctx0, q, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + k = ggml_rope_ext( + ctx0, k, inp_pos, nullptr, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + + cur = build_attn(inp, gf, + model.layers[il].wo, NULL, + q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); + + cb(cur, "model.layers.{}.self_attn.out_proj", il); + + return cur; + } + + ggml_tensor * build_shortconv_block( + ggml_cgraph * gf, + ggml_tensor * cur, + int il) { + const auto * mctx_cur = static_cast(mctx)->get_recr(); + + auto *bcx = ggml_mul_mat(ctx0, model.layers[il].shortconv.in_proj, cur); + cb(bcx, "model.layers.{}.conv.in_proj", il); + + constexpr auto n_chunks = 3; + GGML_ASSERT(bcx->ne[0] % n_chunks == 0); + auto const chunk_size = bcx->ne[0] / n_chunks; + auto *b = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 0 * chunk_size * ggml_element_size(bcx)); + auto *c = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 1 * chunk_size * ggml_element_size(bcx)); + auto *x = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 2 * chunk_size * ggml_element_size(bcx)); + + auto *bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x)); + + // read conv state directly, with build_rs generation is slower + const int64_t n_seqs = ubatch.n_seqs; + ggml_tensor * conv_state = mctx_cur->get_r_l(il); + auto *conv = ggml_reshape_3d(ctx0, conv_state, hparams.n_shortconv_l_cache - 1, hparams.n_embd, n_seqs); + + bx = ggml_concat(ctx0, conv, bx, 0); + GGML_ASSERT(bx->ne[0] > conv->ne[0]); + + auto *new_conv = ggml_view_2d(ctx0, bx, conv->ne[0], bx->ne[1], bx->nb[1], (bx->ne[0] - conv->ne[0]) * ggml_element_size(bx)); + GGML_ASSERT(ggml_are_same_shape(conv, new_conv)); + + // write conv state + ggml_build_forward_expand(gf, ggml_cpy(ctx0, new_conv, conv_state)); + + auto *conv_kernel = model.layers[il].shortconv.conv; + GGML_ASSERT(hparams.n_shortconv_l_cache > 0); + + // construct ssm_conv op + struct ggml_tensor * conv_out = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, bx->ne[1], bx->ne[0] - conv->ne[0], bx->ne[2]); + conv_out->op = GGML_OP_SSM_CONV; + conv_out->src[0] = bx; + conv_out->src[1] = conv_kernel; + + cb(conv_out, "model.layers.{}.conv.conv", il); + + auto *y = ggml_mul(ctx0, c, conv_out); + + y = build_lora_mm(model.layers[il].shortconv.out_proj, y); + cb(y, "model.layers.{}.conv.out_proj", il); + + return y; + } + + // upcast to f32 before rms norm + ggml_tensor *lfm2_rms_norm(ggml_tensor *t, ggml_tensor *w) const { + auto *t_float = t; + if (t_float->type != GGML_TYPE_F32) { + t_float = ggml_cast(ctx0, t, GGML_TYPE_F32); + } + + auto *output = ggml_rms_norm(ctx0, t_float, hparams.f_norm_rms_eps); + if (output->type != t->type) { + output = ggml_cast(ctx0, output, t->type); + } + + return ggml_mul(ctx0, output, w); + } +}; + llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const { llama_memory_i * res; @@ -15872,6 +16102,10 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; + case LLM_ARCH_LFM2: + { + llm = std::make_unique(*this, params, gf); + } break; default: GGML_ABORT("fatal error"); } @@ -16064,6 +16298,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_MINICPM3: case LLM_ARCH_DOTS1: case LLM_ARCH_HUNYUAN_MOE: + case LLM_ARCH_LFM2: return LLAMA_ROPE_TYPE_NEOX; case LLM_ARCH_QWEN2VL: diff --git a/src/llama-model.h b/src/llama-model.h index 453f5af62fbc7..78ac5ec6089d2 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -34,15 +34,18 @@ enum llm_type { LLM_TYPE_250M, LLM_TYPE_270M, LLM_TYPE_335M, + LLM_TYPE_350M, LLM_TYPE_410M, LLM_TYPE_450M, LLM_TYPE_475M, + LLM_TYPE_700M, LLM_TYPE_770M, LLM_TYPE_780M, LLM_TYPE_0_3B, LLM_TYPE_0_5B, LLM_TYPE_0_6B, LLM_TYPE_1B, + LLM_TYPE_1_2B, LLM_TYPE_1_3B, LLM_TYPE_1_4B, LLM_TYPE_1_5B, @@ -154,6 +157,12 @@ struct llama_layer_convnext { struct ggml_tensor * gamma = nullptr; }; +struct llama_layer_shortconv { + struct ggml_tensor * in_proj = nullptr; + struct ggml_tensor * conv = nullptr; + struct ggml_tensor * out_proj = nullptr; +}; + struct llama_layer { // normalization struct ggml_tensor * attn_norm = nullptr; @@ -340,6 +349,8 @@ struct llama_layer { struct llama_layer_posnet posnet; struct llama_layer_convnext convnext; + + struct llama_layer_shortconv shortconv; }; struct llama_model { diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index f4b5713d7dd9a..4dbd1e309919a 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -844,6 +844,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // do not quantize Mamba's small yet 2D weights // NOTE: can't use LLM_TN here because the layer number is not known quantize &= name.find("ssm_conv1d.weight") == std::string::npos; + quantize &= name.find("shortconv.conv.weight") == std::string::npos; // do not quantize RWKV's small yet 2D weights quantize &= name.find("time_mix_first.weight") == std::string::npos; diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index 6aa1d901c5e36..eb0091b952545 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1524,7 +1524,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "llama-bpe"|| tokenizer_pre == "falcon3" || tokenizer_pre == "falcon-h1" || - tokenizer_pre == "pixtral") { + tokenizer_pre == "pixtral" || + tokenizer_pre == "lfm2") { pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3; ignore_merges = true; add_bos = true; From 85c79868d9f7c780c08cc141e0cdc08586c389cd Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Thu, 10 Jul 2025 16:54:13 +0200 Subject: [PATCH 2/8] Fix cache --- src/llama-model.cpp | 36 ++++++++++++++++-------------------- 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 38d780727403a..bbb6e0d8ef1c9 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -15532,20 +15532,17 @@ struct llm_build_lfm2 : public llm_graph_context { ggml_tensor * inp_pos = build_inp_pos(); - auto *inp = build_inp_mem_hybrid(); + auto *inp_hybrid = build_inp_mem_hybrid(); ggml_tensor * inp_out_ids = build_inp_out_ids(); - // add s_copy to graph - ggml_build_forward_expand(gf, inp->s_copy); - for (int il = 0; il < n_layer; ++il) { auto *prev_cur = cur; cur = lfm2_rms_norm(cur, model.layers[il].attn_norm); cb(cur, "model.layers.{}.operator_norm", il); cur = hparams.is_recurrent(il) ? - build_shortconv_block(gf, cur, il) : - build_attn_block(gf, cur, inp_pos, inp, il) ; + build_shortconv_block(gf, cur, inp_hybrid->get_recr(), il) : + build_attn_block(gf, cur, inp_pos, inp_hybrid->get_attn(), il) ; if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); @@ -15589,12 +15586,11 @@ struct llm_build_lfm2 : public llm_graph_context { return cur; } - ggml_tensor *build_attn_block( - ggml_cgraph *gf, - ggml_tensor *cur, - ggml_tensor *inp_pos, - llm_graph_input_mem_hybrid *inp, - int il) const { + ggml_tensor *build_attn_block(ggml_cgraph *gf, + ggml_tensor *cur, + ggml_tensor *inp_pos, + llm_graph_input_attn_kv_unified *inp_attn, + int il) const { GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il)); auto const n_embd_head = hparams.n_embd_head_v; auto const n_head_kv = hparams.n_head_kv(il); @@ -15628,8 +15624,7 @@ struct llm_build_lfm2 : public llm_graph_context { ext_factor, attn_factor, beta_fast, beta_slow ); - cur = build_attn(inp, gf, - model.layers[il].wo, NULL, + cur = build_attn(inp_attn, gf, model.layers[il].wo, NULL, q, k, v, nullptr, nullptr, 1.0f/sqrtf(float(n_embd_head)), il); cb(cur, "model.layers.{}.self_attn.out_proj", il); @@ -15637,10 +15632,10 @@ struct llm_build_lfm2 : public llm_graph_context { return cur; } - ggml_tensor * build_shortconv_block( - ggml_cgraph * gf, - ggml_tensor * cur, - int il) { + ggml_tensor * build_shortconv_block(ggml_cgraph * gf, + ggml_tensor * cur, + llm_graph_input_rs *inp_recr, + int il) { const auto * mctx_cur = static_cast(mctx)->get_recr(); auto *bcx = ggml_mul_mat(ctx0, model.layers[il].shortconv.in_proj, cur); @@ -15656,9 +15651,10 @@ struct llm_build_lfm2 : public llm_graph_context { auto *bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x)); // read conv state directly, with build_rs generation is slower - const int64_t n_seqs = ubatch.n_seqs; ggml_tensor * conv_state = mctx_cur->get_r_l(il); - auto *conv = ggml_reshape_3d(ctx0, conv_state, hparams.n_shortconv_l_cache - 1, hparams.n_embd, n_seqs); + const int64_t n_seqs = ubatch.n_seqs; + ggml_tensor * conv = build_rs(inp_recr, gf, conv_state, hparams.n_embd_r(), n_seqs); + conv = ggml_reshape_3d(ctx0, conv_state, hparams.n_shortconv_l_cache - 1, hparams.n_embd, n_seqs); bx = ggml_concat(ctx0, conv, bx, 0); GGML_ASSERT(bx->ne[0] > conv->ne[0]); From 094109202572b826023706546715cd7a70258176 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Thu, 10 Jul 2025 17:07:13 +0200 Subject: [PATCH 3/8] Set is_recurrent from h_head_kv --- convert_hf_to_gguf.py | 3 +-- gguf-py/gguf/constants.py | 2 +- gguf-py/gguf/gguf_writer.py | 3 --- src/llama-arch.cpp | 2 -- src/llama-arch.h | 2 -- src/llama-model-loader.cpp | 7 ++----- src/llama-model.cpp | 4 +++- 7 files changed, 7 insertions(+), 16 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 372c7004cef9f..7aa5dd489538b 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6970,13 +6970,12 @@ def _add_feed_forward_length(self): def set_gguf_parameters(self): - # set only for attention layers before calling super().set_gguf_parameters() + # set num_key_value_heads only for attention layers self.hparams["num_key_value_heads"] = [(self.hparams["num_key_value_heads"] if x in self.hparams["full_attn_idxs"] else 0) for x in range(self.block_count)] super().set_gguf_parameters() self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) self.gguf_writer.add_shortconv_l_cache(self.hparams["conv_L_cache"]) - self.gguf_writer.add_is_recurrent_layer([x not in self.hparams["full_attn_idxs"] for x in range(self.block_count)]) self.gguf_writer.add_layer_norm_rms_eps(self.hparams["norm_eps"]) self._add_feed_forward_length() diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 958f3e2ee3bfc..60228ae542d38 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -122,7 +122,6 @@ class LLM: ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx" ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs" EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input" - IS_RECURRENT_LAYER = "{arch}.is_recurrent_layer" class Attention: HEAD_COUNT = "{arch}.attention.head_count" @@ -2335,6 +2334,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, + ], MODEL_ARCH.LFM2: [ MODEL_TENSOR.TOKEN_EMBD, MODEL_TENSOR.TOKEN_EMBD_NORM, diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 94ee6a32ad164..4f23f9b024619 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -651,9 +651,6 @@ def add_convnext_block_count(self, length: int) -> None: def add_shortconv_l_cache(self, length: int) -> None: self.add_uint32(Keys.ShortConv.L_CACHE.format(arch=self.arch), length) - def add_is_recurrent_layer(self, value: Sequence[bool]) -> None: - self.add_array(Keys.LLM.IS_RECURRENT_LAYER.format(arch=self.arch), value) - def add_block_count(self, length: int) -> None: self.add_uint32(Keys.LLM.BLOCK_COUNT.format(arch=self.arch), length) diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 2eca0a4f63d85..8591b7306244d 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -191,8 +191,6 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_SHORTCONV_L_CACHE, "%s.shortconv.l_cache" }, - { LLM_KV_IS_RECURRENT_LAYER, "%s.is_recurrent_layer" }, - { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 73cf7e010c533..c60b28fdde9ed 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -230,8 +230,6 @@ enum llm_kv { LLM_KV_SHORTCONV_L_CACHE, - LLM_KV_IS_RECURRENT_LAYER, - // deprecated: LLM_KV_TOKENIZER_PREFIX_ID, LLM_KV_TOKENIZER_SUFFIX_ID, diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index bc48027862169..bd9e6da8832b7 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -305,11 +305,10 @@ namespace GGUFMeta { case GGUF_TYPE_UINT32: case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same::value) || (std::is_same::value)); break; - case GGUF_TYPE_BOOL: GGML_ASSERT((std::is_same::value)); break; case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same::value)); break; default: - throw std::runtime_error(format("%s is not a string/float32/uint32/int32/bool array", key.c_str())); + throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str())); } if constexpr (std::is_same::value) { @@ -347,11 +346,10 @@ namespace GGUFMeta { case GGUF_TYPE_UINT32: case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same::value) || (std::is_same::value)); break; - case GGUF_TYPE_BOOL: GGML_ASSERT((std::is_same::value)); break; case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same::value)); break; case GGUF_TYPE_STRING: GGML_ASSERT((std::is_same::value)); break; default: - throw std::runtime_error(format("%s is not a string/float32/uint32/int32/bool array", key.c_str())); + throw std::runtime_error(format("%s is not a string/float32/uint32/int32 array", key.c_str())); } if (arr_info.length > N_MAX) { @@ -466,7 +464,6 @@ namespace GGUFMeta { // TODO: this is not very clever - figure out something better template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); - template bool llama_model_loader::get_key_or_arr>(enum llm_kv kid, std::array & result, uint32_t n, bool required); llama_model_loader::llama_model_loader( const std::string & fname, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index bbb6e0d8ef1c9..04574ad966a07 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -498,7 +498,6 @@ void llama_model::load_hparams(llama_model_loader & ml) { hparams.n_head_kv_arr = hparams.n_head_arr; ml.get_key_or_arr(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, hparams.n_layer, false); - ml.get_key_or_arr(LLM_KV_IS_RECURRENT_LAYER, hparams.recurrent_layer_arr, hparams.n_layer, false); bool rope_finetuned = false; ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false); @@ -1630,6 +1629,9 @@ void llama_model::load_hparams(llama_model_loader & ml) { { ml.get_key(LLM_KV_SHORTCONV_L_CACHE, hparams.n_shortconv_l_cache); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + for (uint32_t il = 0; il < hparams.n_layer; ++il) { + hparams.recurrent_layer_arr[il] = hparams.n_head_kv(il) == 0; + } switch (hparams.n_embd) { case 1024: type = LLM_TYPE_350M; break; case 1536: type = LLM_TYPE_700M; break; From 2ddfa277f8f9706642e9918e7cfeaf13e661b7ab Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Thu, 10 Jul 2025 18:27:43 +0200 Subject: [PATCH 4/8] Use layer_types instead of full_attn_idxs --- convert_hf_to_gguf.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7aa5dd489538b..ffe9bbeb692b8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6971,7 +6971,10 @@ def _add_feed_forward_length(self): def set_gguf_parameters(self): # set num_key_value_heads only for attention layers - self.hparams["num_key_value_heads"] = [(self.hparams["num_key_value_heads"] if x in self.hparams["full_attn_idxs"] else 0) for x in range(self.block_count)] + self.hparams["num_key_value_heads"] = [ + self.hparams["num_key_value_heads"] if layer_type == "full_attention" else 0 + for layer_type in self.hparams["layer_types"] + ] super().set_gguf_parameters() self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) From b76c0582483fa126777b7bcf35528fbab64b6714 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Thu, 10 Jul 2025 18:46:27 +0200 Subject: [PATCH 5/8] make flake8 happy --- convert_hf_to_gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ffe9bbeb692b8..ae2a3332f1b8e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6947,6 +6947,7 @@ def set_vocab(self): chat_template = tokenizer.chat_template.replace("[:]", "") self.gguf_writer.add_chat_template(chat_template) + @ModelBase.register("LFM2ForCausalLM") class LFM2Model(TextModel): model_arch = gguf.MODEL_ARCH.LFM2 @@ -6968,7 +6969,6 @@ def _add_feed_forward_length(self): self.gguf_writer.add_feed_forward_length(ff_dim) - def set_gguf_parameters(self): # set num_key_value_heads only for attention layers self.hparams["num_key_value_heads"] = [ From a22c09ac90534de501d9fbcc93cb1bab2de831b9 Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Thu, 10 Jul 2025 21:15:03 +0200 Subject: [PATCH 6/8] Address PR feedback --- ggml/src/ggml-cuda/ssm-conv.cu | 4 ++-- src/llama-model.cpp | 6 +----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu index 2d84c90782d59..41979733601d2 100644 --- a/ggml/src/ggml-cuda/ssm-conv.cu +++ b/ggml/src/ggml-cuda/ssm-conv.cu @@ -111,7 +111,7 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int ssm_conv_f32<<>>(src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t); } else { - GGML_ABORT("Only support kernel size = 4 now."); + GGML_ABORT("Only support kernel size = 3 or size = 4 right now."); } } else { if (nc == 4) { @@ -125,7 +125,7 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int ssm_conv_long_token_f32<<>>( src0, src1, src0_nb0, src0_nb1, src0_nb2, src1_nb1, dst, dst_nb0, dst_nb1, dst_nb2, n_t); } else { - GGML_ABORT("Only support kernel size = 4 right now."); + GGML_ABORT("Only support kernel size = 3 or size = 4 right now."); } } } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 04574ad966a07..4253027dbf8a5 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -15671,11 +15671,7 @@ struct llm_build_lfm2 : public llm_graph_context { GGML_ASSERT(hparams.n_shortconv_l_cache > 0); // construct ssm_conv op - struct ggml_tensor * conv_out = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, bx->ne[1], bx->ne[0] - conv->ne[0], bx->ne[2]); - conv_out->op = GGML_OP_SSM_CONV; - conv_out->src[0] = bx; - conv_out->src[1] = conv_kernel; - + ggml_tensor * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel); cb(conv_out, "model.layers.{}.conv.conv", il); auto *y = ggml_mul(ctx0, c, conv_out); From 3cadfcef28e76107a089bd6bf369647e1ae4848e Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Fri, 11 Jul 2025 15:36:03 +0200 Subject: [PATCH 7/8] Address PR feedback 2 --- convert_hf_to_gguf.py | 12 +----- gguf-py/gguf/tensor_mapping.py | 3 ++ src/llama-arch.cpp | 2 +- src/llama-model.cpp | 73 +++++++++++++--------------------- 4 files changed, 34 insertions(+), 56 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 082e65132d795..ce38392b1eef6 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7113,17 +7113,9 @@ def set_gguf_parameters(self): self._add_feed_forward_length() def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - if 'operator_norm' in name: - name = name.replace('operator_norm', 'norm') - elif 'attention.k_layernorm' in name or 'attention.q_layernorm' in name: - name = name.replace('attention', 'self_attn') - elif name.startswith("model.embedding_norm"): - name = name.replace("model.embedding_norm", 'word_embeddings_layernorm') - elif 'conv.conv' in name: - # conv op requires 2d tensor + # conv op requires 2d tensor + if 'conv.conv' in name: data_torch = data_torch.squeeze(1) - elif 'self_attn.out_proj' in name: - name = name.replace('out_proj', 'o_proj') return [(self.map_tensor_name(name), data_torch)] diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 2fbae6f461448..75855eba52c3c 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -50,6 +50,7 @@ class TensorNameMap: "model.pre_ln", # rwkv7 "model.layers.0.pre_norm", # rwkv7 "backbone.norm", # wavtokenizer + "model.embedding_norm", # lfm2 ), # Position embeddings @@ -136,6 +137,7 @@ class TensorNameMap: "model.layers.{bid}.ln1", # rwkv7 "model.layers.{bid}.input_layernorm", # llama4 "transformer_encoder.{bid}.attention_norm", # neobert + "model.layers.{bid}.operator_norm", # lfm2 ), # Attention norm 2 @@ -220,6 +222,7 @@ class TensorNameMap: "transformer.h.{bid}.self_attention.dense", # falcon "h.{bid}.self_attention.dense", # bloom "model.layers.{bid}.self_attn.o_proj", # llama-hf nemotron olmoe olmo2 phimoe + "model.layers.{bid}.self_attn.out_proj", # lfm2 "model.layers.{bid}.self_attn.linear_attn", # deci "layers.{bid}.attention.wo", # llama-pth "encoder.layer.{bid}.attention.output.dense", # bert diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 262d1739d5a90..e63ab284bc3b5 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -2021,7 +2021,7 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_CONVNEXT_PW1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CONVNEXT_PW2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CONVNEXT_GAMMA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_SHORTCONV_CONV, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}}, {LLM_TENSOR_SHORTCONV_INPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_SHORTCONV_OUTPROJ, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, }; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index b1ab81535f297..a322fc39352e7 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -15917,14 +15917,13 @@ struct llm_build_lfm2 : public llm_graph_context { ggml_tensor * cur = build_inp_embd(model.tok_embd); cb(cur, "model.embed_tokens", -1); - ggml_tensor * inp_pos = build_inp_pos(); - - auto *inp_hybrid = build_inp_mem_hybrid(); + ggml_tensor * inp_pos = build_inp_pos(); + auto * inp_hybrid = build_inp_mem_hybrid(); ggml_tensor * inp_out_ids = build_inp_out_ids(); for (int il = 0; il < n_layer; ++il) { - auto *prev_cur = cur; - cur = lfm2_rms_norm(cur, model.layers[il].attn_norm); + auto * prev_cur = cur; + cur = build_norm(cur, model.layers[il].attn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "model.layers.{}.operator_norm", il); cur = hparams.is_recurrent(il) ? @@ -15940,12 +15939,12 @@ struct llm_build_lfm2 : public llm_graph_context { cur = ggml_add(ctx0, cur, build_feed_forward(cur, il)); } - cur = lfm2_rms_norm(cur, model.tok_norm); + cur = build_norm(cur, model.tok_norm, NULL, LLM_NORM_RMS, -1); cb(cur, "model.embedding_norm", -1); res->t_embd = cur; // lm_head is tied with embeddings - cur = ggml_mul_mat(ctx0, model.tok_embd, cur); + cur = build_lora_mm(model.tok_embd, cur); cb(cur, "lm_head", -1); res->t_logits = cur; @@ -15953,10 +15952,9 @@ struct llm_build_lfm2 : public llm_graph_context { ggml_build_forward_expand(gf, cur); } - ggml_tensor *build_feed_forward( - ggml_tensor * cur, - int il) const { - cur = lfm2_rms_norm(cur, model.layers[il].ffn_norm); + ggml_tensor * build_feed_forward(ggml_tensor * cur, + int il) const { + cur = build_norm(cur, model.layers[il].ffn_norm, NULL, LLM_NORM_RMS, il); cb(cur, "model.layers.{}.ffn_norm", il); GGML_ASSERT(!model.layers[il].ffn_up_b); @@ -15973,20 +15971,20 @@ struct llm_build_lfm2 : public llm_graph_context { return cur; } - ggml_tensor *build_attn_block(ggml_cgraph *gf, - ggml_tensor *cur, - ggml_tensor *inp_pos, - llm_graph_input_attn_kv_unified *inp_attn, - int il) const { + ggml_tensor * build_attn_block(ggml_cgraph * gf, + ggml_tensor * cur, + ggml_tensor * inp_pos, + llm_graph_input_attn_kv_unified * inp_attn, + int il) const { GGML_ASSERT(hparams.n_embd_v_gqa(il) == hparams.n_embd_k_gqa(il)); auto const n_embd_head = hparams.n_embd_head_v; auto const n_head_kv = hparams.n_head_kv(il); - auto *q = build_lora_mm(model.layers[il].wq, cur); + auto * q = build_lora_mm(model.layers[il].wq, cur); cb(q, "model.layers.{}.self_attn.q_proj", il); - auto *k = build_lora_mm(model.layers[il].wk, cur); + auto * k = build_lora_mm(model.layers[il].wk, cur); cb(k, "model.layers.{}.self_attn.k_proj", il); - auto *v = build_lora_mm(model.layers[il].wv, cur); + auto * v = build_lora_mm(model.layers[il].wv, cur); cb(v, "model.layers.{}.self_attn.v_proj", il); q = ggml_reshape_3d(ctx0, q, n_embd_head, n_head, n_tokens); @@ -15994,9 +15992,9 @@ struct llm_build_lfm2 : public llm_graph_context { v = ggml_reshape_3d(ctx0, v, n_embd_head, n_head_kv, n_tokens); // qk norm - q = lfm2_rms_norm(q, model.layers[il].attn_q_norm); + q = build_norm(q, model.layers[il].attn_q_norm, NULL, LLM_NORM_RMS, il); cb(q, "model.layers.{}.self_attn.q_layernorm", il); - k = lfm2_rms_norm(k, model.layers[il].attn_k_norm); + k = build_norm(k, model.layers[il].attn_k_norm, NULL, LLM_NORM_RMS, il); cb(k, "model.layers.{}.self_attn.k_layernorm", il); // RoPE @@ -16021,21 +16019,21 @@ struct llm_build_lfm2 : public llm_graph_context { ggml_tensor * build_shortconv_block(ggml_cgraph * gf, ggml_tensor * cur, - llm_graph_input_rs *inp_recr, + llm_graph_input_rs * inp_recr, int il) { const auto * mctx_cur = static_cast(mctx)->get_recr(); - auto *bcx = ggml_mul_mat(ctx0, model.layers[il].shortconv.in_proj, cur); + auto * bcx = build_lora_mm(model.layers[il].shortconv.in_proj, cur); cb(bcx, "model.layers.{}.conv.in_proj", il); constexpr auto n_chunks = 3; GGML_ASSERT(bcx->ne[0] % n_chunks == 0); auto const chunk_size = bcx->ne[0] / n_chunks; - auto *b = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 0 * chunk_size * ggml_element_size(bcx)); - auto *c = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 1 * chunk_size * ggml_element_size(bcx)); - auto *x = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 2 * chunk_size * ggml_element_size(bcx)); + auto * b = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 0 * chunk_size * ggml_element_size(bcx)); + auto * c = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 1 * chunk_size * ggml_element_size(bcx)); + auto * x = ggml_view_2d(ctx0, bcx, chunk_size, bcx->ne[1], bcx->nb[1], 2 * chunk_size * ggml_element_size(bcx)); - auto *bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x)); + auto * bx = ggml_transpose(ctx0, ggml_mul(ctx0, b, x)); // read conv state directly, with build_rs generation is slower ggml_tensor * conv_state = mctx_cur->get_r_l(il); @@ -16046,41 +16044,26 @@ struct llm_build_lfm2 : public llm_graph_context { bx = ggml_concat(ctx0, conv, bx, 0); GGML_ASSERT(bx->ne[0] > conv->ne[0]); - auto *new_conv = ggml_view_2d(ctx0, bx, conv->ne[0], bx->ne[1], bx->nb[1], (bx->ne[0] - conv->ne[0]) * ggml_element_size(bx)); + auto * new_conv = ggml_view_2d(ctx0, bx, conv->ne[0], bx->ne[1], bx->nb[1], (bx->ne[0] - conv->ne[0]) * ggml_element_size(bx)); GGML_ASSERT(ggml_are_same_shape(conv, new_conv)); // write conv state ggml_build_forward_expand(gf, ggml_cpy(ctx0, new_conv, conv_state)); - auto *conv_kernel = model.layers[il].shortconv.conv; + auto * conv_kernel = model.layers[il].shortconv.conv; GGML_ASSERT(hparams.n_shortconv_l_cache > 0); // construct ssm_conv op ggml_tensor * conv_out = ggml_ssm_conv(ctx0, bx, conv_kernel); cb(conv_out, "model.layers.{}.conv.conv", il); - auto *y = ggml_mul(ctx0, c, conv_out); + auto * y = ggml_mul(ctx0, c, conv_out); y = build_lora_mm(model.layers[il].shortconv.out_proj, y); cb(y, "model.layers.{}.conv.out_proj", il); return y; } - - // upcast to f32 before rms norm - ggml_tensor *lfm2_rms_norm(ggml_tensor *t, ggml_tensor *w) const { - auto *t_float = t; - if (t_float->type != GGML_TYPE_F32) { - t_float = ggml_cast(ctx0, t, GGML_TYPE_F32); - } - - auto *output = ggml_rms_norm(ctx0, t_float, hparams.f_norm_rms_eps); - if (output->type != t->type) { - output = ggml_cast(ctx0, output, t->type); - } - - return ggml_mul(ctx0, output, w); - } }; llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const { From 92793845cbe1a83b0fdb7084ecd9ad3c1dc1f0de Mon Sep 17 00:00:00 2001 From: Tarek Dakhran Date: Fri, 11 Jul 2025 19:20:47 +0200 Subject: [PATCH 8/8] Support Lfm2ForCausalLM architecture name as well --- convert_hf_to_gguf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index ce38392b1eef6..8afb425b156f2 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -7078,6 +7078,7 @@ def set_vocab(self): self.gguf_writer.add_chat_template(chat_template) +@ModelBase.register("Lfm2ForCausalLM") @ModelBase.register("LFM2ForCausalLM") class LFM2Model(TextModel): model_arch = gguf.MODEL_ARCH.LFM2