5656#pragma warning(disable: 4244 4267) // possible loss of data
5757#endif
5858
59+ // tensor names
60+ #define TN_TOKEN_EMBD " token_embd.weight"
61+ #define TN_OUTPUT_NORM " output_norm.weight"
62+ #define TN_OUTPUT " output.weight"
63+ #define TN_ATTN_NORM " blk.%d.attn_norm.weight"
64+ #define TN_ATTN_Q " blk.%d.attn_q.weight"
65+ #define TN_ATTN_K " blk.%d.attn_k.weight"
66+ #define TN_ATTN_V " blk.%d.attn_v.weight"
67+ #define TN_ATTN_OUTPUT " blk.%d.attn_output.weight"
68+ #define TN_FFN_NORM " blk.%d.ffn_norm.weight"
69+ #define TN_FFN_GATE " blk.%d.ffn_gate.weight"
70+ #define TN_FFN_DOWN " blk.%d.ffn_down.weight"
71+ #define TN_FFN_UP " blk.%d.ffn_up.weight"
72+
5973static void llama_log_internal (llama_log_level level, const char * format, ...);
6074static void llama_log_callback_default (llama_log_level level, const char * text, void * user_data);
6175#define LLAMA_LOG_INFO (...) llama_log_internal(LLAMA_LOG_LEVEL_INFO , __VA_ARGS__)
@@ -1310,7 +1324,7 @@ static void llama_model_load_internal(
13101324
13111325 ml->ggml_ctx = ctx;
13121326
1313- model.tok_embeddings = ml->get_tensor (" token_embd.weight " , {n_embd, n_vocab}, GGML_BACKEND_CPU);
1327+ model.tok_embeddings = ml->get_tensor (TN_TOKEN_EMBD , {n_embd, n_vocab}, GGML_BACKEND_CPU);
13141328
13151329 // "output" tensor
13161330 {
@@ -1331,8 +1345,8 @@ static void llama_model_load_internal(
13311345 backend_output = GGML_BACKEND_CPU;
13321346 }
13331347
1334- model.norm = ml->get_tensor (" output_norm.weight " , {n_embd}, backend_norm);
1335- model.output = ml->get_tensor (" output.weight " , {n_embd, n_vocab}, backend_output);
1348+ model.norm = ml->get_tensor (TN_OUTPUT_NORM , {n_embd}, backend_norm);
1349+ model.output = ml->get_tensor (TN_OUTPUT , {n_embd, n_vocab}, backend_output);
13361350 if (backend_norm == GGML_BACKEND_GPU) {
13371351 vram_weights += ggml_nbytes (model.norm );
13381352 }
@@ -1349,21 +1363,18 @@ static void llama_model_load_internal(
13491363 const ggml_backend backend_split = int (i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
13501364
13511365 auto & layer = model.layers [i];
1366+ layer.attention_norm = ml->get_tensor (format (TN_ATTN_NORM, i), {n_embd}, backend);
13521367
1353- std::string layers_i = " blk." + std::to_string (i);
1354-
1355- layer.attention_norm = ml->get_tensor (layers_i + " .attn_norm.weight" , {n_embd}, backend);
1356-
1357- layer.wq = ml->get_tensor (layers_i + " .attn_q.weight" , {n_embd, n_embd}, backend_split);
1358- layer.wk = ml->get_tensor (layers_i + " .attn_k.weight" , {n_embd, n_embd_gqa}, backend_split);
1359- layer.wv = ml->get_tensor (layers_i + " .attn_v.weight" , {n_embd, n_embd_gqa}, backend_split);
1360- layer.wo = ml->get_tensor (layers_i + " .attn_output.weight" , {n_embd, n_embd}, backend_split);
1368+ layer.wq = ml->get_tensor (format (TN_ATTN_Q, i), {n_embd, n_embd}, backend_split);
1369+ layer.wk = ml->get_tensor (format (TN_ATTN_K, i), {n_embd, n_embd_gqa}, backend_split);
1370+ layer.wv = ml->get_tensor (format (TN_ATTN_V, i), {n_embd, n_embd_gqa}, backend_split);
1371+ layer.wo = ml->get_tensor (format (TN_ATTN_OUTPUT, i), {n_embd, n_embd}, backend_split);
13611372
1362- layer.ffn_norm = ml->get_tensor (layers_i + " .ffn_norm.weight " , {n_embd}, backend);
1373+ layer.ffn_norm = ml->get_tensor (format (TN_FFN_NORM, i) , {n_embd}, backend);
13631374
1364- layer.w1 = ml->get_tensor (layers_i + " .ffn_gate.weight " , {n_embd, n_ff}, backend_split);
1365- layer.w2 = ml->get_tensor (layers_i + " .ffn_down.weight " , { n_ff, n_embd}, backend_split);
1366- layer.w3 = ml->get_tensor (layers_i + " .ffn_up.weight " , {n_embd, n_ff}, backend_split);
1375+ layer.w1 = ml->get_tensor (format (TN_FFN_GATE, i) , {n_embd, n_ff}, backend_split);
1376+ layer.w2 = ml->get_tensor (format (TN_FFN_DOWN, i) , { n_ff, n_embd}, backend_split);
1377+ layer.w3 = ml->get_tensor (format (TN_FFN_UP, i) , {n_embd, n_ff}, backend_split);
13671378
13681379 if (backend == GGML_BACKEND_GPU) {
13691380 vram_weights +=
@@ -3240,10 +3251,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
32403251 int n_attention_wv = 0 ;
32413252 int n_feed_forward_w2 = 0 ;
32423253 for (auto & tensor : model_loader->tensors_map .tensors ) {
3243- if (tensor.name .find (" attention.wv .weight" ) != std::string::npos) {
3254+ if (tensor.name .find (" attn_v .weight" ) != std::string::npos) {
32443255 ++n_attention_wv;
32453256 }
3246- else if (tensor.name .find (" feed_forward.w2 .weight" ) != std::string::npos) {
3257+ else if (tensor.name .find (" ffn_down .weight" ) != std::string::npos) {
32473258 ++n_feed_forward_w2;
32483259 }
32493260 }
@@ -3298,13 +3309,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
32983309 } else {
32993310 new_type = quantized_type;
33003311#ifdef GGML_USE_K_QUANTS
3301- if (tensor.name == " output.weight " ) {
3312+ if (tensor.name == TN_OUTPUT ) {
33023313 int nx = tensor.ne .at (0 );
33033314 int ny = tensor.ne .at (1 );
33043315 if (nx % QK_K == 0 && ny % QK_K == 0 ) {
33053316 new_type = GGML_TYPE_Q6_K;
33063317 }
3307- } else if (tensor.name .find (" attention.wv .weight" ) != std::string::npos) {
3318+ } else if (tensor.name .find (" attn_v .weight" ) != std::string::npos) {
33083319 if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
33093320 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
33103321 else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
@@ -3319,7 +3330,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
33193330 use_more_bits (i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
33203331 // else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_feed_forward_w2 < n_feed_forward_w2/8) new_type = GGML_TYPE_Q6_K;
33213332 ++i_feed_forward_w2;
3322- } else if (tensor.name .find (" attention.wo .weight" ) != std::string::npos) {
3333+ } else if (tensor.name .find (" attn_output .weight" ) != std::string::npos) {
33233334 if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K;
33243335 else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
33253336 }
@@ -3334,10 +3345,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
33343345 }
33353346 }
33363347 if (convert_incompatible_tensor) {
3337- if (tensor.name == " output.weight " ) {
3348+ if (tensor.name == TN_OUTPUT ) {
33383349 new_type = GGML_TYPE_F16; // fall back to F16 instead of just failing.
33393350 LLAMA_LOG_WARN (" F16 will be used for this tensor instead.\n " );
3340- } else if (tensor.name == " tok_embeddings.weight " ) {
3351+ } else if (tensor.name == TN_TOKEN_EMBD ) {
33413352 new_type = GGML_TYPE_Q4_0; // fall back to Q4_0 instead of just failing.
33423353 LLAMA_LOG_WARN (" Q4_0 will be used for this tensor instead.\n " );
33433354 } else {
0 commit comments