@@ -659,6 +659,7 @@ struct llama_model_loader {
659659 LLAMA_ASSERT (lt.ne .size () == 1 );
660660 tensor = ggml_new_tensor_1d (ggml_ctx, lt.type , lt.ne .at (0 ));
661661 }
662+ ggml_set_name (tensor, lt.name .c_str ());
662663 LLAMA_ASSERT (lt.ggml_tensor == NULL ); // if this fails, we called get_tensor twice on the same tensor
663664 lt.ggml_tensor = tensor;
664665 num_ggml_tensors_created++;
@@ -798,6 +799,8 @@ static bool kv_cache_init(
798799
799800 cache.k = ggml_new_tensor_1d (cache.ctx , wtype, n_elements);
800801 cache.v = ggml_new_tensor_1d (cache.ctx , wtype, n_elements);
802+ ggml_set_name (cache.k , " cache_k" );
803+ ggml_set_name (cache.v , " cache_v" );
801804
802805 return true ;
803806}
@@ -1084,6 +1087,7 @@ static bool llama_eval_internal(
10841087 gf.n_threads = N >= 32 && ggml_cpu_has_blas () && !ggml_cpu_has_gpublas () ? 1 : n_threads;
10851088
10861089 struct ggml_tensor * embd = ggml_new_tensor_1d (ctx0, GGML_TYPE_I32, N);
1090+ ggml_set_name (embd, " embd" );
10871091 memcpy (embd->data , tokens, N*ggml_element_size (embd));
10881092
10891093 struct ggml_tensor * inpL = ggml_get_rows (ctx0, model.tok_embeddings , embd);
@@ -1110,6 +1114,8 @@ static bool llama_eval_internal(
11101114 // compute Q and K and RoPE them
11111115 struct ggml_tensor * Qcur = ggml_rope (ctx0, ggml_reshape_3d (ctx0, ggml_mul_mat (ctx0, model.layers [il].wq , cur), n_embd/n_head, n_head, N), n_past, n_rot, 0 );
11121116 struct ggml_tensor * Kcur = ggml_rope (ctx0, ggml_reshape_3d (ctx0, ggml_mul_mat (ctx0, model.layers [il].wk , cur), n_embd/n_head, n_head, N), n_past, n_rot, 0 );
1117+ ggml_set_name (Qcur, " Qcur" );
1118+ ggml_set_name (Kcur, " Kcur" );
11131119
11141120 // store key and value to memory
11151121 {
@@ -1130,28 +1136,34 @@ static bool llama_eval_internal(
11301136 ggml_permute (ctx0,
11311137 Qcur,
11321138 0 , 2 , 1 , 3 );
1139+ ggml_set_name (Q, " Q" );
11331140
11341141 struct ggml_tensor * K =
11351142 ggml_permute (ctx0,
11361143 ggml_reshape_3d (ctx0,
11371144 ggml_view_1d (ctx0, kv_self.k , (n_past + N)*n_embd, il*n_ctx*ggml_element_size (kv_self.k )*n_embd),
11381145 n_embd/n_head, n_head, n_past + N),
11391146 0 , 2 , 1 , 3 );
1147+ ggml_set_name (K, " K" );
11401148
11411149 // K * Q
11421150 struct ggml_tensor * KQ = ggml_mul_mat (ctx0, K, Q);
1151+ ggml_set_name (KQ, " KQ" );
11431152
11441153 // KQ_scaled = KQ / sqrt(n_embd/n_head)
1145- struct ggml_tensor * KQ_scaled =
1146- ggml_scale (ctx0,
1147- KQ,
1148- ggml_new_f32 (ctx0, 1 .0f /sqrtf (float (n_embd)/n_head)));
1154+ struct ggml_tensor * KQ_scale = ggml_new_f32 (ctx0, 1 .0f /sqrtf (float (n_embd)/n_head));
1155+ ggml_set_name (KQ_scale, " 1/sqrt(n_embd/n_head)" );
1156+
1157+ struct ggml_tensor * KQ_scaled = ggml_scale (ctx0, KQ, KQ_scale);
1158+ ggml_set_name (KQ_scaled, " KQ_scaled" );
11491159
11501160 // KQ_masked = mask_past(KQ_scaled)
11511161 struct ggml_tensor * KQ_masked = ggml_diag_mask_inf (ctx0, KQ_scaled, n_past);
1162+ ggml_set_name (KQ_masked, " KQ_masked" );
11521163
11531164 // KQ = soft_max(KQ_masked)
11541165 struct ggml_tensor * KQ_soft_max = ggml_soft_max (ctx0, KQ_masked);
1166+ ggml_set_name (KQ_soft_max, " KQ_soft_max" );
11551167
11561168 // split cached V into n_head heads
11571169 struct ggml_tensor * V =
@@ -1160,9 +1172,11 @@ static bool llama_eval_internal(
11601172 n_ctx*ggml_element_size (kv_self.v ),
11611173 n_ctx*ggml_element_size (kv_self.v )*n_embd/n_head,
11621174 il*n_ctx*ggml_element_size (kv_self.v )*n_embd);
1175+ ggml_set_name (V, " V" );
11631176
11641177#if 1
11651178 struct ggml_tensor * KQV = ggml_mul_mat (ctx0, V, KQ_soft_max);
1179+ ggml_set_name (KQV, " KQV" );
11661180#else
11671181 // make V contiguous in memory to speed up the matmul, however we waste time on the copy
11681182 // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
@@ -1173,11 +1187,13 @@ static bool llama_eval_internal(
11731187
11741188 // KQV_merged = KQV.permute(0, 2, 1, 3)
11751189 struct ggml_tensor * KQV_merged = ggml_permute (ctx0, KQV, 0 , 2 , 1 , 3 );
1190+ ggml_set_name (KQV_merged, " KQV_merged" );
11761191
11771192 // cur = KQV_merged.contiguous().view(n_embd, N)
11781193 cur = ggml_cpy (ctx0,
11791194 KQV_merged,
11801195 ggml_new_tensor_2d (ctx0, GGML_TYPE_F32, n_embd, N));
1196+ ggml_set_name (cur, " KQV_merged_contiguous" );
11811197
11821198 // projection (no bias)
11831199 cur = ggml_mul_mat (ctx0,
0 commit comments