@@ -266,8 +266,13 @@ struct LoraModel : public GGMLRunner {
266266 float scale_value = 1 .0f ;
267267 std::string fk = lora_pre[type] + key;
268268 if (lora_tensors.find (fk + " .hada_w1_a" ) != lora_tensors.end ()) {
269- // loHa mode
269+ // LoHa mode
270270
271+ // TODO: split qkv convention for LoHas (is it ever used?)
272+ if (is_qkv_split || is_qkvm_split) {
273+ LOG_ERROR (" Split qkv isn't supported for LoHa models." );
274+ break ;
275+ }
271276 std::string alpha_name = " " ;
272277
273278 ggml_tensor* hada_1_mid = NULL ; // tau for tucker decomposition
@@ -286,11 +291,6 @@ struct LoraModel : public GGMLRunner {
286291 std::string hada_2_down_name = " " ;
287292 std::string hada_2_up_name = " " ;
288293
289- // TODO: split qkv convention for LoHas (is it ever used?)
290- if (is_qkv_split || is_qkvm_split) {
291- LOG_ERROR (" Split qkv isn't supported for LoHa models." );
292- break ;
293- }
294294
295295 hada_1_down_name = fk + " .hada_w1_b" ;
296296 hada_1_up_name = fk + " .hada_w1_a" ;
@@ -340,12 +340,20 @@ struct LoraModel : public GGMLRunner {
340340
341341 // calc_scale
342342 // TODO: .dora_scale?
343- int64_t dim = hada_1_down->ne [ggml_n_dims (hada_1_down) - 1 ];
343+ int64_t rank = hada_1_down->ne [ggml_n_dims (hada_1_down) - 1 ];
344344 if (lora_tensors.find (alpha_name) != lora_tensors.end ()) {
345345 float alpha = ggml_backend_tensor_get_f32 (lora_tensors[alpha_name]);
346- scale_value = alpha / dim ;
346+ scale_value = alpha / rank ;
347347 }
348348 } else if (lora_tensors.find (fk + " .lokr_w1" ) != lora_tensors.end () || lora_tensors.find (fk + " .lokr_w1_a" ) != lora_tensors.end ()) {
349+ // LoKr mode
350+
351+ // TODO: split qkv convention for LoKrs (is it ever used?)
352+ if (is_qkv_split || is_qkvm_split) {
353+ LOG_ERROR (" Split qkv isn't supported for LoKr models." );
354+ break ;
355+ }
356+
349357 std::string alpha_name = fk + " .alpha" ;
350358
351359 ggml_tensor* lokr_w1 = NULL ;
@@ -354,12 +362,6 @@ struct LoraModel : public GGMLRunner {
354362 std::string lokr_w1_name = " " ;
355363 std::string lokr_w2_name = " " ;
356364
357- // TODO: split qkv convention for LoKrs (is it ever used?)
358- if (is_qkv_split || is_qkvm_split) {
359- LOG_ERROR (" Split qkv isn't supported for LoKr models." );
360- break ;
361- }
362-
363365 lokr_w1_name = fk + " .lokr_w1" ;
364366 lokr_w2_name = fk + " .lokr_w2" ;
365367
@@ -372,14 +374,14 @@ struct LoraModel : public GGMLRunner {
372374 std::string down_name = lokr_w1_name + " _b" ;
373375 std::string up_name = lokr_w1_name + " _a" ;
374376 if (lora_tensors.find (down_name) != lora_tensors.end ()) {
377+ // w1 should not be low rank normally, sometimes w1 and w2 are swapped
375378 down = to_f32 (compute_ctx, lora_tensors[down_name]);
376379 applied_lora_tensors.insert (down_name);
377380
378- // scale != 1 only when using Low rank form (?)
379- int64_t dim = down->ne [ggml_n_dims (down) - 1 ];
381+ int64_t rank = down->ne [ggml_n_dims (down) - 1 ];
380382 if (lora_tensors.find (alpha_name) != lora_tensors.end ()) {
381383 float alpha = ggml_backend_tensor_get_f32 (lora_tensors[alpha_name]);
382- scale_value = alpha / dim ;
384+ scale_value = alpha / rank ;
383385 }
384386 }
385387 if (lora_tensors.find (up_name) != lora_tensors.end ()) {
@@ -399,18 +401,25 @@ struct LoraModel : public GGMLRunner {
399401 if (lora_tensors.find (down_name) != lora_tensors.end ()) {
400402 down = to_f32 (compute_ctx, lora_tensors[down_name]);
401403 applied_lora_tensors.insert (down_name);
404+
405+ int64_t rank = down->ne [ggml_n_dims (down) - 1 ];
406+ if (lora_tensors.find (alpha_name) != lora_tensors.end ()) {
407+ float alpha = ggml_backend_tensor_get_f32 (lora_tensors[alpha_name]);
408+ scale_value = alpha / rank;
409+ }
402410 }
403411 if (lora_tensors.find (up_name) != lora_tensors.end ()) {
404412 up = to_f32 (compute_ctx, lora_tensors[up_name]);
405413 applied_lora_tensors.insert (up_name);
406414 }
407415 lokr_w2 = ggml_merge_lora (compute_ctx, down, up);
408416 }
417+
418+ // Technically it might be unused, but I believe it's the expected behavior
419+ applied_lora_tensors.insert (alpha_name);
409420
410421 updown = ggml_kronecker (compute_ctx, lokr_w1, lokr_w2);
411422
412- // TODO: double check alpha implementation, it seems strange to not use them most of the time
413- applied_lora_tensors.insert (alpha_name);
414423 } else {
415424 // LoRA mode
416425 ggml_tensor* lora_mid = NULL ; // tau for tucker decomposition
@@ -770,12 +779,12 @@ struct LoraModel : public GGMLRunner {
770779 }
771780 // calc_scale
772781 // TODO: .dora_scale?
773- int64_t dim = lora_down->ne [ggml_n_dims (lora_down) - 1 ];
782+ int64_t rank = lora_down->ne [ggml_n_dims (lora_down) - 1 ];
774783 if (lora_tensors.find (scale_name) != lora_tensors.end ()) {
775784 scale_value = ggml_backend_tensor_get_f32 (lora_tensors[scale_name]);
776785 } else if (lora_tensors.find (alpha_name) != lora_tensors.end ()) {
777786 float alpha = ggml_backend_tensor_get_f32 (lora_tensors[alpha_name]);
778- scale_value = alpha / dim ;
787+ scale_value = alpha / rank ;
779788 }
780789
781790 updown = ggml_merge_lora (compute_ctx, lora_down, lora_up, lora_mid);
0 commit comments