metal : use F16 precision in FA kernel

ggerganov · ggerganov · commit c71e0bcd8a94 · 2024-11-04T18:36:44.000+02:00
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m
@@ -3150,7 +3150,7 @@ static void ggml_metal_encode_node(
                     while (true) {
                         // 16*32*nsgmax - the shared memory needed for the simdgroups to load the KV cache
                         //                each thread loads (dequantizes) 16 head elements, there are 32 threads in th SG
-                        const size_t smem = (nqptg*(ne00 + 2*nsgmax*(ncpsg + nqptg)) + 16*32*nsgmax)*(sizeof(float)/2);
+                        const size_t smem = (nqptg*(ne00 + nsgmax*(ncpsg + nqptg)) + 16*32*nsgmax)*(sizeof(float)/2);
                         if (smem > device.maxThreadgroupMemoryLength) {
                             break;
                         }
@@ -3161,7 +3161,7 @@ static void ggml_metal_encode_node(
                     // simdgroups per threadgroup (a.k.a. warps)
                     const int64_t nsg = ne01 <= nqptg ? MAX(4, MIN(nsgmax, MIN(ne11/ncpsg, (int64_t) pipeline.maxTotalThreadsPerThreadgroup/32))) : 4;
 
-                    const size_t smem = (nqptg*(ne00 + 2*nsg*(ncpsg + nqptg)) + 16*32*nsg)*(sizeof(float)/2);
+                    const size_t smem = (nqptg*(ne00 + nsg*(ncpsg + nqptg)) + 16*32*nsg)*(sizeof(float)/2);
 
                     //printf("smem: %zu, max: %zu, nsg = %d\n", smem, device.maxThreadgroupMemoryLength, (int) nsg);
                     GGML_ASSERT(smem <= device.maxThreadgroupMemoryLength);
diff --git a/ggml/src/ggml-metal.metal b/ggml/src/ggml-metal.metal
@@ -2774,13 +2774,13 @@ kernel void kernel_flash_attn_ext(
     const short NW  = N_SIMDWIDTH;
     const short SH  = (C + Q); // shared memory per simdgroup in (half)
 
-    const short T  = D + 2*nsg*SH; // shared memory size per query in (half)
-    const short TF = T/2;        // shared memory size per query in (float)
+    const short T  = D + nsg*SH; // shared memory size per query in (half)
+    const short TF = T;          // shared memory size per query in (float)
     const short T4 = T/4;        // shared memory size per query in (half4)
 
-    threadgroup half  * sq  = (threadgroup half  *) (shared +              0*D); // holds the query data
-    threadgroup half4 * sq4 = (threadgroup half4 *) (shared +              0*D); // same as above but in half4
-    threadgroup float * ss  = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix
+    threadgroup half  * sq  = (threadgroup half  *) (shared +            0*D); // holds the query data
+    threadgroup half4 * sq4 = (threadgroup half4 *) (shared +            0*D); // same as above but in half4
+    threadgroup half  * ss  = (threadgroup half  *) (shared + sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix
 
     threadgroup half    * skv  = (threadgroup half    *) (shared + sgitg*(4*16*KV) + Q*T); // scratch buffer to load K and V in shared memory
     threadgroup half4x4 * skv4 = (threadgroup half4x4 *) (shared + sgitg*(4*16*KV) + Q*T); // same as above but in half4x4
@@ -2809,7 +2809,7 @@ kernel void kernel_flash_attn_ext(
     // zero out shared memory SH
     for (short j = 0; j < Q; ++j) {
         for (short i = tiisg; i < SH; i += NW) {
-            ss[j*TF + i] = 0.0f;
+            ss[j*TF + i] = 0.0h;
         }
     }
 
@@ -2874,7 +2874,7 @@ kernel void kernel_flash_attn_ext(
             // Q*K^T
             {
                 for (short cc = 0; cc < C/8; ++cc) {
-                    simdgroup_float8x8 mqk = make_filled_simdgroup_matrix<float, 8>(0.h);
+                    simdgroup_half8x8 mqk = make_filled_simdgroup_matrix<half, 8>(0.h);
 
                     if (is_same<block_q, half4x4>::value) {
                         // we can read directly from global memory
@@ -2944,7 +2944,7 @@ kernel void kernel_flash_attn_ext(
                     const float m = M[j];
 
                     // scale and apply the logitcap / mask
-                    float s = ss[j*TF + tiisg]*scale;
+                    float s = ((float)(ss[j*TF + tiisg]))*scale;
 
                     if (logit_softcap != 0.0f) {
                         s = logit_softcap*precise::tanh(s);
@@ -2980,7 +2980,7 @@ kernel void kernel_flash_attn_ext(
 
             // O = diag(ms)*O
             {
-                simdgroup_float8x8 mm;
+                simdgroup_half8x8 mm;
                 simdgroup_load(mm, ss + C, TF, 0, false);
 
                 for (short i = 0; i < D8; ++i) {
@@ -2991,7 +2991,7 @@ kernel void kernel_flash_attn_ext(
             // O = O + (Q*K^T)*V
             {
                 for (short cc = 0; cc < C/8; ++cc) {
-                    simdgroup_float8x8 ms;
+                    simdgroup_half8x8 ms;
                     simdgroup_load(ms, ss + 8*cc, TF, 0, false);
 
                     if (is_same<block_q, half4x4>::value) {
@@ -3103,8 +3103,8 @@ kernel void kernel_flash_attn_ext(
             // O_0 = diag(ms0)*O_0 + diag(ms1)*O_1
             {
                 simdgroup_half8x8 t;
-                simdgroup_float8x8 ms0;
-                simdgroup_float8x8 ms1;
+                simdgroup_half8x8 ms0;
+                simdgroup_half8x8 ms1;
 
                 simdgroup_load(ms0, ss + C,         TF, 0, false);
                 simdgroup_load(ms1, ss + C + sg*SH, TF, 0, false);