@@ -2848,10 +2848,17 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void
28482848 float sumf = 0.0 ;
28492849
28502850#if defined(__ARM_NEON )
2851+ const int ahead = 80 ;
28512852 float32x4_t sumv0 = vdupq_n_f32 (0.0f );
28522853 float32x4_t sumv1 = vdupq_n_f32 (0.0f );
28532854
28542855 for (int i = 0 ; i < nb /2 ; i ++ ) {
2856+ __builtin_prefetch (& xqs [i * QK4_0 + 64 * ahead ]);
2857+ __builtin_prefetch (& yqs [2 * i * QK8_0C + 64 * ahead ]);
2858+ __builtin_prefetch (& yqs [2 * i * QK8_0C + 64 * ahead + 64 ]);
2859+ __builtin_prefetch (& xds [2 * i + 64 /4 * ahead ]);
2860+ __builtin_prefetch (& yds [2 * i + 64 /4 * ahead ]);
2861+
28552862 const int dst0 = i + i /2 * 2 ; // 0, 1, 4, 5, 8, 9, ...
28562863 const int dst1 = i + i /2 * 2 + 2 ; // 2, 3, 6, 7, 10, 11 ...
28572864
@@ -2910,9 +2917,15 @@ static void ggml_vec_dot_q4_0c_q8_0c(const int n, float * restrict s, const void
29102917 sumf = vaddvq_f32 (sumv0 ) + vaddvq_f32 (sumv1 );
29112918
29122919#elif defined(__AVX512F__ )
2920+ const int ahead = 64 ;
29132921 // Initialize accumulator with zeros
29142922 __m512 acc = _mm512_setzero_ps ();
29152923 for (int i = 0 ; i < nb ; i += 4 ) {
2924+ _mm_prefetch (xqs + i * QK4_0 /2 + 64 * ahead , _MM_HINT_T0 );
2925+ _mm_prefetch (yqs + i * QK8_0 + 64 * ahead , _MM_HINT_T0 );
2926+ _mm_prefetch (yqs + i * QK8_0 + 64 * ahead + 64 , _MM_HINT_T0 );
2927+ _mm_prefetch (xds + i + 64 /4 * ahead , _MM_HINT_T0 );
2928+ _mm_prefetch (yds + i + 64 /4 * ahead , _MM_HINT_T0 );
29162929 acc = dot_q4_0c_fourblocks_avx512 (acc , xqs + i * QK4_0 /2 , xds + i , yqs + i * QK8_0 , yds + i );
29172930 }
29182931 // Horizontal sum of all lanes of the accumulator
0 commit comments