From 499e9f2f494e16ffdf377c214b2af122500f1349 Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Wed, 30 Oct 2024 21:42:22 -0400
Subject: [PATCH 1/8] q6_k instruction reordering attempt

---
 ggml/src/ggml-quants.c | 101 ++++++++++++++++++++---------------------
 1 file changed, 50 insertions(+), 51 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 7aa6dce8907f5..30a2241dd1e3c 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -9107,9 +9107,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     const __m128i m4 = _mm_set1_epi8(0xF);
     const __m128i m3 = _mm_set1_epi8(3);
     const __m128i m32s = _mm_set1_epi8(32);
-    const __m128i m2 = _mm_set1_epi8(2);
 
-    __m256 acc = _mm256_setzero_ps();
+    __m256 acc1 = _mm256_setzero_ps();
+    __m256 acc2 = _mm256_setzero_ps();
 
     for (int i = 0; i < nb; ++i) {
 
@@ -9123,6 +9123,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
         __m128i sumi_0 = _mm_setzero_si128();
         __m128i sumi_1 = _mm_setzero_si128();
+        __m128i sumi_2 = _mm_setzero_si128();
+        __m128i sumi_3 = _mm_setzero_si128();
 
         __m128i shuffle = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
         for (int j = 0; j < QK_K/128; ++j) {
@@ -9132,93 +9134,90 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
             const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
             const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
-            const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 2), m3), 4);
-            const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 2), m3), 4);
             const __m128i q4h_4 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 4), m3), 4);
             const __m128i q4h_5 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 4), m3), 4);
-            const __m128i q4h_6 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 6), m3), 4);
-            const __m128i q4h_7 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 6), m3), 4);
 
             const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
             const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
 
             const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m4), q4h_0);
             const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m4), q4h_1);
-            const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m4), q4h_2);
-            const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m4), q4h_3);
             const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m4), q4h_4);
             const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m4), q4h_5);
-            const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m4), q4h_6);
-            const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m4), q4h_7);
-
             const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 48;
             const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-
+            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 -= 48;
             __m128i q8s_0 = _mm_maddubs_epi16(m32s, q8_0);
             __m128i q8s_1 = _mm_maddubs_epi16(m32s, q8_1);
-            __m128i q8s_2 = _mm_maddubs_epi16(m32s, q8_2);
-            __m128i q8s_3 = _mm_maddubs_epi16(m32s, q8_3);
             __m128i q8s_4 = _mm_maddubs_epi16(m32s, q8_4);
             __m128i q8s_5 = _mm_maddubs_epi16(m32s, q8_5);
-            __m128i q8s_6 = _mm_maddubs_epi16(m32s, q8_6);
-            __m128i q8s_7 = _mm_maddubs_epi16(m32s, q8_7);
-
             __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
             __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
-            __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
-            __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3);
             __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4);
             __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5);
-            __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
-            __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
-
             p16_0 = _mm_sub_epi16(p16_0, q8s_0);
             p16_1 = _mm_sub_epi16(p16_1, q8s_1);
-            p16_2 = _mm_sub_epi16(p16_2, q8s_2);
-            p16_3 = _mm_sub_epi16(p16_3, q8s_3);
             p16_4 = _mm_sub_epi16(p16_4, q8s_4);
             p16_5 = _mm_sub_epi16(p16_5, q8s_5);
-            p16_6 = _mm_sub_epi16(p16_6, q8s_6);
-            p16_7 = _mm_sub_epi16(p16_7, q8s_7);
 
             const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi8(shuffle, m2);
-            const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi8(shuffle, m2);
-            const __m128i scale_2 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi8(shuffle, m2);
-            const __m128i scale_3 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi8(shuffle, m2);
-
+            const __m128i scale_2 = _mm_shuffle_epi8(scales, _mm_add_epi8(shuffle, _mm_set1_epi8(4)));
             p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
             p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
-            p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
-            p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
             p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
             p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_2, scale_2)), p16_5);
+
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_1));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_4, p16_5));
+
+            const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 2), m3), 4);
+            const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 2), m3), 4);
+            const __m128i q4h_6 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 6), m3), 4);
+            const __m128i q4h_7 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 6), m3), 4);
+            const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m4), q4h_2);
+            const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m4), q4h_3);
+            const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m4), q4h_6);
+            const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m4), q4h_7);
+
+            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 48;
+            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16 ;
+            __m128i q8s_2 = _mm_maddubs_epi16(m32s, q8_2);
+            __m128i q8s_3 = _mm_maddubs_epi16(m32s, q8_3);
+            __m128i q8s_6 = _mm_maddubs_epi16(m32s, q8_6);
+            __m128i q8s_7 = _mm_maddubs_epi16(m32s, q8_7);
+            __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
+            __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3);
+            __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
+            __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
+            p16_2 = _mm_sub_epi16(p16_2, q8s_2);
+            p16_3 = _mm_sub_epi16(p16_3, q8s_3);
+            p16_6 = _mm_sub_epi16(p16_6, q8s_6);
+            p16_7 = _mm_sub_epi16(p16_7, q8s_7);
+
+            const __m128i scale_1 = _mm_shuffle_epi8(scales, _mm_add_epi8(shuffle, _mm_set1_epi8(2)));
+            const __m128i scale_3 = _mm_shuffle_epi8(scales, _mm_add_epi8(shuffle, _mm_set1_epi8(6)));
+            p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
+            p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
             p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
             p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_3, scale_3)), p16_7);
 
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7));
+            sumi_2 = _mm_add_epi32(sumi_2, _mm_add_epi32(p16_2, p16_3));
+            sumi_3 = _mm_add_epi32(sumi_3, _mm_add_epi32(p16_6, p16_7));
 
         }
 
-        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
+        __m256i sumi1 = MM256_SET_M128I(sumi_0, sumi_1);
+        __m256i sumi2 = MM256_SET_M128I(sumi_2, sumi_3);
+        acc1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi1)), acc1);
+        acc2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi2)), acc2);
     }
 
-    *s = hsum_float_8(acc);
+    *s = hsum_float_8(_mm256_add_ps(acc1, acc2));
 
 #elif defined __riscv_v_intrinsic
 

From e3a34321c44da0f5e0d67862b1d741b074e9b691 Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Wed, 30 Oct 2024 21:43:22 -0400
Subject: [PATCH 2/8] better subtract method

---
 ggml/src/ggml-quants.c | 115 ++++++++++++++++++-----------------------
 1 file changed, 49 insertions(+), 66 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 30a2241dd1e3c..c202775b8ab42 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -9107,9 +9107,9 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
     const __m128i m4 = _mm_set1_epi8(0xF);
     const __m128i m3 = _mm_set1_epi8(3);
     const __m128i m32s = _mm_set1_epi8(32);
+    const __m128i m2 = _mm_set1_epi8(2);
 
-    __m256 acc1 = _mm256_setzero_ps();
-    __m256 acc2 = _mm256_setzero_ps();
+    __m256 acc = _mm256_setzero_ps();
 
     for (int i = 0; i < nb; ++i) {
 
@@ -9123,8 +9123,6 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
         __m128i sumi_0 = _mm_setzero_si128();
         __m128i sumi_1 = _mm_setzero_si128();
-        __m128i sumi_2 = _mm_setzero_si128();
-        __m128i sumi_3 = _mm_setzero_si128();
 
         __m128i shuffle = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
         for (int j = 0; j < QK_K/128; ++j) {
@@ -9134,90 +9132,75 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
             const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
             const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
+            const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 2), m3), 4);
+            const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 2), m3), 4);
             const __m128i q4h_4 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 4), m3), 4);
             const __m128i q4h_5 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 4), m3), 4);
+            const __m128i q4h_6 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 6), m3), 4);
+            const __m128i q4h_7 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 6), m3), 4);
 
             const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
             const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+            const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
+
+            const __m128i q4_0 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits1_0, m4), q4h_0), m32s);
+            const __m128i q4_1 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits1_1, m4), q4h_1), m32s);
+            const __m128i q4_2 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits2_0, m4), q4h_2), m32s);
+            const __m128i q4_3 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits2_1, m4), q4h_3), m32s);
+            const __m128i q4_4 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m4), q4h_4), m32s);
+            const __m128i q4_5 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m4), q4h_5), m32s);
+            const __m128i q4_6 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m4), q4h_6), m32s);
+            const __m128i q4_7 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m4), q4h_7), m32s);
 
-            const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m4), q4h_0);
-            const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m4), q4h_1);
-            const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m4), q4h_4);
-            const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m4), q4h_5);
             const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 48;
+            const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
             const __m128i q8_4 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 -= 48;
-            __m128i q8s_0 = _mm_maddubs_epi16(m32s, q8_0);
-            __m128i q8s_1 = _mm_maddubs_epi16(m32s, q8_1);
-            __m128i q8s_4 = _mm_maddubs_epi16(m32s, q8_4);
-            __m128i q8s_5 = _mm_maddubs_epi16(m32s, q8_5);
-            __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
-            __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
-            __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4);
-            __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5);
-            p16_0 = _mm_sub_epi16(p16_0, q8s_0);
-            p16_1 = _mm_sub_epi16(p16_1, q8s_1);
-            p16_4 = _mm_sub_epi16(p16_4, q8s_4);
-            p16_5 = _mm_sub_epi16(p16_5, q8s_5);
+            const __m128i q8_5 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
+
+            __m128i p16_0 = mul_add_epi8_sse(q4_0, q8_0);
+            __m128i p16_1 = mul_add_epi8_sse(q4_1, q8_1);
+            __m128i p16_2 = mul_add_epi8_sse(q4_2, q8_2);
+            __m128i p16_3 = mul_add_epi8_sse(q4_3, q8_3);
+            __m128i p16_4 = mul_add_epi8_sse(q4_4, q8_4);
+            __m128i p16_5 = mul_add_epi8_sse(q4_5, q8_5);
+            __m128i p16_6 = mul_add_epi8_sse(q4_6, q8_6);
+            __m128i p16_7 = mul_add_epi8_sse(q4_7, q8_7);
 
             const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
-            const __m128i scale_2 = _mm_shuffle_epi8(scales, _mm_add_epi8(shuffle, _mm_set1_epi8(4)));
+            shuffle = _mm_add_epi8(shuffle, m2);
+            const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi8(shuffle, m2);
+            const __m128i scale_2 = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi8(shuffle, m2);
+            const __m128i scale_3 = _mm_shuffle_epi8(scales, shuffle);
+            shuffle = _mm_add_epi8(shuffle, m2);
+
             p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
             p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
-            p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
-            p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_2, scale_2)), p16_5);
-
-            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_1));
-            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_4, p16_5));
-
-            const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
-            const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 2), m3), 4);
-            const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 2), m3), 4);
-            const __m128i q4h_6 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 6), m3), 4);
-            const __m128i q4h_7 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 6), m3), 4);
-            const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m4), q4h_2);
-            const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m4), q4h_3);
-            const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m4), q4h_6);
-            const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m4), q4h_7);
-
-            const __m128i q8_2 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_3 = _mm_loadu_si128((const __m128i*)q8); q8 += 48;
-            const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
-            const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16 ;
-            __m128i q8s_2 = _mm_maddubs_epi16(m32s, q8_2);
-            __m128i q8s_3 = _mm_maddubs_epi16(m32s, q8_3);
-            __m128i q8s_6 = _mm_maddubs_epi16(m32s, q8_6);
-            __m128i q8s_7 = _mm_maddubs_epi16(m32s, q8_7);
-            __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
-            __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3);
-            __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
-            __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
-            p16_2 = _mm_sub_epi16(p16_2, q8s_2);
-            p16_3 = _mm_sub_epi16(p16_3, q8s_3);
-            p16_6 = _mm_sub_epi16(p16_6, q8s_6);
-            p16_7 = _mm_sub_epi16(p16_7, q8s_7);
-
-            const __m128i scale_1 = _mm_shuffle_epi8(scales, _mm_add_epi8(shuffle, _mm_set1_epi8(2)));
-            const __m128i scale_3 = _mm_shuffle_epi8(scales, _mm_add_epi8(shuffle, _mm_set1_epi8(6)));
             p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
             p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
+            p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
+            p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_2, scale_2)), p16_5);
             p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
             p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_3, scale_3)), p16_7);
 
-            sumi_2 = _mm_add_epi32(sumi_2, _mm_add_epi32(p16_2, p16_3));
-            sumi_3 = _mm_add_epi32(sumi_3, _mm_add_epi32(p16_6, p16_7));
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
+            sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6));
+            sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7));
 
         }
 
-        __m256i sumi1 = MM256_SET_M128I(sumi_0, sumi_1);
-        __m256i sumi2 = MM256_SET_M128I(sumi_2, sumi_3);
-        acc1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi1)), acc1);
-        acc2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi2)), acc2);
+        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
     }
 
-    *s = hsum_float_8(_mm256_add_ps(acc1, acc2));
+    *s = hsum_float_8(acc);
 
 #elif defined __riscv_v_intrinsic
 

From 0b75215f9d0e46faef0a8918cb4dd21a0f44d3b6 Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Wed, 30 Oct 2024 22:11:32 -0400
Subject: [PATCH 3/8] should be theoretically faster

small improvement with shuffle lut, likely because all loads are already done at that stage
---
 ggml/src/ggml-quants.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index c202775b8ab42..e18c5af83eedd 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -9124,7 +9124,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         __m128i sumi_0 = _mm_setzero_si128();
         __m128i sumi_1 = _mm_setzero_si128();
 
-        __m128i shuffle = _mm_set_epi64x(0x0101010101010101, 0x0000000000000000);
+        int is = 0;
+
         for (int j = 0; j < QK_K/128; ++j) {
 
             const __m128i q4bitsH_0 = _mm_loadu_si128((const __m128i*)qh); qh += 16;
@@ -9171,14 +9172,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
             __m128i p16_6 = mul_add_epi8_sse(q4_6, q8_6);
             __m128i p16_7 = mul_add_epi8_sse(q4_7, q8_7);
 
-            const __m128i scale_0 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi8(shuffle, m2);
-            const __m128i scale_1 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi8(shuffle, m2);
-            const __m128i scale_2 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi8(shuffle, m2);
-            const __m128i scale_3 = _mm_shuffle_epi8(scales, shuffle);
-            shuffle = _mm_add_epi8(shuffle, m2);
+            const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
+            const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
+            const __m128i scale_2 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 2));
+            const __m128i scale_3 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 3));
+            is += 4;
 
             p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
             p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
@@ -9197,7 +9195,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         }
 
         __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
-        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_broadcast_ss(&d), _mm256_cvtepi32_ps(sumi)), acc);
+        acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)), acc);
     }
 
     *s = hsum_float_8(acc);

From a420e4cd44b62b4e4187d7f9bc1a21013bb01bd4 Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Wed, 30 Oct 2024 23:38:00 -0400
Subject: [PATCH 4/8] optimize bit fiddling

---
 ggml/src/ggml-quants.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index e18c5af83eedd..374c469c344cd 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -9104,10 +9104,10 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
 #elif defined __AVX__
 
-    const __m128i m4 = _mm_set1_epi8(0xF);
-    const __m128i m3 = _mm_set1_epi8(3);
-    const __m128i m32s = _mm_set1_epi8(32);
     const __m128i m2 = _mm_set1_epi8(2);
+    const __m128i m3 = _mm_set1_epi8(3);
+    const __m128i m15 = _mm_set1_epi8(15);
+    const __m128i m32 = _mm_set1_epi8(32);
 
     __m256 acc = _mm256_setzero_ps();
 
@@ -9133,26 +9133,26 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
             const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
             const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
-            const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 2), m3), 4);
-            const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 2), m3), 4);
-            const __m128i q4h_4 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 4), m3), 4);
-            const __m128i q4h_5 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 4), m3), 4);
-            const __m128i q4h_6 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_0, 6), m3), 4);
-            const __m128i q4h_7 = _mm_slli_epi16(_mm_and_si128(_mm_srli_epi16(q4bitsH_1, 6), m3), 4);
+            const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(0x0C)), 2);
+            const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(0x0C)), 2);
+            const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(0x30));
+            const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(0x30));
+            const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(0xC0)), 2);
+            const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(0xC0)), 2);
 
             const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
             const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
             const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
             const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
 
-            const __m128i q4_0 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits1_0, m4), q4h_0), m32s);
-            const __m128i q4_1 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits1_1, m4), q4h_1), m32s);
-            const __m128i q4_2 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits2_0, m4), q4h_2), m32s);
-            const __m128i q4_3 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits2_1, m4), q4h_3), m32s);
-            const __m128i q4_4 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m4), q4h_4), m32s);
-            const __m128i q4_5 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m4), q4h_5), m32s);
-            const __m128i q4_6 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m4), q4h_6), m32s);
-            const __m128i q4_7 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m4), q4h_7), m32s);
+            const __m128i q4_0 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0), m32);
+            const __m128i q4_1 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1), m32);
+            const __m128i q4_2 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2), m32);
+            const __m128i q4_3 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3), m32);
+            const __m128i q4_4 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4), m32);
+            const __m128i q4_5 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5), m32);
+            const __m128i q4_6 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6), m32);
+            const __m128i q4_7 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7), m32);
 
             const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
             const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;

From 35255d64f6b0450e9bf5cfd0a5215c36312f00e6 Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Thu, 31 Oct 2024 18:25:25 -0400
Subject: [PATCH 5/8] handle -32 offset separately. bsums exists for a reason!

---
 ggml/src/ggml-quants.c | 47 ++++++++++++++++++++++++------------------
 1 file changed, 27 insertions(+), 20 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 374c469c344cd..491e2194571c5 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -9104,10 +9104,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
 #elif defined __AVX__
 
-    const __m128i m2 = _mm_set1_epi8(2);
     const __m128i m3 = _mm_set1_epi8(3);
     const __m128i m15 = _mm_set1_epi8(15);
-    const __m128i m32 = _mm_set1_epi8(32);
 
     __m256 acc = _mm256_setzero_ps();
 
@@ -9119,7 +9117,15 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         const uint8_t * restrict qh = x[i].qh;
         const int8_t  * restrict q8 = y[i].qs;
 
+        // handle the q6_k -32 offset separately using bsums
+		// TODO: tabs, compiler warnings for earlier code
+        const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
+        const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)y[i].bsums + 1);
         const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
+        const __m128i scales_16_0 = _mm_cvtepi8_epi16(scales);
+        const __m128i scales_16_1 = _mm_cvtepi8_epi16(_mm_bsrli_si128(scales, 8));
+        const __m128i q8scld_0 = _mm_mullo_epi32(_mm_madd_epi16(q8sums_0, scales_16_0), _mm_set1_epi32(32));
+        const __m128i q8scld_1 = _mm_mullo_epi32(_mm_madd_epi16(q8sums_1, scales_16_1), _mm_set1_epi32(32));
 
         __m128i sumi_0 = _mm_setzero_si128();
         __m128i sumi_1 = _mm_setzero_si128();
@@ -9145,14 +9151,14 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
             const __m128i q4bits2_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
             const __m128i q4bits2_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
 
-            const __m128i q4_0 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0), m32);
-            const __m128i q4_1 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1), m32);
-            const __m128i q4_2 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2), m32);
-            const __m128i q4_3 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3), m32);
-            const __m128i q4_4 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4), m32);
-            const __m128i q4_5 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5), m32);
-            const __m128i q4_6 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6), m32);
-            const __m128i q4_7 = _mm_sub_epi8(_mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7), m32);
+            const __m128i q4_0 = _mm_or_si128(_mm_and_si128(q4bits1_0, m15), q4h_0);
+            const __m128i q4_1 = _mm_or_si128(_mm_and_si128(q4bits1_1, m15), q4h_1);
+            const __m128i q4_2 = _mm_or_si128(_mm_and_si128(q4bits2_0, m15), q4h_2);
+            const __m128i q4_3 = _mm_or_si128(_mm_and_si128(q4bits2_1, m15), q4h_3);
+            const __m128i q4_4 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_0, 4), m15), q4h_4);
+            const __m128i q4_5 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits1_1, 4), m15), q4h_5);
+            const __m128i q4_6 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_0, 4), m15), q4h_6);
+            const __m128i q4_7 = _mm_or_si128(_mm_and_si128(_mm_srli_epi16(q4bits2_1, 4), m15), q4h_7);
 
             const __m128i q8_0 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
             const __m128i q8_1 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
@@ -9163,14 +9169,14 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
             const __m128i q8_6 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
             const __m128i q8_7 = _mm_loadu_si128((const __m128i*)q8); q8 += 16;
 
-            __m128i p16_0 = mul_add_epi8_sse(q4_0, q8_0);
-            __m128i p16_1 = mul_add_epi8_sse(q4_1, q8_1);
-            __m128i p16_2 = mul_add_epi8_sse(q4_2, q8_2);
-            __m128i p16_3 = mul_add_epi8_sse(q4_3, q8_3);
-            __m128i p16_4 = mul_add_epi8_sse(q4_4, q8_4);
-            __m128i p16_5 = mul_add_epi8_sse(q4_5, q8_5);
-            __m128i p16_6 = mul_add_epi8_sse(q4_6, q8_6);
-            __m128i p16_7 = mul_add_epi8_sse(q4_7, q8_7);
+            __m128i p16_0 = _mm_maddubs_epi16(q4_0, q8_0);
+            __m128i p16_1 = _mm_maddubs_epi16(q4_1, q8_1);
+            __m128i p16_2 = _mm_maddubs_epi16(q4_2, q8_2);
+            __m128i p16_3 = _mm_maddubs_epi16(q4_3, q8_3);
+            __m128i p16_4 = _mm_maddubs_epi16(q4_4, q8_4);
+            __m128i p16_5 = _mm_maddubs_epi16(q4_5, q8_5);
+            __m128i p16_6 = _mm_maddubs_epi16(q4_6, q8_6);
+            __m128i p16_7 = _mm_maddubs_epi16(q4_7, q8_7);
 
             const __m128i scale_0 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 0));
             const __m128i scale_1 = _mm_shuffle_epi8(scales, get_scale_shuffle(is + 1));
@@ -9191,10 +9197,11 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
             sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
             sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6));
             sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7));
-
         }
 
-        __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
+		sumi_0 = _mm_sub_epi32(sumi_0, q8scld_0);
+		sumi_1 = _mm_sub_epi32(sumi_1, q8scld_1);
+        const __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
         acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)), acc);
     }
 

From 5b367158c7c58ead9c13dfa49dbfeb3989598a70 Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Thu, 31 Oct 2024 20:16:25 -0400
Subject: [PATCH 6/8] use shift

---
 ggml/src/ggml-quants.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 491e2194571c5..bc93d0d7571c3 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -9118,14 +9118,13 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
         const int8_t  * restrict q8 = y[i].qs;
 
         // handle the q6_k -32 offset separately using bsums
-		// TODO: tabs, compiler warnings for earlier code
         const __m128i q8sums_0 = _mm_loadu_si128((const __m128i*)y[i].bsums);
         const __m128i q8sums_1 = _mm_loadu_si128((const __m128i*)y[i].bsums + 1);
         const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales);
         const __m128i scales_16_0 = _mm_cvtepi8_epi16(scales);
         const __m128i scales_16_1 = _mm_cvtepi8_epi16(_mm_bsrli_si128(scales, 8));
-        const __m128i q8scld_0 = _mm_mullo_epi32(_mm_madd_epi16(q8sums_0, scales_16_0), _mm_set1_epi32(32));
-        const __m128i q8scld_1 = _mm_mullo_epi32(_mm_madd_epi16(q8sums_1, scales_16_1), _mm_set1_epi32(32));
+        const __m128i q8sclsub_0 = _mm_slli_epi32(_mm_madd_epi16(q8sums_0, scales_16_0), 5);
+        const __m128i q8sclsub_1 = _mm_slli_epi32(_mm_madd_epi16(q8sums_1, scales_16_1), 5);
 
         __m128i sumi_0 = _mm_setzero_si128();
         __m128i sumi_1 = _mm_setzero_si128();
@@ -9139,12 +9138,12 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
 
             const __m128i q4h_0 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, m3), 4);
             const __m128i q4h_1 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, m3), 4);
-            const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(0x0C)), 2);
-            const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(0x0C)), 2);
-            const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(0x30));
-            const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(0x30));
-            const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(0xC0)), 2);
-            const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(0xC0)), 2);
+            const __m128i q4h_2 = _mm_slli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(12)), 2);
+            const __m128i q4h_3 = _mm_slli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(12)), 2);
+            const __m128i q4h_4 = _mm_and_si128(q4bitsH_0, _mm_set1_epi8(48));
+            const __m128i q4h_5 = _mm_and_si128(q4bitsH_1, _mm_set1_epi8(48));
+            const __m128i q4h_6 = _mm_srli_epi16(_mm_and_si128(q4bitsH_0, _mm_set1_epi8(-64)), 2);
+            const __m128i q4h_7 = _mm_srli_epi16(_mm_and_si128(q4bitsH_1, _mm_set1_epi8(-64)), 2);
 
             const __m128i q4bits1_0 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
             const __m128i q4bits1_1 = _mm_loadu_si128((const __m128i*)q4); q4 += 16;
@@ -9185,13 +9184,13 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
             is += 4;
 
             p16_0 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_0), p16_0);
-            p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_0, scale_0)), p16_1);
+            p16_1 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_0, 8)), p16_1);
             p16_2 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_1), p16_2);
-            p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_1, scale_1)), p16_3);
+            p16_3 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_1, 8)), p16_3);
             p16_4 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_2), p16_4);
-            p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_2, scale_2)), p16_5);
+            p16_5 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_2, 8)), p16_5);
             p16_6 = _mm_madd_epi16(_mm_cvtepi8_epi16(scale_3), p16_6);
-            p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_unpackhi_epi64(scale_3, scale_3)), p16_7);
+            p16_7 = _mm_madd_epi16(_mm_cvtepi8_epi16(_mm_bsrli_si128(scale_3, 8)), p16_7);
 
             sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_0, p16_2));
             sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
@@ -9199,8 +9198,8 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
             sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7));
         }
 
-		sumi_0 = _mm_sub_epi32(sumi_0, q8scld_0);
-		sumi_1 = _mm_sub_epi32(sumi_1, q8scld_1);
+        sumi_0 = _mm_sub_epi32(sumi_0, q8sclsub_0);
+        sumi_1 = _mm_sub_epi32(sumi_1, q8sclsub_1);
         const __m256i sumi = MM256_SET_M128I(sumi_1, sumi_0);
         acc = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(sumi)), acc);
     }

From d84c372bbfd7e82cb3aafc13c224c2ba069c9545 Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Fri, 1 Nov 2024 01:48:56 +0000
Subject: [PATCH 7/8] Update ggml-quants.c

---
 ggml/src/ggml-quants.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index bc93d0d7571c3..320c9fc9f784b 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -9196,6 +9196,7 @@ void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, size_t bs, const void * r
             sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_1, p16_3));
             sumi_0 = _mm_add_epi32(sumi_0, _mm_add_epi32(p16_4, p16_6));
             sumi_1 = _mm_add_epi32(sumi_1, _mm_add_epi32(p16_5, p16_7));
+
         }
 
         sumi_0 = _mm_sub_epi32(sumi_0, q8sclsub_0);

From f85336e263f5d27b49903a76f6fd9d31ce888198 Mon Sep 17 00:00:00 2001
From: Eve <139727413+netrunnereve@users.noreply.github.com>
Date: Mon, 4 Nov 2024 21:31:02 +0000
Subject: [PATCH 8/8] have to update ci macos version to 13 as 12 doesnt work
 now. 13 is still x86

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 423173b975897..02dcee9634f46 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -92,7 +92,7 @@ jobs:
           name: llama-bin-macos-arm64.zip
 
   macOS-latest-cmake-x64:
-    runs-on: macos-12
+    runs-on: macos-13
 
     steps:
       - name: Clone