pytorch
diff --git a/‎torchao/experimental/kernels/mps/metal.yaml‎
Lines changed: 7 additions & 4 deletions b/‎torchao/experimental/kernels/mps/metal.yaml‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎torchao/experimental/kernels/mps/metal/common.metal‎
Lines changed: 15 additions & 0 deletions b/‎torchao/experimental/kernels/mps/metal/common.metal‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎torchao/experimental/kernels/mps/metal/divbit.metal‎
Lines changed: 0 additions & 1 deletion b/‎torchao/experimental/kernels/mps/metal/divbit.metal‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎torchao/experimental/kernels/mps/metal/int1mm.metal‎
Lines changed: 97 additions & 0 deletions b/‎torchao/experimental/kernels/mps/metal/int1mm.metal‎
Lines changed: 97 additions & 0 deletions
diff --git a/‎torchao/experimental/kernels/mps/metal/int2mm.metal‎
Lines changed: 99 additions & 0 deletions b/‎torchao/experimental/kernels/mps/metal/int2mm.metal‎
Lines changed: 99 additions & 0 deletions
diff --git a/‎torchao/experimental/kernels/mps/metal/int2mm_opt.metal‎
Lines changed: 138 additions & 0 deletions b/‎torchao/experimental/kernels/mps/metal/int2mm_opt.metal‎
Lines changed: 138 additions & 0 deletions
@@ -1,14 +1,17 @@
+- func: Vec4Type
+  file: common.metal
+
 - func: int1mm
-  file: divbit.metal
+  file: int1mm.metal
 
 - func: int2mm
-  file: divbit.metal
+  file: int2mm_opt.metal
 
 - func: int3mm
-  file: int3mm.metal
+  file: int3mm_opt.metal
 
 - func: int4mm
-  file: divbit.metal
+  file: int4mm_opt.metal
 
 - func: int5mm
   file: int5mm.metal
 
@@ -0,0 +1,15 @@
+template <typename T> struct Vec4Type {};
+
+template <> struct Vec4Type<float> {
+  using type = float4;
+};
+
+template <> struct Vec4Type<half> {
+  using type = half4;
+};
+
+#if __METAL_VERSION__ >= 310
+template <> struct Vec4Type<bfloat> {
+  using type = bfloat4;
+};
+#endif
@@ -30,7 +30,6 @@ kernel void divbit_mm(
     constant T *A_ptr = A + m * K;
     constant uchar *B_ptr = B;
 
-    constexpr uint8_t zero_shift = 1 << (nbit - 1);
     constexpr uint8_t values_per_byte = 8 / nbit;
     constexpr uint8_t minimask = (1 << nbit) - 1;
 
 
@@ -0,0 +1,97 @@
+#include <metal_stdlib>
+using namespace metal;
+
+/**
+ * 1-Bit Quantized Linear.
+ *
+ * @param[A] M x K unquantized input tensor of floating point dtype (Float, Half, BFloat16)
+ * @param[B] Packed & quantized weight tensor of uint8 dtype. Expected shape is N x (K / 8)
+ * @param[scales] 2D tensor containg the scales for each group. Expected shape is #groups x N
+ * @param[zeros] 2D tensor containg the zero points for each group. Expected shape is #groups x N
+ * @param[outputData] M x N output tensor of floating point dtype (same as input)
+ * @param[sizes] The sizes involved in the order: M, K, N
+ *
+ * Dispatched threads: N x M x 1
+ */
+template<typename T, unsigned groupSize>
+kernel void int1pack_mm(
+    constant T                 * A              [[buffer(0)]],
+    constant uchar             * B              [[buffer(1)]],
+    constant T                 * scales         [[buffer(2)]],
+    constant T                 * zeros          [[buffer(3)]],
+    device   T                 * outputData     [[buffer(4)]],
+    constant uint3             & sizes          [[buffer(5)]], // M, K, N
+    uint2                        thread_index   [[thread_position_in_grid]]) {
+    const uint K = sizes.y;
+    const uint N = sizes.z;
+    const uint m = thread_index.y; // 0..M-1
+    const uint n = thread_index.x; // 0..N-1
+    const uint32_t k_block = (K + groupSize - 1) / groupSize;
+    constant T *A_ptr = A + m * K;
+    constant uchar *B_ptr = B + n * K / 8;
+
+    float rc = 0.0;
+    uint k = 0;
+    for (uint32_t kb = 0; kb < k_block ; kb ++) {
+      const float scale = float(scales[kb * N + n]);
+      const float zero = float(zeros[kb * N + n]);
+      for(uint idx = 0; idx < groupSize && k < K; idx+=8, k+=8) {
+        const auto a_val0 = float(A_ptr[k + 0]);
+        const auto a_val1 = float(A_ptr[k + 1]);
+        const auto a_val2 = float(A_ptr[k + 2]);
+        const auto a_val3 = float(A_ptr[k + 3]);
+        const auto a_val4 = float(A_ptr[k + 4]);
+        const auto a_val5 = float(A_ptr[k + 5]);
+        const auto a_val6 = float(A_ptr[k + 6]);
+        const auto a_val7 = float(A_ptr[k + 7]);
+
+        uchar b0 = B_ptr[(k / 8)];
+
+        uchar w_val0 = b0 & 0x01;
+        uchar w_val1 = (b0 & 0x02) >> 1;
+        uchar w_val2 = (b0 & 0x04) >> 2;
+        uchar w_val3 = (b0 & 0x08) >> 3;
+        uchar w_val4 = (b0 & 0x10) >> 4;
+        uchar w_val5 = (b0 & 0x20) >> 5;
+        uchar w_val6 = (b0 & 0x40) >> 6;
+        uchar w_val7 = (b0 & 0x80) >> 7;
+
+        rc += a_val0 * (scale * float(w_val0) + zero);
+        rc += a_val1 * (scale * float(w_val1) + zero);
+        rc += a_val2 * (scale * float(w_val2) + zero);
+        rc += a_val3 * (scale * float(w_val3) + zero);
+        rc += a_val4 * (scale * float(w_val4) + zero);
+        rc += a_val5 * (scale * float(w_val5) + zero);
+        rc += a_val6 * (scale * float(w_val6) + zero);
+        rc += a_val7 * (scale * float(w_val7) + zero);
+      }
+    }
+    outputData[m * N + n] = T(rc);
+}
+
+#define INSTANTIATE_INT1MM(DTYPE, GSIZE)                                 \
+template                                                                 \
+[[host_name("int1pack_mm_" #GSIZE "_" #DTYPE)]]                          \
+kernel void int1pack_mm<DTYPE, GSIZE>(                                   \
+    constant DTYPE             * A              [[buffer(0)]],           \
+    constant uchar             * B              [[buffer(1)]],           \
+    constant DTYPE             * scales         [[buffer(2)]],           \
+    constant DTYPE             * zeros          [[buffer(3)]],           \
+    device   DTYPE             * outputData     [[buffer(4)]],           \
+    constant uint3             & sizes          [[buffer(5)]],           \
+    uint2                        thread_index [[thread_position_in_grid]])
+
+INSTANTIATE_INT1MM(float, 32);
+INSTANTIATE_INT1MM(half, 32);
+INSTANTIATE_INT1MM(float, 64);
+INSTANTIATE_INT1MM(half, 64);
+INSTANTIATE_INT1MM(float, 128);
+INSTANTIATE_INT1MM(half, 128);
+INSTANTIATE_INT1MM(float, 256);
+INSTANTIATE_INT1MM(half, 256);
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_INT1MM(bfloat, 32);
+INSTANTIATE_INT1MM(bfloat, 64);
+INSTANTIATE_INT1MM(bfloat, 128);
+INSTANTIATE_INT1MM(bfloat, 256);
+#endif
@@ -0,0 +1,99 @@
+#include <metal_stdlib>
+using namespace metal;
+
+/**
+ * 2-Bit Quantized Linear.
+ *
+ * @param[A] M x K unquantized input tensor of floating point dtype (Float, Half, BFloat16)
+ * @param[B] Packed & quantized weight tensor of uint8 dtype. Expected shape is N x (K / 4)
+ * @param[scales] 2D tensor containg the scales for each group. Expected shape is #groups x N
+ * @param[zeros] 2D tensor containg the zero points for each group. Expected shape is #groups x N
+ * @param[outputData] M x N output tensor of floating point dtype (same as input)
+ * @param[sizes] The sizes involved in the order: M, K, N
+ *
+ * Dispatched threads: N x M x 1
+ */
+template<typename T, unsigned groupSize>
+kernel void int2pack_mm(
+    constant T                 * A              [[buffer(0)]],
+    constant uchar             * B              [[buffer(1)]],
+    constant T                 * scales         [[buffer(2)]],
+    constant T                 * zeros          [[buffer(3)]],
+    device   T                 * outputData     [[buffer(4)]],
+    constant uint3             & sizes          [[buffer(5)]], // M, K, N
+    uint2                        thread_index   [[thread_position_in_grid]]) {
+    const uint K = sizes.y;
+    const uint N = sizes.z;
+    const uint m = thread_index.y; // 0..M-1
+    const uint n = thread_index.x; // 0..N-1
+    const uint32_t k_block = (K + groupSize - 1) / groupSize;
+    constant T *A_ptr = A + m * K;
+    constant uchar *B_ptr = B + n * 2 * K / 8;
+
+    float rc = 0.0;
+    uint k = 0;
+    for (uint32_t kb = 0; kb < k_block ; kb ++) {
+      const float scale = float(scales[kb * N + n]);
+      const float zero = float(zeros[kb * N + n]);
+      for(uint idx = 0; idx < groupSize && k < K; idx+=8, k+=8) {
+        const auto a_val0 = float(A_ptr[k + 0]);
+        const auto a_val1 = float(A_ptr[k + 1]);
+        const auto a_val2 = float(A_ptr[k + 2]);
+        const auto a_val3 = float(A_ptr[k + 3]);
+        const auto a_val4 = float(A_ptr[k + 4]);
+        const auto a_val5 = float(A_ptr[k + 5]);
+        const auto a_val6 = float(A_ptr[k + 6]);
+        const auto a_val7 = float(A_ptr[k + 7]);
+
+        uchar b0 = B_ptr[2 * (k / 8) + 0];
+        uchar b1 = B_ptr[2 * (k / 8) + 1];
+
+        uchar w_val0 = b0 & 0x03;
+        uchar w_val1 = (b0 & 0x0c) >> 2;
+        uchar w_val2 = (b0 & 0x30) >> 4;
+        uchar w_val3 = (b0 & 0xc0) >> 6;
+
+        uchar w_val4 = b1 & 0x03;
+        uchar w_val5 = (b1 & 0x0c) >> 2;
+        uchar w_val6 = (b1 & 0x30) >> 4;
+        uchar w_val7 = (b1 & 0xc0) >> 6;
+
+        rc += a_val0 * (scale * float(w_val0) + zero);
+        rc += a_val1 * (scale * float(w_val1) + zero);
+        rc += a_val2 * (scale * float(w_val2) + zero);
+        rc += a_val3 * (scale * float(w_val3) + zero);
+        rc += a_val4 * (scale * float(w_val4) + zero);
+        rc += a_val5 * (scale * float(w_val5) + zero);
+        rc += a_val6 * (scale * float(w_val6) + zero);
+        rc += a_val7 * (scale * float(w_val7) + zero);
+      }
+    }
+    outputData[m * N + n] = T(rc);
+}
+
+#define INSTANTIATE_INT2MM(DTYPE, GSIZE)                                 \
+template                                                                 \
+[[host_name("int2pack_mm_" #GSIZE "_" #DTYPE)]]                          \
+kernel void int2pack_mm<DTYPE, GSIZE>(                                   \
+    constant DTYPE             * A              [[buffer(0)]],           \
+    constant uchar             * B              [[buffer(1)]],           \
+    constant DTYPE             * scales         [[buffer(2)]],           \
+    constant DTYPE             * zeros          [[buffer(3)]],           \
+    device   DTYPE             * outputData     [[buffer(4)]],           \
+    constant uint3             & sizes          [[buffer(5)]],           \
+    uint2                        thread_index [[thread_position_in_grid]])
+
+INSTANTIATE_INT2MM(float, 32);
+INSTANTIATE_INT2MM(half, 32);
+INSTANTIATE_INT2MM(float, 64);
+INSTANTIATE_INT2MM(half, 64);
+INSTANTIATE_INT2MM(float, 128);
+INSTANTIATE_INT2MM(half, 128);
+INSTANTIATE_INT2MM(float, 256);
+INSTANTIATE_INT2MM(half, 256);
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_INT2MM(bfloat, 32);
+INSTANTIATE_INT2MM(bfloat, 64);
+INSTANTIATE_INT2MM(bfloat, 128);
+INSTANTIATE_INT2MM(bfloat, 256);
+#endif
@@ -0,0 +1,138 @@
+#include <metal_simdgroup>
+#include <metal_stdlib>
+using namespace metal;
+
+/*
+   This code takes heavy inspiration from MLX:
+   https://github.com/ml-explore/mlx/blob/main/mlx/backend/metal/kernels/quantized.h
+   Specifically:
+     - Multiplying activation by inverse scaling factor to reduce compute
+   boundedness
+     - Handling zero point by accumulating act in separate sum term. Needed with
+   optimization done above. MLX MIT License:
+   https://github.com/ml-explore/mlx/blob/main/LICENSE
+*/
+
+/*
+   @brief This shader implements 2-bit matrix-vector multiplication where A
+   matrix is fp16, bfloat or float and B matrix is a 2-bit groupwise-quantized weight
+   matrix.
+   @param [in] A is activation matrix of size M x K.
+   @param [in] B is weight matrix of size M x K. Each byte contains 4 2-bit
+   values, along K dim, packed together.
+   @param [in] scales_ptr is scales ptr corresponding each
+   output channel x groups. These are packed as [num_groups = ceil(K / group_size), N]. N = output
+   channels.
+   @param [in] zeros_ptr is zero points corresponding each
+   output channel x groups. These are packed as [num_groups = ceil(K / group_size), N]. N = output
+   channels.
+   output channel x groups. These are packed as [num_groups = ceil(K / group_size), N, 2]. N = output
+   @param [out] output_data is output matrix of size M x N.
+   @param [in] sizes array contains values of M, K and N.
+   @param [in] thread_index is global thread id.
+   @param [in] tid_in_simdgruop is thread id in simdgroup. e.g. in simdgroup of size 32 it can be in [0-31].
+*/
+template <typename T, unsigned group_size>
+kernel void int2pack_mm(constant T *A [[buffer(0)]],
+                        constant uchar *B [[buffer(1)]],
+                        constant T *scales_ptr [[buffer(2)]],
+                        constant T *zeros_ptr [[buffer(3)]],
+                        device T *output_data [[buffer(4)]],
+                        constant uint3 &sizes [[buffer(5)]], // M, K, N
+                        uint3 thread_index [[thread_position_in_grid]],
+                        uint tid_in_simdgroup [[thread_index_in_simdgroup]]) {
+  constexpr uint threads_per_channel = 32;
+  constexpr uint ks_per_thread = 4;
+  constexpr uint k_pack_factor = 4;
+  const uint K = sizes.y;
+  const uint N = sizes.z;
+  uint n = thread_index.x; // 0..N/4-1
+  uint m = thread_index.z; // 0..M
+  n = n / threads_per_channel;
+  n = n * 4;
+  // This is starting k for each thread. In the example above, for thread 1 this
+  // value will be 4.
+  uint k = (tid_in_simdgroup % threads_per_channel) * ks_per_thread;
+  constexpr int k_jump = threads_per_channel * ks_per_thread;
+
+  using vecT = typename Vec4Type<T>::type;
+  constant vecT *A_ptr = reinterpret_cast<constant vecT *>(A + m * K);
+  constant uchar *B_ptr = B + ((n * K) / k_pack_factor);
+
+  thread float4 result = float4(0.0);
+  // We multipy group of 4 channels with these scales.
+  // Because corresponding values from weight matrix are effectively left
+  // shifted. This is to avoid doing right shift on those values which ends up
+  // affecting performance. This is the trick applied in MLX kernels.
+  float4 act_div_scales = {1.f, 1 / 4.f, 1 / 16.f, 1 / 64.f};
+
+  for (; k < K; k += k_jump) {
+    // Find specific group to which channels handled by this thread
+    // belong.
+    uint k_block_index = k / group_size;
+    uint scales_group_offset = (k_block_index * N + n);
+
+    vecT scales =
+        (reinterpret_cast<constant vecT *>(scales_ptr + scales_group_offset))[0];
+    // Adding zero point results in 10% perf penalty.
+    vecT zeros =
+        (reinterpret_cast<constant vecT *>(zeros_ptr + scales_group_offset))[0];
+    float4 zeros_float = float4(zeros);
+
+    float4 a_val = float4(A_ptr[k / 4]);
+    // We are gonna skip right-shifts of the weights and hence divide by corresponding factor.
+    float4 a_vec = a_val * act_div_scales;
+    float a_val_sum = a_val[0] + a_val[1] + a_val[2] + a_val[3];
+
+    float4x4 b_mat;
+    ushort b_val0 = (B_ptr + (k + 0 * K) / k_pack_factor)[0];
+    ushort b_val1 = (B_ptr + (k + 1 * K) / k_pack_factor)[0];
+    ushort b_val2 = (B_ptr + (k + 2 * K) / k_pack_factor)[0];
+    ushort b_val3 = (B_ptr + (k + 3 * K) / k_pack_factor)[0];
+    b_mat[0] = scales[0] * float4(float(b_val0 & 0x03), float(b_val0 & 0x0c),
+                               float(b_val0 & 0x30), float(b_val0 & 0xc0));
+    b_mat[1] = scales[1] * float4(float(b_val1 & 0x03), float(b_val1 & 0x0c),
+                               float(b_val1 & 0x30), float(b_val1 & 0xc0));
+    b_mat[2] = scales[2] * float4(float(b_val2 & 0x03), float(b_val2 & 0x0c),
+                               float(b_val2 & 0x30), float(b_val2 & 0xc0));
+    b_mat[3] = scales[3] * float4(float(b_val3 & 0x03), float(b_val3 & 0x0c),
+                               float(b_val3 & 0x30), float(b_val3 & 0xc0));
+
+    result += a_vec * b_mat;
+    result += a_val_sum * zeros_float;
+  }
+  result += simd_shuffle_down(result, 1);
+  result += simd_shuffle_down(result, 2);
+  result += simd_shuffle_down(result, 4);
+  result += simd_shuffle_down(result, 8);
+  result += simd_shuffle_down(result, 16);
+  if (tid_in_simdgroup % threads_per_channel == 0) {
+    reinterpret_cast<device vecT *>(output_data + m * N)[n / 4] = vecT(result);
+  }
+}
+
+#define INSTANTIATE_INT2MM(DTYPE, GSIZE)                                       \
+  template [[host_name("int2pack_mm_" #GSIZE "_" #DTYPE)]] kernel void         \
+  int2pack_mm<DTYPE, GSIZE>(                                                   \
+      constant DTYPE * A [[buffer(0)]], constant uchar * B [[buffer(1)]],      \
+      constant DTYPE * scales_ptr [[buffer(2)]],                               \
+      constant DTYPE * zeros_ptr [[buffer(3)]],                                \
+      device DTYPE * output_data [[buffer(4)]],                                \
+      constant uint3 & sizes [[buffer(5)]],                                    \
+      uint3 thread_index [[thread_position_in_grid]],                          \
+      uint tid_in_simdgroup [[thread_index_in_simdgroup]])
+
+INSTANTIATE_INT2MM(float, 32);
+INSTANTIATE_INT2MM(half, 32);
+INSTANTIATE_INT2MM(float, 64);
+INSTANTIATE_INT2MM(half, 64);
+INSTANTIATE_INT2MM(float, 128);
+INSTANTIATE_INT2MM(half, 128);
+INSTANTIATE_INT2MM(float, 256);
+INSTANTIATE_INT2MM(half, 256);
+#if __METAL_VERSION__ >= 310
+INSTANTIATE_INT2MM(bfloat, 32);
+INSTANTIATE_INT2MM(bfloat, 64);
+INSTANTIATE_INT2MM(bfloat, 128);
+INSTANTIATE_INT2MM(bfloat, 256);
+#endif