@@ -204,23 +204,31 @@ typedef void (*ggml_cuda_op_t)(
204204// QR = QK / number of values before dequantization
205205// QI = number of 32 bit integers before dequantization
206206
207+ #define Q4_0DM (1 .0f /8 .0f )
208+ #define Q4_0D (x ) (((x)*Q4_0DM) / 127 .0f )
209+
207210#define QK4_0 32
208211#define QR4_0 2
209212#define QI4_0 (QK4_0 / (4 * QR4_0))
210213typedef struct {
211- half d; // delta
214+ int8_t d; // delta
212215 uint8_t qs[QK4_0 / 2 ]; // nibbles / quants
213216} block_q4_0;
214- static_assert (sizeof (block_q4_0) == sizeof (ggml_fp16_t ) + QK4_0 / 2 , " wrong q4_0 block size/padding" );
217+ static_assert (sizeof (block_q4_0) == sizeof (int8_t ) + QK4_0 / 2 , " wrong q4_0 block size/padding" );
218+
219+ #define Q4_1DM (2 .0f /15 .0f )
220+ #define Q4_1MM (2 .0f )
221+ #define Q4_1D (x ) ( (((x) & 0xFF )*Q4_1DM) / 255 .0f )
222+ #define Q4_1M (x ) (-1 .0f + (((x) >> 8 )*Q4_1MM) / 255 .0f )
215223
216224#define QK4_1 32
217225#define QR4_1 2
218226#define QI4_1 (QK4_1 / (4 * QR4_1))
219227typedef struct {
220- half2 dm; // dm.x = delta, dm.y = min
221- uint8_t qs[QK4_1 / 2 ]; // nibbles / quants
228+ uint16_t dm; // 8-bit delta + 8-bit min (can be adjusted easily)
229+ uint8_t qs[QK4_1 / 2 ]; // nibbles / quants
222230} block_q4_1;
223- static_assert (sizeof (block_q4_1) == sizeof (ggml_fp16_t ) * 2 + QK4_1 / 2 , " wrong q4_1 block size/padding" );
231+ static_assert (sizeof (block_q4_1) == sizeof (uint16_t ) + QK4_1 / 2 , " wrong q4_1 block size/padding" );
224232
225233#define QK5_0 32
226234#define QR5_0 2
@@ -232,15 +240,20 @@ typedef struct {
232240} block_q5_0;
233241static_assert (sizeof (block_q5_0) == sizeof (ggml_fp16_t ) + sizeof (uint32_t ) + QK5_0 / 2 , " wrong q5_0 block size/padding" );
234242
243+ #define Q5_1DM (2 .0f /31 .0f )
244+ #define Q5_1MM (2 .0f )
245+ #define Q5_1D (x ) ( (((x) & 0x0F )*Q5_1DM) / 15 .0f )
246+ #define Q5_1M (x ) (-1 .0f + (((x) >> 4 )*Q5_1MM) / 15 .0f )
247+
235248#define QK5_1 32
236249#define QR5_1 2
237250#define QI5_1 (QK5_1 / (4 * QR5_1))
238251typedef struct {
239- half2 dm; // dm.x = delta, dm.y = min
252+ uint8_t dm; // 4-bit delta + 4-bit min
240253 uint8_t qh[4 ]; // 5-th bit of quants
241254 uint8_t qs[QK5_1 / 2 ]; // nibbles / quants
242255} block_q5_1;
243- static_assert (sizeof (block_q5_1) == 2 * sizeof (ggml_fp16_t ) + sizeof (uint32_t ) + QK5_1 / 2 , " wrong q5_1 block size/padding" );
256+ static_assert (sizeof (block_q5_1) == sizeof (uint8_t ) + sizeof (uint32_t ) + QK5_1 / 2 , " wrong q5_1 block size/padding" );
244257
245258#define QK8_0 32
246259#define QR8_0 1
@@ -506,7 +519,7 @@ static __global__ void rms_norm_f32(const float * x, float * dst, const int ncol
506519static __device__ __forceinline__ void dequantize_q4_0 (const void * vx, const int ib, const int iqs, dfloat2 & v){
507520 const block_q4_0 * x = (const block_q4_0 *) vx;
508521
509- const dfloat d = x[ib].d ;
522+ const dfloat d = Q4_0D ( x[ib].d ) ;
510523
511524 const int vui = x[ib].qs [iqs];
512525
@@ -525,8 +538,8 @@ static __device__ __forceinline__ void dequantize_q4_0(const void * vx, const in
525538static __device__ __forceinline__ void dequantize_q4_1 (const void * vx, const int ib, const int iqs, dfloat2 & v){
526539 const block_q4_1 * x = (const block_q4_1 *) vx;
527540
528- const dfloat d = __low2half (x[ib].dm );
529- const dfloat m = __high2half (x[ib].dm );
541+ const dfloat d = Q4_1D (x[ib].dm );
542+ const dfloat m = Q4_1M (x[ib].dm );
530543
531544 const int vui = x[ib].qs [iqs];
532545
@@ -568,8 +581,8 @@ static __device__ __forceinline__ void dequantize_q5_0(const void * vx, const in
568581static __device__ __forceinline__ void dequantize_q5_1 (const void * vx, const int ib, const int iqs, dfloat2 & v){
569582 const block_q5_1 * x = (const block_q5_1 *) vx;
570583
571- const dfloat d = __low2half (x[ib].dm );
572- const dfloat m = __high2half (x[ib].dm );
584+ const dfloat d = Q5_1D (x[ib].dm );
585+ const dfloat m = Q5_1M (x[ib].dm );
573586
574587 uint32_t qh;
575588 memcpy (&qh, x[ib].qh , sizeof (qh));
@@ -2041,7 +2054,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
20412054 u[2 *i+1 ] = get_int_from_int8_aligned (bq8_1->qs , iqs + i + QI4_0);
20422055 }
20432056
2044- return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, bq4_0->d , bq8_1->ds );
2057+ return vec_dot_q4_0_q8_1_impl<VDR_Q4_0_Q8_1_MMVQ>(v, u, Q4_0D ( bq4_0->d ) , bq8_1->ds );
20452058}
20462059
20472060template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_0 (int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
@@ -2135,7 +2148,12 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(
21352148 u[2 *i+1 ] = get_int_from_int8_aligned (bq8_1->qs , iqs + i + QI4_1);
21362149 }
21372150
2138- return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, bq4_1->dm , bq8_1->ds );
2151+ const float d = Q4_1D (bq4_1->dm );
2152+ const float m = Q4_1M (bq4_1->dm );
2153+
2154+ const float2 dm = {d, m};
2155+
2156+ return vec_dot_q4_1_q8_1_impl<VDR_Q4_1_Q8_1_MMVQ>(v, u, dm, bq8_1->ds );
21392157}
21402158
21412159template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q4_1 (int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
@@ -2341,7 +2359,12 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
23412359 u[2 *i+1 ] = get_int_from_int8_aligned (bq8_1->qs , iqs + i + QI5_1);
23422360 }
23432361
2344- return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, bq5_1->dm , bq8_1->ds );
2362+ const float d = Q5_1D (bq4_1->dm );
2363+ const float m = Q5_1M (bq4_1->dm );
2364+
2365+ const float2 dm = {d, m};
2366+
2367+ return vec_dot_q5_1_q8_1_impl<VDR_Q5_1_Q8_1_MMVQ>(vl, vh, u, dm, bq8_1->ds );
23452368}
23462369
23472370template <int mmq_y> static __device__ __forceinline__ void allocate_tiles_q5_1 (int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) {
0 commit comments