@@ -60,6 +60,16 @@ typedef float dfloat; // dequantize float
6060typedef float2 dfloat2;
6161#endif // GGML_CUDA_DMMV_F16
6262
63+ static __device__ __forceinline__ int get_int_from_int8 (const int8_t * x8, const int & i32 ) {
64+ const uint16_t * x16 = (uint16_t *) (x8 + sizeof (int ) * i32 ); // assume at least 2 byte alignment
65+
66+ int x32 = 0 ;
67+ x32 |= x16[0 ] << 0 ;
68+ x32 |= x16[1 ] << 16 ;
69+
70+ return x32;
71+ }
72+
6373static __device__ __forceinline__ int get_int_from_uint8 (const uint8_t * x8, const int & i32 ) {
6474 const uint16_t * x16 = (uint16_t *) (x8 + sizeof (int ) * i32 ); // assume at least 2 byte alignment
6575
@@ -1602,27 +1612,30 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat(
16021612 y_qs[j * (2 *WARP_SIZE) + kyqs + (QI8_1/2 )], x_dm[index_bx], y_ds[j * (2 *WARP_SIZE/QI8_1) + 2 *k/QI8_1]);
16031613}
16041614
1605- static __device__ __forceinline__ float vec_dot_q8_0_q8_1 (
1606- const void * __restrict__ vbq , const block_q8_1 * __restrict__ bq8_1 , const int & iqs ) {
1615+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1_impl (
1616+ const int & vi , const int & ui, const half & d8_0 , const half2 & ds8_1 ) {
16071617
16081618#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
1609- const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1610-
1611- int vi;
1612- memcpy (&vi, &bq8_0->qs [sizeof (int ) * (iqs + 0 )], sizeof (int ));
1613- const int ui = *((int *) &bq8_1->qs [sizeof (int ) * (iqs + 0 )]);
1614-
1615- const float d = __half2float (bq8_0->d ) * __half2float (bq8_1->ds .x );
1616-
16171619 // SIMD dot product of quantized values
1618- int sumi = __dp4a (vi, ui, 0 );
1620+ const int sumi = __dp4a (vi, ui, 0 );
16191621
1620- return sumi*d ;
1622+ return sumi * __half2float (d8_0) * __half2float (ds8_1. x ) ;
16211623#else
16221624 return 0 .0f ; // only to satisfy the compiler
16231625#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
16241626}
16251627
1628+ static __device__ __forceinline__ float vec_dot_q8_0_q8_1 (
1629+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1630+
1631+ const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
1632+
1633+ const int vi = get_int_from_int8 (bq8_0->qs , iqs);
1634+ const int ui = get_int_from_int8_aligned (bq8_1->qs , iqs);
1635+
1636+ return vec_dot_q8_0_q8_1_impl (vi, ui, bq8_0->d , bq8_1->ds );
1637+ }
1638+
16261639static __device__ __forceinline__ float vec_dot_q2_K_q8_1 (
16271640 const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
16281641
0 commit comments