@@ -106,6 +106,24 @@ static __device__ void dequantize_q4_1(const void * vx, const int ib, const int
106106 v1 = vi1*d + m;
107107}
108108
109+ static __device__ void dequantize_q5_0 (const void * vx, const int ib, const int iqs, float & v0, float & v1){
110+ const block_q5_0 * x = (const block_q5_0 *) vx;
111+
112+ const float d = x[ib].d ;
113+
114+ uint32_t qh;
115+ memcpy (&qh, x[ib].qh , sizeof (qh));
116+
117+ const uint8_t xh_0 = ((qh >> (iqs + 0 )) << 4 ) & 0x10 ;
118+ const uint8_t xh_1 = ((qh >> (iqs + 12 )) ) & 0x10 ;
119+
120+ const int32_t x0 = ((x[ib].qs [iqs] & 0xf ) | xh_0) - 16 ;
121+ const int32_t x1 = ((x[ib].qs [iqs] >> 4 ) | xh_1) - 16 ;
122+
123+ v0 = x0*d;
124+ v1 = x1*d;
125+ }
126+
109127static __global__ void dequantize_block_q4_0 (const void * vx, float * y) {
110128 static const int qk = QK4_0;
111129
@@ -277,6 +295,11 @@ static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, f
277295 dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_1, dequantize_q4_1><<<nrows, CUDA_DMMV_BLOCK_SIZE, 0 , stream>>> (vx, y, dst, ncols);
278296}
279297
298+ static void dequantize_mul_mat_vec_q5_0_cuda (const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
299+ GGML_ASSERT (ncols % CUDA_DMMV_BLOCK_SIZE == 0 );
300+ dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_0, dequantize_q5_0><<<nrows, CUDA_DMMV_BLOCK_SIZE, 0 , stream>>> (vx, y, dst, ncols);
301+ }
302+
280303// TODO: optimize
281304static __global__ void convert_fp16_to_fp32 (const void * vx, float * y) {
282305 const half * x = (const half *) vx;
@@ -315,6 +338,8 @@ static dequantize_mul_mat_vec_cuda_t ggml_get_dequantize_mul_mat_vec_cuda(ggml_t
315338 return dequantize_mul_mat_vec_q4_0_cuda;
316339 case GGML_TYPE_Q4_1:
317340 return dequantize_mul_mat_vec_q4_1_cuda;
341+ case GGML_TYPE_Q5_0:
342+ return dequantize_mul_mat_vec_q5_0_cuda;
318343 default :
319344 return nullptr ;
320345 }
0 commit comments