@@ -1303,6 +1303,7 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1_impl(
13031303
13041304static __device__ __forceinline__ float vec_dot_q4_0_q8_1 (
13051305 const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1306+
13061307 const block_q4_0 * bq4_0 = (const block_q4_0 *) vbq;
13071308
13081309 int vi;
@@ -1313,7 +1314,9 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1(
13131314 return vec_dot_q4_0_q8_1_impl (vi, ui0, ui1, __half2float (bq4_0->d ), __half2float (bq8_1->d ));
13141315}
13151316
1316- static __device__ __forceinline__ float vec_dot_q4_1_q8_1 (const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1317+ static __device__ __forceinline__ float vec_dot_q4_1_q8_1 (
1318+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1319+
13171320#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
13181321 const block_q4_1 * bq4_1 = (const block_q4_1 *) vbq;
13191322
@@ -1340,6 +1343,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1(const void * __restric
13401343
13411344static __device__ __forceinline__ float vec_dot_q5_0_q8_1 (
13421345 const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1346+
13431347#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
13441348 const block_q5_0 * bq5_0 = (const block_q5_0 *) vbq;
13451349
@@ -1376,6 +1380,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1(
13761380
13771381static __device__ __forceinline__ float vec_dot_q5_1_q8_1 (
13781382 const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1383+
13791384#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
13801385 const block_q5_1 * bq5_1 = (const block_q5_1 *) vbq;
13811386
@@ -1411,6 +1416,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1(
14111416
14121417static __device__ __forceinline__ float vec_dot_q8_0_q8_1 (
14131418 const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
1419+
14141420#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
14151421 const block_q8_0 * bq8_0 = (const block_q8_0 *) vbq;
14161422
@@ -1430,7 +1436,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1(
14301436}
14311437
14321438static __device__ __forceinline__ float vec_dot_q2_K_q8_1 (
1433- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1439+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
14341440
14351441#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
14361442 const block_q2_K * bq2_K = (const block_q2_K *) vbq;
@@ -1466,7 +1472,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1(
14661472}
14671473
14681474static __device__ __forceinline__ float vec_dot_q3_K_q8_1 (
1469- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1475+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
14701476
14711477#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
14721478 const block_q3_K * bq3_K = (const block_q3_K *) vbq;
@@ -1519,7 +1525,7 @@ static __device__ __forceinline__ float vec_dot_q3_K_q8_1(
15191525}
15201526
15211527static __device__ __forceinline__ float vec_dot_q4_K_q8_1 (
1522- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1528+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
15231529
15241530#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
15251531 const block_q4_K * bq4_K = (const block_q4_K *) vbq;
@@ -1557,7 +1563,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1(
15571563}
15581564
15591565static __device__ __forceinline__ float vec_dot_q5_K_q8_1 (
1560- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1566+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
15611567
15621568#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
15631569 const block_q5_K * bq5_K = (const block_q5_K *) vbq;
@@ -1601,7 +1607,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1(
16011607}
16021608
16031609static __device__ __forceinline__ float vec_dot_q6_K_q8_1 (
1604- const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int iqs) {
1610+ const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) {
16051611
16061612#if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics
16071613 const block_q6_K * bq6_K = (const block_q6_K *) vbq;
0 commit comments