@@ -3536,9 +3536,7 @@ static size_t g_scratch_offset = 0;
35363536
35373537static int g_device_count = -1 ;
35383538static int g_main_device = 0 ;
3539- #ifndef GGML_CUDA_FORCE_DMMV
35403539static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
3541- #endif
35423540static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0 };
35433541
35443542static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr };
@@ -3561,9 +3559,7 @@ void ggml_init_cublas() {
35613559 g_tensor_split[id] = total_vram;
35623560 total_vram += prop.totalGlobalMem ;
35633561
3564- #ifndef GGML_CUDA_FORCE_DMMV
35653562 g_compute_capabilities[id] = 100 *prop.major + 10 *prop.minor ;
3566- #endif
35673563 }
35683564 for (int id = 0 ; id < g_device_count; ++id) {
35693565 g_tensor_split[id] /= total_vram;
@@ -3916,6 +3912,7 @@ inline void ggml_cuda_op_mul_mat_vec(
39163912
39173913#ifdef GGML_CUDA_FORCE_DMMV
39183914 const bool use_mul_mat_vec_q = false ;
3915+ (void ) g_compute_capabilities[0 ];
39193916#else
39203917 int id;
39213918 CUDA_CHECK (cudaGetDevice (&id));
@@ -4659,8 +4656,16 @@ void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_
46594656 } else {
46604657#ifdef GGML_CUDA_CUBLAS
46614658 const bool use_mul_mat_q = false ;
4659+ (void ) g_compute_capabilities[0 ];
46624660#else
4663- const bool use_mul_mat_q = ggml_is_quantized (src0->type );
4661+ int min_compute_capability = 1000000 ;
4662+ for (int id = 0 ; id < g_device_count; ++id) {
4663+ if (min_compute_capability > g_compute_capabilities[id]) {
4664+ min_compute_capability = g_compute_capabilities[id];
4665+ }
4666+ }
4667+
4668+ const bool use_mul_mat_q = ggml_is_quantized (src0->type ) && min_compute_capability >= MIN_CC_DP4A;
46644669#endif // GGML_CUDA_CUBLAS
46654670 if (use_mul_mat_q) {
46664671 ggml_cuda_op (src0, src1, dst, ggml_cuda_op_mul_mat_q, false , false );
0 commit comments