@@ -8033,7 +8033,7 @@ static void ggml_compute_forward_mul_mat_f32(
80338033#if defined(GGML_USE_CUBLAS )
80348034 const float alpha = 1.0f ;
80358035 const float beta = 0.0f ;
8036- const int x_ne = ne01 * ne10 ;
8036+ const int x_ne = ne01 * ne00 ;
80378037 const int y_ne = ne11 * ne10 ;
80388038 const int d_ne = ne11 * ne01 ;
80398039
@@ -8235,25 +8235,27 @@ static void ggml_compute_forward_mul_mat_f16_f32(
82358235 }
82368236
82378237#if defined(GGML_USE_CUBLAS )
8238- ggml_fp16_t * const wdata = params -> wdata ;
8239-
82408238 const float alpha = 1.0f ;
82418239 const float beta = 0.0f ;
8242- const int x_ne = ne01 * ne10 ;
8240+ const int x_ne = ne01 * ne00 ;
82438241 const int y_ne = ne11 * ne10 ;
82448242 const int d_ne = ne11 * ne01 ;
82458243
82468244 size_t x_size , y_size , d_size ;
8247- float * d_X = ggml_cuda_pool_malloc (sizeof (float ) * x_ne , & x_size );
8248- float * d_Y = ggml_cuda_pool_malloc (sizeof (float ) * y_ne , & y_size );
8249- float * d_D = ggml_cuda_pool_malloc (sizeof (float ) * d_ne , & d_size );
8245+ ggml_fp16_t * d_X = ggml_cuda_pool_malloc (sizeof (float ) * x_ne , & x_size );
8246+ ggml_fp16_t * d_Y = ggml_cuda_pool_malloc (sizeof (float ) * y_ne , & y_size );
8247+ float * d_D = ggml_cuda_pool_malloc (sizeof (float ) * d_ne , & d_size );
82508248#else
82518249 float * const wdata = params -> wdata ;
82528250#endif
82538251 for (int64_t i03 = 0 ; i03 < ne03 ; i03 ++ ) {
82548252 for (int64_t i02 = 0 ; i02 < ne02 ; i02 ++ ) {
82558253#if defined(GGML_USE_CUBLAS )
8254+ // copy src0 while converting src1
8255+ CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_X , src0 , i03 , i02 , g_cudaStream ));
8256+
82568257 // with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
8258+ ggml_fp16_t * const wdata = (ggml_fp16_t * ) params -> wdata + (ne11 * ne10 ) * (i03 * ne02 + i02 );
82578259 {
82588260 size_t id = 0 ;
82598261 for (int64_t i01 = 0 ; i01 < ne11 ; ++ i01 ) {
@@ -8275,11 +8277,9 @@ static void ggml_compute_forward_mul_mat_f16_f32(
82758277
82768278#if defined(GGML_USE_CUBLAS )
82778279 const ggml_fp16_t * y = (ggml_fp16_t * ) wdata ;
8278-
82798280 float * d = (float * ) ((char * ) dst -> data + i02 * nb2 + i03 * nb3 );
82808281
82818282 // copy data to device
8282- CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_X , src0 , i03 , i02 , g_cudaStream ));
82838283 CUDA_CHECK (cudaMemcpyAsync (d_Y , y , sizeof (ggml_fp16_t ) * y_ne , cudaMemcpyHostToDevice , g_cudaStream ));
82848284
82858285 // compute
@@ -8498,39 +8498,19 @@ static void ggml_compute_forward_mul_mat_q_f32(
84988498#if defined(GGML_USE_CUBLAS )
84998499 const float alpha = 1.0f ;
85008500 const float beta = 0.0f ;
8501- const int x_ne = ne01 * ne10 ;
8501+ const int x_ne = ne01 * ne00 ;
85028502 const int y_ne = ne11 * ne10 ;
85038503 const int d_ne = ne11 * ne01 ;
85048504
85058505 size_t x_size , y_size , d_size , q_size ;
8506- float * d_X = ggml_cuda_pool_malloc (sizeof (float ) * x_ne , & x_size );
8507- float * d_Y = ggml_cuda_pool_malloc (sizeof (float ) * y_ne , & y_size );
8508- float * d_D = ggml_cuda_pool_malloc (sizeof (float ) * d_ne , & d_size );
8509- float * d_Q = ggml_cuda_pool_malloc (GGML_TYPE_SIZE [type ] * x_ne / GGML_BLCK_SIZE [type ], & q_size );
8506+ float * d_X = ggml_cuda_pool_malloc (sizeof (float ) * x_ne , & x_size );
8507+ float * d_Y = ggml_cuda_pool_malloc (sizeof (float ) * y_ne , & y_size );
8508+ float * d_D = ggml_cuda_pool_malloc (sizeof (float ) * d_ne , & d_size );
8509+ void * d_Q = ggml_cuda_pool_malloc (GGML_TYPE_SIZE [type ] * x_ne / GGML_BLCK_SIZE [type ], & q_size );
85108510
8511- void (* dequantize_row_q_cuda )(const void * x , float * y , int k , cudaStream_t stream ) = NULL ;
8512- if (type == GGML_TYPE_Q4_0 ) {
8513- dequantize_row_q_cuda = dequantize_row_q4_0_cuda ;
8514- }
8515- else if (type == GGML_TYPE_Q4_1 ) {
8516- dequantize_row_q_cuda = dequantize_row_q4_1_cuda ;
8517- }
8518- else if (type == GGML_TYPE_Q4_2 ) {
8519- dequantize_row_q_cuda = dequantize_row_q4_2_cuda ;
8520- }
8521- else if (type == GGML_TYPE_Q5_0 ) {
8522- dequantize_row_q_cuda = dequantize_row_q5_0_cuda ;
8523- }
8524- else if (type == GGML_TYPE_Q5_1 ) {
8525- dequantize_row_q_cuda = dequantize_row_q5_1_cuda ;
8526- }
8527- else if (type == GGML_TYPE_Q8_0 ) {
8528- dequantize_row_q_cuda = dequantize_row_q8_0_cuda ;
8529- }
8530- else {
8531- GGML_ASSERT (false);
8532- }
8533- #elif !defined(GGML_USE_CLBLAST )
8511+ const dequantize_row_q_cuda_t dequantize_row_q_cuda = ggml_get_dequantize_row_q_cuda (type );
8512+ GGML_ASSERT (dequantize_row_q_cuda != NULL );
8513+ #else
85348514 float * const wdata = params -> wdata ;
85358515 dequantize_row_q_t const dequantize_row_q = quantize_fns [type ].dequantize_row_q ;
85368516#endif
@@ -8543,10 +8523,11 @@ static void ggml_compute_forward_mul_mat_q_f32(
85438523
85448524#if defined(GGML_USE_CUBLAS )
85458525 // copy and dequantize on device
8546- CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_Q , src0 , i03 , i02 , g_cudaStream ));
8526+ CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_Q , src0 , i03 , i02 , g_cudaStream2 ));
85478527
8548- dequantize_row_q_cuda (d_Q , d_X , ne01 * ne00 , g_cudaStream );
8528+ dequantize_row_q_cuda (d_Q , d_X , x_ne , g_cudaStream2 );
85498529 CUDA_CHECK (cudaGetLastError ());
8530+ CUDA_CHECK (cudaEventRecord (g_cudaEvent , g_cudaStream2 ));
85508531#elif defined(GGML_USE_CLBLAST )
85518532 const void * x = (char * ) src0 -> data + i03 * nb03 + i02 * nb02 ;
85528533#else
@@ -8560,11 +8541,13 @@ static void ggml_compute_forward_mul_mat_q_f32(
85608541 const float * x = wdata ;
85618542#endif
85628543
8563-
85648544#if defined(GGML_USE_CUBLAS )
85658545 // copy data to device
85668546 CUDA_CHECK (ggml_cuda_h2d_tensor_2d (d_Y , src1 , i03 , i02 , g_cudaStream ));
85678547
8548+ // wait for dequantization
8549+ CUDA_CHECK (cudaStreamWaitEvent (g_cudaStream , g_cudaEvent , 0 ));
8550+
85688551 // compute
85698552 CUBLAS_CHECK (
85708553 cublasSgemm (g_cublasH , CUBLAS_OP_T , CUBLAS_OP_N ,
@@ -11588,7 +11571,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
1158811571 if (ggml_compute_forward_mul_mat_use_blas (node -> src0 , node -> src1 , node )) {
1158911572 node -> n_tasks = 1 ; // TODO: this actually is doing nothing
1159011573 // the threads are still spinning
11591- cur = GGML_TYPE_SIZE [GGML_TYPE_F32 ]* ( node -> src0 -> ne [ 0 ] * node -> src0 -> ne [ 1 ] );
11574+ cur = GGML_TYPE_SIZE [GGML_TYPE_F32 ]* MAX ( ggml_nelements ( node -> src1 ), ggml_nelements ( node -> src0 ) );
1159211575 //printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);
1159311576 //printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);
1159411577 //printf("cur = %zu\n", cur);
@@ -11600,6 +11583,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
1160011583#endif
1160111584 } else if (node -> src0 -> type == GGML_TYPE_F32 && node -> src1 -> type == GGML_TYPE_F32 ) {
1160211585 cur = 0 ;
11586+ #if defined(GGML_USE_ACCELERATE ) || defined(GGML_USE_OPENBLAS ) || defined(GGML_USE_CUBLAS )
11587+ if (ggml_compute_forward_mul_mat_use_blas (node -> src0 , node -> src1 , node )) {
11588+ node -> n_tasks = 1 ;
11589+ }
11590+ #endif
1160311591 } else if (ggml_is_quantized (node -> src0 -> type ) && node -> src1 -> type == GGML_TYPE_F32 ) {
1160411592#if defined(GGML_USE_ACCELERATE ) || defined(GGML_USE_OPENBLAS ) || defined(GGML_USE_CUBLAS ) || defined(GGML_USE_CLBLAST )
1160511593 if (ggml_compute_forward_mul_mat_use_blas (node -> src0 , node -> src1 , node )) {
0 commit comments