Review: fix operation ordering in ggml-cuda, use __forceinline__, use more const

am17an · am17an · commit d64ba79da8be · 2025-06-19T20:27:24.000+08:00
diff --git a/ggml/src/ggml-cuda/conv2d-dw.cu b/ggml/src/ggml-cuda/conv2d-dw.cu
@@ -15,7 +15,7 @@ struct kernel_bounds {
     int x_min, x_max;
 };
 
-__device__ inline kernel_bounds calculate_kernel_bounds(int out_x, int out_y, const conv_params & params) {
+__device__ __forceinline__ kernel_bounds calculate_kernel_bounds(int out_x, int out_y, const conv_params & params) {
     kernel_bounds bounds;
     bounds.y_min = max(0, (params.padding_y - out_y * params.stride_y + params.dilation_y - 1) / params.dilation_y);
     bounds.y_max =
@@ -28,7 +28,7 @@ __device__ inline kernel_bounds calculate_kernel_bounds(int out_x, int out_y, co
     return bounds;
 }
 
-__device__ inline int calculate_input_coord(int out_coord, int kern_coord, int stride, int dilation, int padding) {
+__device__ __forceinline__ int calculate_input_coord(int out_coord, int kern_coord, int stride, int dilation, int padding) {
     return out_coord * stride + kern_coord * dilation - padding;
 }
 
@@ -84,8 +84,8 @@ __global__ void conv2d_dw_kernel(const T * __restrict__ input, const T * __restr
                                  const int kernel_w, const int kernel_h, const int stride_x, const int stride_y,
                                  const int padding_x, const int padding_y, const int dilation_x, const int dilation_y,
                                  const int channels, const int batches) {
-    int global_idx     = blockIdx.x * blockDim.x + threadIdx.x;
-    int total_elements = batches * channels * out_h * out_w;
+    const int global_idx     = blockIdx.x * blockDim.x + threadIdx.x;
+    const int total_elements = batches * channels * out_h * out_w;
 
     if (global_idx >= total_elements) {
         return;
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -2311,6 +2311,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_IM2COL:
             ggml_cuda_op_im2col(ctx, dst);
             break;
+        case GGML_OP_CONV_2D_DW:
+            ggml_cuda_op_conv2d_dw(ctx, dst);
+            break;
         case GGML_OP_CONV_TRANSPOSE_1D:
             ggml_cuda_op_conv_transpose_1d(ctx,dst);
             break;
@@ -2353,9 +2356,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
         case GGML_OP_OPT_STEP_ADAMW:
             ggml_cuda_opt_step_adamw(ctx, dst);
             break;
-        case GGML_OP_CONV_2D_DW:
-            ggml_cuda_op_conv2d_dw(ctx, dst);
-            break;
         default:
             return false;
     }
@@ -3213,6 +3213,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
             return op->src[0]->nb[0] == ggml_type_size(op->src[0]->type) && ggml_is_contiguous_2(op->src[0]);
         }
         case GGML_OP_IM2COL:
+        case GGML_OP_CONV_2D_DW:
         case GGML_OP_POOL_2D:
         case GGML_OP_SUM:
         case GGML_OP_SUM_ROWS:
@@ -3267,7 +3268,6 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
         case GGML_OP_CROSS_ENTROPY_LOSS:
         case GGML_OP_CROSS_ENTROPY_LOSS_BACK:
         case GGML_OP_OPT_STEP_ADAMW:
-        case GGML_OP_CONV_2D_DW:
             return true;
         default:
             return false;