From f3fb5dea74471a1c2d957c385a011fe221b7b03e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 26 Oct 2025 19:33:58 +0100 Subject: [PATCH 1/2] use fast copy when src and dst are contiguous and same shape --- ggml/src/ggml-cuda/cpy.cu | 81 +++++++++++++++++++++++++++++++++------ 1 file changed, 70 insertions(+), 11 deletions(-) diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index 12d5bf7763c38..d100ce225dbe5 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -112,6 +112,30 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne, cpy_blck(cx + x_offset, cdst + dst_offset); } +template +static __global__ void cpy_flt_cont_shape(const char * cx, char * cdst, const int ne) { + const int64_t i = blockDim.x*blockIdx.x + threadIdx.x; + + if (i >= ne) { + return; + } + + const src_t * x = (const src_t *) cx; + dst_t * dst = (dst_t *) cdst; + + dst[i] = ggml_cuda_cast(x[i]); +} + +template +static void ggml_cpy_flt_cont_shape_cuda( + const char * cx, char * cdst, const int ne, +cudaStream_t stream) { + + const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + cpy_flt_cont_shape<<>> + (cx, cdst, ne); +} + template static void ggml_cpy_flt_cuda( const char * cx, char * cdst, const int ne, @@ -285,7 +309,10 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg char * src0_ddc = (char *) src0->data; char * src1_ddc = (char *) src1->data; - if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) { + const bool contiguous_srcs = ggml_is_contiguous(src0) && ggml_is_contiguous(src1); + const bool cont_shape_srcs = contiguous_srcs && ggml_are_same_shape(src0, src1); + + if (src0->type == src1->type && contiguous_srcs) { GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1)); #if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY) if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) { @@ -296,11 +323,19 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream)); } } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { - ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) { - ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + if (cont_shape_srcs) { + ggml_cpy_flt_cont_shape_cuda (src0_ddc, src1_ddc, ne, main_stream); + } else { + ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { - ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + if (cont_shape_srcs) { + ggml_cpy_flt_cont_shape_cuda (src0_ddc, src1_ddc, ne, main_stream); + } else { + ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) { ggml_cpy_f32_q8_0_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_Q8_0 && src1->type == GGML_TYPE_F32) { @@ -327,21 +362,45 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg } else if (src0->type == GGML_TYPE_Q5_1 && src1->type == GGML_TYPE_F32) { ggml_cpy_q5_1_f32_cuda(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { - ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_BF16) { - ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + if (cont_shape_srcs) { + ggml_cpy_flt_cont_shape_cuda (src0_ddc, src1_ddc, ne, main_stream); + } else { + ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { - ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + if (cont_shape_srcs) { + ggml_cpy_flt_cont_shape_cuda (src0_ddc, src1_ddc, ne, main_stream); + } else { + ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) { ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) { - ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + if (cont_shape_srcs) { + ggml_cpy_flt_cont_shape_cuda (src0_ddc, src1_ddc, ne, main_stream); + } else { + ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32) { - ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + if (cont_shape_srcs) { + ggml_cpy_flt_cont_shape_cuda (src0_ddc, src1_ddc, ne, main_stream); + } else { + ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_I32) { - ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + if (cont_shape_srcs) { + ggml_cpy_flt_cont_shape_cuda (src0_ddc, src1_ddc, ne, main_stream); + } else { + ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_F32) { - ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + if (cont_shape_srcs) { + ggml_cpy_flt_cont_shape_cuda (src0_ddc, src1_ddc, ne, main_stream); + } else { + ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } } else { GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__, ggml_type_name(src0->type), ggml_type_name(src1->type)); From 6ffdcc5c22ce6b919da60e8768f047b8ee4c8929 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Sun, 26 Oct 2025 20:12:35 +0100 Subject: [PATCH 2/2] use int64_t ne and ignore shape --- ggml/src/ggml-cuda/cpy.cu | 43 +++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/ggml/src/ggml-cuda/cpy.cu b/ggml/src/ggml-cuda/cpy.cu index d100ce225dbe5..c5821acbdeb8a 100644 --- a/ggml/src/ggml-cuda/cpy.cu +++ b/ggml/src/ggml-cuda/cpy.cu @@ -113,7 +113,7 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst, const int ne, } template -static __global__ void cpy_flt_cont_shape(const char * cx, char * cdst, const int ne) { +static __global__ void cpy_flt_contiguous(const char * cx, char * cdst, const int64_t ne) { const int64_t i = blockDim.x*blockIdx.x + threadIdx.x; if (i >= ne) { @@ -127,12 +127,12 @@ static __global__ void cpy_flt_cont_shape(const char * cx, char * cdst, const in } template -static void ggml_cpy_flt_cont_shape_cuda( - const char * cx, char * cdst, const int ne, +static void ggml_cpy_flt_contiguous_cuda( + const char * cx, char * cdst, const int64_t ne, cudaStream_t stream) { - const int num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; - cpy_flt_cont_shape<<>> + const int64_t num_blocks = (ne + CUDA_CPY_BLOCK_SIZE - 1) / CUDA_CPY_BLOCK_SIZE; + cpy_flt_contiguous<<>> (cx, cdst, ne); } @@ -310,7 +310,6 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg char * src1_ddc = (char *) src1->data; const bool contiguous_srcs = ggml_is_contiguous(src0) && ggml_is_contiguous(src1); - const bool cont_shape_srcs = contiguous_srcs && ggml_are_same_shape(src0, src1); if (src0->type == src1->type && contiguous_srcs) { GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1)); @@ -325,14 +324,14 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) { - if (cont_shape_srcs) { - ggml_cpy_flt_cont_shape_cuda (src0_ddc, src1_ddc, ne, main_stream); + if (contiguous_srcs) { + ggml_cpy_flt_contiguous_cuda (src0_ddc, src1_ddc, ne, main_stream); } else { ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) { - if (cont_shape_srcs) { - ggml_cpy_flt_cont_shape_cuda (src0_ddc, src1_ddc, ne, main_stream); + if (contiguous_srcs) { + ggml_cpy_flt_contiguous_cuda (src0_ddc, src1_ddc, ne, main_stream); } else { ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } @@ -364,40 +363,40 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_BF16) { - if (cont_shape_srcs) { - ggml_cpy_flt_cont_shape_cuda (src0_ddc, src1_ddc, ne, main_stream); + if (contiguous_srcs) { + ggml_cpy_flt_contiguous_cuda (src0_ddc, src1_ddc, ne, main_stream); } else { ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { - if (cont_shape_srcs) { - ggml_cpy_flt_cont_shape_cuda (src0_ddc, src1_ddc, ne, main_stream); + if (contiguous_srcs) { + ggml_cpy_flt_contiguous_cuda (src0_ddc, src1_ddc, ne, main_stream); } else { ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16) { ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F16) { - if (cont_shape_srcs) { - ggml_cpy_flt_cont_shape_cuda (src0_ddc, src1_ddc, ne, main_stream); + if (contiguous_srcs) { + ggml_cpy_flt_contiguous_cuda (src0_ddc, src1_ddc, ne, main_stream); } else { ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } } else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32) { - if (cont_shape_srcs) { - ggml_cpy_flt_cont_shape_cuda (src0_ddc, src1_ddc, ne, main_stream); + if (contiguous_srcs) { + ggml_cpy_flt_contiguous_cuda (src0_ddc, src1_ddc, ne, main_stream); } else { ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_I32) { - if (cont_shape_srcs) { - ggml_cpy_flt_cont_shape_cuda (src0_ddc, src1_ddc, ne, main_stream); + if (contiguous_srcs) { + ggml_cpy_flt_contiguous_cuda (src0_ddc, src1_ddc, ne, main_stream); } else { ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } } else if (src0->type == GGML_TYPE_I32 && src1->type == GGML_TYPE_F32) { - if (cont_shape_srcs) { - ggml_cpy_flt_cont_shape_cuda (src0_ddc, src1_ddc, ne, main_stream); + if (contiguous_srcs) { + ggml_cpy_flt_contiguous_cuda (src0_ddc, src1_ddc, ne, main_stream); } else { ggml_cpy_flt_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); }