From cbefca1818e75a2e49dd896eaf0c60debde13cc8 Mon Sep 17 00:00:00 2001 From: safranowith Date: Sun, 21 Sep 2025 19:49:41 +0300 Subject: [PATCH 01/15] SYCL: Add support for FLOOR,CEIL,ROUND and TRUNC unary operators Clean up unrelated changes from previous commit --- docs/ops.md | 8 +- docs/ops/SYCL.csv | 16 ++++ ggml/include/ggml.h | 37 +++++++- ggml/src/ggml-sycl/element_wise.cpp | 120 +++++++++++++++++++++++++ ggml/src/ggml-sycl/element_wise.hpp | 4 + ggml/src/ggml-sycl/ggml-sycl.cpp | 16 ++++ tests/test-backend-ops.cpp | 132 ++++++++++++++++++++++++++++ 7 files changed, 327 insertions(+), 6 deletions(-) diff --git a/docs/ops.md b/docs/ops.md index 938efac815fc0..226cd935d698a 100644 --- a/docs/ops.md +++ b/docs/ops.md @@ -22,7 +22,7 @@ Legend: | ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | | ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | | ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | -| CEIL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| CEIL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | | CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | | CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | | CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ | @@ -42,7 +42,7 @@ Legend: | ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ | | EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ | | FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | -| FLOOR | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| FLOOR | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | | GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | | GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ | | GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ | @@ -84,7 +84,7 @@ Legend: | ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | | ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | | ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | -| ROUND | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| ROUND | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | | RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | | RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | | SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | @@ -111,6 +111,6 @@ Legend: | TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ | | TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | | TOPK_MOE | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | -| TRUNC | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| TRUNC | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | | UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | | XIELU | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | diff --git a/docs/ops/SYCL.csv b/docs/ops/SYCL.csv index d7efa43cdf3da..bc6319f51fa8c 100644 --- a/docs/ops/SYCL.csv +++ b/docs/ops/SYCL.csv @@ -31,6 +31,14 @@ "SYCL0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","XIELU","type=f16,ne_a=[128,2,2,2],v=0","support","0","no","SYCL" "SYCL0","XIELU","type=f16,ne_a=[5,7,11,13],v=0","support","0","no","SYCL" +"SYCL0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" +"SYCL0","FLOOR","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" +"SYCL0","CEIL","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" +"SYCL0","CEIL","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" +"SYCL0","ROUND","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" +"SYCL0","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" +"SYCL0","TRUNC","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" +"SYCL0","TRUNC","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","ABS","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" "SYCL0","ABS","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","SYCL" "SYCL0","SGN","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" @@ -95,6 +103,14 @@ "SYCL0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","XIELU","type=f32,ne_a=[128,2,2,2],v=0","support","0","no","SYCL" "SYCL0","XIELU","type=f32,ne_a=[5,7,11,13],v=0","support","0","no","SYCL" +"SYCL0","FLOOR","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" +"SYCL0","FLOOR","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" +"SYCL0","CEIL","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" +"SYCL0","CEIL","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" +"SYCL0","ROUND","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" +"SYCL0","ROUND","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" +"SYCL0","TRUNC","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","SYCL" +"SYCL0","TRUNC","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","SYCL" "SYCL0","ABS","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" "SYCL0","ABS","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","SYCL" "SYCL0","SGN","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","SYCL" diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index d948b00cc7f30..b06fa0c5c82ab 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -581,7 +581,7 @@ extern "C" { GGML_UNARY_OP_CEIL, GGML_UNARY_OP_ROUND, GGML_UNARY_OP_TRUNC, - + GGML_UNARY_OP_COUNT, }; @@ -1103,7 +1103,40 @@ extern "C" { GGML_API struct ggml_tensor * ggml_gelu_inplace( struct ggml_context * ctx, struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_floor( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_floor_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_ceil( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_ceil_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_round( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_round_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_trunc( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_trunc_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + // GELU using erf (error function) when possible // some backends may fallback to approximation based on Abramowitz and Stegun formula GGML_API struct ggml_tensor * ggml_gelu_erf( @@ -2154,7 +2187,7 @@ extern "C" { int p1, int p2, int p3); - + GGML_API struct ggml_tensor * ggml_pad_ext( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index 58f5125c9cf6e..810995d0cbf74 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -150,6 +150,26 @@ static __dpct_inline__ T op_clamp(T x, float min_val, float max_val) { return x < static_cast(min_val) ? static_cast(min_val) : (x > static_cast(max_val) ? static_cast(max_val) : x); } +template +static __dpct_inline__ T op_floor(T x) { + return sycl::floor(x); +} + +template +static __dpct_inline__ T op_ceil(T x) { + return sycl::ceil(x); +} + +template +static __dpct_inline__ T op_round(T x) { + return sycl::round(x); +} + +template +static __dpct_inline__ T op_trunc(T x) { + return sycl::trunc(x); +} + template static void unary_op_sgn_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { SYCL_GLOBAL_ID_LOOP(k, item_ct1) { @@ -304,6 +324,34 @@ static void unary_op_clamp_kernel(const T * x, T * dst, const int k, const sycl: } } +template +static void unary_op_floor_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_floor(x[i]); + } +} + +template +static void unary_op_ceil_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_ceil(x[i]); + } +} + +template +static void unary_op_round_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_round(x[i]); + } +} + +template +static void unary_op_trunc_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { + SYCL_GLOBAL_ID_LOOP(k, item_ct1) { + dst[i] = op_trunc(x[i]); + } +} + template static void upscale(const T *x, T *dst, const int nb00, const int nb01, const int nb02, const int nb03, const int ne10, const int ne11, @@ -897,6 +945,58 @@ static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tens }, min_val, max_val); } +static inline void ggml_sycl_op_floor(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, 256); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), + sycl::range<1>(256)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_floor_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_ceil(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, 256); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), + sycl::range<1>(256)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_ceil_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_round(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, 256); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), + sycl::range<1>(256)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_round_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + +static inline void ggml_sycl_op_trunc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, + [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { + const int num_blocks = ceil_div(k_elements, 256); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), + sycl::range<1>(256)), + [=](sycl::nd_item<1> item_ct1) { + unary_op_trunc_kernel(src, dst_ptr, k_elements, item_ct1); + }); + }); +} + static inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32); @@ -1122,3 +1222,23 @@ void ggml_sycl_arange(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/0); ggml_sycl_detail::ggml_sycl_op_arange(ctx, dst); } + +void ggml_sycl_floor(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_floor(ctx, dst); +} + +void ggml_sycl_ceil(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_ceil(ctx, dst); +} + +void ggml_sycl_round(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_round(ctx, dst); +} + +void ggml_sycl_trunc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_trunc(ctx, dst); +} diff --git a/ggml/src/ggml-sycl/element_wise.hpp b/ggml/src/ggml-sycl/element_wise.hpp index ed96c55f75a7a..fcf93295cb215 100644 --- a/ggml/src/ggml-sycl/element_wise.hpp +++ b/ggml/src/ggml-sycl/element_wise.hpp @@ -80,6 +80,10 @@ void ggml_sycl_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_geglu_erf(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_geglu_quick(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_floor(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_ceil(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_round(ggml_backend_sycl_context & ctx, ggml_tensor * dst); +void ggml_sycl_trunc(ggml_backend_sycl_context & ctx, ggml_tensor * dst); void ggml_sycl_arange(ggml_backend_sycl_context & ctx, ggml_tensor * dst); diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index a7e077ec8ebe0..1a007ffe2bca6 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -3698,6 +3698,18 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg case GGML_UNARY_OP_ELU: ggml_sycl_elu(ctx, dst); break; + case GGML_UNARY_OP_FLOOR: + ggml_sycl_floor(ctx, dst); + break; + case GGML_UNARY_OP_CEIL: + ggml_sycl_ceil(ctx, dst); + break; + case GGML_UNARY_OP_ROUND: + ggml_sycl_round(ctx, dst); + break; + case GGML_UNARY_OP_TRUNC: + ggml_sycl_trunc(ctx, dst); + break; default: return false; } @@ -4262,6 +4274,10 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g case GGML_UNARY_OP_SGN: case GGML_UNARY_OP_ABS: case GGML_UNARY_OP_ELU: + case GGML_UNARY_OP_FLOOR: + case GGML_UNARY_OP_CEIL: + case GGML_UNARY_OP_ROUND: + case GGML_UNARY_OP_TRUNC: #if defined (GGML_SYCL_F16) return ggml_is_contiguous(op->src[0]) && (op->type == op->src[0]->type); #else diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 82bb55ea0e184..fa98db2982ce7 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -3759,6 +3759,130 @@ struct test_clamp : public test_case { } }; +// GGML_OP_FLOOR +struct test_floor : public test_case { + const ggml_type type; + const std::array ne; + + std::string vars() override { + return VARS_TO_STR2(type, ne); + } + + test_floor(ggml_type type = GGML_TYPE_F32, + std::array ne = {10, 2, 2, 2}) + : type(type), ne(ne) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(a); + ggml_set_name(a, "a"); + + ggml_tensor * out = ggml_floor(ctx, a); + ggml_set_name(out, "out"); + + return out; + } + + void initialize_tensors(ggml_context * ctx) override { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + init_tensor_uniform(t, -10.0f, 10.0f); + } + } +}; + +// GGML_OP_CEIL +struct test_ceil : public test_case { + const ggml_type type; + const std::array ne; + + std::string vars() override { + return VARS_TO_STR2(type, ne); + } + + test_ceil(ggml_type type = GGML_TYPE_F32, + std::array ne = {10, 2, 2, 2}) + : type(type), ne(ne) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(a); + ggml_set_name(a, "a"); + + ggml_tensor * out = ggml_ceil(ctx, a); + ggml_set_name(out, "out"); + + return out; + } + + void initialize_tensors(ggml_context * ctx) override { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + init_tensor_uniform(t, -10.0f, 10.0f); + } + } +}; + +// GGML_OP_ROUND +struct test_round : public test_case { + const ggml_type type; + const std::array ne; + + std::string vars() override { + return VARS_TO_STR2(type, ne); + } + + test_round(ggml_type type = GGML_TYPE_F32, + std::array ne = {10, 2, 2, 2}) + : type(type), ne(ne) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(a); + ggml_set_name(a, "a"); + + ggml_tensor * out = ggml_round(ctx, a); + ggml_set_name(out, "out"); + + return out; + } + + void initialize_tensors(ggml_context * ctx) override { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + init_tensor_uniform(t, -10.0f, 10.0f); + } + } +}; + +// GGML_OP_TRUNC +struct test_trunc : public test_case { + const ggml_type type; + const std::array ne; + + std::string vars() override { + return VARS_TO_STR2(type, ne); + } + + test_trunc(ggml_type type = GGML_TYPE_F32, + std::array ne = {10, 2, 2, 2}) + : type(type), ne(ne) {} + + ggml_tensor * build_graph(ggml_context * ctx) override { + ggml_tensor * a = ggml_new_tensor(ctx, type, 4, ne.data()); + ggml_set_param(a); + ggml_set_name(a, "a"); + + ggml_tensor * out = ggml_trunc(ctx, a); + ggml_set_name(out, "out"); + + return out; + } + + void initialize_tensors(ggml_context * ctx) override { + for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { + init_tensor_uniform(t, -10.0f, 10.0f); + } + } +}; + // GGML_OP_DIAG_MASK_INF struct test_diag_mask_inf : public test_case { const ggml_type type; @@ -6585,6 +6709,10 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_cos (type)); test_cases.emplace_back(new test_clamp (type)); test_cases.emplace_back(new test_leaky_relu(type)); + test_cases.emplace_back(new test_floor (type)); + test_cases.emplace_back(new test_ceil (type)); + test_cases.emplace_back(new test_round (type)); + test_cases.emplace_back(new test_trunc (type)); test_cases.emplace_back(new test_sqr (type, {7, 1, 5, 3})); test_cases.emplace_back(new test_sqrt (type, {7, 1, 5, 3})); test_cases.emplace_back(new test_log (type, {7, 1, 5, 3})); @@ -6592,6 +6720,10 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_cos (type, {7, 1, 5, 3})); test_cases.emplace_back(new test_clamp (type, {7, 1, 5, 3})); test_cases.emplace_back(new test_leaky_relu(type, {7, 1, 5, 3})); + test_cases.emplace_back(new test_floor (type, {7, 1, 5, 3})); + test_cases.emplace_back(new test_ceil (type, {7, 1, 5, 3})); + test_cases.emplace_back(new test_round (type, {7, 1, 5, 3})); + test_cases.emplace_back(new test_trunc (type, {7, 1, 5, 3})); } test_cases.emplace_back(new test_diag_mask_inf(GGML_TYPE_F32, {10, 10, 1, 1}, 5)); From c4ce488e498ecf65e9f82235217e24d4e106070f Mon Sep 17 00:00:00 2001 From: safranowith Date: Thu, 16 Oct 2025 15:35:49 +0300 Subject: [PATCH 02/15] Chore: remove empty lines and fix indentation --- ggml/include/ggml.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index b06fa0c5c82ab..f34d20f0cd9e2 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -581,7 +581,7 @@ extern "C" { GGML_UNARY_OP_CEIL, GGML_UNARY_OP_ROUND, GGML_UNARY_OP_TRUNC, - + GGML_UNARY_OP_COUNT, }; @@ -2187,10 +2187,11 @@ extern "C" { int p1, int p2, int p3); - + GGML_API struct ggml_tensor * ggml_pad_ext( struct ggml_context * ctx, struct ggml_tensor * a, + int lp0, int rp0, int lp1, From a97a0e0df59057e9ad726d6a6253de907aa6c9be Mon Sep 17 00:00:00 2001 From: safranowith Date: Thu, 16 Oct 2025 15:45:29 +0300 Subject: [PATCH 03/15] Clean up: remove leftover blank lines and fix spacing --- ggml/include/ggml.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index f34d20f0cd9e2..9fdc3d5a997af 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2187,11 +2187,10 @@ extern "C" { int p1, int p2, int p3); - + GGML_API struct ggml_tensor * ggml_pad_ext( struct ggml_context * ctx, struct ggml_tensor * a, - int lp0, int rp0, int lp1, From 7d2a08d429f31c3b3f83a612fea9bc2eaa31d672 Mon Sep 17 00:00:00 2001 From: safranowith Date: Thu, 16 Oct 2025 16:33:34 +0300 Subject: [PATCH 04/15] chore: fix trailing whitespace and ensure final newline --- ggml/include/ggml.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 9fdc3d5a997af..33e3ff385d0ef 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1103,7 +1103,7 @@ extern "C" { GGML_API struct ggml_tensor * ggml_gelu_inplace( struct ggml_context * ctx, struct ggml_tensor * a); - + GGML_API struct ggml_tensor * ggml_floor( struct ggml_context * ctx, struct ggml_tensor * a); @@ -1130,13 +1130,13 @@ extern "C" { GGML_API struct ggml_tensor * ggml_trunc( struct ggml_context * ctx, - struct ggml_tensor * a); + struct ggml_tensor * a); GGML_API struct ggml_tensor * ggml_trunc_inplace( struct ggml_context * ctx, struct ggml_tensor * a); - + // GELU using erf (error function) when possible // some backends may fallback to approximation based on Abramowitz and Stegun formula GGML_API struct ggml_tensor * ggml_gelu_erf( From 3489bbe8b71c5fcaf725c9819c2a4c703be0dcd6 Mon Sep 17 00:00:00 2001 From: safranowith Date: Sun, 19 Oct 2025 12:18:23 +0300 Subject: [PATCH 05/15] Cleanup: remove redundant declarations already defined in header --- ggml/include/ggml.h | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 33e3ff385d0ef..b4015a6dd1ce1 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1104,38 +1104,6 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); - GGML_API struct ggml_tensor * ggml_floor( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_floor_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_ceil( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_ceil_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_round( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_round_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_trunc( - struct ggml_context * ctx, - struct ggml_tensor * a); - - GGML_API struct ggml_tensor * ggml_trunc_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - // GELU using erf (error function) when possible // some backends may fallback to approximation based on Abramowitz and Stegun formula From d85313eac99047e326d908f8415c9cb27294e93d Mon Sep 17 00:00:00 2001 From: safranowith Date: Sun, 19 Oct 2025 12:36:23 +0300 Subject: [PATCH 06/15] Sync docs/ops.md with updated backend operation support --- ggml/include/ggml.h | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index b4015a6dd1ce1..d948b00cc7f30 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1104,7 +1104,6 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); - // GELU using erf (error function) when possible // some backends may fallback to approximation based on Abramowitz and Stegun formula GGML_API struct ggml_tensor * ggml_gelu_erf( From afc71a4cccd830c4be4e7a191ccf9a6195609fb6 Mon Sep 17 00:00:00 2001 From: safranowith Date: Sun, 19 Oct 2025 12:52:55 +0300 Subject: [PATCH 07/15] docs: update ops.md after rebase --- docs/ops.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/ops.md b/docs/ops.md index 226cd935d698a..89861b8c2f28b 100644 --- a/docs/ops.md +++ b/docs/ops.md @@ -100,8 +100,8 @@ Legend: | SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | | SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | | SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ❌ | ❌ | -| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | -| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | +| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | +| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | | STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ | | SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | | SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | From 47aa3a47980c2e0d5b2fee6e93eedaeeffaed9e5 Mon Sep 17 00:00:00 2001 From: safranowith Date: Sun, 19 Oct 2025 13:39:46 +0300 Subject: [PATCH 08/15] docs: update ops.md - Vulkan supports SSM_CONV and SSM_SCAN --- docs/ops.md | 4 ++-- docs/ops/Vulkan.csv | 42 +++++++++++++++++++++--------------------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/docs/ops.md b/docs/ops.md index 89861b8c2f28b..226cd935d698a 100644 --- a/docs/ops.md +++ b/docs/ops.md @@ -100,8 +100,8 @@ Legend: | SOFT_MAX_BACK | ❌ | ❌ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ✅ | ❌ | | SQR | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | 🟡 | ❌ | | SQRT | ❌ | ✅ | ✅ | ✅ | 🟡 | ❌ | ✅ | ❌ | ❌ | -| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | -| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | +| SSM_CONV | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | +| SSM_SCAN | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | | STEP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ | | SUB | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | ✅ | ❌ | | SUM | ❌ | ✅ | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ❌ | diff --git a/docs/ops/Vulkan.csv b/docs/ops/Vulkan.csv index ea252577280d5..298c2a6ccd5fc 100644 --- a/docs/ops/Vulkan.csv +++ b/docs/ops/Vulkan.csv @@ -3263,27 +3263,27 @@ "Vulkan0","RMS_NORM_MUL_ADD","type=f32,ne=[64,5,4,3],eps=1.000000,broadcast=0","support","1","yes","Vulkan" "Vulkan0","RMS_NORM_MUL_ADD","type=f32,ne=[64,5,4,3],eps=1.000000,broadcast=1","support","1","yes","Vulkan" "Vulkan0","L2_NORM","type=f32,ne=[64,5,4,3]","support","1","yes","Vulkan" -"Vulkan0","SSM_CONV","type=f32,ne_a=[4,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","Vulkan" -"Vulkan0","SSM_CONV","type=f32,ne_a=[8,1024,1,1],ne_b=[3,1024,1,1]","support","0","no","Vulkan" -"Vulkan0","SSM_CONV","type=f32,ne_a=[4,1024,4,1],ne_b=[3,1024,1,1]","support","0","no","Vulkan" -"Vulkan0","SSM_CONV","type=f32,ne_a=[4,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","Vulkan" -"Vulkan0","SSM_CONV","type=f32,ne_a=[8,1536,1,1],ne_b=[3,1536,1,1]","support","0","no","Vulkan" -"Vulkan0","SSM_CONV","type=f32,ne_a=[4,1536,4,1],ne_b=[3,1536,1,1]","support","0","no","Vulkan" -"Vulkan0","SSM_CONV","type=f32,ne_a=[4,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","Vulkan" -"Vulkan0","SSM_CONV","type=f32,ne_a=[8,2048,1,1],ne_b=[3,2048,1,1]","support","0","no","Vulkan" -"Vulkan0","SSM_CONV","type=f32,ne_a=[4,2048,4,1],ne_b=[3,2048,1,1]","support","0","no","Vulkan" -"Vulkan0","SSM_CONV","type=f32,ne_a=[4,1024,1,1],ne_b=[4,1024,1,1]","support","0","no","Vulkan" -"Vulkan0","SSM_CONV","type=f32,ne_a=[8,1024,1,1],ne_b=[4,1024,1,1]","support","0","no","Vulkan" -"Vulkan0","SSM_CONV","type=f32,ne_a=[4,1024,4,1],ne_b=[4,1024,1,1]","support","0","no","Vulkan" -"Vulkan0","SSM_CONV","type=f32,ne_a=[4,1536,1,1],ne_b=[4,1536,1,1]","support","0","no","Vulkan" -"Vulkan0","SSM_CONV","type=f32,ne_a=[8,1536,1,1],ne_b=[4,1536,1,1]","support","0","no","Vulkan" -"Vulkan0","SSM_CONV","type=f32,ne_a=[4,1536,4,1],ne_b=[4,1536,1,1]","support","0","no","Vulkan" -"Vulkan0","SSM_CONV","type=f32,ne_a=[4,2048,1,1],ne_b=[4,2048,1,1]","support","0","no","Vulkan" -"Vulkan0","SSM_CONV","type=f32,ne_a=[8,2048,1,1],ne_b=[4,2048,1,1]","support","0","no","Vulkan" -"Vulkan0","SSM_CONV","type=f32,ne_a=[4,2048,4,1],ne_b=[4,2048,1,1]","support","0","no","Vulkan" -"Vulkan0","SSM_SCAN","type=f32,d_state=16,head_dim=1,n_head=1024,n_group=1,n_seq_tokens=32,n_seqs=4","support","0","no","Vulkan" -"Vulkan0","SSM_SCAN","type=f32,d_state=128,head_dim=64,n_head=16,n_group=2,n_seq_tokens=32,n_seqs=4","support","0","no","Vulkan" -"Vulkan0","SSM_SCAN","type=f32,d_state=256,head_dim=64,n_head=8,n_group=2,n_seq_tokens=32,n_seqs=4","support","0","no","Vulkan" +"Vulkan0","SSM_CONV","type=f32,ne_a=[4,1024,1,1],ne_b=[3,1024,1,1]","support","1","yes","Vulkan" +"Vulkan0","SSM_CONV","type=f32,ne_a=[8,1024,1,1],ne_b=[3,1024,1,1]","support","1","yes","Vulkan" +"Vulkan0","SSM_CONV","type=f32,ne_a=[4,1024,4,1],ne_b=[3,1024,1,1]","support","1","yes","Vulkan" +"Vulkan0","SSM_CONV","type=f32,ne_a=[4,1536,1,1],ne_b=[3,1536,1,1]","support","1","yes","Vulkan" +"Vulkan0","SSM_CONV","type=f32,ne_a=[8,1536,1,1],ne_b=[3,1536,1,1]","support","1","yes","Vulkan" +"Vulkan0","SSM_CONV","type=f32,ne_a=[4,1536,4,1],ne_b=[3,1536,1,1]","support","1","yes","Vulkan" +"Vulkan0","SSM_CONV","type=f32,ne_a=[4,2048,1,1],ne_b=[3,2048,1,1]","support","1","yes","Vulkan" +"Vulkan0","SSM_CONV","type=f32,ne_a=[8,2048,1,1],ne_b=[3,2048,1,1]","support","1","yes","Vulkan" +"Vulkan0","SSM_CONV","type=f32,ne_a=[4,2048,4,1],ne_b=[3,2048,1,1]","support","1","yes","Vulkan" +"Vulkan0","SSM_CONV","type=f32,ne_a=[4,1024,1,1],ne_b=[4,1024,1,1]","support","1","yes","Vulkan" +"Vulkan0","SSM_CONV","type=f32,ne_a=[8,1024,1,1],ne_b=[4,1024,1,1]","support","1","yes","Vulkan" +"Vulkan0","SSM_CONV","type=f32,ne_a=[4,1024,4,1],ne_b=[4,1024,1,1]","support","1","yes","Vulkan" +"Vulkan0","SSM_CONV","type=f32,ne_a=[4,1536,1,1],ne_b=[4,1536,1,1]","support","1","yes","Vulkan" +"Vulkan0","SSM_CONV","type=f32,ne_a=[8,1536,1,1],ne_b=[4,1536,1,1]","support","1","yes","Vulkan" +"Vulkan0","SSM_CONV","type=f32,ne_a=[4,1536,4,1],ne_b=[4,1536,1,1]","support","1","yes","Vulkan" +"Vulkan0","SSM_CONV","type=f32,ne_a=[4,2048,1,1],ne_b=[4,2048,1,1]","support","1","yes","Vulkan" +"Vulkan0","SSM_CONV","type=f32,ne_a=[8,2048,1,1],ne_b=[4,2048,1,1]","support","1","yes","Vulkan" +"Vulkan0","SSM_CONV","type=f32,ne_a=[4,2048,4,1],ne_b=[4,2048,1,1]","support","1","yes","Vulkan" +"Vulkan0","SSM_SCAN","type=f32,d_state=16,head_dim=1,n_head=1024,n_group=1,n_seq_tokens=32,n_seqs=4","support","1","yes","Vulkan" +"Vulkan0","SSM_SCAN","type=f32,d_state=128,head_dim=64,n_head=16,n_group=2,n_seq_tokens=32,n_seqs=4","support","1","yes","Vulkan" +"Vulkan0","SSM_SCAN","type=f32,d_state=256,head_dim=64,n_head=8,n_group=2,n_seq_tokens=32,n_seqs=4","support","1","yes","Vulkan" "Vulkan0","RWKV_WKV6","type=f32,head_count=32,head_size=64,n_seq_tokens=1,n_seqs=1","support","1","yes","Vulkan" "Vulkan0","RWKV_WKV6","type=f32,head_count=32,head_size=64,n_seq_tokens=32,n_seqs=1","support","1","yes","Vulkan" "Vulkan0","RWKV_WKV6","type=f32,head_count=32,head_size=64,n_seq_tokens=32,n_seqs=4","support","1","yes","Vulkan" From c6291c0e7dfec6b324789bfd70623c9121031b83 Mon Sep 17 00:00:00 2001 From: safranowith Date: Thu, 16 Oct 2025 17:59:36 +0300 Subject: [PATCH 09/15] SYCL: add support for ROUND, FLOOR, CEIL, TRUNC ops --- ggml/include/ggml.h | 1 + ggml/src/ggml-cuda/ggml-cuda.cu | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index d948b00cc7f30..8ce0d7a6fbc29 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1943,6 +1943,7 @@ extern "C" { int d1); // dilation dimension 1 GGML_API struct ggml_tensor * ggml_im2col_3d( +<<<<<<< HEAD struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 75fd6db14c514..723bb4eab4862 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -3357,6 +3357,10 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g case GGML_UNARY_OP_TANH: case GGML_UNARY_OP_EXP: case GGML_UNARY_OP_ELU: + case GGML_UNARY_OP_FLOOR: + case GGML_UNARY_OP_CEIL: + case GGML_UNARY_OP_ROUND: + case GGML_UNARY_OP_TRUNC: return ggml_is_contiguous(op->src[0]); default: return false; From 022c43261a9afc142abc10bcd2df2222df87e688 Mon Sep 17 00:00:00 2001 From: safranowith Date: Thu, 25 Sep 2025 22:49:06 +0300 Subject: [PATCH 10/15] change llama.cpp/ggml/src/ggml-cuda$ code ggml-cuda.cu --- ggml/src/ggml-cuda/ggml-cuda.cu | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 723bb4eab4862..0d22935b462b0 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -2345,6 +2345,17 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg break; case GGML_UNARY_OP_XIELU: ggml_cuda_op_xielu(ctx, dst); + case GGML_UNARY_OP_FLOOR: + ggml_cuda_op_floor(ctx, dst); + break; + case GGML_UNARY_OP_CEIL: + ggml_cuda_op_ceil(ctx, dst); + break; + case GGML_UNARY_OP_ROUND: + ggml_cuda_op_round(ctx, dst); + break; + case GGML_UNARY_OP_TRUNC: + ggml_cuda_op_trunc(ctx, dst); break; default: return false; From 840cebc3efc8a239a9863bd183fe2bb172ee800f Mon Sep 17 00:00:00 2001 From: safranowith Date: Sun, 28 Sep 2025 18:09:48 +0300 Subject: [PATCH 11/15] Add CUDA backend support for floor, ceil, round, trunc unary ops --- docs/ops/CUDA.csv | 32 ++++++++++++++++++++++++++++++++ ggml/src/ggml-cuda/unary.cu | 33 +++++++++++++++++++++++++++++++++ ggml/src/ggml-cuda/unary.cuh | 8 ++++++++ ggml/src/ggml.c | 9 +++++++++ 4 files changed, 82 insertions(+) diff --git a/docs/ops/CUDA.csv b/docs/ops/CUDA.csv index 71e47977e31d1..48d0e4ac664df 100644 --- a/docs/ops/CUDA.csv +++ b/docs/ops/CUDA.csv @@ -29,6 +29,14 @@ "CUDA0","EXP","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","CUDA" "CUDA0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","CUDA" "CUDA0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","CUDA" +"CUDA0","FLOOR","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","CUDA" +"CUDA0","FLOOR","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","CUDA" +"CUDA0","CEIL","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","CUDA" +"CUDA0","CEIL","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","CUDA" +"CUDA0","ROUND","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","CUDA" +"CUDA0","ROUND","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","CUDA" +"CUDA0","TRUNC","type=f16,ne_a=[128,2,2,2],v=0","support","1","yes","CUDA" +"CUDA0","TRUNC","type=f16,ne_a=[5,7,11,13],v=0","support","1","yes","CUDA" "CUDA0","ABS","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","CUDA" "CUDA0","ABS","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","CUDA" "CUDA0","SGN","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","CUDA" @@ -59,6 +67,14 @@ "CUDA0","EXP","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","CUDA" "CUDA0","GELU_ERF","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","CUDA" "CUDA0","GELU_ERF","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","CUDA" +"CUDA0","FLOOR","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","CUDA" +"CUDA0","FLOOR","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","CUDA" +"CUDA0","CEIL","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","CUDA" +"CUDA0","CEIL","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","CUDA" +"CUDA0","ROUND","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","CUDA" +"CUDA0","ROUND","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","CUDA" +"CUDA0","TRUNC","type=f16,ne_a=[128,2,2,2],v=1","support","0","no","CUDA" +"CUDA0","TRUNC","type=f16,ne_a=[5,7,11,13],v=1","support","0","no","CUDA" "CUDA0","ABS","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CUDA" "CUDA0","ABS","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CUDA" "CUDA0","SGN","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CUDA" @@ -89,6 +105,14 @@ "CUDA0","EXP","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CUDA" "CUDA0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CUDA" "CUDA0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CUDA" +"CUDA0","FLOOR","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CUDA" +"CUDA0","FLOOR","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CUDA" +"CUDA0","CEIL","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CUDA" +"CUDA0","CEIL","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CUDA" +"CUDA0","ROUND","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CUDA" +"CUDA0","ROUND","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CUDA" +"CUDA0","TRUNC","type=f32,ne_a=[128,2,2,2],v=0","support","1","yes","CUDA" +"CUDA0","TRUNC","type=f32,ne_a=[5,7,11,13],v=0","support","1","yes","CUDA" "CUDA0","ABS","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","CUDA" "CUDA0","ABS","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","CUDA" "CUDA0","SGN","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","CUDA" @@ -118,6 +142,14 @@ "CUDA0","EXP","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","CUDA" "CUDA0","EXP","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","CUDA" "CUDA0","GELU_ERF","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","CUDA" +"CUDA0","FLOOR","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","CUDA" +"CUDA0","FLOOR","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","CUDA" +"CUDA0","CEIL","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","CUDA" +"CUDA0","CEIL","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","CUDA" +"CUDA0","ROUND","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","CUDA" +"CUDA0","ROUND","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","CUDA" +"CUDA0","TRUNC","type=f32,ne_a=[128,2,2,2],v=1","support","0","no","CUDA" +"CUDA0","TRUNC","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","CUDA" "CUDA0","GELU_ERF","type=f32,ne_a=[5,7,11,13],v=1","support","0","no","CUDA" "CUDA0","REGLU","type=f16,ne_a=[128,2,2,2],v=0,swapped=0","support","1","yes","CUDA" "CUDA0","REGLU","type=f16,ne_a=[5,7,11,13],v=0,swapped=0","support","1","yes","CUDA" diff --git a/ggml/src/ggml-cuda/unary.cu b/ggml/src/ggml-cuda/unary.cu index 3c564566a51ff..6b5fe1ff6cea2 100644 --- a/ggml/src/ggml-cuda/unary.cu +++ b/ggml/src/ggml-cuda/unary.cu @@ -88,6 +88,22 @@ static __device__ __forceinline__ float op_elu(float x) { return (x > 0.f) ? x : expm1f(x); } +static __device__ __forceinline__ float op_floor(float x) { + return floorf(x); +} + +static __device__ __forceinline__ float op_ceil(float x) { + return ceilf(x); +} + +static __device__ __forceinline__ float op_round(float x) { + return roundf(x); +} + +static __device__ __forceinline__ float op_trunc(float x) { + return truncf(x); +} + template static __global__ void unary_op_kernel(const T * x, T * dst, const int k) { const int i = blockDim.x*blockIdx.x + threadIdx.x; @@ -204,6 +220,23 @@ void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_elu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { ggml_cuda_op_unary(ctx, dst); } + +void ggml_cuda_op_floor(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_unary(ctx, dst); +} + +void ggml_cuda_op_ceil(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_unary(ctx, dst); +} + +void ggml_cuda_op_round(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_unary(ctx, dst); +} + +void ggml_cuda_op_trunc(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { + ggml_cuda_op_unary(ctx, dst); +} + /* gated ops */ template diff --git a/ggml/src/ggml-cuda/unary.cuh b/ggml/src/ggml-cuda/unary.cuh index 8e7644fcd9a48..8f28f89ab7916 100644 --- a/ggml/src/ggml-cuda/unary.cuh +++ b/ggml/src/ggml-cuda/unary.cuh @@ -75,3 +75,11 @@ void ggml_cuda_op_geglu_erf(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_geglu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_xielu(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_floor(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_ceil(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_round(ggml_backend_cuda_context & ctx, ggml_tensor * dst); + +void ggml_cuda_op_trunc(ggml_backend_cuda_context & ctx, ggml_tensor * dst); \ No newline at end of file diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 86f1c31afd7a6..fd778ff82887c 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -6454,6 +6454,15 @@ static void ggml_compute_backward( ggml_add_or_set(ctx, cgraph, isrc0, ggml_mul(ctx, tensor, grad)); } } break; + case GGML_UNARY_OP_FLOOR: + case GGML_UNARY_OP_CEIL: + case GGML_UNARY_OP_ROUND: + case GGML_UNARY_OP_TRUNC: { + if (src0_needs_grads) { + ggml_add_or_set(ctx, cgraph, isrc0, ggml_repeat(ctx, ggml_new_f32(ctx, 0.0f), src0)); + } + } break; + default: { fprintf(stderr, "%s: unsupported unary op for backward pass: %s\n", __func__, ggml_unary_op_name(ggml_get_unary_op(tensor))); From 60bdbfb8deb48347539c09350249495ac64328c1 Mon Sep 17 00:00:00 2001 From: safranowith Date: Sun, 28 Sep 2025 18:12:31 +0300 Subject: [PATCH 12/15] finish --- docs/ops.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/ops.md b/docs/ops.md index 226cd935d698a..6fc976ac7b452 100644 --- a/docs/ops.md +++ b/docs/ops.md @@ -22,7 +22,7 @@ Legend: | ARANGE | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❌ | | ARGMAX | ❌ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | | ARGSORT | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | -| CEIL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | +| CEIL | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | ❌ | ❌ | | CLAMP | ❌ | ✅ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | ❌ | | CONCAT | ❌ | ✅ | ✅ | 🟡 | ✅ | 🟡 | 🟡 | ✅ | ❌ | | CONT | ❌ | 🟡 | ✅ | ✅ | ✅ | 🟡 | 🟡 | 🟡 | ❌ | @@ -42,7 +42,7 @@ Legend: | ELU | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ | | EXP | ❌ | ✅ | ✅ | 🟡 | 🟡 | ❌ | 🟡 | ❌ | ❌ | | FLASH_ATTN_EXT | ❌ | 🟡 | ✅ | 🟡 | 🟡 | ❌ | ❌ | 🟡 | ❌ | -| FLOOR | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | +| FLOOR | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | ❌ | ❌ | | GATED_LINEAR_ATTN | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ✅ | ❌ | ❌ | | GEGLU | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ | | GEGLU_ERF | ❌ | ✅ | ✅ | ✅ | 🟡 | ✅ | ✅ | 🟡 | ❌ | @@ -84,7 +84,7 @@ Legend: | ROLL | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | | ROPE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | | ROPE_BACK | ❌ | ❌ | ✅ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | -| ROUND | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | +| ROUND | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | ❌ | ❌ | | RWKV_WKV6 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | | RWKV_WKV7 | ❌ | ❌ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ❌ | | SCALE | ❌ | 🟡 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | @@ -111,6 +111,6 @@ Legend: | TANH | ❌ | ✅ | ✅ | 🟡 | 🟡 | ✅ | 🟡 | 🟡 | ❌ | | TIMESTEP_EMBEDDING | ❌ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | | TOPK_MOE | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | -| TRUNC | ❌ | ❌ | ✅ | ❌ | ❌ | ❌ | ✅ | ❌ | ❌ | +| TRUNC | ❌ | ❌ | ✅ | 🟡 | ❌ | ❌ | ✅ | ❌ | ❌ | | UPSCALE | ❌ | 🟡 | ✅ | ✅ | 🟡 | ✅ | 🟡 | ✅ | ❌ | | XIELU | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | From 0767a17ab0278bb77b90c2bcf8de405cdd751926 Mon Sep 17 00:00:00 2001 From: safranowith Date: Thu, 16 Oct 2025 17:55:40 +0300 Subject: [PATCH 13/15] fix a bag --- ggml/include/ggml.h | 75 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 8ce0d7a6fbc29..caa1647a78253 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1104,6 +1104,38 @@ extern "C" { struct ggml_context * ctx, struct ggml_tensor * a); + GGML_API struct ggml_tensor * ggml_floor( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_floor_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_ceil( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_ceil_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_round( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_round_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_trunc( + struct ggml_context * ctx, + struct ggml_tensor * a); + + GGML_API struct ggml_tensor * ggml_trunc_inplace( + struct ggml_context * ctx, + struct ggml_tensor * a); + // GELU using erf (error function) when possible // some backends may fallback to approximation based on Abramowitz and Stegun formula GGML_API struct ggml_tensor * ggml_gelu_erf( @@ -1943,6 +1975,7 @@ extern "C" { int d1); // dilation dimension 1 GGML_API struct ggml_tensor * ggml_im2col_3d( +<<<<<<< HEAD <<<<<<< HEAD struct ggml_context * ctx, struct ggml_tensor * a, @@ -1960,6 +1993,24 @@ extern "C" { enum ggml_type dst_type); // a: [OC*IC, KD, KH, KW] +======= + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int64_t IC, + int s0, // stride width + int s1, // stride height + int s2, // stride depth + int p0, // padding width + int p1, // padding height + int p2, // padding depth + int d0, // dilation width + int d1, // dilation height + int d2, // dilation depth + enum ggml_type dst_type); + + // a: [OC*IC, KD, KH, KW] +>>>>>>> 1b66dc4a6 (fix a bag) // b: [N*IC, ID, IH, IW] // result: [N*OC, OD, OH, OW] GGML_API struct ggml_tensor * ggml_conv_3d( @@ -1967,6 +2018,7 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b, int64_t IC, +<<<<<<< HEAD int s0, // stride width int s1, // stride height int s2, // stride depth @@ -1976,6 +2028,17 @@ extern "C" { int d0, // dilation width int d1, // dilation height int d2 // dilation depth +======= + int s0, // stride width + int s1, // stride height + int s2, // stride depth + int p0, // padding width + int p1, // padding height + int p2, // padding depth + int d0, // dilation width + int d1, // dilation height + int d2 // dilation depth +>>>>>>> 1b66dc4a6 (fix a bag) ); // kernel size is a->ne[0] x a->ne[1] @@ -2155,6 +2218,18 @@ extern "C" { int p1, int p2, int p3); + GGML_API struct ggml_tensor * ggml_pad_ext( + struct ggml_context * ctx, + struct ggml_tensor * a, + int lp0, + int rp0, + int lp1, + int rp1, + int lp2, + int rp2, + int lp3, + int rp3 + ); GGML_API struct ggml_tensor * ggml_pad_ext( struct ggml_context * ctx, From 0dc6d82c1f93ce5e9c0c2e59516cb691539ed43a Mon Sep 17 00:00:00 2001 From: safranowith Date: Thu, 16 Oct 2025 17:59:36 +0300 Subject: [PATCH 14/15] CUDA: Add support for FLOOR,CEIL,ROUND and TRUNC unary operators --- ggml/include/ggml.h | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index caa1647a78253..3944ff84744e7 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -1975,8 +1975,6 @@ extern "C" { int d1); // dilation dimension 1 GGML_API struct ggml_tensor * ggml_im2col_3d( -<<<<<<< HEAD -<<<<<<< HEAD struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, @@ -1993,24 +1991,6 @@ extern "C" { enum ggml_type dst_type); // a: [OC*IC, KD, KH, KW] -======= - struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - int64_t IC, - int s0, // stride width - int s1, // stride height - int s2, // stride depth - int p0, // padding width - int p1, // padding height - int p2, // padding depth - int d0, // dilation width - int d1, // dilation height - int d2, // dilation depth - enum ggml_type dst_type); - - // a: [OC*IC, KD, KH, KW] ->>>>>>> 1b66dc4a6 (fix a bag) // b: [N*IC, ID, IH, IW] // result: [N*OC, OD, OH, OW] GGML_API struct ggml_tensor * ggml_conv_3d( @@ -2018,7 +1998,6 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b, int64_t IC, -<<<<<<< HEAD int s0, // stride width int s1, // stride height int s2, // stride depth @@ -2028,17 +2007,6 @@ extern "C" { int d0, // dilation width int d1, // dilation height int d2 // dilation depth -======= - int s0, // stride width - int s1, // stride height - int s2, // stride depth - int p0, // padding width - int p1, // padding height - int p2, // padding depth - int d0, // dilation width - int d1, // dilation height - int d2 // dilation depth ->>>>>>> 1b66dc4a6 (fix a bag) ); // kernel size is a->ne[0] x a->ne[1] From fe5ae2cede18d7d53f6363e5306cd362c557b01d Mon Sep 17 00:00:00 2001 From: safranowith Date: Mon, 20 Oct 2025 11:08:32 +0300 Subject: [PATCH 15/15] SYCL: Add support for FLOOR,CEIL,ROUND and TRUNC unary operators (#16613) * SYCL: Add support for FLOOR,CEIL,ROUND and TRUNC unary operators Clean up unrelated changes from previous commit * Chore: remove empty lines and fix indentation * Clean up: remove leftover blank lines and fix spacing * chore: fix trailing whitespace and ensure final newline * Cleanup: remove redundant declarations already defined in header * Sync docs/ops.md with updated backend operation support * docs: update ops.md after rebase * docs: update ops.md - Vulkan supports SSM_CONV and SSM_SCAN --- ggml/src/ggml-sycl/element_wise.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-sycl/element_wise.cpp b/ggml/src/ggml-sycl/element_wise.cpp index 810995d0cbf74..36d681195017d 100644 --- a/ggml/src/ggml-sycl/element_wise.cpp +++ b/ggml/src/ggml-sycl/element_wise.cpp @@ -1241,4 +1241,4 @@ void ggml_sycl_round(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { void ggml_sycl_trunc(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); ggml_sycl_op_trunc(ctx, dst); -} +} \ No newline at end of file