From 8d51e183b077f84b4c58e66c7d1884946932e7b0 Mon Sep 17 00:00:00 2001 From: shani-f Date: Thu, 23 Oct 2025 12:38:20 +0300 Subject: [PATCH 1/8] =?UTF-8?q?SYCL=20repeat=5Fback=20v1=20=E2=80=94=20add?= =?UTF-8?q?=20core=20op=20+=20switch=20case?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ggml/src/ggml-sycl/ggml-sycl.cpp | 14 +++++++ ggml/src/ggml-sycl/repeat_back.cpp | 62 ++++++++++++++++++++++++++++++ ggml/src/ggml-sycl/repeat_back.hpp | 9 +++++ 3 files changed, 85 insertions(+) create mode 100644 ggml/src/ggml-sycl/repeat_back.cpp create mode 100644 ggml/src/ggml-sycl/repeat_back.hpp diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index f3407a813d731..b43f37478dc36 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -44,6 +44,7 @@ #include "ggml-sycl/set_rows.hpp" #include "ggml-sycl/sycl_hw.hpp" #include "ggml-sycl/getrows.hpp" +#include "ggml-sycl/repeat_back.hpp" #include "ggml-sycl/quantize.hpp" #include "ggml.h" @@ -2597,6 +2598,10 @@ catch (sycl::exception const &exc) { std::exit(1); } +static void ggml_sycl_repeat_back(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/1); + ggml_sycl_op_repeat_back(ctx, dst); +} static void ggml_sycl_get_rows(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { scope_op_debug_print scope_dbg_print(__func__, dst, /*num_src=*/2); @@ -3616,6 +3621,9 @@ static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct gg case GGML_OP_REPEAT: ggml_sycl_repeat(ctx, dst); break; + case GGML_OP_REPEAT_BACK: + ggml_sycl_repeat_back(ctx, dst); + break; case GGML_OP_GET_ROWS: ggml_sycl_get_rows(ctx, dst); break; @@ -4405,11 +4413,17 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g } return false; } + case GGML_OP_CONCAT: { ggml_type src0_type = op->src[0]->type; return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; } + case GGML_OP_REPEAT_BACK: + { + ggml_type src0_type = op->src[0]->type; + return src0_type == GGML_TYPE_F32; + } case GGML_OP_DUP: case GGML_OP_ARGMAX: case GGML_OP_NONE: diff --git a/ggml/src/ggml-sycl/repeat_back.cpp b/ggml/src/ggml-sycl/repeat_back.cpp new file mode 100644 index 0000000000000..10fef098cdd93 --- /dev/null +++ b/ggml/src/ggml-sycl/repeat_back.cpp @@ -0,0 +1,62 @@ +//בס"ד +#include "repeat_back.hpp" + +#include "common.hpp" + +void ggml_sycl_op_repeat_back(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); + GGML_ASSERT(dst->type == GGML_TYPE_F32); + + const float * src0_dd = (const float *) dst->src[0]->data; + float * dst_dd = (float *) dst->data; + + const int64_t ne0 = dst->ne[0]; + const int64_t ne1 = dst->ne[1]; + const int64_t ne2 = dst->ne[2]; + const int64_t ne3 = dst->ne[3]; + const int64_t ne00 = dst->src[0]->ne[0]; + const int64_t ne01 = dst->src[0]->ne[1]; + const int64_t ne02 = dst->src[0]->ne[2]; + const int64_t ne03 = dst->src[0]->ne[3]; + + const int nr0 = (int) (ne00 / ne0); + const int nr1 = (int) (ne01 / ne1); + const int nr2 = (int) (ne02 / ne2); + const int nr3 = (int) (ne03 / ne3); + + const size_t total = ne0 * ne1 * ne2 * ne3; + const int BLOCK_SIZE = 256; + const int num_blocks = (total + BLOCK_SIZE - 1) / BLOCK_SIZE; + + queue_ptr stream = ctx.stream(); + stream->memset(dst_dd, 0, total * sizeof(float)); + + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks * BLOCK_SIZE), sycl::range<1>(BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + const size_t i = item_ct1.get_global_linear_id(); + if (i >= total) { + return; + } + + const int i0 = i % ne0; + const int i1 = (i / ne0) % ne1; + const int i2 = (i / (ne0 * ne1)) % ne2; + const int i3 = i / (ne0 * ne1 * ne2); + + float acc = 0.0f; + + for (int j3 = 0; j3 < nr3; ++j3) { + for (int j2 = 0; j2 < nr2; ++j2) { + for (int j1 = 0; j1 < nr1; ++j1) { + for (int j0 = 0; j0 < nr0; ++j0) { + acc += src0_dd[(i0 + j0 * ne0) + (i1 + j1 * ne1) * ne00 + (i2 + j2 * ne2) * ne00 * ne01 + + (i3 + j3 * ne3) * ne00 * ne01 * ne02]; + } + } + } + } + + dst_dd[i] = acc; + }); +} diff --git a/ggml/src/ggml-sycl/repeat_back.hpp b/ggml/src/ggml-sycl/repeat_back.hpp new file mode 100644 index 0000000000000..6783dead4fe1c --- /dev/null +++ b/ggml/src/ggml-sycl/repeat_back.hpp @@ -0,0 +1,9 @@ +//בס"ד +#ifndef GGML_SYCL_REPEAT_BACK_HPP +#define GGML_SYCL_REPEAT_BACK_HPP + +#include "common.hpp" + +void ggml_sycl_op_repeat_back(ggml_backend_sycl_context & ctx, ggml_tensor * dst); + +#endif // GGML_SYCL_REPEAT_BACK_HPP From d3e88bc5753e1caa4fc0c26697676e9a6a66d60a Mon Sep 17 00:00:00 2001 From: shani-f Date: Thu, 23 Oct 2025 12:40:43 +0300 Subject: [PATCH 2/8] Implement repeat_back SYCL operation and minor fixes --- ggml/src/ggml-sycl/repeat_back.cpp | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/ggml/src/ggml-sycl/repeat_back.cpp b/ggml/src/ggml-sycl/repeat_back.cpp index 10fef098cdd93..48ae998a27543 100644 --- a/ggml/src/ggml-sycl/repeat_back.cpp +++ b/ggml/src/ggml-sycl/repeat_back.cpp @@ -4,20 +4,16 @@ #include "common.hpp" void ggml_sycl_op_repeat_back(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { + GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); const float * src0_dd = (const float *) dst->src[0]->data; float * dst_dd = (float *) dst->data; - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - const int64_t ne2 = dst->ne[2]; - const int64_t ne3 = dst->ne[3]; - const int64_t ne00 = dst->src[0]->ne[0]; - const int64_t ne01 = dst->src[0]->ne[1]; - const int64_t ne02 = dst->src[0]->ne[2]; - const int64_t ne03 = dst->src[0]->ne[3]; + const int64_t ne0 = dst->ne[0], ne1 = dst->ne[1], ne2 = dst->ne[2], ne3 = dst->ne[3]; + const int64_t ne00 = dst->src[0]->ne[0], ne01 = dst->src[0]->ne[1], ne02 = dst->src[0]->ne[2], + ne03 = dst->src[0]->ne[3]; const int nr0 = (int) (ne00 / ne0); const int nr1 = (int) (ne01 / ne1); @@ -29,7 +25,6 @@ void ggml_sycl_op_repeat_back(ggml_backend_sycl_context & ctx, ggml_tensor * dst const int num_blocks = (total + BLOCK_SIZE - 1) / BLOCK_SIZE; queue_ptr stream = ctx.stream(); - stream->memset(dst_dd, 0, total * sizeof(float)); stream->parallel_for( sycl::nd_range<1>(sycl::range<1>(num_blocks * BLOCK_SIZE), sycl::range<1>(BLOCK_SIZE)), From f7d05e7a945ce6bfd0f214db9dbc628c68a351b3 Mon Sep 17 00:00:00 2001 From: shani-f Date: Thu, 30 Oct 2025 20:00:26 +0200 Subject: [PATCH 3/8] SYCL: optimize repeat_back kernel --- ggml/src/ggml-sycl/repeat_back.cpp | 102 ++++++++++++++++++----------- 1 file changed, 63 insertions(+), 39 deletions(-) diff --git a/ggml/src/ggml-sycl/repeat_back.cpp b/ggml/src/ggml-sycl/repeat_back.cpp index 48ae998a27543..1b99719cc0935 100644 --- a/ggml/src/ggml-sycl/repeat_back.cpp +++ b/ggml/src/ggml-sycl/repeat_back.cpp @@ -3,55 +3,79 @@ #include "common.hpp" -void ggml_sycl_op_repeat_back(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { +#define GGML_ASSERT_TENSOR_FITS_INT(t) \ + GGML_ASSERT((t)->ne[0] < INT_MAX && (t)->ne[1] < INT_MAX && (t)->ne[2] < INT_MAX && (t)->ne[3] < INT_MAX) +void ggml_sycl_op_repeat_back(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); const float * src0_dd = (const float *) dst->src[0]->data; float * dst_dd = (float *) dst->data; - const int64_t ne0 = dst->ne[0], ne1 = dst->ne[1], ne2 = dst->ne[2], ne3 = dst->ne[3]; - const int64_t ne00 = dst->src[0]->ne[0], ne01 = dst->src[0]->ne[1], ne02 = dst->src[0]->ne[2], - ne03 = dst->src[0]->ne[3]; + GGML_ASSERT_TENSOR_FITS_INT(dst); + GGML_ASSERT_TENSOR_FITS_INT(dst->src[0]); + + const int ne0 = dst->ne[0], ne1 = dst->ne[1], ne2 = dst->ne[2], ne3 = dst->ne[3]; + const int ne00 = dst->src[0]->ne[0], ne01 = dst->src[0]->ne[1], ne02 = dst->src[0]->ne[2], + ne03 = dst->src[0]->ne[3]; + + const int nr0 = ne00 / ne0; + const int nr1 = ne01 / ne1; + const int nr2 = ne02 / ne2; + const int nr3 = ne03 / ne3; + + const int nb0 = dst->src[0]->nb[0]; + const int nb1 = dst->src[0]->nb[1]; + const int nb2 = dst->src[0]->nb[2]; + const int nb3 = dst->src[0]->nb[3]; + + const char * base = (const char *) src0_dd; - const int nr0 = (int) (ne00 / ne0); - const int nr1 = (int) (ne01 / ne1); - const int nr2 = (int) (ne02 / ne2); - const int nr3 = (int) (ne03 / ne3); + const size_t total = (size_t) ne0 * ne1 * ne2 * ne3; + constexpr int BLOCK_SIZE = 256; + const int num_blocks = (total + BLOCK_SIZE - 1) / BLOCK_SIZE; - const size_t total = ne0 * ne1 * ne2 * ne3; - const int BLOCK_SIZE = 256; - const int num_blocks = (total + BLOCK_SIZE - 1) / BLOCK_SIZE; + // Precompute inverse sizes to replace integer divisions with multiplications + const float inv_ne0 = 1.0f / ne0; + const float inv_ne_01 = 1.0f / (ne0 * ne1); + const float inv_ne_012 = 1.0f / (ne0 * ne1 * ne2); + const int repeat_count = nr0 * nr1 * nr2 * nr3; queue_ptr stream = ctx.stream(); - stream->parallel_for( - sycl::nd_range<1>(sycl::range<1>(num_blocks * BLOCK_SIZE), sycl::range<1>(BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - const size_t i = item_ct1.get_global_linear_id(); - if (i >= total) { - return; - } - - const int i0 = i % ne0; - const int i1 = (i / ne0) % ne1; - const int i2 = (i / (ne0 * ne1)) % ne2; - const int i3 = i / (ne0 * ne1 * ne2); - - float acc = 0.0f; - - for (int j3 = 0; j3 < nr3; ++j3) { - for (int j2 = 0; j2 < nr2; ++j2) { - for (int j1 = 0; j1 < nr1; ++j1) { - for (int j0 = 0; j0 < nr0; ++j0) { - acc += src0_dd[(i0 + j0 * ne0) + (i1 + j1 * ne1) * ne00 + (i2 + j2 * ne2) * ne00 * ne01 + - (i3 + j3 * ne3) * ne00 * ne01 * ne02]; - } - } - } - } - - dst_dd[i] = acc; - }); + stream->parallel_for(sycl::nd_range<1>(sycl::range<1>(num_blocks * BLOCK_SIZE), sycl::range<1>(BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + const size_t i = item_ct1.get_global_linear_id(); + + if (i >= total) { + return; + } + + // Compute multidimensional indices (i0,i1,i2,i3) from the flattened linear index i + const int i3 = (int) (i * inv_ne_012); + const int i2 = (int) (i * inv_ne_01) - i3 * ne2; + const int i1 = (int) (i * inv_ne0) - (int) (i * inv_ne_01) * ne1; + const int i0 = i - (int) (i * inv_ne0) * ne0; + + int j0 = 0, j1 = 0, j2 = 0, j3 = 0; + float acc = 0.0f; + + for (int j = 0; j < repeat_count; ++j) { + const float * ptr = (const float *) (base + (i0 + j0 * ne0) * nb0 + (i1 + j1 * ne1) * nb1 + + (i2 + j2 * ne2) * nb2 + (i3 + j3 * ne3) * nb3); + acc += *ptr; + + // Manual carry propagation simulates nested loops efficiently + int carry = (++j0 >= nr0); + j0 -= carry * nr0; + carry = (carry && (++j1 >= nr1)); + j1 -= carry * nr1; + carry = (carry && (++j2 >= nr2)); + j2 -= carry * nr2; + j3 += carry; + } + dst_dd[i] = acc; + }); + } From 20717ac63dff4f3f927c54ee9263a0126b973c80 Mon Sep 17 00:00:00 2001 From: shani-f Date: Thu, 30 Oct 2025 21:06:13 +0200 Subject: [PATCH 4/8] Remove Hebrew comment from repeat_back.cpp --- ggml/src/ggml-sycl/repeat_back.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-sycl/repeat_back.cpp b/ggml/src/ggml-sycl/repeat_back.cpp index 0a7e12563fe1d..9807e85cb7fb2 100644 --- a/ggml/src/ggml-sycl/repeat_back.cpp +++ b/ggml/src/ggml-sycl/repeat_back.cpp @@ -1,4 +1,3 @@ -//בס"ד #include "repeat_back.hpp" #include "common.hpp" From f2853da020b96daaf5a6a5116e1865add03627e0 Mon Sep 17 00:00:00 2001 From: shani-f Date: Thu, 30 Oct 2025 21:09:35 +0200 Subject: [PATCH 5/8] Remove comments for code clarity Removed comments to clean up the code. --- ggml/src/ggml-sycl/repeat_back.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/ggml/src/ggml-sycl/repeat_back.cpp b/ggml/src/ggml-sycl/repeat_back.cpp index 9807e85cb7fb2..1fc1e8c8c2cef 100644 --- a/ggml/src/ggml-sycl/repeat_back.cpp +++ b/ggml/src/ggml-sycl/repeat_back.cpp @@ -35,7 +35,6 @@ void ggml_sycl_op_repeat_back(ggml_backend_sycl_context & ctx, ggml_tensor * dst constexpr int BLOCK_SIZE = 256; const int num_blocks = (total + BLOCK_SIZE - 1) / BLOCK_SIZE; - // Precompute inverse sizes to replace integer divisions with multiplications const float inv_ne0 = 1.0f / ne0; const float inv_ne_01 = 1.0f / (ne0 * ne1); const float inv_ne_012 = 1.0f / (ne0 * ne1 * ne2); @@ -51,7 +50,6 @@ void ggml_sycl_op_repeat_back(ggml_backend_sycl_context & ctx, ggml_tensor * dst return; } - // Compute multidimensional indices (i0,i1,i2,i3) from the flattened linear index i const int i3 = (int) (i * inv_ne_012); const int i2 = (int) (i * inv_ne_01) - i3 * ne2; const int i1 = (int) (i * inv_ne0) - (int) (i * inv_ne_01) * ne1; @@ -66,7 +64,6 @@ void ggml_sycl_op_repeat_back(ggml_backend_sycl_context & ctx, ggml_tensor * dst (i2 + j2 * ne2) * nb2 + (i3 + j3 * ne3) * nb3); acc += *ptr; - // Manual carry propagation simulates nested loops efficiently int carry = (++j0 >= nr0); j0 -= carry * nr0; carry = (carry && (++j1 >= nr1)); From 0efd7eb84701d433dc4ab511b617bf1bc90dbccc Mon Sep 17 00:00:00 2001 From: shani-f Date: Fri, 31 Oct 2025 16:09:08 +0200 Subject: [PATCH 6/8] Fix formatting in ggml-sycl.cpp --- ggml/src/ggml-sycl/ggml-sycl.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp index d9b2878e9a922..c97c5899435b1 100644 --- a/ggml/src/ggml-sycl/ggml-sycl.cpp +++ b/ggml/src/ggml-sycl/ggml-sycl.cpp @@ -4534,7 +4534,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g } return false; } - case GGML_OP_CONCAT: { ggml_type src0_type = op->src[0]->type; From 2c8f97726d31da2bd377989302fd774167ca68de Mon Sep 17 00:00:00 2001 From: shani-f Date: Sat, 1 Nov 2025 18:12:40 +0200 Subject: [PATCH 7/8] Formatted lambda according to legacy style. No logic changes --- ggml/src/ggml-sycl/repeat_back.cpp | 64 +++++++++++++++--------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/ggml/src/ggml-sycl/repeat_back.cpp b/ggml/src/ggml-sycl/repeat_back.cpp index 1fc1e8c8c2cef..5f651d208bd46 100644 --- a/ggml/src/ggml-sycl/repeat_back.cpp +++ b/ggml/src/ggml-sycl/repeat_back.cpp @@ -42,36 +42,36 @@ void ggml_sycl_op_repeat_back(ggml_backend_sycl_context & ctx, ggml_tensor * dst queue_ptr stream = ctx.stream(); - stream->parallel_for(sycl::nd_range<1>(sycl::range<1>(num_blocks * BLOCK_SIZE), sycl::range<1>(BLOCK_SIZE)), - [=](sycl::nd_item<1> item_ct1) { - const size_t i = item_ct1.get_global_linear_id(); - - if (i >= total) { - return; - } - - const int i3 = (int) (i * inv_ne_012); - const int i2 = (int) (i * inv_ne_01) - i3 * ne2; - const int i1 = (int) (i * inv_ne0) - (int) (i * inv_ne_01) * ne1; - const int i0 = i - (int) (i * inv_ne0) * ne0; - - int j0 = 0, j1 = 0, j2 = 0, j3 = 0; - float acc = 0.0f; - - for (int j = 0; j < repeat_count; ++j) { - const float * ptr = - (const float *) (base + (i0 + j0 * ne0) * nb0 + (i1 + j1 * ne1) * nb1 + - (i2 + j2 * ne2) * nb2 + (i3 + j3 * ne3) * nb3); - acc += *ptr; - - int carry = (++j0 >= nr0); - j0 -= carry * nr0; - carry = (carry && (++j1 >= nr1)); - j1 -= carry * nr1; - carry = (carry && (++j2 >= nr2)); - j2 -= carry * nr2; - j3 += carry; - } - dst_dd[i] = acc; - }); + stream->parallel_for( + sycl::nd_range<1>(sycl::range<1>(num_blocks * BLOCK_SIZE), sycl::range<1>(BLOCK_SIZE)), + [=](sycl::nd_item<1> item_ct1) { + const size_t i = item_ct1.get_global_linear_id(); + if (i >= total) { + return; + } + + const int i3 = (int) (i * inv_ne_012); + const int i2 = (int) (i * inv_ne_01) - i3 * ne2; + const int i1 = (int) (i * inv_ne0) - (int) (i * inv_ne_01) * ne1; + const int i0 = i - (int) (i * inv_ne0) * ne0; + + int j0 = 0, j1 = 0, j2 = 0, j3 = 0; + float acc = 0.0f; + + for (int j = 0; j < repeat_count; ++j) { + const float * ptr = (const float *) (base + (i0 + j0 * ne0) * nb0 + (i1 + j1 * ne1) * nb1 + + (i2 + j2 * ne2) * nb2 + (i3 + j3 * ne3) * nb3); + acc += *ptr; + + int carry = (++j0 >= nr0); + j0 -= carry * nr0; + carry = (carry && (++j1 >= nr1)); + j1 -= carry * nr1; + carry = (carry && (++j2 >= nr2)); + j2 -= carry * nr2; + j3 += carry; + } + + dst_dd[i] = acc; + }); } From aa20ee1318b3f004d3363d0896ef9b2130611b9b Mon Sep 17 00:00:00 2001 From: shani-f Date: Sat, 1 Nov 2025 20:10:53 +0200 Subject: [PATCH 8/8] Remove blank line in repeat_back.cpp Remove unnecessary blank line before assigning acc to dst_dd. --- ggml/src/ggml-sycl/repeat_back.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/ggml/src/ggml-sycl/repeat_back.cpp b/ggml/src/ggml-sycl/repeat_back.cpp index 5f651d208bd46..845b48468c1d6 100644 --- a/ggml/src/ggml-sycl/repeat_back.cpp +++ b/ggml/src/ggml-sycl/repeat_back.cpp @@ -71,7 +71,6 @@ void ggml_sycl_op_repeat_back(ggml_backend_sycl_context & ctx, ggml_tensor * dst j2 -= carry * nr2; j3 += carry; } - dst_dd[i] = acc; }); }