-
Notifications
You must be signed in to change notification settings - Fork 13.7k
SYCL: using graphs is configurable by environment variable and compile option #12371
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
2896987
4efab98
d02a0d2
89ac171
d75f29a
55a49ba
1693dde
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -46,6 +46,7 @@ | |
| static bool g_sycl_loaded = false; | ||
| int g_ggml_sycl_debug = 0; | ||
| int g_ggml_sycl_disable_optimize = 0; | ||
| int g_ggml_sycl_disable_graph = 0; | ||
|
|
||
| static ggml_sycl_device_info ggml_sycl_init() { | ||
| ggml_sycl_device_info info = {}; | ||
|
|
@@ -95,7 +96,7 @@ const ggml_sycl_device_info & ggml_sycl_info() { | |
| return info; | ||
| } | ||
|
|
||
| void print_device_detail(int id, sycl::device &device, std::string device_type) { | ||
| static void print_device_detail(int id, sycl::device &device, std::string device_type) { | ||
|
|
||
| dpct::device_info prop; | ||
| SYCL_CHECK(CHECK_TRY_ERROR( | ||
|
|
@@ -118,7 +119,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type) | |
| global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str()); | ||
| } | ||
|
|
||
| void print_device_opt_feature(int device_count) { | ||
| static void print_device_opt_feature(int device_count) { | ||
| GGML_LOG_INFO("SYCL Optimization Feature:\n"); | ||
| GGML_LOG_INFO( | ||
| "|ID| Device Type|Reorder|\n"); | ||
|
|
@@ -191,10 +192,12 @@ static void ggml_check_sycl() try { | |
| if (!initialized) { | ||
| g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); | ||
| g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0); | ||
| g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1); | ||
| GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n"); | ||
| GGML_LOG_INFO("Running with Environment Variables:\n"); | ||
| GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug); | ||
| GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize); | ||
| GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph); | ||
| GGML_LOG_INFO("Build with Macros:\n"); | ||
| #if defined(GGML_SYCL_FORCE_MMQ) | ||
| GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n"); | ||
|
|
@@ -400,7 +403,7 @@ catch (sycl::exception const &exc) { | |
| std::exit(1); | ||
| } | ||
|
|
||
| void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, | ||
| static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, | ||
| const void *ptr_src, size_t size) { | ||
| char *host_buf = (char *)malloc(size); | ||
| q_src.memcpy(host_buf, (const char *)ptr_src, size).wait(); | ||
|
|
@@ -604,7 +607,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { | |
| return &ggml_backend_sycl_buffer_types[device]; | ||
| } | ||
|
|
||
| ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) { | ||
| static ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) { | ||
| GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n"); | ||
|
|
||
| int device = ctx->device; | ||
|
|
@@ -1666,7 +1669,7 @@ static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx, | |
|
|
||
| stream->parallel_for( | ||
| sycl::nd_range<3>(num_blocks * block_size, block_size), | ||
| [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { | ||
| [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { | ||
| quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1); | ||
| }); | ||
| } | ||
|
|
@@ -1687,7 +1690,7 @@ static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y, | |
|
|
||
| stream->parallel_for( | ||
| sycl::nd_range<3>(block_nums * block_dims, block_dims), | ||
| [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { | ||
| [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { | ||
| mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x, | ||
| nchannels_y, item_ct1); | ||
| }); | ||
|
|
@@ -1707,7 +1710,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl( | |
|
|
||
| stream->parallel_for( | ||
| sycl::nd_range<3>(block_nums * block_dims, block_dims), | ||
| [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] { | ||
| [=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { | ||
| mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x, | ||
| row_stride_x, channel_stride_x, | ||
| nchannels_y / nchannels_x, item_ct1); | ||
|
|
@@ -1748,7 +1751,7 @@ static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols, | |
| const sycl::range<3> block_nums(1, nrows, 1); | ||
| stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims), | ||
| [=](sycl::nd_item<3> item_ct1) | ||
| [[intel::reqd_sub_group_size(WARP_SIZE)]] { | ||
| [[sycl::reqd_sub_group_size(WARP_SIZE)]] { | ||
| k_sum_rows_f32(x, dst, ncols, item_ct1); | ||
| }); | ||
| } | ||
|
|
@@ -2898,7 +2901,7 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) { | |
| return false; | ||
| } | ||
|
|
||
| bool ggml_sycl_supports_dmmv(enum ggml_type type) { | ||
| static bool ggml_sycl_supports_dmmv(enum ggml_type type) { | ||
Alcpz marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| switch (type) { | ||
| case GGML_TYPE_Q4_0: | ||
| case GGML_TYPE_Q4_1: | ||
|
|
@@ -3271,7 +3274,7 @@ static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst) | |
| } | ||
|
|
||
|
|
||
| void ggml_sycl_set_main_device(const int main_device) try { | ||
| static void ggml_sycl_set_main_device(const int main_device) try { | ||
| if (dpct::get_current_device_id() == static_cast<unsigned int> (main_device)) { | ||
| return; | ||
| } | ||
|
|
@@ -3292,7 +3295,7 @@ catch (sycl::exception const &exc) { | |
| std::exit(1); | ||
| } | ||
|
|
||
| bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) { | ||
| static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) { | ||
| if (!g_sycl_loaded) return false; | ||
|
|
||
| if (dst->src[0] != nullptr && ggml_backend_buffer_is_sycl_split(dst->src[0]->buffer)) { | ||
|
|
@@ -3610,7 +3613,7 @@ catch (sycl::exception const &exc) { | |
| std::exit(1); | ||
| } | ||
|
|
||
| void reorder_qw(char *data_device, const int ncols, const int nrows, | ||
| static void reorder_qw(char *data_device, const int ncols, const int nrows, | ||
| size_t size, size_t offset, dpct::queue_ptr stream) { | ||
| auto tmp_buf = sycl::malloc_shared<char>(size, *stream); | ||
| SYCL_CHECK( | ||
|
|
@@ -3624,7 +3627,7 @@ void reorder_qw(char *data_device, const int ncols, const int nrows, | |
|
|
||
| stream->parallel_for( | ||
| size / sizeof(block_q4_0), | ||
| [=](auto i) [[intel::reqd_sub_group_size(WARP_SIZE)]] { | ||
| [=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] { | ||
| const block_q4_0* x = (const block_q4_0*)tmp_buf; | ||
| const int ib = i; | ||
|
|
||
|
|
@@ -3638,7 +3641,7 @@ void reorder_qw(char *data_device, const int ncols, const int nrows, | |
| sycl::free(tmp_buf, *stream); | ||
| } | ||
|
|
||
| void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) { | ||
| static void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) { | ||
| char*data_device = (char*)src0->data; | ||
| size_t ncols = src0->ne[0]; | ||
| size_t nrows = src0->ne[1]; | ||
|
|
@@ -3647,7 +3650,7 @@ void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) { | |
| reorder_qw(data_device, ncols, nrows, size, 0, stream); | ||
| } | ||
|
|
||
| void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) { | ||
| static void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) { | ||
| ggml_tensor *src0 = dst->src[0]; | ||
| ggml_tensor *src1 = dst->src[1]; | ||
|
|
||
|
|
@@ -3660,7 +3663,7 @@ void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) { | |
| } | ||
| } | ||
|
|
||
| void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) { | ||
| static void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) { | ||
| dpct::queue_ptr stream = ctx->stream(); | ||
| if (ctx->optimized_graph) { | ||
| return; | ||
|
|
@@ -3671,10 +3674,9 @@ void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) | |
| if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream); | ||
| } | ||
| } | ||
| static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { | ||
| ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; | ||
| ggml_sycl_set_main_device(sycl_ctx->device); | ||
|
|
||
| static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) { | ||
| ggml_sycl_set_main_device(sycl_ctx->device); | ||
| if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx); | ||
|
|
||
| for (int i = 0; i < cgraph->n_nodes; i++) { | ||
|
|
@@ -3696,7 +3698,46 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_ | |
| } | ||
| GGML_ASSERT(ok); | ||
| } | ||
| } | ||
|
|
||
| static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { | ||
| auto * sycl_ctx = static_cast<ggml_backend_sycl_context *>(backend->context); | ||
|
|
||
| #ifdef GGML_SYCL_GRAPH | ||
| if (!g_ggml_sycl_disable_graph) { | ||
| if (!sycl_ctx->exec_graph && !dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_graph)) { | ||
| GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device); | ||
| ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph); | ||
| return GGML_STATUS_SUCCESS; | ||
| } | ||
|
|
||
| sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream())); | ||
| model_sycl_graph.begin_recording(*(sycl_ctx->stream())); | ||
| ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph); | ||
| model_sycl_graph.end_recording(); | ||
|
|
||
| if (!sycl_ctx->exec_graph) { | ||
| auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}}); | ||
| sycl_ctx->exec_graph = std::make_unique< | ||
| sycl_ex::command_graph<sycl_ex::graph_state::executable>>(exec_graph); | ||
| } else { | ||
| try { | ||
| sycl_ctx->exec_graph->update(model_sycl_graph); | ||
| GGML_SYCL_DEBUG("[SYCL-GRAPH] update success\n"); | ||
| } catch (sycl::exception const & e) { | ||
| GGML_SYCL_DEBUG("[SYCL-GRAPH] Exception when updating graph, %s\n", e.what()); | ||
| auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}}); | ||
| sycl_ctx->exec_graph = std::make_unique< | ||
| sycl_ex::command_graph<sycl_ex::graph_state::executable>>(exec_graph); | ||
| } | ||
| } | ||
|
|
||
| sycl_ctx->stream()->ext_oneapi_graph(*(sycl_ctx->exec_graph)); | ||
| } else | ||
| #endif | ||
| { | ||
| ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph); | ||
| } | ||
| return GGML_STATUS_SUCCESS; | ||
| } | ||
|
|
||
|
|
@@ -3851,7 +3892,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g | |
| return true; | ||
| } | ||
| return false; | ||
| } break; | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After remove so many
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I ran our CI and got no issues, see #12371 (review)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These breaks used to show as unreachable code warnings notified by the compiler. I don't know why they went away. |
||
| case GGML_OP_UNARY: | ||
| switch (ggml_get_unary_op(op)) { | ||
| case GGML_UNARY_OP_NEG: | ||
|
|
@@ -3869,7 +3910,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g | |
| default: | ||
| return false; | ||
| } | ||
| break; | ||
| case GGML_OP_MUL_MAT: | ||
| case GGML_OP_MUL_MAT_ID: | ||
| { | ||
|
|
@@ -3900,7 +3940,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g | |
| return false; | ||
| } | ||
| return true; | ||
| } break; | ||
| } | ||
| case GGML_OP_OUT_PROD: | ||
| return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1; | ||
| case GGML_OP_GET_ROWS: | ||
|
|
@@ -3917,7 +3957,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g | |
| default: | ||
| return false; | ||
| } | ||
| } break; | ||
| } | ||
| case GGML_OP_CPY: | ||
| { | ||
| ggml_type src0_type = op->src[0]->type; | ||
|
|
@@ -3968,12 +4008,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g | |
| return true; | ||
| } | ||
| return false; | ||
| } break; | ||
| } | ||
| case GGML_OP_CONCAT: | ||
| { | ||
| ggml_type src0_type = op->src[0]->type; | ||
| return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; | ||
| } break; | ||
| } | ||
| case GGML_OP_DUP: | ||
| case GGML_OP_ARGMAX: | ||
| case GGML_OP_NONE: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why need to change?
Are you using official oneAPI compiler 2025.0? or internal compiler?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I agree it's not strictly needed in this PR. I believe
intel::reqd_sub_group_sizewill become deprecated in the next compiler release. This works fine with 2025.0 so I don't mind updating this now.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've been facing a lot of warnings when building llama.cpp, so I find these changes a positive thing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am compiling with both: oneAPI and latest intel llvm compiler (next oneAPI candidate). Latest llvm issues deprecated warning, hence the change.
See llvm release notes: Deprecated
intel::reqd_sub_group_size, the official SYCL 2020 spelling should be used instead (with sycl:: namespace).