Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion docs/backend/SYCL.md
Original file line number Diff line number Diff line change
Expand Up @@ -660,8 +660,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|--------------------|---------------------------------------|---------------------------------------------|
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. |
| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |

Expand All @@ -671,6 +672,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |


Expand Down
1 change: 1 addition & 0 deletions ggml/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ option(GGML_OPENMP "ggml: use OpenMP"
option(GGML_RPC "ggml: use RPC" OFF)
option(GGML_SYCL "ggml: use SYCL" OFF)
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
"ggml: sycl target device")
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
Expand Down
3 changes: 3 additions & 0 deletions ggml/src/ggml-sycl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ if (WIN32)
find_package(MKL REQUIRED)
target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
else()
if (GGML_SYCL_GRAPH)
add_compile_definitions(GGML_SYCL_GRAPH)
endif()
if (GGML_SYCL_TARGET STREQUAL "INTEL")
target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
Expand Down
5 changes: 5 additions & 0 deletions ggml/src/ggml-sycl/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) {
return opt;
}

namespace sycl_ex = sycl::ext::oneapi::experimental;
struct ggml_backend_sycl_context {
int device;
std::string name;
Expand Down Expand Up @@ -392,6 +393,10 @@ struct ggml_backend_sycl_context {
return pool(device);
}

#ifdef GGML_SYCL_GRAPH
std::unique_ptr<sycl_ex::command_graph<sycl_ex::graph_state::executable>> exec_graph = nullptr;
#endif

ggml_sycl_pool & host_pool(int device) {
if (host_pools[device] == nullptr) {
host_pools[device] = new_pool_for_host(stream(device, 0), device);
Expand Down
90 changes: 65 additions & 25 deletions ggml/src/ggml-sycl/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
static bool g_sycl_loaded = false;
int g_ggml_sycl_debug = 0;
int g_ggml_sycl_disable_optimize = 0;
int g_ggml_sycl_disable_graph = 0;

static ggml_sycl_device_info ggml_sycl_init() {
ggml_sycl_device_info info = {};
Expand Down Expand Up @@ -95,7 +96,7 @@ const ggml_sycl_device_info & ggml_sycl_info() {
return info;
}

void print_device_detail(int id, sycl::device &device, std::string device_type) {
static void print_device_detail(int id, sycl::device &device, std::string device_type) {

dpct::device_info prop;
SYCL_CHECK(CHECK_TRY_ERROR(
Expand All @@ -118,7 +119,7 @@ void print_device_detail(int id, sycl::device &device, std::string device_type)
global_mem_size, device.get_info<sycl::info::device::driver_version>().c_str());
}

void print_device_opt_feature(int device_count) {
static void print_device_opt_feature(int device_count) {
GGML_LOG_INFO("SYCL Optimization Feature:\n");
GGML_LOG_INFO(
"|ID| Device Type|Reorder|\n");
Expand Down Expand Up @@ -191,10 +192,12 @@ static void ggml_check_sycl() try {
if (!initialized) {
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
GGML_LOG_INFO("Running with Environment Variables:\n");
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph);
GGML_LOG_INFO("Build with Macros:\n");
#if defined(GGML_SYCL_FORCE_MMQ)
GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n");
Expand Down Expand Up @@ -400,7 +403,7 @@ catch (sycl::exception const &exc) {
std::exit(1);
}

void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
const void *ptr_src, size_t size) {
char *host_buf = (char *)malloc(size);
q_src.memcpy(host_buf, (const char *)ptr_src, size).wait();
Expand Down Expand Up @@ -604,7 +607,7 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) {
return &ggml_backend_sycl_buffer_types[device];
}

ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
static ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(ggml_backend_sycl_context * ctx) {
GGML_SYCL_DEBUG("[SYCL] call ggml_backend_sycl_buffer_type\n");

int device = ctx->device;
Expand Down Expand Up @@ -1666,7 +1669,7 @@ static void quantize_row_q8_1_sycl(const float *x, void *vy, const int kx,

stream->parallel_for(
sycl::nd_range<3>(num_blocks * block_size, block_size),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why need to change?
Are you using official oneAPI compiler 2025.0? or internal compiler?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree it's not strictly needed in this PR. I believe intel::reqd_sub_group_size will become deprecated in the next compiler release. This works fine with 2025.0 so I don't mind updating this now.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've been facing a lot of warnings when building llama.cpp, so I find these changes a positive thing.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am compiling with both: oneAPI and latest intel llvm compiler (next oneAPI candidate). Latest llvm issues deprecated warning, hence the change.
See llvm release notes: Deprecated intel::reqd_sub_group_size, the official SYCL 2020 spelling should be used instead (with sycl:: namespace).

quantize_q8_1<QUANT_BLOCK_TILE>(x, vy, kx, kx_padded, item_ct1);
});
}
Expand All @@ -1687,7 +1690,7 @@ static void ggml_mul_mat_p021_f16_f32_sycl(const void *vx, const float *y,

stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_p021_f16_f32(vx, y, dst, ncols_x, nrows_x, nchannels_x,
nchannels_y, item_ct1);
});
Expand All @@ -1707,7 +1710,7 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl(

stream->parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
[=](sycl::nd_item<3> item_ct1) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
mul_mat_vec_nc_f16_f32(vx, y, dst, ncols_x, nrows_x,
row_stride_x, channel_stride_x,
nchannels_y / nchannels_x, item_ct1);
Expand Down Expand Up @@ -1748,7 +1751,7 @@ static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
const sycl::range<3> block_nums(1, nrows, 1);
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1)
[[intel::reqd_sub_group_size(WARP_SIZE)]] {
[[sycl::reqd_sub_group_size(WARP_SIZE)]] {
k_sum_rows_f32(x, dst, ncols, item_ct1);
});
}
Expand Down Expand Up @@ -2898,7 +2901,7 @@ inline bool ggml_sycl_supports_mmq(enum ggml_type type) {
return false;
}

bool ggml_sycl_supports_dmmv(enum ggml_type type) {
static bool ggml_sycl_supports_dmmv(enum ggml_type type) {
switch (type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
Expand Down Expand Up @@ -3271,7 +3274,7 @@ static void ggml_sycl_argmax(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
}


void ggml_sycl_set_main_device(const int main_device) try {
static void ggml_sycl_set_main_device(const int main_device) try {
if (dpct::get_current_device_id() == static_cast<unsigned int> (main_device)) {
return;
}
Expand All @@ -3292,7 +3295,7 @@ catch (sycl::exception const &exc) {
std::exit(1);
}

bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) {
static bool ggml_sycl_compute_forward(ggml_backend_sycl_context & ctx, struct ggml_tensor * dst) {
if (!g_sycl_loaded) return false;

if (dst->src[0] != nullptr && ggml_backend_buffer_is_sycl_split(dst->src[0]->buffer)) {
Expand Down Expand Up @@ -3610,7 +3613,7 @@ catch (sycl::exception const &exc) {
std::exit(1);
}

void reorder_qw(char *data_device, const int ncols, const int nrows,
static void reorder_qw(char *data_device, const int ncols, const int nrows,
size_t size, size_t offset, dpct::queue_ptr stream) {
auto tmp_buf = sycl::malloc_shared<char>(size, *stream);
SYCL_CHECK(
Expand All @@ -3624,7 +3627,7 @@ void reorder_qw(char *data_device, const int ncols, const int nrows,

stream->parallel_for(
size / sizeof(block_q4_0),
[=](auto i) [[intel::reqd_sub_group_size(WARP_SIZE)]] {
[=](auto i) [[sycl::reqd_sub_group_size(WARP_SIZE)]] {
const block_q4_0* x = (const block_q4_0*)tmp_buf;
const int ib = i;

Expand All @@ -3638,7 +3641,7 @@ void reorder_qw(char *data_device, const int ncols, const int nrows,
sycl::free(tmp_buf, *stream);
}

void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
static void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
char*data_device = (char*)src0->data;
size_t ncols = src0->ne[0];
size_t nrows = src0->ne[1];
Expand All @@ -3647,7 +3650,7 @@ void reorder_qw(ggml_tensor * src0, dpct::queue_ptr stream) {
reorder_qw(data_device, ncols, nrows, size, 0, stream);
}

void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
static void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
ggml_tensor *src0 = dst->src[0];
ggml_tensor *src1 = dst->src[1];

Expand All @@ -3660,7 +3663,7 @@ void opt_for_reorder(ggml_tensor * dst, dpct::queue_ptr stream) {
}
}

void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
static void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx) {
dpct::queue_ptr stream = ctx->stream();
if (ctx->optimized_graph) {
return;
Expand All @@ -3671,10 +3674,9 @@ void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context * ctx)
if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream);
}
}
static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
ggml_sycl_set_main_device(sycl_ctx->device);

static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) {
ggml_sycl_set_main_device(sycl_ctx->device);
if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx);

for (int i = 0; i < cgraph->n_nodes; i++) {
Expand All @@ -3696,7 +3698,46 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
}
GGML_ASSERT(ok);
}
}

static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
auto * sycl_ctx = static_cast<ggml_backend_sycl_context *>(backend->context);

#ifdef GGML_SYCL_GRAPH
if (!g_ggml_sycl_disable_graph) {
if (!sycl_ctx->exec_graph && !dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_graph)) {
GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device);
ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
return GGML_STATUS_SUCCESS;
}

sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream()));
model_sycl_graph.begin_recording(*(sycl_ctx->stream()));
ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
model_sycl_graph.end_recording();

if (!sycl_ctx->exec_graph) {
auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}});
sycl_ctx->exec_graph = std::make_unique<
sycl_ex::command_graph<sycl_ex::graph_state::executable>>(exec_graph);
} else {
try {
sycl_ctx->exec_graph->update(model_sycl_graph);
GGML_SYCL_DEBUG("[SYCL-GRAPH] update success\n");
} catch (sycl::exception const & e) {
GGML_SYCL_DEBUG("[SYCL-GRAPH] Exception when updating graph, %s\n", e.what());
auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}});
sycl_ctx->exec_graph = std::make_unique<
sycl_ex::command_graph<sycl_ex::graph_state::executable>>(exec_graph);
}
}

sycl_ctx->stream()->ext_oneapi_graph(*(sycl_ctx->exec_graph));
} else
#endif
{
ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
}
return GGML_STATUS_SUCCESS;
}

Expand Down Expand Up @@ -3851,7 +3892,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
return true;
}
return false;
} break;
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After remove so many break, the code path will be changed.
Have you test with the CI?
This change will impact the CI test.
Suggest to run CI locally.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I ran our CI and got no issues, see #12371 (review)
The break are not needed as there is always a return before.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These breaks used to show as unreachable code warnings notified by the compiler. I don't know why they went away.

case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_NEG:
Expand All @@ -3869,7 +3910,6 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
default:
return false;
}
break;
case GGML_OP_MUL_MAT:
case GGML_OP_MUL_MAT_ID:
{
Expand Down Expand Up @@ -3900,7 +3940,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
return false;
}
return true;
} break;
}
case GGML_OP_OUT_PROD:
return op->type == GGML_TYPE_F32 && op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32 && op->ne[2] == 1 && op->ne[3] == 1;
case GGML_OP_GET_ROWS:
Expand All @@ -3917,7 +3957,7 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
default:
return false;
}
} break;
}
case GGML_OP_CPY:
{
ggml_type src0_type = op->src[0]->type;
Expand Down Expand Up @@ -3968,12 +4008,12 @@ static bool ggml_backend_sycl_device_supports_op(ggml_backend_dev_t dev, const g
return true;
}
return false;
} break;
}
case GGML_OP_CONCAT:
{
ggml_type src0_type = op->src[0]->type;
return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16;
} break;
}
case GGML_OP_DUP:
case GGML_OP_ARGMAX:
case GGML_OP_NONE:
Expand Down
Loading