@@ -6261,7 +6261,8 @@ static void norm_f32_sycl(const float *x, float *dst, const int ncols,
62616261 });
62626262 });
62636263 } else {
6264- const int work_group_size = g_work_group_size;
6264+ // FIXME: 1024 from cuda
6265+ const int work_group_size = 1024;
62656266 const sycl::range<3> block_dims(1, 1, work_group_size);
62666267 /*
62676268 DPCT1049:17: The work-group size passed to the SYCL kernel may exceed
@@ -6307,7 +6308,7 @@ static void group_norm_f32_sycl(const float *x, float *dst,
63076308 });
63086309 });
63096310 } else {
6310- const int work_group_size = g_work_group_size ;
6311+ const int work_group_size = 1024 ;
63116312 const sycl::range<3> block_dims(1, 1, work_group_size);
63126313 /*
63136314 DPCT1049:18: The work-group size passed to the SYCL kernel may exceed
@@ -6396,7 +6397,7 @@ static void rms_norm_f32_sycl(const float *x, float *dst, const int ncols,
63966397 });
63976398 });
63986399 } else {
6399- const int work_group_size = g_work_group_size ;
6400+ const int work_group_size = 1024 ;
64006401 const sycl::range<3> block_dims(1, 1, work_group_size);
64016402 /*
64026403 DPCT1049:19: The work-group size passed to the SYCL kernel may exceed
@@ -9246,7 +9247,7 @@ static void soft_max_f32_sycl(const float * x, const float * mask,
92469247 const int nrows_y, const float scale, const float max_bias,
92479248 queue_ptr stream) {
92489249 int nth = WARP_SIZE;
9249- int max_block_size = g_work_group_size ;
9250+ int max_block_size = 1024 ;
92509251 while (nth < ncols_x && nth < max_block_size) nth *= 2;
92519252 if (nth>max_block_size) nth = max_block_size;
92529253
@@ -11452,14 +11453,9 @@ static void ggml_sycl_mul_mat_vec_p021(ggml_backend_sycl_context & ctx, const gg
1145211453 SYCL_CHECK(ggml_sycl_set_device(ctx.device));
1145311454 queue_ptr main_stream = ctx.stream();
1145411455
11455- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
11456- void * src0_ddq = src0_extra->data_device[ctx.device];
11457-
11458- ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
11459- float * src1_ddf = (float *) src1_extra->data_device[ctx.device];
11460-
11461- ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
11462- float * dst_ddf = (float *) dst_extra->data_device[ctx.device];
11456+ void * src0_ddq = src0->data;
11457+ float * src1_ddf = (float *) src1->data;
11458+ float * dst_ddf = (float *) dst->data;
1146311459
1146411460 ggml_mul_mat_p021_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
1146511461}
@@ -11490,15 +11486,10 @@ static void ggml_sycl_mul_mat_vec_nc(ggml_backend_sycl_context & ctx, const ggml
1149011486
1149111487 SYCL_CHECK(ggml_sycl_set_device(ctx.device));
1149211488 queue_ptr main_stream = ctx.stream();
11493-
11494- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
11495- void * src0_ddq = src0_extra->data_device[ctx.device];
11496-
11497- ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
11498- float * src1_ddf = (float *) src1_extra->data_device[ctx.device];
11499-
11500- ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
11501- float * dst_ddf = (float *) dst_extra->data_device[ctx.device];
11489+
11490+ void * src0_ddq = src0->data;
11491+ float * src1_ddf = (float *) src1->data;
11492+ float * dst_ddf = (float *) dst->data;
1150211493
1150311494 const int64_t row_stride_x = nb01 / sizeof(sycl::half);
1150411495 const int64_t channel_stride_x = nb02 / sizeof(sycl::half);
@@ -12042,9 +12033,6 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
1204212033 const int64_t ne = ggml_nelements(src0);
1204312034 GGML_ASSERT(ne == ggml_nelements(src1));
1204412035
12045- GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU);
12046- GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU);
12047-
1204812036 GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
1204912037 GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
1205012038
@@ -12053,11 +12041,8 @@ static void ggml_sycl_cpy(ggml_backend_sycl_context & ctx, const ggml_tensor *sr
1205312041 SYCL_CHECK(ggml_sycl_set_device(ctx.device));
1205412042 queue_ptr main_stream = ctx.stream();
1205512043
12056- const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
12057- const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
12058-
12059- char * src0_ddc = (char *) src0_extra->data_device[ctx.device];
12060- char * src1_ddc = (char *) src1_extra->data_device[ctx.device];
12044+ char * src0_ddc = (char *) src0->data;
12045+ char * src1_ddc = (char *) src1->data;
1206112046
1206212047 if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
1206312048 ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream);
0 commit comments