Skip to content

Commit dddd1bb

Browse files
committed
resolve merge conflicts and remove group_norm
1 parent 344cf20 commit dddd1bb

File tree

9 files changed

+1212
-353
lines changed

9 files changed

+1212
-353
lines changed

ggml/src/ggml-opencl/CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,14 +55,17 @@ endfunction()
5555

5656
set(GGML_OPENCL_KERNELS
5757
add
58+
argsort
5859
clamp
5960
cpy
6061
cvt
6162
diag_mask_inf
63+
div
6264
gelu
6365
gemv_noshuffle_general
6466
gemv_noshuffle
6567
get_rows
68+
group_norm
6669
im2col_f32
6770
im2col_f16
6871
mul_mat_Ab_Bi_8x4
@@ -83,11 +86,14 @@ set(GGML_OPENCL_KERNELS
8386
rms_norm
8487
rope
8588
scale
89+
sigmoid
8690
silu
8791
softmax_4_f32
8892
softmax_4_f16
8993
softmax_f32
9094
softmax_f16
95+
sub
96+
sum_rows
9197
transpose
9298
concat
9399
tsembd

ggml/src/ggml-opencl/ggml-opencl.cpp

Lines changed: 836 additions & 289 deletions
Large diffs are not rendered by default.
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2+
3+
#ifdef cl_intel_subgroups
4+
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
5+
#else
6+
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
7+
#endif
8+
9+
#ifdef cl_intel_required_subgroup_size
10+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11+
#define INTEL_GPU 1
12+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14+
#elif defined(cl_qcom_reqd_sub_group_size)
15+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16+
#define ADRENO_GPU 1
17+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19+
#endif
20+
21+
#define SWAP(x, y, T) { T tmp = (x); (x) = (y); (y) = tmp; }
22+
23+
enum ggml_sort_order {
24+
GGML_SORT_ORDER_ASC,
25+
GGML_SORT_ORDER_DESC,
26+
};
27+
28+
kernel void kernel_argsort_f32_i32(
29+
global float * src0,
30+
ulong offset0,
31+
global int * dst,
32+
ulong offsetd,
33+
const int ne00,
34+
const int ne00_pad,
35+
const int order,
36+
local int * dst_row
37+
) {
38+
// bitonic sort
39+
int col = get_local_id(0);
40+
int row = get_group_id(1);
41+
42+
if (col >= ne00_pad) {
43+
return;
44+
}
45+
46+
src0 = (global char *)((global char *)src0 + offset0);
47+
dst = (global float *)((global char *)dst + offsetd);
48+
49+
global float * x_row = src0 + row * ne00;
50+
51+
// initialize indices
52+
dst_row[col] = col;
53+
54+
barrier(CLK_LOCAL_MEM_FENCE);
55+
56+
for (int k = 2; k <= ne00_pad; k *= 2) {
57+
for (int j = k / 2; j > 0; j /= 2) {
58+
int ixj = col ^ j;
59+
if (ixj > col) {
60+
if ((col & k) == 0) {
61+
if (dst_row[col] >= ne00 ||
62+
(dst_row[ixj] < ne00 && (order == GGML_SORT_ORDER_ASC ?
63+
x_row[dst_row[col]] > x_row[dst_row[ixj]] :
64+
x_row[dst_row[col]] < x_row[dst_row[ixj]]))
65+
) {
66+
SWAP(dst_row[col], dst_row[ixj], int);
67+
}
68+
} else {
69+
if (dst_row[ixj] >= ne00 ||
70+
(dst_row[col] < ne00 && (order == GGML_SORT_ORDER_ASC ?
71+
x_row[dst_row[col]] < x_row[dst_row[ixj]] :
72+
x_row[dst_row[col]] > x_row[dst_row[ixj]]))
73+
) {
74+
SWAP(dst_row[col], dst_row[ixj], int);
75+
}
76+
}
77+
}
78+
barrier(CLK_LOCAL_MEM_FENCE);
79+
}
80+
}
81+
82+
// copy the result to dst without the padding
83+
if (col < ne00) {
84+
dst[row * ne00 + col] = dst_row[col];
85+
}
86+
}

ggml/src/ggml-opencl/kernels/div.cl

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2+
3+
//------------------------------------------------------------------------------
4+
// div
5+
//------------------------------------------------------------------------------
6+
kernel void kernel_div(
7+
global char * src0,
8+
ulong offset0,
9+
global char * src1,
10+
ulong offset1,
11+
global char * dst,
12+
ulong offsetd,
13+
ulong nb00,
14+
ulong nb01,
15+
ulong nb02,
16+
ulong nb03,
17+
int ne10,
18+
int ne11,
19+
int ne12,
20+
int ne13,
21+
ulong nb10,
22+
ulong nb11,
23+
ulong nb12,
24+
ulong nb13,
25+
int ne0,
26+
ulong nb0,
27+
ulong nb1,
28+
ulong nb2,
29+
ulong nb3
30+
) {
31+
src0 = src0 + offset0;
32+
src1 = src1 + offset1;
33+
dst = dst + offsetd;
34+
35+
int i03 = get_group_id(2);
36+
int i02 = get_group_id(1);
37+
int i01 = get_group_id(0);
38+
39+
int i13 = i03 % ne13;
40+
int i12 = i02 % ne12;
41+
int i11 = i01 % ne11;
42+
43+
global char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01;
44+
global char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
45+
global char * dst_ptr = dst + i03*nb3 + i02*nb2 + i01*nb1;
46+
47+
for (int i0 = get_local_id(0); i0 < ne0; i0 += get_local_size(0)) {
48+
const int i10 = i0 % ne10;
49+
*((global float *)(dst_ptr + i0*nb0)) = *((global float *)(src0_ptr + i0*nb00)) / *((global float *)(src1_ptr + i10*nb10));
50+
}
51+
}
52+
53+
// assumption: src1 is a row
54+
// broadcast src1 into src0
55+
kernel void kernel_div_row(
56+
global float4 * src0,
57+
ulong offset0,
58+
global float4 * src1,
59+
ulong offset1,
60+
global float4 * dst,
61+
ulong offsetd,
62+
int ne
63+
) {
64+
src0 = (global float4*)((global char*)src0 + offset0);
65+
src1 = (global float4*)((global char*)src1 + offset1);
66+
dst = (global float4*)((global char*)dst + offsetd);
67+
68+
// This performs better than using %.
69+
uint gid = get_global_id(0);
70+
uint idx1 = gid - (gid/ne)*ne; // get_global_id(0) % ne
71+
dst[gid] = src0[gid] / src1[idx1];
72+
}
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2+
3+
#ifdef cl_intel_subgroups
4+
#pragma OPENCL EXTENSION cl_intel_subgroups : enable
5+
#else
6+
#pragma OPENCL EXTENSION cl_khr_subgroups : enable
7+
#endif
8+
9+
#ifdef cl_intel_required_subgroup_size
10+
#pragma OPENCL EXTENSION cl_intel_required_subgroup_size : enable
11+
#define INTEL_GPU 1
12+
#define REQD_SUBGROUP_SIZE_16 __attribute__((intel_reqd_sub_group_size(16)))
13+
#define REQD_SUBGROUP_SIZE_32 __attribute__((intel_reqd_sub_group_size(32)))
14+
#elif defined(cl_qcom_reqd_sub_group_size)
15+
#pragma OPENCL EXTENSION cl_qcom_reqd_sub_group_size : enable
16+
#define ADRENO_GPU 1
17+
#define REQD_SUBGROUP_SIZE_64 __attribute__((qcom_reqd_sub_group_size("half")))
18+
#define REQD_SUBGROUP_SIZE_128 __attribute__((qcom_reqd_sub_group_size("full")))
19+
#endif
20+
21+
// Workgroup must be a subgroup
22+
#ifdef INTEL_GPU
23+
REQD_SUBGROUP_SIZE_32
24+
#elif defined (ADRENO_GPU)
25+
REQD_SUBGROUP_SIZE_64
26+
#endif
27+
kernel void kernel_group_norm(
28+
global float * src0,
29+
ulong offset0,
30+
global float * dst,
31+
ulong offsetd,
32+
int ne,
33+
int group_size,
34+
float eps
35+
) {
36+
src0 = (global float *)((global char *)src0 + offset0);
37+
dst = (global float *)((global char *)dst + offsetd);
38+
39+
int start = get_group_id(0) * group_size;
40+
int end = start + group_size;
41+
42+
start += get_local_id(0);
43+
44+
if (end >= ne) {
45+
end = ne;
46+
}
47+
48+
float tmp = 0.0f;
49+
50+
for (int j = start; j < end; j += get_local_size(0)) {
51+
tmp += src0[j];
52+
}
53+
54+
tmp = sub_group_reduce_add(tmp);
55+
56+
const float mean = tmp / group_size;
57+
tmp = 0.0f;
58+
59+
for (int j = start; j < end; j += get_local_size(0)) {
60+
float xi = src0[j] - mean;
61+
dst[j] = xi;
62+
tmp += xi * xi;
63+
}
64+
65+
tmp = sub_group_reduce_add(tmp);
66+
67+
const float variance = tmp / group_size;
68+
const float scale = 1.0f/sqrt(variance + eps);
69+
for (int j = start; j < end; j += get_local_size(0)) {
70+
dst[j] *= scale;
71+
}
72+
}

ggml/src/ggml-opencl/kernels/norm.cl

Lines changed: 0 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -79,67 +79,3 @@ kernel void kernel_norm(
7979
y[i00] = y[i00] * scale;
8080
}
8181
}
82-
kernel void kernel_group_norm(
83-
global void * p_src0,
84-
ulong off_src0,
85-
global void * p_dst,
86-
ulong off_dst,
87-
int elements_per_group_arg,
88-
int total_elements_in_tensor,
89-
float eps,
90-
local float * sdata
91-
) {
92-
global const float * x_tensor_base = (global const float *)((global char *)p_src0 + off_src0);
93-
global float * dst_tensor_base = (global float *)((global char *)p_dst + off_dst);
94-
95-
int group_instance_idx = get_group_id(0);
96-
int tid = get_local_id(0);
97-
int lws = get_local_size(0); // Local work size
98-
99-
int group_data_global_start_idx = group_instance_idx * elements_per_group_arg;
100-
101-
int N = elements_per_group_arg;
102-
if (group_data_global_start_idx + N > total_elements_in_tensor) {
103-
N = total_elements_in_tensor - group_data_global_start_idx;
104-
}
105-
106-
if (N <= 0) {
107-
return;
108-
}
109-
110-
sdata[tid] = 0.0f;
111-
112-
for (int i = tid; i < N; i += lws) {
113-
sdata[tid] += x_tensor_base[group_data_global_start_idx + i];
114-
}
115-
116-
barrier(CLK_LOCAL_MEM_FENCE);
117-
for (uint k = lws / 2; k > 0; k /= 2) {
118-
if (tid < k) {
119-
sdata[tid] += sdata[tid + k];
120-
}
121-
barrier(CLK_LOCAL_MEM_FENCE); // Synchronize after each step of reduction
122-
}
123-
float mean = sdata[0] / N;
124-
125-
sdata[tid] = 0.0f; // Reset local memory for sum of squares.
126-
for (int i = tid; i < N; i += lws) {
127-
float val = x_tensor_base[group_data_global_start_idx + i] - mean;
128-
dst_tensor_base[group_data_global_start_idx + i] = val; // Store recentered value
129-
sdata[tid] += val * val;
130-
}
131-
132-
barrier(CLK_LOCAL_MEM_FENCE);
133-
for (uint k = lws / 2; k > 0; k /= 2) {
134-
if (tid < k) {
135-
sdata[tid] += sdata[tid + k];
136-
}
137-
barrier(CLK_LOCAL_MEM_FENCE);
138-
}
139-
float variance = sdata[0] / N;
140-
float scale = rsqrt(variance + eps);
141-
142-
for (int i = tid; i < N; i += lws) {
143-
dst_tensor_base[group_data_global_start_idx + i] *= scale;
144-
}
145-
}
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
2+
3+
//------------------------------------------------------------------------------
4+
// sigmoid
5+
//------------------------------------------------------------------------------
6+
7+
kernel void kernel_sigmoid_f32(
8+
global float * src0,
9+
ulong offset0,
10+
global float * dst,
11+
ulong offsetd
12+
) {
13+
src0 = (global float*)((global char*)src0 + offset0);
14+
dst = (global float*)((global char*)dst + offsetd);
15+
16+
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
17+
}
18+
19+
kernel void kernel_sigmoid_f16(
20+
global half * src0,
21+
ulong offset0,
22+
global half * dst,
23+
ulong offsetd
24+
) {
25+
src0 = (global half*)((global char*)src0 + offset0);
26+
dst = (global half*)((global char*)dst + offsetd);
27+
28+
dst[get_global_id(0)] = 1.0f / (1.0f + exp(-src0[get_global_id(0)]));
29+
}

0 commit comments

Comments
 (0)