Skip to content

Commit 432d276

Browse files
author
Diptorup Deb
authored
Merge pull request #278 from AlexanderKalistratov/kmeans
Update kmeans
2 parents 5daec03 + e90212f commit 432d276

File tree

11 files changed

+820
-426
lines changed

11 files changed

+820
-426
lines changed

dpbench/benchmarks/kmeans/kmeans_initialize.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,9 @@ def initialize(npoints, niters, seed, ndims, ncentroids, types_dict):
1616

1717
arrayP = default_rng.uniform(XL, XH, (npoints, ndims)).astype(f_dtype)
1818
arrayPclusters = np.ones(npoints, dtype=i_dtype)
19-
arrayC = np.ones((ncentroids, ndims), dtype=f_dtype)
20-
arrayCsum = np.ones((ncentroids, ndims), dtype=f_dtype)
19+
arrayC = np.empty((ncentroids, ndims), dtype=f_dtype)
2120
arrayCnumpoint = np.ones(ncentroids, dtype=i_dtype)
2221

23-
return (arrayP, arrayPclusters, arrayC, arrayCsum, arrayCnumpoint)
22+
arrayC[:] = arrayP[:ncentroids]
23+
24+
return (arrayP, arrayPclusters, arrayC, arrayCnumpoint)

dpbench/benchmarks/kmeans/kmeans_numba_dpex_k.py

Lines changed: 212 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -2,106 +2,243 @@
22
#
33
# SPDX-License-Identifier: Apache-2.0
44

5+
from functools import lru_cache
56
from math import sqrt
67

78
import numba_dpex as dpex
9+
import numpy
10+
from dpctl import tensor as dpt
11+
from numba_dpex import NdRange
812

913

10-
@dpex.kernel
11-
def groupByCluster(arrayP, arrayPcluster, arrayC, num_points, num_centroids):
12-
idx = dpex.get_global_id(0)
13-
# if idx < num_points: # why it was removed??
14-
dtype = arrayC.dtype
15-
minor_distance = dtype.type(-1)
16-
for i in range(num_centroids):
17-
dx = arrayP[idx, 0] - arrayC[i, 0]
18-
dy = arrayP[idx, 1] - arrayC[i, 1]
19-
my_distance = sqrt(dx * dx + dy * dy)
20-
if minor_distance > my_distance or minor_distance == -1:
21-
minor_distance = my_distance
22-
arrayPcluster[idx] = i
14+
def DivUp(numerator, denominator):
15+
return (numerator + denominator - 1) // denominator
2316

2417

25-
@dpex.kernel
26-
def calCentroidsSum1(arrayCsum, arrayCnumpoint):
27-
i = dpex.get_global_id(0)
28-
arrayCsum[i, 0] = 0
29-
arrayCsum[i, 1] = 0
30-
arrayCnumpoint[i] = 0
18+
def Align(value, base):
19+
return base * DivUp(value, base)
3120

3221

33-
@dpex.kernel
34-
def calCentroidsSum2(arrayP, arrayPcluster, arrayCsum, arrayCnumpoint):
35-
i = dpex.get_global_id(0)
36-
ci = arrayPcluster[i]
37-
dpex.atomic.add(arrayCsum, (ci, 0), arrayP[i, 0])
38-
dpex.atomic.add(arrayCsum, (ci, 1), arrayP[i, 1])
39-
dpex.atomic.add(arrayCnumpoint, ci, 1)
22+
@lru_cache(maxsize=1)
23+
def getGroupByCluster( # noqa: C901
24+
dims, num_centroids, dtyp, WorkPI, local_size_
25+
):
26+
local_copies = min(4, max(1, DivUp(local_size_, num_centroids)))
27+
28+
@dpex.kernel
29+
def groupByCluster(
30+
arrayP, arrayPcluster, arrayC, NewCentroids, NewCount, last
31+
):
32+
numpoints = arrayP.shape[0]
33+
localCentroids = dpex.local.array((dims, num_centroids), dtype=dtyp)
34+
localNewCentroids = dpex.local.array(
35+
(local_copies, dims, num_centroids), dtype=dtyp
36+
)
37+
localNewCount = dpex.local.array(
38+
(local_copies, num_centroids), dtype=dpt.int32
39+
)
4040

41+
grid = dpex.get_group_id(0)
42+
lid = dpex.get_local_id(0)
43+
local_size = dpex.get_local_size(0)
4144

42-
@dpex.kernel
43-
def updateCentroids(arrayC, arrayCsum, arrayCnumpoint, num_centroids):
44-
i = dpex.get_global_id(0)
45-
dtype = arrayC.dtype
46-
arrayC[i, 0] = arrayCsum[i, 0] / dtype.type(arrayCnumpoint[i])
47-
arrayC[i, 1] = arrayCsum[i, 1] / dtype.type(arrayCnumpoint[i])
45+
for i in range(lid, num_centroids * dims, local_size):
46+
localCentroids[i % dims, i // dims] = arrayC[i // dims, i % dims]
47+
for lc in range(local_copies):
48+
localNewCentroids[lc, i % dims, i // dims] = 0
4849

50+
for c in range(lid, num_centroids, local_size):
51+
for lc in range(local_copies):
52+
localNewCount[lc, c] = 0
4953

50-
@dpex.kernel
51-
def copy_arrayC(arrayC, arrayP):
52-
i = dpex.get_global_id(0)
53-
arrayC[i, 0] = arrayP[i, 0]
54-
arrayC[i, 1] = arrayP[i, 1]
54+
dpex.barrier(dpex.LOCAL_MEM_FENCE)
5555

56+
for i in range(WorkPI):
57+
point_id = grid * WorkPI * local_size + i * local_size + lid
58+
if point_id < numpoints:
59+
localP = dpex.private.array(dims, dtyp)
60+
for d in range(dims):
61+
localP[d] = arrayP[point_id, d]
5662

57-
def kmeans_kernel(
58-
arrayP,
59-
arrayPcluster,
60-
arrayC,
61-
arrayCsum,
62-
arrayCnumpoint,
63-
niters,
64-
num_points,
65-
num_centroids,
66-
):
67-
copy_arrayC[num_centroids,](arrayC, arrayP)
63+
minimal_distance = dtyp.type(dpt.inf)
64+
nearest_centroid = int(0)
65+
for c in range(num_centroids):
66+
sq_sum = dtyp.type(0)
67+
for d in range(dims):
68+
sq_sum += (localP[d] - localCentroids[d, c]) ** 2
6869

69-
for i in range(niters):
70-
groupByCluster[num_points,](
71-
arrayP, arrayPcluster, arrayC, num_points, num_centroids
72-
)
70+
if sq_sum < minimal_distance:
71+
nearest_centroid = c
72+
minimal_distance = sq_sum
7373

74-
calCentroidsSum1[num_centroids,](arrayCsum, arrayCnumpoint)
74+
lc = lid % local_copies
75+
for d in range(dims):
76+
dpex.atomic.add(
77+
localNewCentroids, (lc, d, nearest_centroid), localP[d]
78+
)
7579

76-
calCentroidsSum2[num_points,](
77-
arrayP, arrayPcluster, arrayCsum, arrayCnumpoint
78-
)
80+
dpex.atomic.add(localNewCount, (lc, nearest_centroid), 1)
7981

80-
updateCentroids[num_centroids,](
81-
arrayC, arrayCsum, arrayCnumpoint, num_centroids
82-
)
82+
if last:
83+
arrayPcluster[point_id] = nearest_centroid
84+
85+
dpex.barrier(dpex.LOCAL_MEM_FENCE)
86+
87+
for i in range(lid, num_centroids * dims, local_size):
88+
local_centroid_d = dtyp.type(0)
89+
for lc in range(local_copies):
90+
local_centroid_d += localNewCentroids[lc, i % dims, i // dims]
91+
92+
dpex.atomic.add(
93+
NewCentroids,
94+
(i // dims, i % dims),
95+
local_centroid_d,
96+
)
97+
98+
for c in range(lid, num_centroids, local_size):
99+
local_centroid_npoints = dpt.int32.type(0)
100+
for lc in range(local_copies):
101+
local_centroid_npoints += localNewCount[lc, c]
102+
103+
dpex.atomic.add(NewCount, c, local_centroid_npoints)
104+
105+
return groupByCluster
106+
107+
108+
@lru_cache(maxsize=1)
109+
def getUpdateCentroids(dims, num_centroids, dtyp, local_size_):
110+
@dpex.kernel
111+
def updateCentroids(diff, arrayC, arrayCnumpoint, NewCentroids, NewCount):
112+
lid = dpex.get_local_id(0)
113+
local_size = dpex.get_local_size(0)
114+
115+
local_distance = dpex.local.array(local_size_, dtype=dtyp)
116+
117+
max_distance = dtyp.type(0)
118+
for c in range(lid, num_centroids, local_size):
119+
numpoints = NewCount[c]
120+
arrayCnumpoint[c] = numpoints
121+
NewCount[c] = 0
122+
distance = dtyp.type(0)
123+
124+
for d in range(dims):
125+
d0 = arrayC[c, d]
126+
d1 = NewCentroids[c, d]
127+
NewCentroids[c, d] = 0
128+
129+
d1 = d1 / dtyp.type(numpoints) if numpoints > 0 else d0
130+
arrayC[c, d] = d1
131+
132+
distance += d0 * d0 - d1 * d1
83133

84-
return arrayC, arrayCsum, arrayCnumpoint
134+
max_distance = max(max_distance, distance)
135+
local_distance[c] = max_distance
85136

137+
dpex.barrier(dpex.LOCAL_MEM_FENCE)
86138

87-
def kmeans(
139+
if lid == 0:
140+
for c in range(local_size):
141+
max_distance = max(max_distance, local_distance[c])
142+
143+
diff[0] = sqrt(max_distance)
144+
145+
return updateCentroids
146+
147+
148+
@lru_cache(maxsize=1)
149+
def getUpdateLabels(dims, num_centroids, dtyp, WorkPI):
150+
@dpex.kernel
151+
def updateLabels(arrayP, arrayPcluster, arrayC):
152+
numpoints = arrayP.shape[0]
153+
localCentroids = dpex.local.array((dims, num_centroids), dtype=dtyp)
154+
155+
grid = dpex.get_group_id(0)
156+
lid = dpex.get_local_id(0)
157+
local_size = dpex.get_local_size(0)
158+
159+
for i in range(lid, num_centroids * dims, local_size):
160+
localCentroids[i % dims, i // dims] = arrayC[i // dims, i % dims]
161+
162+
dpex.barrier(dpex.LOCAL_MEM_FENCE)
163+
164+
for i in range(WorkPI):
165+
point_id = grid * WorkPI * local_size + i * local_size + lid
166+
if point_id < numpoints:
167+
localP = dpex.private.array(dims, dtyp)
168+
for d in range(dims):
169+
localP[d] = arrayP[point_id, d]
170+
171+
minimal_distance = dtyp.type(dpt.inf)
172+
nearest_centroid = int(0)
173+
for c in range(num_centroids):
174+
sq_sum = dtyp.type(0)
175+
for d in range(dims):
176+
sq_sum += (localP[d] - localCentroids[d, c]) ** 2
177+
178+
if sq_sum < minimal_distance:
179+
nearest_centroid = c
180+
minimal_distance = sq_sum
181+
182+
arrayPcluster[point_id] = nearest_centroid
183+
184+
return updateLabels
185+
186+
187+
@lru_cache(maxsize=1)
188+
def getKernels(dims, num_centroids, dtyp, WorkPI, local_size_):
189+
groupByCluster = getGroupByCluster(
190+
dims, num_centroids, dtyp, WorkPI, local_size_
191+
)
192+
updateCentroids = getUpdateCentroids(dims, num_centroids, dtyp, local_size_)
193+
updateLabels = getUpdateLabels(dims, num_centroids, dtyp, WorkPI)
194+
195+
return groupByCluster, updateCentroids, updateLabels
196+
197+
198+
def kmeans_kernel(
88199
arrayP,
89-
arrayPclusters,
200+
arrayPcluster,
90201
arrayC,
91-
arrayCsum,
92202
arrayCnumpoint,
93203
niters,
94-
npoints,
95-
ndims,
96-
ncentroids,
97204
):
98-
arrayC, arrayCsum, arrayCnumpoint = kmeans_kernel(
99-
arrayP,
100-
arrayPclusters,
101-
arrayC,
102-
arrayCsum,
103-
arrayCnumpoint,
104-
niters,
105-
npoints,
106-
ncentroids,
205+
num_points = arrayP.shape[0]
206+
num_centroids = arrayC.shape[0]
207+
dims = arrayC.shape[1]
208+
device = arrayP.device
209+
210+
NewCentroids = dpt.zeros_like(arrayC._array_obj)
211+
NewCount = dpt.zeros_like(arrayCnumpoint._array_obj)
212+
diff = dpt.zeros(1, dtype=arrayP.dtype, device=device)
213+
diff_host = dpt.inf
214+
215+
tolerance = 0
216+
WorkPI = 8
217+
local_size = min(1024, device.sycl_device.max_work_group_size)
218+
global_size = Align(DivUp(num_points, WorkPI), local_size)
219+
220+
groupByCluster, updateCentroids, updateLabels = getKernels(
221+
dims, num_centroids, arrayP.dtype, WorkPI, local_size
107222
)
223+
224+
for i in range(niters):
225+
last = i == (niters - 1)
226+
if diff_host < tolerance:
227+
updateLabels[NdRange((global_size,), (local_size,))](
228+
arrayP, arrayPcluster, arrayC
229+
)
230+
break
231+
232+
groupByCluster[NdRange((global_size,), (local_size,))](
233+
arrayP, arrayPcluster, arrayC, NewCentroids, NewCount, last
234+
)
235+
236+
update_centroid_size = min(num_centroids, local_size)
237+
updateCentroids[
238+
NdRange((update_centroid_size,), (update_centroid_size,))
239+
](diff, arrayC, arrayCnumpoint, NewCentroids, NewCount)
240+
diff_host = dpt.asnumpy(diff)[0]
241+
242+
243+
def kmeans(arrayP, arrayPclusters, arrayC, arrayCnumpoint, niters):
244+
kmeans_kernel(arrayP, arrayPclusters, arrayC, arrayCnumpoint, niters)

dpbench/benchmarks/kmeans/kmeans_numba_dpex_p.py

Lines changed: 6 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -70,16 +70,10 @@ def copy_arrayC(arrayC, arrayP, num_centroids):
7070
arrayC[i, 1] = arrayP[i, 1]
7171

7272

73-
def kmeans_numba(
74-
arrayP,
75-
arrayPcluster,
76-
arrayC,
77-
arrayCsum,
78-
arrayCnumpoint,
79-
niters,
80-
num_points,
81-
num_centroids,
82-
):
73+
def kmeans_numba(arrayP, arrayPcluster, arrayC, arrayCnumpoint, niters):
74+
num_points = arrayP.shape[0]
75+
num_centroids = arrayC.shape[0]
76+
arrayCsum = np.zeros_like(arrayC)
8377
for i in range(niters):
8478
groupByCluster(arrayP, arrayPcluster, arrayC, num_points, num_centroids)
8579

@@ -106,26 +100,7 @@ def kmeans_numba(
106100
return arrayC, arrayCsum, arrayCnumpoint
107101

108102

109-
def kmeans(
110-
arrayP,
111-
arrayPclusters,
112-
arrayC,
113-
arrayCsum,
114-
arrayCnumpoint,
115-
niters,
116-
npoints,
117-
ndims,
118-
ncentroids,
119-
):
120-
copy_arrayC(arrayC, arrayP, ncentroids)
121-
103+
def kmeans(arrayP, arrayPclusters, arrayC, arrayCnumpoint, niters):
122104
arrayC, arrayCsum, arrayCnumpoint = kmeans_numba(
123-
arrayP,
124-
arrayPclusters,
125-
arrayC,
126-
arrayCsum,
127-
arrayCnumpoint,
128-
niters,
129-
npoints,
130-
ncentroids,
105+
arrayP, arrayPclusters, arrayC, arrayCnumpoint, niters
131106
)

0 commit comments

Comments
 (0)