Skip to content

Commit f5e452b

Browse files
authored
Merge pull request #308 from AlexanderKalistratov/fp32_support_for_sycl_impl
Add support of fp32 for sycl implementations
2 parents fa9de07 + b6d1bac commit f5e452b

File tree

11 files changed

+101
-80
lines changed

11 files changed

+101
-80
lines changed

.github/workflows/build_and_run.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ jobs:
2323
python: ["3.9", "3.10", "3.11"]
2424
sycl: ["sycl","no-sycl"]
2525
install: ["pip", "setup.py"]
26+
precision: ["single", "double"]
2627
exclude:
2728
# setuptools<64 + scikit-build produce 'UNKOWN' package name for
2829
# python 3.11. Could not find exact reference for that issue.
@@ -174,10 +175,12 @@ jobs:
174175
echo "OCL_ICD_FILENAMES=$env:CONDA_PREFIX\Library\lib\intelocl64.dll" >> $env:GITHUB_ENV
175176
176177
- name: Run benchmarks
177-
run: dpbench -i ${{env.WORKLOADS}} run -r2 --no-print-results || exit 1
178+
run: dpbench -i ${{env.WORKLOADS}} run -r2 --no-print-results --precision=${{matrix.precision}} || exit 1
178179

179180
- name: Run rodinia benchmarks
180-
run: dpbench -i ${{env.WORKLOADS}} --last-run run -r2 --no-print-results --rodinia --no-dpbench|| exit 1
181+
# TODO: fix rodinia benchmark: https://github.com/IntelPython/dpbench/issues/316
182+
if: matrix.precision != 'single'
183+
run: dpbench -i ${{env.WORKLOADS}} --last-run run -r2 --no-print-results --rodinia --no-dpbench --precision=${{matrix.precision}} || exit 1
181184

182185
- name: Generate report
183186
run: dpbench -i ${{env.WORKLOADS}} report || exit 1

.github/workflows/conda-package.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ jobs:
112112
matrix:
113113
python: ['3.9', '3.10', '3.11']
114114
os: [ubuntu-20.04, ubuntu-latest, windows-latest]
115+
precision: ["single", "double"]
115116
experimental: [false]
116117

117118
continue-on-error: ${{ matrix.experimental }}
@@ -193,11 +194,13 @@ jobs:
193194
# we want to make sure that configuration files are geting populated
194195
- name: Run npbench benchmark
195196
run: |
196-
dpbench -i numpy -b azimint_hist run --npbench
197+
dpbench -i numpy -b azimint_hist run --npbench --precision=${{matrix.precision}}
197198
198199
- name: Run rodinia benchmark
200+
# TODO: fix rodinia benchmark: https://github.com/IntelPython/dpbench/issues/316
201+
if: matrix.precision != 'single'
199202
run: |
200-
dpbench run --rodinia --no-dpbench --no-validate -r 1
203+
dpbench run --rodinia --no-dpbench --no-validate -r 1 --precision=${{matrix.precision}}
201204
202205
upload_anaconda:
203206
name: Upload dppy/label/dev ['${{ matrix.os }}', python='${{ matrix.python }}']

dpbench/benchmarks/black_scholes/black_scholes_sycl_native_ext/black_scholes_sycl/_black_scholes_kernel.hpp

Lines changed: 15 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -6,30 +6,10 @@
66
#include <stdlib.h>
77
#include <type_traits>
88

9-
#ifdef __DO_FLOAT__
10-
#define EXP(x) expf(x)
11-
#define LOG(x) logf(x)
12-
#define SQRT(x) sqrtf(x)
13-
#define ERF(x) erff(x)
14-
#define INVSQRT(x) 1.0f / sqrtf(x)
15-
16-
#define QUARTER 0.25f
17-
#define HALF 0.5f
18-
#define TWO 2.0f
19-
#else
20-
#define EXP(x) sycl::exp(x)
21-
#define LOG(x) sycl::log(x)
22-
#define SQRT(x) sycl::sqrt(x)
23-
#define ERF(x) sycl::erf(x)
24-
#define INVSQRT(x) 1.0 / sycl::sqrt(x)
25-
26-
#define QUARTER 0.25
27-
#define HALF 0.5
28-
#define TWO 2.0
29-
#endif
30-
319
using namespace sycl;
3210

11+
template <typename FpTy> class BlackScholesKernel;
12+
3313
template <typename FpTy>
3414
void black_scholes_impl(queue Queue,
3515
size_t nopt,
@@ -41,27 +21,30 @@ void black_scholes_impl(queue Queue,
4121
FpTy *call,
4222
FpTy *put)
4323
{
24+
constexpr FpTy _0_25 = 0.25;
25+
constexpr FpTy _0_5 = 0.5;
26+
4427
auto e = Queue.submit([&](handler &h) {
45-
h.parallel_for<class BlackScholesKernel>(
28+
h.parallel_for<BlackScholesKernel<FpTy>>(
4629
range<1>{nopt}, [=](id<1> myID) {
4730
FpTy mr = -rate;
48-
FpTy sig_sig_two = volatility * volatility * TWO;
31+
FpTy sig_sig_two = volatility * volatility * 2;
4932
int i = myID[0];
5033
FpTy a, b, c, y, z, e;
5134
FpTy d1, d2, w1, w2;
5235

53-
a = LOG(price[i] / strike[i]);
36+
a = sycl::log(price[i] / strike[i]);
5437
b = t[i] * mr;
5538
z = t[i] * sig_sig_two;
56-
c = QUARTER * z;
57-
y = INVSQRT(z);
39+
c = _0_25 * z;
40+
y = sycl::rsqrt(z);
5841
w1 = (a - b + c) * y;
5942
w2 = (a - b - c) * y;
60-
d1 = ERF(w1);
61-
d2 = ERF(w2);
62-
d1 = HALF + HALF * d1;
63-
d2 = HALF + HALF * d2;
64-
e = EXP(b);
43+
d1 = sycl::erf(w1);
44+
d2 = sycl::erf(w2);
45+
d1 = _0_5 + _0_5 * d1;
46+
d2 = _0_5 + _0_5 * d2;
47+
e = sycl::exp(b);
6548
call[i] = price[i] * d1 - strike[i] * e * d2;
6649
put[i] = call[i] - price[i] + strike[i] * e;
6750
});

dpbench/benchmarks/black_scholes/black_scholes_sycl_native_ext/black_scholes_sycl/_black_scholes_sycl.cpp

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -64,14 +64,22 @@ void black_scholes_sync(size_t /**/,
6464
if (!ensure_compatibility(price, strike, t, call, put))
6565
throw std::runtime_error("Input arrays are not acceptable.");
6666

67-
if (typenum != UAR_DOUBLE) {
68-
throw std::runtime_error("Expected a double precision FP array.");
67+
if (typenum == UAR_FLOAT) {
68+
black_scholes_impl<float>(Queue, nopt, price.get_data<float>(),
69+
strike.get_data<float>(), t.get_data<float>(),
70+
rate, volatility, call.get_data<float>(),
71+
put.get_data<float>());
72+
}
73+
else if (typenum == UAR_DOUBLE) {
74+
black_scholes_impl<double>(
75+
Queue, nopt, price.get_data<double>(), strike.get_data<double>(),
76+
t.get_data<double>(), rate, volatility, call.get_data<double>(),
77+
put.get_data<double>());
78+
}
79+
else {
80+
throw std::runtime_error(
81+
"Expected a double or single precision FP array.");
6982
}
70-
71-
black_scholes_impl(Queue, nopt, price.get_data<double>(),
72-
strike.get_data<double>(), t.get_data<double>(), rate,
73-
volatility, call.get_data<double>(),
74-
put.get_data<double>());
7583
}
7684

7785
PYBIND11_MODULE(_black_scholes_sycl, m)

dpbench/benchmarks/dbscan/dbscan_sycl_native_ext/dbscan_sycl/_dbscan_kernel.hpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,8 @@ void getNeighborhood(size_t n,
106106
}
107107
}
108108

109+
template <typename FpTy> class DBScanKernel;
110+
109111
template <typename FpTy>
110112
size_t dbscan_impl(queue q,
111113
size_t n_samples,
@@ -126,14 +128,13 @@ size_t dbscan_impl(queue q,
126128
q.wait();
127129

128130
auto e = q.submit([&](handler &h) {
129-
h.parallel_for<class DBScanKernel>(
131+
h.parallel_for<DBScanKernel<FpTy>>(
130132
range<1>{n_samples}, [=](id<1> myID) {
131133
size_t i1 = myID[0];
132134
size_t i2 = (i1 + 1 == n_samples ? n_samples : i1 + 1);
133-
getNeighborhood<double>(n_samples, n_features, data, i2 - i1,
134-
data + i1 * n_features, eps,
135-
d_indices + i1 * n_samples,
136-
d_sizes + i1);
135+
getNeighborhood<FpTy>(n_samples, n_features, data, i2 - i1,
136+
data + i1 * n_features, eps,
137+
d_indices + i1 * n_samples, d_sizes + i1);
137138
});
138139
});
139140

dpbench/benchmarks/dbscan/dbscan_sycl_native_ext/dbscan_sycl/_dbscan_sycl.cpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -38,16 +38,21 @@ size_t dbscan_sync(size_t n_samples,
3838
size_t min_pts)
3939
{
4040
auto queue = data.get_queue();
41+
auto typenum = data.get_typenum();
4142

4243
if (!ensure_compatibility(data))
4344
throw std::runtime_error("Input arrays are not acceptable.");
4445

45-
if (data.get_typenum() != UAR_DOUBLE) {
46-
throw std::runtime_error("Expected a double precision FP array.");
46+
if (typenum == UAR_FLOAT) {
47+
return dbscan_impl<float>(queue, n_samples, n_features,
48+
data.get_data<float>(), eps, min_pts);
49+
}
50+
else if (typenum == UAR_DOUBLE) {
51+
return dbscan_impl<double>(queue, n_samples, n_features,
52+
data.get_data<double>(), eps, min_pts);
4753
}
4854

49-
return dbscan_impl<double>(queue, n_samples, n_features,
50-
data.get_data<double>(), eps, min_pts);
55+
throw std::runtime_error("Expected a double or single precision FP array.");
5156
}
5257

5358
PYBIND11_MODULE(_dbscan_sycl, m)

dpbench/benchmarks/l2_norm/l2_norm_sycl_native_ext/l2_norm_sycl/_l2_norm_kernel.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
using namespace sycl;
1111

12+
template <typename FpTy> class theKernel;
13+
1214
template <typename FpTy>
1315
void l2_norm_impl(queue Queue,
1416
size_t npoints,
@@ -18,7 +20,7 @@ void l2_norm_impl(queue Queue,
1820
{
1921
Queue
2022
.submit([&](handler &h) {
21-
h.parallel_for<class theKernel>(range<1>{npoints}, [=](id<1> myID) {
23+
h.parallel_for<theKernel<FpTy>>(range<1>{npoints}, [=](id<1> myID) {
2224
size_t i = myID[0];
2325
for (size_t k = 0; k < dims; k++) {
2426
d[i] += a[i * dims + k] * a[i * dims + k];

dpbench/benchmarks/l2_norm/l2_norm_sycl_native_ext/l2_norm_sycl/_l2_norm_sycl.cpp

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,20 @@ void l2_norm_sync(dpctl::tensor::usm_ndarray a, dpctl::tensor::usm_ndarray d)
1919

2020
auto dims = 3;
2121
auto npoints = a.get_size() / dims;
22+
auto typenum = a.get_typenum();
2223

23-
if (a.get_typenum() != UAR_DOUBLE) {
24-
throw std::runtime_error("Expected a double precision FP array.");
24+
if (typenum == UAR_FLOAT) {
25+
l2_norm_impl(Queue, npoints, dims, a.get_data<float>(),
26+
d.get_data<float>());
27+
}
28+
else if (typenum == UAR_DOUBLE) {
29+
l2_norm_impl(Queue, npoints, dims, a.get_data<double>(),
30+
d.get_data<double>());
31+
}
32+
else {
33+
throw std::runtime_error(
34+
"Expected a double or single precision FP array.");
2535
}
26-
27-
l2_norm_impl(Queue, npoints, dims, a.get_data<double>(),
28-
d.get_data<double>());
2936
}
3037

3138
PYBIND11_MODULE(_l2_norm_sycl, m)

dpbench/benchmarks/pca/pca_numba_n.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def pca(data, dims_rescaled_data=2):
3939
evectors = evectors[:, :dims_rescaled_data]
4040

4141
# carry out the transformation on the data using eigenvectors
42-
tdata = np.dot(evectors.T, data.T).T
42+
tdata = np.dot(evectors.T.astype(data.dtype), data.T).T
4343

4444
# return the transformed data, eigenvalues, and eigenvectors
4545
return tdata, evalues, evectors

dpbench/benchmarks/rambo/rambo_sycl_native_ext/rambo_sycl/_rambo_kernel.hpp

Lines changed: 10 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,10 @@
1010
#include <stdlib.h>
1111
#include <type_traits>
1212

13-
#define SIN(x) sycl::sin(x)
14-
#define COS(x) sycl::cos(x)
15-
#define SQRT(x) sycl::sqrt(x)
16-
#define LOG(x) sycl::log(x)
17-
1813
using namespace sycl;
1914

15+
template <typename FpTy> class RamboKernel;
16+
2017
template <typename FpTy>
2118
event rambo_impl(queue Queue,
2219
size_t nevts,
@@ -26,20 +23,21 @@ event rambo_impl(queue Queue,
2623
const FpTy *usmQ1,
2724
FpTy *usmOutput)
2825
{
26+
constexpr FpTy pi_v = M_PI;
2927
return Queue.submit([&](handler &h) {
30-
h.parallel_for<class RamboKernel>(range<1>{nevts}, [=](id<1> myID) {
28+
h.parallel_for<RamboKernel<FpTy>>(range<1>{nevts}, [=](id<1> myID) {
3129
for (size_t j = 0; j < nout; j++) {
3230
int i = myID[0];
3331
size_t idx = i * nout + j;
3432

35-
FpTy C = 2.0 * usmC1[idx] - 1.0;
36-
FpTy S = SQRT(1 - C * C);
37-
FpTy F = 2.0 * M_PI * usmF1[idx];
38-
FpTy Q = -LOG(usmQ1[idx]);
33+
FpTy C = 2 * usmC1[idx] - 1;
34+
FpTy S = sycl::sqrt(1 - C * C);
35+
FpTy F = 2 * pi_v * usmF1[idx];
36+
FpTy Q = -sycl::log(usmQ1[idx]);
3937

4038
usmOutput[idx * 4] = Q;
41-
usmOutput[idx * 4 + 1] = Q * S * SIN(F);
42-
usmOutput[idx * 4 + 2] = Q * S * COS(F);
39+
usmOutput[idx * 4 + 1] = Q * S * sycl::sin(F);
40+
usmOutput[idx * 4 + 2] = Q * S * sycl::cos(F);
4341
usmOutput[idx * 4 + 3] = Q * C;
4442
}
4543
});

0 commit comments

Comments
 (0)