From 5318bfde4ce068147f09dea89a871a286d69c74c Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Thu, 26 Oct 2023 06:15:11 +0200 Subject: [PATCH 1/2] Add support of fp32 for sycl implementations --- .../_black_scholes_kernel.hpp | 47 ++++++------------- .../_black_scholes_sycl.cpp | 22 ++++++--- .../dbscan_sycl/_dbscan_kernel.hpp | 11 +++-- .../dbscan_sycl/_dbscan_sycl.cpp | 13 +++-- .../l2_norm_sycl/_l2_norm_kernel.hpp | 4 +- .../l2_norm_sycl/_l2_norm_sycl.cpp | 17 +++++-- dpbench/benchmarks/pca/pca_numba_n.py | 2 +- .../rambo_sycl/_rambo_kernel.hpp | 22 ++++----- .../rambo_sycl/_rambo_sycl.cpp | 29 ++++++++---- 9 files changed, 91 insertions(+), 76 deletions(-) diff --git a/dpbench/benchmarks/black_scholes/black_scholes_sycl_native_ext/black_scholes_sycl/_black_scholes_kernel.hpp b/dpbench/benchmarks/black_scholes/black_scholes_sycl_native_ext/black_scholes_sycl/_black_scholes_kernel.hpp index 59132c50..ad99f5f4 100644 --- a/dpbench/benchmarks/black_scholes/black_scholes_sycl_native_ext/black_scholes_sycl/_black_scholes_kernel.hpp +++ b/dpbench/benchmarks/black_scholes/black_scholes_sycl_native_ext/black_scholes_sycl/_black_scholes_kernel.hpp @@ -6,30 +6,10 @@ #include #include -#ifdef __DO_FLOAT__ -#define EXP(x) expf(x) -#define LOG(x) logf(x) -#define SQRT(x) sqrtf(x) -#define ERF(x) erff(x) -#define INVSQRT(x) 1.0f / sqrtf(x) - -#define QUARTER 0.25f -#define HALF 0.5f -#define TWO 2.0f -#else -#define EXP(x) sycl::exp(x) -#define LOG(x) sycl::log(x) -#define SQRT(x) sycl::sqrt(x) -#define ERF(x) sycl::erf(x) -#define INVSQRT(x) 1.0 / sycl::sqrt(x) - -#define QUARTER 0.25 -#define HALF 0.5 -#define TWO 2.0 -#endif - using namespace sycl; +template class BlackScholesKernel; + template void black_scholes_impl(queue Queue, size_t nopt, @@ -41,27 +21,30 @@ void black_scholes_impl(queue Queue, FpTy *call, FpTy *put) { + constexpr FpTy _0_25 = 0.25; + constexpr FpTy _0_5 = 0.5; + auto e = Queue.submit([&](handler &h) { - h.parallel_for( + h.parallel_for>( range<1>{nopt}, [=](id<1> myID) { FpTy mr = -rate; - FpTy sig_sig_two = volatility * volatility * TWO; + FpTy sig_sig_two = volatility * volatility * 2; int i = myID[0]; FpTy a, b, c, y, z, e; FpTy d1, d2, w1, w2; - a = LOG(price[i] / strike[i]); + a = sycl::log(price[i] / strike[i]); b = t[i] * mr; z = t[i] * sig_sig_two; - c = QUARTER * z; - y = INVSQRT(z); + c = _0_25 * z; + y = sycl::rsqrt(z); w1 = (a - b + c) * y; w2 = (a - b - c) * y; - d1 = ERF(w1); - d2 = ERF(w2); - d1 = HALF + HALF * d1; - d2 = HALF + HALF * d2; - e = EXP(b); + d1 = sycl::erf(w1); + d2 = sycl::erf(w2); + d1 = _0_5 + _0_5 * d1; + d2 = _0_5 + _0_5 * d2; + e = sycl::exp(b); call[i] = price[i] * d1 - strike[i] * e * d2; put[i] = call[i] - price[i] + strike[i] * e; }); diff --git a/dpbench/benchmarks/black_scholes/black_scholes_sycl_native_ext/black_scholes_sycl/_black_scholes_sycl.cpp b/dpbench/benchmarks/black_scholes/black_scholes_sycl_native_ext/black_scholes_sycl/_black_scholes_sycl.cpp index f0bb0d5b..96a49547 100644 --- a/dpbench/benchmarks/black_scholes/black_scholes_sycl_native_ext/black_scholes_sycl/_black_scholes_sycl.cpp +++ b/dpbench/benchmarks/black_scholes/black_scholes_sycl_native_ext/black_scholes_sycl/_black_scholes_sycl.cpp @@ -64,14 +64,22 @@ void black_scholes_sync(size_t /**/, if (!ensure_compatibility(price, strike, t, call, put)) throw std::runtime_error("Input arrays are not acceptable."); - if (typenum != UAR_DOUBLE) { - throw std::runtime_error("Expected a double precision FP array."); + if (typenum == UAR_FLOAT) { + black_scholes_impl(Queue, nopt, price.get_data(), + strike.get_data(), t.get_data(), + rate, volatility, call.get_data(), + put.get_data()); + } + else if (typenum == UAR_DOUBLE) { + black_scholes_impl( + Queue, nopt, price.get_data(), strike.get_data(), + t.get_data(), rate, volatility, call.get_data(), + put.get_data()); + } + else { + throw std::runtime_error( + "Expected a double or single precision FP array."); } - - black_scholes_impl(Queue, nopt, price.get_data(), - strike.get_data(), t.get_data(), rate, - volatility, call.get_data(), - put.get_data()); } PYBIND11_MODULE(_black_scholes_sycl, m) diff --git a/dpbench/benchmarks/dbscan/dbscan_sycl_native_ext/dbscan_sycl/_dbscan_kernel.hpp b/dpbench/benchmarks/dbscan/dbscan_sycl_native_ext/dbscan_sycl/_dbscan_kernel.hpp index 725e6923..2c236f2c 100644 --- a/dpbench/benchmarks/dbscan/dbscan_sycl_native_ext/dbscan_sycl/_dbscan_kernel.hpp +++ b/dpbench/benchmarks/dbscan/dbscan_sycl_native_ext/dbscan_sycl/_dbscan_kernel.hpp @@ -106,6 +106,8 @@ void getNeighborhood(size_t n, } } +template class DBScanKernel; + template size_t dbscan_impl(queue q, size_t n_samples, @@ -126,14 +128,13 @@ size_t dbscan_impl(queue q, q.wait(); auto e = q.submit([&](handler &h) { - h.parallel_for( + h.parallel_for>( range<1>{n_samples}, [=](id<1> myID) { size_t i1 = myID[0]; size_t i2 = (i1 + 1 == n_samples ? n_samples : i1 + 1); - getNeighborhood(n_samples, n_features, data, i2 - i1, - data + i1 * n_features, eps, - d_indices + i1 * n_samples, - d_sizes + i1); + getNeighborhood(n_samples, n_features, data, i2 - i1, + data + i1 * n_features, eps, + d_indices + i1 * n_samples, d_sizes + i1); }); }); diff --git a/dpbench/benchmarks/dbscan/dbscan_sycl_native_ext/dbscan_sycl/_dbscan_sycl.cpp b/dpbench/benchmarks/dbscan/dbscan_sycl_native_ext/dbscan_sycl/_dbscan_sycl.cpp index efe4f4bd..8dd05aa3 100644 --- a/dpbench/benchmarks/dbscan/dbscan_sycl_native_ext/dbscan_sycl/_dbscan_sycl.cpp +++ b/dpbench/benchmarks/dbscan/dbscan_sycl_native_ext/dbscan_sycl/_dbscan_sycl.cpp @@ -38,16 +38,21 @@ size_t dbscan_sync(size_t n_samples, size_t min_pts) { auto queue = data.get_queue(); + auto typenum = data.get_typenum(); if (!ensure_compatibility(data)) throw std::runtime_error("Input arrays are not acceptable."); - if (data.get_typenum() != UAR_DOUBLE) { - throw std::runtime_error("Expected a double precision FP array."); + if (typenum == UAR_FLOAT) { + return dbscan_impl(queue, n_samples, n_features, + data.get_data(), eps, min_pts); + } + else if (typenum == UAR_DOUBLE) { + return dbscan_impl(queue, n_samples, n_features, + data.get_data(), eps, min_pts); } - return dbscan_impl(queue, n_samples, n_features, - data.get_data(), eps, min_pts); + throw std::runtime_error("Expected a double or single precision FP array."); } PYBIND11_MODULE(_dbscan_sycl, m) diff --git a/dpbench/benchmarks/l2_norm/l2_norm_sycl_native_ext/l2_norm_sycl/_l2_norm_kernel.hpp b/dpbench/benchmarks/l2_norm/l2_norm_sycl_native_ext/l2_norm_sycl/_l2_norm_kernel.hpp index daf1d57b..8f13d972 100644 --- a/dpbench/benchmarks/l2_norm/l2_norm_sycl_native_ext/l2_norm_sycl/_l2_norm_kernel.hpp +++ b/dpbench/benchmarks/l2_norm/l2_norm_sycl_native_ext/l2_norm_sycl/_l2_norm_kernel.hpp @@ -9,6 +9,8 @@ using namespace sycl; +template class theKernel; + template void l2_norm_impl(queue Queue, size_t npoints, @@ -18,7 +20,7 @@ void l2_norm_impl(queue Queue, { Queue .submit([&](handler &h) { - h.parallel_for(range<1>{npoints}, [=](id<1> myID) { + h.parallel_for>(range<1>{npoints}, [=](id<1> myID) { size_t i = myID[0]; for (size_t k = 0; k < dims; k++) { d[i] += a[i * dims + k] * a[i * dims + k]; diff --git a/dpbench/benchmarks/l2_norm/l2_norm_sycl_native_ext/l2_norm_sycl/_l2_norm_sycl.cpp b/dpbench/benchmarks/l2_norm/l2_norm_sycl_native_ext/l2_norm_sycl/_l2_norm_sycl.cpp index 90f1a026..ab8b9e06 100644 --- a/dpbench/benchmarks/l2_norm/l2_norm_sycl_native_ext/l2_norm_sycl/_l2_norm_sycl.cpp +++ b/dpbench/benchmarks/l2_norm/l2_norm_sycl_native_ext/l2_norm_sycl/_l2_norm_sycl.cpp @@ -19,13 +19,20 @@ void l2_norm_sync(dpctl::tensor::usm_ndarray a, dpctl::tensor::usm_ndarray d) auto dims = 3; auto npoints = a.get_size() / dims; + auto typenum = a.get_typenum(); - if (a.get_typenum() != UAR_DOUBLE) { - throw std::runtime_error("Expected a double precision FP array."); + if (typenum == UAR_FLOAT) { + l2_norm_impl(Queue, npoints, dims, a.get_data(), + d.get_data()); + } + else if (typenum == UAR_DOUBLE) { + l2_norm_impl(Queue, npoints, dims, a.get_data(), + d.get_data()); + } + else { + throw std::runtime_error( + "Expected a double or single precision FP array."); } - - l2_norm_impl(Queue, npoints, dims, a.get_data(), - d.get_data()); } PYBIND11_MODULE(_l2_norm_sycl, m) diff --git a/dpbench/benchmarks/pca/pca_numba_n.py b/dpbench/benchmarks/pca/pca_numba_n.py index 538222bb..347e2136 100644 --- a/dpbench/benchmarks/pca/pca_numba_n.py +++ b/dpbench/benchmarks/pca/pca_numba_n.py @@ -39,7 +39,7 @@ def pca(data, dims_rescaled_data=2): evectors = evectors[:, :dims_rescaled_data] # carry out the transformation on the data using eigenvectors - tdata = np.dot(evectors.T, data.T).T + tdata = np.dot(evectors.T.astype(data.dtype), data.T).T # return the transformed data, eigenvalues, and eigenvectors return tdata, evalues, evectors diff --git a/dpbench/benchmarks/rambo/rambo_sycl_native_ext/rambo_sycl/_rambo_kernel.hpp b/dpbench/benchmarks/rambo/rambo_sycl_native_ext/rambo_sycl/_rambo_kernel.hpp index c8c71fc0..c090a72c 100644 --- a/dpbench/benchmarks/rambo/rambo_sycl_native_ext/rambo_sycl/_rambo_kernel.hpp +++ b/dpbench/benchmarks/rambo/rambo_sycl_native_ext/rambo_sycl/_rambo_kernel.hpp @@ -10,13 +10,10 @@ #include #include -#define SIN(x) sycl::sin(x) -#define COS(x) sycl::cos(x) -#define SQRT(x) sycl::sqrt(x) -#define LOG(x) sycl::log(x) - using namespace sycl; +template class RamboKernel; + template event rambo_impl(queue Queue, size_t nevts, @@ -26,20 +23,21 @@ event rambo_impl(queue Queue, const FpTy *usmQ1, FpTy *usmOutput) { + constexpr FpTy pi_v = M_PI; return Queue.submit([&](handler &h) { - h.parallel_for(range<1>{nevts}, [=](id<1> myID) { + h.parallel_for>(range<1>{nevts}, [=](id<1> myID) { for (size_t j = 0; j < nout; j++) { int i = myID[0]; size_t idx = i * nout + j; - FpTy C = 2.0 * usmC1[idx] - 1.0; - FpTy S = SQRT(1 - C * C); - FpTy F = 2.0 * M_PI * usmF1[idx]; - FpTy Q = -LOG(usmQ1[idx]); + FpTy C = 2 * usmC1[idx] - 1; + FpTy S = sycl::sqrt(1 - C * C); + FpTy F = 2 * pi_v * usmF1[idx]; + FpTy Q = -sycl::log(usmQ1[idx]); usmOutput[idx * 4] = Q; - usmOutput[idx * 4 + 1] = Q * S * SIN(F); - usmOutput[idx * 4 + 2] = Q * S * COS(F); + usmOutput[idx * 4 + 1] = Q * S * sycl::sin(F); + usmOutput[idx * 4 + 2] = Q * S * sycl::cos(F); usmOutput[idx * 4 + 3] = Q * C; } }); diff --git a/dpbench/benchmarks/rambo/rambo_sycl_native_ext/rambo_sycl/_rambo_sycl.cpp b/dpbench/benchmarks/rambo/rambo_sycl_native_ext/rambo_sycl/_rambo_sycl.cpp index 8357fa68..c6a8e2ff 100644 --- a/dpbench/benchmarks/rambo/rambo_sycl_native_ext/rambo_sycl/_rambo_sycl.cpp +++ b/dpbench/benchmarks/rambo/rambo_sycl_native_ext/rambo_sycl/_rambo_sycl.cpp @@ -55,16 +55,27 @@ void rambo_sync(size_t nevts, if (!ensure_compatibility(C1, F1, Q1)) throw std::runtime_error("Input arrays are not acceptable."); - if (C1.get_typenum() != UAR_DOUBLE || F1.get_typenum() != UAR_DOUBLE || - Q1.get_typenum() != UAR_DOUBLE || output.get_typenum() != UAR_DOUBLE) - { - throw std::runtime_error("Expected a double precision FP array."); - } + if (output.get_typenum() != C1.get_typenum()) + throw std::runtime_error("Input arrays are not acceptable."); + + auto typenum = C1.get_typenum(); - auto e = rambo_impl(Queue, nevts, nout, C1.get_data(), - F1.get_data(), Q1.get_data(), - output.get_data()); - e.wait(); + if (typenum == UAR_FLOAT) { + auto e = rambo_impl(Queue, nevts, nout, C1.get_data(), + F1.get_data(), Q1.get_data(), + output.get_data()); + e.wait(); + } + else if (typenum == UAR_DOUBLE) { + auto e = rambo_impl(Queue, nevts, nout, C1.get_data(), + F1.get_data(), Q1.get_data(), + output.get_data()); + e.wait(); + } + else { + throw std::runtime_error( + "Expected a double or single precision FP array."); + } } PYBIND11_MODULE(_rambo_sycl, m) From b6d1bacf52291318d4002c650a58d29981ce97cf Mon Sep 17 00:00:00 2001 From: Alexander Kalistratov Date: Mon, 6 Nov 2023 14:01:40 +0100 Subject: [PATCH 2/2] Add fp32 precision to matrix --- .github/workflows/build_and_run.yml | 7 +++++-- .github/workflows/conda-package.yml | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build_and_run.yml b/.github/workflows/build_and_run.yml index eee84754..543f2058 100644 --- a/.github/workflows/build_and_run.yml +++ b/.github/workflows/build_and_run.yml @@ -23,6 +23,7 @@ jobs: python: ["3.9", "3.10", "3.11"] sycl: ["sycl","no-sycl"] install: ["pip", "setup.py"] + precision: ["single", "double"] exclude: # setuptools<64 + scikit-build produce 'UNKOWN' package name for # python 3.11. Could not find exact reference for that issue. @@ -174,10 +175,12 @@ jobs: echo "OCL_ICD_FILENAMES=$env:CONDA_PREFIX\Library\lib\intelocl64.dll" >> $env:GITHUB_ENV - name: Run benchmarks - run: dpbench -i ${{env.WORKLOADS}} run -r2 --no-print-results || exit 1 + run: dpbench -i ${{env.WORKLOADS}} run -r2 --no-print-results --precision=${{matrix.precision}} || exit 1 - name: Run rodinia benchmarks - run: dpbench -i ${{env.WORKLOADS}} --last-run run -r2 --no-print-results --rodinia --no-dpbench|| exit 1 + # TODO: fix rodinia benchmark: https://github.com/IntelPython/dpbench/issues/316 + if: matrix.precision != 'single' + run: dpbench -i ${{env.WORKLOADS}} --last-run run -r2 --no-print-results --rodinia --no-dpbench --precision=${{matrix.precision}} || exit 1 - name: Generate report run: dpbench -i ${{env.WORKLOADS}} report || exit 1 diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml index 26bb59e2..2c4f2b17 100644 --- a/.github/workflows/conda-package.yml +++ b/.github/workflows/conda-package.yml @@ -112,6 +112,7 @@ jobs: matrix: python: ['3.9', '3.10', '3.11'] os: [ubuntu-20.04, ubuntu-latest, windows-latest] + precision: ["single", "double"] experimental: [false] continue-on-error: ${{ matrix.experimental }} @@ -193,11 +194,13 @@ jobs: # we want to make sure that configuration files are geting populated - name: Run npbench benchmark run: | - dpbench -i numpy -b azimint_hist run --npbench + dpbench -i numpy -b azimint_hist run --npbench --precision=${{matrix.precision}} - name: Run rodinia benchmark + # TODO: fix rodinia benchmark: https://github.com/IntelPython/dpbench/issues/316 + if: matrix.precision != 'single' run: | - dpbench run --rodinia --no-dpbench --no-validate -r 1 + dpbench run --rodinia --no-dpbench --no-validate -r 1 --precision=${{matrix.precision}} upload_anaconda: name: Upload dppy/label/dev ['${{ matrix.os }}', python='${{ matrix.python }}']