From 5318bfde4ce068147f09dea89a871a286d69c74c Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Thu, 26 Oct 2023 06:15:11 +0200
Subject: [PATCH 1/2] Add support of fp32 for sycl implementations

---
 .../_black_scholes_kernel.hpp                 | 47 ++++++-------------
 .../_black_scholes_sycl.cpp                   | 22 ++++++---
 .../dbscan_sycl/_dbscan_kernel.hpp            | 11 +++--
 .../dbscan_sycl/_dbscan_sycl.cpp              | 13 +++--
 .../l2_norm_sycl/_l2_norm_kernel.hpp          |  4 +-
 .../l2_norm_sycl/_l2_norm_sycl.cpp            | 17 +++++--
 dpbench/benchmarks/pca/pca_numba_n.py         |  2 +-
 .../rambo_sycl/_rambo_kernel.hpp              | 22 ++++-----
 .../rambo_sycl/_rambo_sycl.cpp                | 29 ++++++++----
 9 files changed, 91 insertions(+), 76 deletions(-)
diff --git a/dpbench/benchmarks/black_scholes/black_scholes_sycl_native_ext/black_scholes_sycl/_black_scholes_kernel.hpp b/dpbench/benchmarks/black_scholes/black_scholes_sycl_native_ext/black_scholes_sycl/_black_scholes_kernel.hpp
index 59132c50..ad99f5f4 100644
--- a/dpbench/benchmarks/black_scholes/black_scholes_sycl_native_ext/black_scholes_sycl/_black_scholes_kernel.hpp
+++ b/dpbench/benchmarks/black_scholes/black_scholes_sycl_native_ext/black_scholes_sycl/_black_scholes_kernel.hpp
@@ -6,30 +6,10 @@
 #include <stdlib.h>
 #include <type_traits>
 
-#ifdef __DO_FLOAT__
-#define EXP(x) expf(x)
-#define LOG(x) logf(x)
-#define SQRT(x) sqrtf(x)
-#define ERF(x) erff(x)
-#define INVSQRT(x) 1.0f / sqrtf(x)
-
-#define QUARTER 0.25f
-#define HALF 0.5f
-#define TWO 2.0f
-#else
-#define EXP(x) sycl::exp(x)
-#define LOG(x) sycl::log(x)
-#define SQRT(x) sycl::sqrt(x)
-#define ERF(x) sycl::erf(x)
-#define INVSQRT(x) 1.0 / sycl::sqrt(x)
-
-#define QUARTER 0.25
-#define HALF 0.5
-#define TWO 2.0
-#endif
-
 using namespace sycl;
 
+template <typename FpTy> class BlackScholesKernel;
+
 template <typename FpTy>
 void black_scholes_impl(queue Queue,
                         size_t nopt,
@@ -41,27 +21,30 @@ void black_scholes_impl(queue Queue,
                         FpTy *call,
                         FpTy *put)
 {
+    constexpr FpTy _0_25 = 0.25;
+    constexpr FpTy _0_5 = 0.5;
+
     auto e = Queue.submit([&](handler &h) {
-        h.parallel_for<class BlackScholesKernel>(
+        h.parallel_for<BlackScholesKernel<FpTy>>(
             range<1>{nopt}, [=](id<1> myID) {
                 FpTy mr = -rate;
-                FpTy sig_sig_two = volatility * volatility * TWO;
+                FpTy sig_sig_two = volatility * volatility * 2;
                 int i = myID[0];
                 FpTy a, b, c, y, z, e;
                 FpTy d1, d2, w1, w2;
 
-                a = LOG(price[i] / strike[i]);
+                a = sycl::log(price[i] / strike[i]);
                 b = t[i] * mr;
                 z = t[i] * sig_sig_two;
-                c = QUARTER * z;
-                y = INVSQRT(z);
+                c = _0_25 * z;
+                y = sycl::rsqrt(z);
                 w1 = (a - b + c) * y;
                 w2 = (a - b - c) * y;
-                d1 = ERF(w1);
-                d2 = ERF(w2);
-                d1 = HALF + HALF * d1;
-                d2 = HALF + HALF * d2;
-                e = EXP(b);
+                d1 = sycl::erf(w1);
+                d2 = sycl::erf(w2);
+                d1 = _0_5 + _0_5 * d1;
+                d2 = _0_5 + _0_5 * d2;
+                e = sycl::exp(b);
                 call[i] = price[i] * d1 - strike[i] * e * d2;
                 put[i] = call[i] - price[i] + strike[i] * e;
             });
diff --git a/dpbench/benchmarks/black_scholes/black_scholes_sycl_native_ext/black_scholes_sycl/_black_scholes_sycl.cpp b/dpbench/benchmarks/black_scholes/black_scholes_sycl_native_ext/black_scholes_sycl/_black_scholes_sycl.cpp
index f0bb0d5b..96a49547 100644
--- a/dpbench/benchmarks/black_scholes/black_scholes_sycl_native_ext/black_scholes_sycl/_black_scholes_sycl.cpp
+++ b/dpbench/benchmarks/black_scholes/black_scholes_sycl_native_ext/black_scholes_sycl/_black_scholes_sycl.cpp
@@ -64,14 +64,22 @@ void black_scholes_sync(size_t /**/,
     if (!ensure_compatibility(price, strike, t, call, put))
         throw std::runtime_error("Input arrays are not acceptable.");
 
-    if (typenum != UAR_DOUBLE) {
-        throw std::runtime_error("Expected a double precision FP array.");
+    if (typenum == UAR_FLOAT) {
+        black_scholes_impl<float>(Queue, nopt, price.get_data<float>(),
+                                  strike.get_data<float>(), t.get_data<float>(),
+                                  rate, volatility, call.get_data<float>(),
+                                  put.get_data<float>());
+    }
+    else if (typenum == UAR_DOUBLE) {
+        black_scholes_impl<double>(
+            Queue, nopt, price.get_data<double>(), strike.get_data<double>(),
+            t.get_data<double>(), rate, volatility, call.get_data<double>(),
+            put.get_data<double>());
+    }
+    else {
+        throw std::runtime_error(
+            "Expected a double or single precision FP array.");
     }
-
-    black_scholes_impl(Queue, nopt, price.get_data<double>(),
-                       strike.get_data<double>(), t.get_data<double>(), rate,
-                       volatility, call.get_data<double>(),
-                       put.get_data<double>());
 }
 
 PYBIND11_MODULE(_black_scholes_sycl, m)
diff --git a/dpbench/benchmarks/dbscan/dbscan_sycl_native_ext/dbscan_sycl/_dbscan_kernel.hpp b/dpbench/benchmarks/dbscan/dbscan_sycl_native_ext/dbscan_sycl/_dbscan_kernel.hpp
index 725e6923..2c236f2c 100644
--- a/dpbench/benchmarks/dbscan/dbscan_sycl_native_ext/dbscan_sycl/_dbscan_kernel.hpp
+++ b/dpbench/benchmarks/dbscan/dbscan_sycl_native_ext/dbscan_sycl/_dbscan_kernel.hpp
@@ -106,6 +106,8 @@ void getNeighborhood(size_t n,
     }
 }
 
+template <typename FpTy> class DBScanKernel;
+
 template <typename FpTy>
 size_t dbscan_impl(queue q,
                    size_t n_samples,
@@ -126,14 +128,13 @@ size_t dbscan_impl(queue q,
     q.wait();
 
     auto e = q.submit([&](handler &h) {
-        h.parallel_for<class DBScanKernel>(
+        h.parallel_for<DBScanKernel<FpTy>>(
             range<1>{n_samples}, [=](id<1> myID) {
                 size_t i1 = myID[0];
                 size_t i2 = (i1 + 1 == n_samples ? n_samples : i1 + 1);
-                getNeighborhood<double>(n_samples, n_features, data, i2 - i1,
-                                        data + i1 * n_features, eps,
-                                        d_indices + i1 * n_samples,
-                                        d_sizes + i1);
+                getNeighborhood<FpTy>(n_samples, n_features, data, i2 - i1,
+                                      data + i1 * n_features, eps,
+                                      d_indices + i1 * n_samples, d_sizes + i1);
             });
     });
 
diff --git a/dpbench/benchmarks/dbscan/dbscan_sycl_native_ext/dbscan_sycl/_dbscan_sycl.cpp b/dpbench/benchmarks/dbscan/dbscan_sycl_native_ext/dbscan_sycl/_dbscan_sycl.cpp
index efe4f4bd..8dd05aa3 100644
--- a/dpbench/benchmarks/dbscan/dbscan_sycl_native_ext/dbscan_sycl/_dbscan_sycl.cpp
+++ b/dpbench/benchmarks/dbscan/dbscan_sycl_native_ext/dbscan_sycl/_dbscan_sycl.cpp
@@ -38,16 +38,21 @@ size_t dbscan_sync(size_t n_samples,
                    size_t min_pts)
 {
     auto queue = data.get_queue();
+    auto typenum = data.get_typenum();
 
     if (!ensure_compatibility(data))
         throw std::runtime_error("Input arrays are not acceptable.");
 
-    if (data.get_typenum() != UAR_DOUBLE) {
-        throw std::runtime_error("Expected a double precision FP array.");
+    if (typenum == UAR_FLOAT) {
+        return dbscan_impl<float>(queue, n_samples, n_features,
+                                  data.get_data<float>(), eps, min_pts);
+    }
+    else if (typenum == UAR_DOUBLE) {
+        return dbscan_impl<double>(queue, n_samples, n_features,
+                                   data.get_data<double>(), eps, min_pts);
     }
 
-    return dbscan_impl<double>(queue, n_samples, n_features,
-                               data.get_data<double>(), eps, min_pts);
+    throw std::runtime_error("Expected a double or single precision FP array.");
 }
 
 PYBIND11_MODULE(_dbscan_sycl, m)
diff --git a/dpbench/benchmarks/l2_norm/l2_norm_sycl_native_ext/l2_norm_sycl/_l2_norm_kernel.hpp b/dpbench/benchmarks/l2_norm/l2_norm_sycl_native_ext/l2_norm_sycl/_l2_norm_kernel.hpp
index daf1d57b..8f13d972 100644
--- a/dpbench/benchmarks/l2_norm/l2_norm_sycl_native_ext/l2_norm_sycl/_l2_norm_kernel.hpp
+++ b/dpbench/benchmarks/l2_norm/l2_norm_sycl_native_ext/l2_norm_sycl/_l2_norm_kernel.hpp
@@ -9,6 +9,8 @@
 
 using namespace sycl;
 
+template <typename FpTy> class theKernel;
+
 template <typename FpTy>
 void l2_norm_impl(queue Queue,
                   size_t npoints,
@@ -18,7 +20,7 @@ void l2_norm_impl(queue Queue,
 {
     Queue
         .submit([&](handler &h) {
-            h.parallel_for<class theKernel>(range<1>{npoints}, [=](id<1> myID) {
+            h.parallel_for<theKernel<FpTy>>(range<1>{npoints}, [=](id<1> myID) {
                 size_t i = myID[0];
                 for (size_t k = 0; k < dims; k++) {
                     d[i] += a[i * dims + k] * a[i * dims + k];
diff --git a/dpbench/benchmarks/l2_norm/l2_norm_sycl_native_ext/l2_norm_sycl/_l2_norm_sycl.cpp b/dpbench/benchmarks/l2_norm/l2_norm_sycl_native_ext/l2_norm_sycl/_l2_norm_sycl.cpp
index 90f1a026..ab8b9e06 100644
--- a/dpbench/benchmarks/l2_norm/l2_norm_sycl_native_ext/l2_norm_sycl/_l2_norm_sycl.cpp
+++ b/dpbench/benchmarks/l2_norm/l2_norm_sycl_native_ext/l2_norm_sycl/_l2_norm_sycl.cpp
@@ -19,13 +19,20 @@ void l2_norm_sync(dpctl::tensor::usm_ndarray a, dpctl::tensor::usm_ndarray d)
 
     auto dims = 3;
     auto npoints = a.get_size() / dims;
+    auto typenum = a.get_typenum();
 
-    if (a.get_typenum() != UAR_DOUBLE) {
-        throw std::runtime_error("Expected a double precision FP array.");
+    if (typenum == UAR_FLOAT) {
+        l2_norm_impl(Queue, npoints, dims, a.get_data<float>(),
+                     d.get_data<float>());
+    }
+    else if (typenum == UAR_DOUBLE) {
+        l2_norm_impl(Queue, npoints, dims, a.get_data<double>(),
+                     d.get_data<double>());
+    }
+    else {
+        throw std::runtime_error(
+            "Expected a double or single precision FP array.");
     }
-
-    l2_norm_impl(Queue, npoints, dims, a.get_data<double>(),
-                 d.get_data<double>());
 }
 
 PYBIND11_MODULE(_l2_norm_sycl, m)
diff --git a/dpbench/benchmarks/pca/pca_numba_n.py b/dpbench/benchmarks/pca/pca_numba_n.py
index 538222bb..347e2136 100644
--- a/dpbench/benchmarks/pca/pca_numba_n.py
+++ b/dpbench/benchmarks/pca/pca_numba_n.py
@@ -39,7 +39,7 @@ def pca(data, dims_rescaled_data=2):
     evectors = evectors[:, :dims_rescaled_data]
 
     # carry out the transformation on the data using eigenvectors
-    tdata = np.dot(evectors.T, data.T).T
+    tdata = np.dot(evectors.T.astype(data.dtype), data.T).T
 
     # return the transformed data, eigenvalues, and eigenvectors
     return tdata, evalues, evectors
diff --git a/dpbench/benchmarks/rambo/rambo_sycl_native_ext/rambo_sycl/_rambo_kernel.hpp b/dpbench/benchmarks/rambo/rambo_sycl_native_ext/rambo_sycl/_rambo_kernel.hpp
index c8c71fc0..c090a72c 100644
--- a/dpbench/benchmarks/rambo/rambo_sycl_native_ext/rambo_sycl/_rambo_kernel.hpp
+++ b/dpbench/benchmarks/rambo/rambo_sycl_native_ext/rambo_sycl/_rambo_kernel.hpp
@@ -10,13 +10,10 @@
 #include <stdlib.h>
 #include <type_traits>
 
-#define SIN(x) sycl::sin(x)
-#define COS(x) sycl::cos(x)
-#define SQRT(x) sycl::sqrt(x)
-#define LOG(x) sycl::log(x)
-
 using namespace sycl;
 
+template <typename FpTy> class RamboKernel;
+
 template <typename FpTy>
 event rambo_impl(queue Queue,
                  size_t nevts,
@@ -26,20 +23,21 @@ event rambo_impl(queue Queue,
                  const FpTy *usmQ1,
                  FpTy *usmOutput)
 {
+    constexpr FpTy pi_v = M_PI;
     return Queue.submit([&](handler &h) {
-        h.parallel_for<class RamboKernel>(range<1>{nevts}, [=](id<1> myID) {
+        h.parallel_for<RamboKernel<FpTy>>(range<1>{nevts}, [=](id<1> myID) {
             for (size_t j = 0; j < nout; j++) {
                 int i = myID[0];
                 size_t idx = i * nout + j;
 
-                FpTy C = 2.0 * usmC1[idx] - 1.0;
-                FpTy S = SQRT(1 - C * C);
-                FpTy F = 2.0 * M_PI * usmF1[idx];
-                FpTy Q = -LOG(usmQ1[idx]);
+                FpTy C = 2 * usmC1[idx] - 1;
+                FpTy S = sycl::sqrt(1 - C * C);
+                FpTy F = 2 * pi_v * usmF1[idx];
+                FpTy Q = -sycl::log(usmQ1[idx]);
 
                 usmOutput[idx * 4] = Q;
-                usmOutput[idx * 4 + 1] = Q * S * SIN(F);
-                usmOutput[idx * 4 + 2] = Q * S * COS(F);
+                usmOutput[idx * 4 + 1] = Q * S * sycl::sin(F);
+                usmOutput[idx * 4 + 2] = Q * S * sycl::cos(F);
                 usmOutput[idx * 4 + 3] = Q * C;
             }
         });
diff --git a/dpbench/benchmarks/rambo/rambo_sycl_native_ext/rambo_sycl/_rambo_sycl.cpp b/dpbench/benchmarks/rambo/rambo_sycl_native_ext/rambo_sycl/_rambo_sycl.cpp
index 8357fa68..c6a8e2ff 100644
--- a/dpbench/benchmarks/rambo/rambo_sycl_native_ext/rambo_sycl/_rambo_sycl.cpp
+++ b/dpbench/benchmarks/rambo/rambo_sycl_native_ext/rambo_sycl/_rambo_sycl.cpp
@@ -55,16 +55,27 @@ void rambo_sync(size_t nevts,
     if (!ensure_compatibility(C1, F1, Q1))
         throw std::runtime_error("Input arrays are not acceptable.");
 
-    if (C1.get_typenum() != UAR_DOUBLE || F1.get_typenum() != UAR_DOUBLE ||
-        Q1.get_typenum() != UAR_DOUBLE || output.get_typenum() != UAR_DOUBLE)
-    {
-        throw std::runtime_error("Expected a double precision FP array.");
-    }
+    if (output.get_typenum() != C1.get_typenum())
+        throw std::runtime_error("Input arrays are not acceptable.");
+
+    auto typenum = C1.get_typenum();
 
-    auto e = rambo_impl(Queue, nevts, nout, C1.get_data<double>(),
-                        F1.get_data<double>(), Q1.get_data<double>(),
-                        output.get_data<double>());
-    e.wait();
+    if (typenum == UAR_FLOAT) {
+        auto e = rambo_impl(Queue, nevts, nout, C1.get_data<float>(),
+                            F1.get_data<float>(), Q1.get_data<float>(),
+                            output.get_data<float>());
+        e.wait();
+    }
+    else if (typenum == UAR_DOUBLE) {
+        auto e = rambo_impl(Queue, nevts, nout, C1.get_data<double>(),
+                            F1.get_data<double>(), Q1.get_data<double>(),
+                            output.get_data<double>());
+        e.wait();
+    }
+    else {
+        throw std::runtime_error(
+            "Expected a double or single precision FP array.");
+    }
 }
 
 PYBIND11_MODULE(_rambo_sycl, m)

From b6d1bacf52291318d4002c650a58d29981ce97cf Mon Sep 17 00:00:00 2001
From: Alexander Kalistratov <alexander.kalistratov@intel.com>
Date: Mon, 6 Nov 2023 14:01:40 +0100
Subject: [PATCH 2/2] Add fp32 precision to matrix

---
 .github/workflows/build_and_run.yml | 7 +++++--
 .github/workflows/conda-package.yml | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build_and_run.yml b/.github/workflows/build_and_run.yml
index eee84754..543f2058 100644
--- a/.github/workflows/build_and_run.yml
+++ b/.github/workflows/build_and_run.yml
@@ -23,6 +23,7 @@ jobs:
         python: ["3.9", "3.10", "3.11"]
         sycl: ["sycl","no-sycl"]
         install: ["pip", "setup.py"]
+        precision: ["single", "double"]
         exclude:
           # setuptools<64 + scikit-build produce 'UNKOWN' package name for
           # python 3.11. Could not find exact reference for that issue.
@@ -174,10 +175,12 @@ jobs:
           echo "OCL_ICD_FILENAMES=$env:CONDA_PREFIX\Library\lib\intelocl64.dll" >> $env:GITHUB_ENV
 
       - name: Run benchmarks
-        run: dpbench -i ${{env.WORKLOADS}} run -r2 --no-print-results || exit 1
+        run: dpbench -i ${{env.WORKLOADS}} run -r2 --no-print-results --precision=${{matrix.precision}} || exit 1
 
       - name: Run rodinia benchmarks
-        run: dpbench -i ${{env.WORKLOADS}} --last-run run -r2 --no-print-results --rodinia --no-dpbench|| exit 1
+        # TODO: fix rodinia benchmark: https://github.com/IntelPython/dpbench/issues/316
+        if: matrix.precision != 'single'
+        run: dpbench -i ${{env.WORKLOADS}} --last-run run -r2 --no-print-results --rodinia --no-dpbench --precision=${{matrix.precision}} || exit 1
 
       - name: Generate report
         run: dpbench -i ${{env.WORKLOADS}} report || exit 1
diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 26bb59e2..2c4f2b17 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -112,6 +112,7 @@ jobs:
       matrix:
         python: ['3.9', '3.10', '3.11']
         os: [ubuntu-20.04, ubuntu-latest, windows-latest]
+        precision: ["single", "double"]
         experimental: [false]
 
     continue-on-error: ${{ matrix.experimental }}
@@ -193,11 +194,13 @@ jobs:
       # we want to make sure that configuration files are geting populated
       - name: Run npbench benchmark
         run: |
-          dpbench -i numpy -b azimint_hist run --npbench
+          dpbench -i numpy -b azimint_hist run --npbench --precision=${{matrix.precision}}
 
       - name: Run rodinia benchmark
+        # TODO: fix rodinia benchmark: https://github.com/IntelPython/dpbench/issues/316
+        if: matrix.precision != 'single'
         run: |
-          dpbench run --rodinia --no-dpbench --no-validate -r 1
+          dpbench run --rodinia --no-dpbench --no-validate -r 1 --precision=${{matrix.precision}}
 
   upload_anaconda:
     name: Upload dppy/label/dev ['${{ matrix.os }}', python='${{ matrix.python }}']