From 3895a4f02fa20fbbb09f51b52877159e5860aca2 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Mon, 27 Feb 2023 20:03:28 -0600
Subject: [PATCH 01/21] Adds two new helper function to dpexrt_python.

   - DPEXRTQueue_CreateFromFilterString: to help create a
     DPCTLSyclQueueRef object from a filter string.
   - DpexrtQueue_SubmitRange: to submit a ranged kernel.
---
 numba_dpex/core/runtime/_dpexrt_python.c | 236 +++++++++++++++--------
 numba_dpex/core/runtime/context.py       |  89 ++++++++-
 2 files changed, 245 insertions(+), 80 deletions(-)

diff --git a/numba_dpex/core/runtime/_dpexrt_python.c b/numba_dpex/core/runtime/_dpexrt_python.c
index fc13f2954e..3712704bec 100644
--- a/numba_dpex/core/runtime/_dpexrt_python.c
+++ b/numba_dpex/core/runtime/_dpexrt_python.c
@@ -21,6 +21,22 @@
 
 #include "numba/_arraystruct.h"
 
+/* Debugging facilities - enabled at compile-time */
+/* #undef NDEBUG */
+#if 0
+#include <stdio.h>
+#define DPEXRT_DEBUG(X)                                                        \
+    {                                                                          \
+        X;                                                                     \
+        fflush(stdout);                                                        \
+    }
+#else
+#define DPEXRT_DEBUG(X)                                                        \
+    if (0) {                                                                   \
+        X;                                                                     \
+    }
+#endif
+
 // forward declarations
 static struct PyUSMArrayObject *PyUSMNdArray_ARRAYOBJ(PyObject *obj);
 static npy_intp product_of_shape(npy_intp *shape, npy_intp ndim);
@@ -30,6 +46,7 @@ static void *usm_host_malloc(size_t size, void *opaque_data);
 static void usm_free(void *data, void *opaque_data);
 static NRT_ExternalAllocator *
 NRT_ExternalAllocator_new_for_usm(DPCTLSyclQueueRef qref, size_t usm_type);
+static void *DPEXRTQueue_CreateFromFilterString(const char *device);
 static MemInfoDtorInfo *MemInfoDtorInfo_new(NRT_MemInfo *mi, PyObject *owner);
 static NRT_MemInfo *NRT_MemInfo_new_from_usmndarray(PyObject *ndarrobj,
                                                     void *data,
@@ -106,6 +123,91 @@ static void usm_free(void *data, void *opaque_data)
     DPCTLfree_with_queue(data, qref);
 }
 
+/*----------------------------------------------------------------------------*/
+/*--------- Functions for dpctl libsyclinterface/sycl gluing         ---------*/
+/*----------------------------------------------------------------------------*/
+
+/*!
+ * @brief Creates and returns a DPCTLSyclQueueRef from a filter string.
+ *
+ * @param    device         A sycl::oneapi_ext::filter_string
+ * @return   {DPCTLSyclQueueRef}       A DPCTLSyclQueueRef object as void*.
+ */
+static void *DPEXRTQueue_CreateFromFilterString(const char *device)
+{
+    DPCTLSyclDeviceSelectorRef dselector = NULL;
+    DPCTLSyclDeviceRef dref = NULL;
+    DPCTLSyclQueueRef qref = NULL;
+
+    DPEXRT_DEBUG(nrt_debug_print(
+        "DPEXRT-DEBUG: Inside DPEXRT_get_sycl_queue %s, line %d\n", __FILE__,
+        __LINE__));
+
+    if (!(dselector = DPCTLFilterSelector_Create(device))) {
+        DPEXRT_DEBUG(nrt_debug_print(
+            "DPEXRT-ERROR: Could not create a sycl::device_selector from "
+            "filter string: %s at %s %d.\n",
+            device, __FILE__, __LINE__));
+        goto error;
+    }
+
+    if (!(dref = DPCTLDevice_CreateFromSelector(dselector)))
+        goto error;
+
+    if (!(qref = DPCTLQueue_CreateForDevice(dref, NULL, 0)))
+        goto error;
+
+    DPCTLDeviceSelector_Delete(dselector);
+    DPCTLDevice_Delete(dref);
+
+    DPEXRT_DEBUG(nrt_debug_print(
+        "DPEXRT-DEBUG: Created sycl::queue on device %s at %s, line %d\n",
+        device, __FILE__, __LINE__));
+
+    return (void *)qref;
+
+error:
+    DPCTLDeviceSelector_Delete(dselector);
+    DPCTLDevice_Delete(dref);
+
+    return NULL;
+}
+
+static void DpexrtQueue_SubmitRange(const void *KRef,
+                                    const void *QRef,
+                                    void **Args,
+                                    const DPCTLKernelArgType *ArgTypes,
+                                    size_t NArgs,
+                                    const size_t Range[3],
+                                    size_t NRange,
+                                    const void *DepEvents,
+                                    size_t NDepEvents)
+{
+    DPCTLSyclEventRef eref = NULL;
+    DPCTLSyclQueueRef qref = NULL;
+
+    DPEXRT_DEBUG(nrt_debug_print(
+        "DPEXRT-DEBUG: Inside DpexrtQueue_SubmitRange %s, line %d\n", __FILE__,
+        __LINE__));
+
+    qref = (DPCTLSyclQueueRef)QRef;
+
+    eref = DPCTLQueue_SubmitRange(
+        (DPCTLSyclKernelRef)KRef, qref, Args, (DPCTLKernelArgType *)ArgTypes,
+        NArgs, Range, NRange, (DPCTLSyclEventRef *)DepEvents, NDepEvents);
+    DPCTLQueue_Wait(qref);
+    DPCTLEvent_Wait(eref);
+    DPCTLEvent_Delete(eref);
+
+    DPEXRT_DEBUG(nrt_debug_print(
+        "DPEXRT-DEBUG: Done with DpexrtQueue_SubmitRange %s, line %d\n",
+        __FILE__, __LINE__));
+}
+
+/*----------------------------------------------------------------------------*/
+/*---------------------- Functions for NRT_MemInfo allocation ----------------*/
+/*----------------------------------------------------------------------------*/
+
 /*!
  * @brief Creates a new NRT_ExternalAllocator object tied to a SYCL USM
  *        allocator.
@@ -128,13 +230,15 @@ NRT_ExternalAllocator_new_for_usm(DPCTLSyclQueueRef qref, size_t usm_type)
 
     allocator = (NRT_ExternalAllocator *)malloc(sizeof(NRT_ExternalAllocator));
     if (allocator == NULL) {
-        NRT_Debug(nrt_debug_print("DPEXRT-ERROR: failed to allocate memory for "
-                                  "NRT_ExternalAllocator at %s, line %d.\n",
-                                  __FILE__, __LINE__));
+        DPEXRT_DEBUG(
+            nrt_debug_print("DPEXRT-ERROR: failed to allocate memory for "
+                            "NRT_ExternalAllocator at %s, line %d.\n",
+                            __FILE__, __LINE__));
         goto error;
     }
-    NRT_Debug(nrt_debug_print("DPEXRT-DEBUG: usm type = %d at %s, line %d.\n",
-                              usm_type, __FILE__, __LINE__));
+    DPEXRT_DEBUG(
+        nrt_debug_print("DPEXRT-DEBUG: usm type = %d at %s, line %d.\n",
+                        usm_type, __FILE__, __LINE__));
 
     switch (usm_type) {
     case 1:
@@ -147,9 +251,9 @@ NRT_ExternalAllocator_new_for_usm(DPCTLSyclQueueRef qref, size_t usm_type)
         allocator->malloc = usm_host_malloc;
         break;
     default:
-        NRT_Debug(nrt_debug_print("DPEXRT-ERROR: Encountered an unknown usm "
-                                  "allocation type (%d) at %s, line %d\n",
-                                  usm_type, __FILE__, __LINE__));
+        DPEXRT_DEBUG(nrt_debug_print("DPEXRT-ERROR: Encountered an unknown usm "
+                                     "allocation type (%d) at %s, line %d\n",
+                                     usm_type, __FILE__, __LINE__));
         goto error;
     }
 
@@ -193,7 +297,7 @@ static void usmndarray_meminfo_dtor(void *ptr, size_t size, void *info)
 
     // Sanity-check to make sure the mi_dtor_info is an actual pointer.
     if (!(mi_dtor_info = (MemInfoDtorInfo *)info)) {
-        NRT_Debug(nrt_debug_print(
+        DPEXRT_DEBUG(nrt_debug_print(
             "DPEXRT-ERROR: MemInfoDtorInfo object might be corrupted. Aborting "
             "MemInfo destruction at %s, line %d\n",
             __FILE__, __LINE__));
@@ -248,9 +352,9 @@ static MemInfoDtorInfo *MemInfoDtorInfo_new(NRT_MemInfo *mi, PyObject *owner)
     MemInfoDtorInfo *mi_dtor_info = NULL;
 
     if (!(mi_dtor_info = (MemInfoDtorInfo *)malloc(sizeof(MemInfoDtorInfo)))) {
-        NRT_Debug(nrt_debug_print("DPEXRT-ERROR: Could not allocate a new "
-                                  "MemInfoDtorInfo object at %s, line %d\n",
-                                  __FILE__, __LINE__));
+        DPEXRT_DEBUG(nrt_debug_print("DPEXRT-ERROR: Could not allocate a new "
+                                     "MemInfoDtorInfo object at %s, line %d\n",
+                                     __FILE__, __LINE__));
         return NULL;
     }
     mi_dtor_info->mi = mi;
@@ -283,7 +387,7 @@ static NRT_MemInfo *NRT_MemInfo_new_from_usmndarray(PyObject *ndarrobj,
 
     // Allocate a new NRT_MemInfo object
     if (!(mi = (NRT_MemInfo *)malloc(sizeof(NRT_MemInfo)))) {
-        NRT_Debug(nrt_debug_print(
+        DPEXRT_DEBUG(nrt_debug_print(
             "DPEXRT-ERROR: Could not allocate a new NRT_MemInfo "
             "object  at %s, line %d\n",
             __FILE__, __LINE__));
@@ -291,7 +395,7 @@ static NRT_MemInfo *NRT_MemInfo_new_from_usmndarray(PyObject *ndarrobj,
     }
 
     if (!(cref = DPCTLQueue_GetContext(qref))) {
-        NRT_Debug(nrt_debug_print(
+        DPEXRT_DEBUG(nrt_debug_print(
             "DPEXRT-ERROR: Could not get the DPCTLSyclContext from "
             "the queue object at %s, line %d\n",
             __FILE__, __LINE__));
@@ -303,7 +407,7 @@ static NRT_MemInfo *NRT_MemInfo_new_from_usmndarray(PyObject *ndarrobj,
 
     // Allocate a new NRT_ExternalAllocator
     if (!(ext_alloca = NRT_ExternalAllocator_new_for_usm(qref, usm_type))) {
-        NRT_Debug(
+        DPEXRT_DEBUG(
             nrt_debug_print("DPEXRT-ERROR: Could not allocate a new "
                             "NRT_ExternalAllocator object  at %s, line %d\n",
                             __FILE__, __LINE__));
@@ -312,9 +416,9 @@ static NRT_MemInfo *NRT_MemInfo_new_from_usmndarray(PyObject *ndarrobj,
 
     // Allocate a new MemInfoDtorInfo
     if (!(midtor_info = MemInfoDtorInfo_new(mi, ndarrobj))) {
-        NRT_Debug(nrt_debug_print("DPEXRT-ERROR: Could not allocate a new "
-                                  "MemInfoDtorInfo object  at %s, line %d\n",
-                                  __FILE__, __LINE__));
+        DPEXRT_DEBUG(nrt_debug_print("DPEXRT-ERROR: Could not allocate a new "
+                                     "MemInfoDtorInfo object  at %s, line %d\n",
+                                     __FILE__, __LINE__));
         goto error;
     }
 
@@ -326,14 +430,14 @@ static NRT_MemInfo *NRT_MemInfo_new_from_usmndarray(PyObject *ndarrobj,
     mi->size = nitems * itemsize;
     mi->external_allocator = ext_alloca;
 
-    NRT_Debug(nrt_debug_print(
+    DPEXRT_DEBUG(nrt_debug_print(
         "DPEXRT-DEBUG: NRT_MemInfo_init mi=%p external_allocator=%p\n", mi,
         ext_alloca));
 
     return mi;
 
 error:
-    NRT_Debug(nrt_debug_print(
+    DPEXRT_DEBUG(nrt_debug_print(
         "DPEXRT-ERROR: Failed inside NRT_MemInfo_new_from_usmndarray clean up "
         "and return NULL at %s, line %d\n",
         __FILE__, __LINE__));
@@ -359,37 +463,27 @@ DPEXRT_MemInfo_alloc(npy_intp size, size_t usm_type, const char *device)
     NRT_MemInfo *mi = NULL;
     NRT_ExternalAllocator *ext_alloca = NULL;
     MemInfoDtorInfo *midtor_info = NULL;
-    DPCTLSyclDeviceSelectorRef dselector = NULL;
-    DPCTLSyclDeviceRef dref = NULL;
     DPCTLSyclQueueRef qref = NULL;
 
-    NRT_Debug(nrt_debug_print(
+    DPEXRT_DEBUG(nrt_debug_print(
         "DPEXRT-DEBUG: Inside DPEXRT_MemInfo_alloc  %s, line %d\n", __FILE__,
         __LINE__));
     // Allocate a new NRT_MemInfo object
     if (!(mi = (NRT_MemInfo *)malloc(sizeof(NRT_MemInfo)))) {
-        NRT_Debug(nrt_debug_print(
+        DPEXRT_DEBUG(nrt_debug_print(
             "DPEXRT-ERROR: Could not allocate a new NRT_MemInfo object.\n"));
         goto error;
     }
 
-    if (!(dselector = DPCTLFilterSelector_Create(device))) {
-        NRT_Debug(nrt_debug_print(
-            "DPEXRT-ERROR: Could not create a sycl::device_selector from "
-            "filter string: %s at %s %d.\n",
-            device, __FILE__, __LINE__));
+    if (!(qref = (DPCTLSyclQueueRef)DPEXRTQueue_CreateFromFilterString(device)))
+    {
+        DPEXRT_DEBUG(
+            nrt_debug_print("DPEXRT-ERROR: Could not create a sycl::queue from "
+                            "filter string: %s at %s %d.\n",
+                            device, __FILE__, __LINE__));
         goto error;
     }
 
-    if (!(dref = DPCTLDevice_CreateFromSelector(dselector)))
-        goto error;
-
-    if (!(qref = DPCTLQueue_CreateForDevice(dref, NULL, 0)))
-        goto error;
-
-    DPCTLDeviceSelector_Delete(dselector);
-    DPCTLDevice_Delete(dref);
-
     // Allocate a new NRT_ExternalAllocator
     if (!(ext_alloca = NRT_ExternalAllocator_new_for_usm(qref, usm_type)))
         goto error;
@@ -407,7 +501,7 @@ DPEXRT_MemInfo_alloc(npy_intp size, size_t usm_type, const char *device)
 
     mi->size = size;
     mi->external_allocator = ext_alloca;
-    NRT_Debug(nrt_debug_print(
+    DPEXRT_DEBUG(nrt_debug_print(
         "DPEXRT-DEBUG: DPEXRT_MemInfo_alloc mi=%p "
         "external_allocator=%p for usm_type %zu on device %s, %s at %d\n",
         mi, ext_alloca, usm_type, device, __FILE__, __LINE__));
@@ -418,8 +512,6 @@ DPEXRT_MemInfo_alloc(npy_intp size, size_t usm_type, const char *device)
     free(mi);
     free(ext_alloca);
     free(midtor_info);
-    DPCTLDeviceSelector_Delete(dselector);
-    DPCTLDevice_Delete(dref);
 
     return NULL;
 }
@@ -444,8 +536,6 @@ static NRT_MemInfo *DPEXRT_MemInfo_fill(NRT_MemInfo *mi,
                                         uint8_t value,
                                         const char *device)
 {
-    DPCTLSyclDeviceSelectorRef dselector = NULL;
-    DPCTLSyclDeviceRef dref = NULL;
     DPCTLSyclQueueRef qref = NULL;
     DPCTLSyclEventRef eref = NULL;
     size_t count = 0, size = 0, exp = 0;
@@ -455,34 +545,20 @@ static NRT_MemInfo *DPEXRT_MemInfo_fill(NRT_MemInfo *mi,
         exp++;
     count = (unsigned int)(size >> exp);
 
-    NRT_Debug(nrt_debug_print(
+    DPEXRT_DEBUG(nrt_debug_print(
         "DPEXRT-DEBUG: mi->size = %u, itemsize = %u, count = %u, "
         "value = %u, Inside DPEXRT_MemInfo_fill %s, line %d\n",
         mi->size, itemsize << exp, count, value, __FILE__, __LINE__));
 
     if (mi->data == NULL) {
-        NRT_Debug(nrt_debug_print("DPEXRT-DEBUG: mi->data is NULL, "
-                                  "Inside DPEXRT_MemInfo_fill %s, line %d\n",
-                                  __FILE__, __LINE__));
+        DPEXRT_DEBUG(nrt_debug_print("DPEXRT-DEBUG: mi->data is NULL, "
+                                     "Inside DPEXRT_MemInfo_fill %s, line %d\n",
+                                     __FILE__, __LINE__));
         goto error;
     }
 
-    if (!(dselector = DPCTLFilterSelector_Create(device))) {
-        NRT_Debug(nrt_debug_print(
-            "DPEXRT-ERROR: Could not create a sycl::device_selector from "
-            "filter string: %s at %s %d.\n",
-            device, __FILE__, __LINE__));
+    if (!(qref = (DPCTLSyclQueueRef)DPEXRTQueue_CreateFromFilterString(device)))
         goto error;
-    }
-
-    if (!(dref = DPCTLDevice_CreateFromSelector(dselector)))
-        goto error;
-
-    if (!(qref = DPCTLQueue_CreateForDevice(dref, NULL, 0)))
-        goto error;
-
-    DPCTLDeviceSelector_Delete(dselector);
-    DPCTLDevice_Delete(dref);
 
     switch (exp) {
     case 3:
@@ -535,8 +611,6 @@ static NRT_MemInfo *DPEXRT_MemInfo_fill(NRT_MemInfo *mi,
 error:
     DPCTLQueue_Delete(qref);
     DPCTLEvent_Delete(eref);
-    DPCTLDeviceSelector_Delete(dselector);
-    DPCTLDevice_Delete(dref);
 
     return NULL;
 }
@@ -616,20 +690,20 @@ static int DPEXRT_sycl_usm_ndarray_from_python(PyObject *obj,
     // collecting the array.
     Py_IncRef(obj);
 
-    NRT_Debug(nrt_debug_print(
+    DPEXRT_DEBUG(nrt_debug_print(
         "DPEXRT-DEBUG: In DPEXRT_sycl_usm_ndarray_from_python.\n"));
 
     // Check if the PyObject obj has an _array_obj attribute that is of
     // dpctl.tensor.usm_ndarray type.
     if (!(arrayobj = PyUSMNdArray_ARRAYOBJ(obj))) {
-        NRT_Debug(nrt_debug_print(
+        DPEXRT_DEBUG(nrt_debug_print(
             "DPEXRT-ERROR: PyUSMNdArray_ARRAYOBJ check failed %d\n", __FILE__,
             __LINE__));
         goto error;
     }
 
     if (!(ndim = UsmNDArray_GetNDim(arrayobj))) {
-        NRT_Debug(nrt_debug_print(
+        DPEXRT_DEBUG(nrt_debug_print(
             "DPEXRT-ERROR: UsmNDArray_GetNDim returned 0 at %s, line %d\n",
             __FILE__, __LINE__));
         goto error;
@@ -640,7 +714,7 @@ static int DPEXRT_sycl_usm_ndarray_from_python(PyObject *obj,
     nitems = product_of_shape(shape, ndim);
     itemsize = (npy_intp)UsmNDArray_GetElementSize(arrayobj);
     if (!(qref = UsmNDArray_GetQueueRef(arrayobj))) {
-        NRT_Debug(nrt_debug_print(
+        DPEXRT_DEBUG(nrt_debug_print(
             "DPEXRT-ERROR: UsmNDArray_GetQueueRef returned NULL at "
             "%s, line %d.\n",
             __FILE__, __LINE__));
@@ -650,7 +724,7 @@ static int DPEXRT_sycl_usm_ndarray_from_python(PyObject *obj,
     if (!(arystruct->meminfo = NRT_MemInfo_new_from_usmndarray(
               obj, data, nitems, itemsize, qref)))
     {
-        NRT_Debug(nrt_debug_print(
+        DPEXRT_DEBUG(nrt_debug_print(
             "DPEXRT-ERROR: NRT_MemInfo_new_from_usmndarray failed "
             "at %s, line %d.\n",
             __FILE__, __LINE__));
@@ -698,7 +772,7 @@ static int DPEXRT_sycl_usm_ndarray_from_python(PyObject *obj,
     // code of -1.
     // Decref the Pyobject of the array
     // ensure the GIL
-    NRT_Debug(nrt_debug_print(
+    DPEXRT_DEBUG(nrt_debug_print(
         "DPEXRT-ERROR: Failed to unbox dpnp ndarray into a Numba "
         "arraystruct at %s, line %d\n",
         __FILE__, __LINE__));
@@ -732,7 +806,7 @@ static PyObject *box_from_arystruct_parent(arystruct_t *arystruct,
     struct PyUSMArrayObject *arrayobj = NULL;
     npy_intp itemsize = 0;
 
-    NRT_Debug(nrt_debug_print("DPEXRT-DEBUG: In try_to_return_parent.\n"));
+    DPEXRT_DEBUG(nrt_debug_print("DPEXRT-DEBUG: In try_to_return_parent.\n"));
 
     if (!(arrayobj = PyUSMNdArray_ARRAYOBJ(arystruct->parent)))
         return NULL;
@@ -778,7 +852,7 @@ static PyObject *box_from_arystruct_parent(arystruct_t *arystruct,
     // parent, we need to increment the reference count of the parent here.
     Py_IncRef(array);
 
-    NRT_Debug(nrt_debug_print(
+    DPEXRT_DEBUG(nrt_debug_print(
         "DPEXRT-DEBUG: try_to_return_parent found a valid parent.\n"));
 
     /* Yes, it is the same array return a new reference */
@@ -818,7 +892,7 @@ DPEXRT_sycl_usm_ndarray_to_python_acqref(arystruct_t *arystruct,
     int exp = 0;
     npy_intp itemsize = 0;
 
-    NRT_Debug(nrt_debug_print(
+    DPEXRT_DEBUG(nrt_debug_print(
         "DPEXRT-DEBUG: In DPEXRT_sycl_usm_ndarray_to_python_acqref.\n"));
 
     if (descr == NULL) {
@@ -837,7 +911,7 @@ DPEXRT_sycl_usm_ndarray_to_python_acqref(arystruct_t *arystruct,
     // If the arystruct has a parent attribute, try to box the parent and
     // return it.
     if (arystruct->parent) {
-        NRT_Debug(nrt_debug_print(
+        DPEXRT_DEBUG(nrt_debug_print(
             "DPEXRT-DEBUG: arystruct has a parent, therefore "
             "trying to box and return the parent at %s, line %d\n",
             __FILE__, __LINE__));
@@ -875,8 +949,8 @@ DPEXRT_sycl_usm_ndarray_to_python_acqref(arystruct_t *arystruct,
         NRT_MemInfo_acquire(arystruct->meminfo);
         status = MemInfo_init(miobj, args, NULL);
         if (status != 0) {
-            NRT_Debug(nrt_debug_print("MemInfo_init failed at %s, line %d\n",
-                                      __FILE__, __LINE__));
+            DPEXRT_DEBUG(nrt_debug_print("MemInfo_init failed at %s, line %d\n",
+                                         __FILE__, __LINE__));
             Py_DECREF(args);
             PyErr_Format(PyExc_ValueError,
                          "In 'DPEXRT_sycl_usm_ndarray_to_python_acqref', "
@@ -960,7 +1034,7 @@ DPEXRT_sycl_usm_ndarray_to_python_acqref(arystruct_t *arystruct,
         return (PyObject *)NULL;
     }
 
-    NRT_Debug(nrt_debug_print(
+    DPEXRT_DEBUG(nrt_debug_print(
         "Returning from DPEXRT_sycl_usm_ndarray_to_python_acqref "
         "at %s, line %d\n",
         __FILE__, __LINE__));
@@ -994,6 +1068,9 @@ static PyObject *build_c_helpers_dict(void)
                  &DPEXRT_sycl_usm_ndarray_from_python);
     _declpointer("DPEXRT_sycl_usm_ndarray_to_python_acqref",
                  &DPEXRT_sycl_usm_ndarray_to_python_acqref);
+    _declpointer("DPEXRTQueue_CreateFromFilterString",
+                 &DPEXRTQueue_CreateFromFilterString);
+    _declpointer("DpexrtQueue_SubmitRange", &DpexrtQueue_SubmitRange);
     _declpointer("DPEXRT_MemInfo_alloc", &DPEXRT_MemInfo_alloc);
     _declpointer("DPEXRT_MemInfo_fill", &DPEXRT_MemInfo_fill);
     _declpointer("NRT_ExternalAllocator_new_for_usm",
@@ -1044,6 +1121,11 @@ MOD_INIT(_dpexrt_python)
     PyModule_AddObject(
         m, "DPEXRT_sycl_usm_ndarray_to_python_acqref",
         PyLong_FromVoidPtr(&DPEXRT_sycl_usm_ndarray_to_python_acqref));
+
+    PyModule_AddObject(m, "DPEXRTQueue_CreateFromFilterString",
+                       PyLong_FromVoidPtr(&DPEXRTQueue_CreateFromFilterString));
+    PyModule_AddObject(m, "DpexrtQueue_SubmitRange",
+                       PyLong_FromVoidPtr(&DpexrtQueue_SubmitRange));
     PyModule_AddObject(m, "DPEXRT_MemInfo_alloc",
                        PyLong_FromVoidPtr(&DPEXRT_MemInfo_alloc));
     PyModule_AddObject(m, "DPEXRT_MemInfo_fill",
diff --git a/numba_dpex/core/runtime/context.py b/numba_dpex/core/runtime/context.py
index 8e70baef67..3a23a819c0 100644
--- a/numba_dpex/core/runtime/context.py
+++ b/numba_dpex/core/runtime/context.py
@@ -129,7 +129,7 @@ def arraystruct_from_python(self, pyapi, obj, ptr):
         return self.error
 
     def usm_ndarray_to_python_acqref(self, pyapi, aryty, ary, dtypeptr):
-        """_summary_
+        """Boxes a DpnpNdArray native object into a Python dpnp.ndarray.
 
         Args:
             pyapi (_type_): _description_
@@ -140,14 +140,14 @@ def usm_ndarray_to_python_acqref(self, pyapi, aryty, ary, dtypeptr):
         Returns:
             _type_: _description_
         """
-        args = [
+        argtys = [
             pyapi.voidptr,
             pyapi.pyobj,
             llvmir.IntType(32),
             llvmir.IntType(32),
             pyapi.pyobj,
         ]
-        fnty = llvmir.FunctionType(pyapi.pyobj, args)
+        fnty = llvmir.FunctionType(pyapi.pyobj, argtys)
         fn = pyapi._get_function(
             fnty, "DPEXRT_sycl_usm_ndarray_to_python_acqref"
         )
@@ -165,3 +165,86 @@ def usm_ndarray_to_python_acqref(self, pyapi, aryty, ary, dtypeptr):
 
         args = [ptr, serial_aryty_pytype, ndim, writable, dtypeptr]
         return pyapi.builder.call(fn, args)
+
+    def get_queue_from_filter_string(self, builder, device):
+        """Calls DPEXRTQueue_CreateFromFilterString to create a new sycl::queue
+        from a given filter string.
+
+        Args:
+            device (llvmlite.ir.values.FormattedConstant): An LLVM ArrayType
+                storing a const string for a DPC++ filter selector string.
+
+        Returns: A DPCTLSyclQueueRef pointer.
+        """
+        mod = builder.module
+        fnty = llvmir.FunctionType(
+            cgutils.voidptr_t,
+            [cgutils.voidptr_t],
+        )
+        fn = cgutils.get_or_insert_function(
+            mod, fnty, "DPEXRTQueue_CreateFromFilterString"
+        )
+        fn.return_value.add_attribute("noalias")
+
+        ret = builder.call(fn, [device])
+
+        return ret
+
+    def submit_range(
+        self,
+        builder,
+        kref,
+        qref,
+        args,
+        argtys,
+        nargs,
+        range,
+        nrange,
+        depevents,
+        ndepevents,
+    ):
+        """Calls DPEXRTQueue_CreateFromFilterString to create a new sycl::queue
+        from a given filter string.
+
+        Args:
+            device (llvmlite.ir.values.FormattedConstant): An LLVM ArrayType
+                storing a const string for a DPC++ filter selector string.
+
+        Returns: A DPCTLSyclQueueRef pointer.
+        """
+        mod = builder.module
+        fnty = llvmir.FunctionType(
+            llvmir.types.VoidType(),
+            [
+                cgutils.voidptr_t,
+                cgutils.voidptr_t,
+                cgutils.voidptr_t.as_pointer(),
+                cgutils.int32_t.as_pointer(),
+                llvmir.IntType(64),
+                llvmir.IntType(64).as_pointer(),
+                llvmir.IntType(64),
+                cgutils.voidptr_t,
+                llvmir.IntType(64),
+            ],
+        )
+        fn = cgutils.get_or_insert_function(
+            mod, fnty, "DpexrtQueue_SubmitRange"
+        )
+        # fn.return_value.add_attribute("noalias")
+
+        ret = builder.call(
+            fn,
+            [
+                kref,
+                qref,
+                args,
+                argtys,
+                nargs,
+                range,
+                nrange,
+                depevents,
+                ndepevents,
+            ],
+        )
+
+        return ret

From 129bd51889133c56f24daa2adfc8473be6a31e9d Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 4 Mar 2023 12:34:07 -0600
Subject: [PATCH 02/21] New Exception

---
 numba_dpex/core/exceptions.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/numba_dpex/core/exceptions.py b/numba_dpex/core/exceptions.py
index 1d75dd6f40..55b3f7e2de 100644
--- a/numba_dpex/core/exceptions.py
+++ b/numba_dpex/core/exceptions.py
@@ -221,7 +221,7 @@ def __init__(
                 f"Arguments {ndarray_args} are non-usm arrays, "
                 f"and arguments {usmarray_args} are usm arrays."
             )
-        elif usmarray_argnum_list:
+        elif usmarray_argnum_list is not None:
             usmarray_args = ",".join([str(i) for i in usmarray_argnum_list])
             self.message = (
                 f'Execution queue for kernel "{kernel_name}" could '
@@ -433,3 +433,13 @@ def __init__(self, kernel_name, argtypes) -> None:
         )
 
         super().__init__(self.message)
+
+
+class UnsupportedParforError(Exception):
+    """Exception raised when a parfor node could not be lowered by Numba-dpex"""
+
+    def __init__(self, extra_msg=None) -> None:
+        self.message = "Expression cannot be offloaded"
+        if extra_msg:
+            self.message += " due to " + extra_msg
+        super().__init__(self.message)

From d2c0632442e1288f90574e8ca332405e1ae0372d Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 4 Mar 2023 10:19:26 -0600
Subject: [PATCH 03/21] Adds a new pass to lower parfors to dpex kernels.

   The code is based on the existing DpexLowerer in
   core/passes/lowerer, but refactored and cleaned up a bit.
      - Add the ParforLoweringPass
      - Utility modules kernel_builder and kernel_launcher
---
 .../core/passes/parfor_lowering_pass.py       | 394 +++++++++++
 numba_dpex/core/utils/kernel_builder.py       | 616 ++++++++++++++++++
 numba_dpex/core/utils/kernel_launcher.py      | 361 ++++++++++
 3 files changed, 1371 insertions(+)
 create mode 100644 numba_dpex/core/passes/parfor_lowering_pass.py
 create mode 100644 numba_dpex/core/utils/kernel_builder.py
 create mode 100644 numba_dpex/core/utils/kernel_launcher.py

diff --git a/numba_dpex/core/passes/parfor_lowering_pass.py b/numba_dpex/core/passes/parfor_lowering_pass.py
new file mode 100644
index 0000000000..1afd0b16f8
--- /dev/null
+++ b/numba_dpex/core/passes/parfor_lowering_pass.py
@@ -0,0 +1,394 @@
+# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import copy
+
+from numba.core import funcdesc, ir, types
+from numba.core.compiler_machinery import LoweringPass, register_pass
+from numba.core.lowering import Lower
+from numba.parfors.parfor_lowering import (
+    _lower_parfor_parallel as _lower_parfor_parallel_std,
+)
+
+from numba_dpex import config
+from numba_dpex.core.utils.kernel_launcher import KernelLaunchIRBuilder
+
+from ..exceptions import UnsupportedParforError
+from ..types.dpnp_ndarray_type import DpnpNdArray
+from ..utils.kernel_builder import create_kernel_for_parfor
+from .parfor import Parfor, find_potential_aliases_parfor, get_parfor_outputs
+
+# A global list of kernels to keep the objects alive indefinitely.
+keep_alive_kernels = []
+
+
+def _getvar_or_none(lowerer, x):
+    try:
+        return lowerer.getvar(x)
+    except:
+        return None
+
+
+def _load_range(lowerer, value):
+    if isinstance(value, ir.Var):
+        return lowerer.loadvar(value.name)
+    else:
+        return lowerer.context.get_constant(types.uintp, value)
+
+
+class ParforLowerImpl:
+    """Provides a custom lowerer for parfor nodes that generates a SYCL kernel
+    for a parfor and submits it to a queue.
+    """
+
+    def _submit_parfor_kernel(
+        self,
+        lowerer,
+        kernel_fn,
+        loop_ranges,
+    ):
+        """
+        Adds the call to the dpex kernel function from the main function.
+        """
+        keep_alive_kernels.append(kernel_fn.kernel)
+
+        # Helper class that generates the LLVM IR values inside the current LLVM
+        # module that are needed to submit the kernel to a queue.
+        ir_builder = KernelLaunchIRBuilder(lowerer, kernel_fn.kernel)
+
+        # Create a local variable storing a pointer to a DPCTLSyclQueueRef
+        # pointer.
+        curr_queue = ir_builder.get_queue(exec_queue=kernel_fn.queue)
+
+        num_flattened_args = 0
+
+        # Compute number of args to be passed to the kernel. Note that the
+        # actual number of kernel arguments is greater than the count of
+        # kernel_fn.kernel_args as arrays get flattened.
+        for arg_type in kernel_fn.kernel_arg_types:
+            if isinstance(arg_type, DpnpNdArray):
+                # FIXME: Remove magic constants
+                num_flattened_args += 5 + (2 * arg_type.ndim)
+            else:
+                num_flattened_args += 1
+
+        # Create LLVM values for the kernel args list and kernel arg types list
+        args_list = ir_builder.allocate_kernel_arg_array(num_flattened_args)
+        args_ty_list = ir_builder.allocate_kernel_arg_ty_array(
+            num_flattened_args
+        )
+
+        # Populate the args_list and the args_ty_list LLVM arrays
+        kernel_arg_num = 0
+        for arg_num, arg in enumerate(kernel_fn.kernel_args):
+            argtype = kernel_fn.kernel_arg_types[arg_num]
+            llvm_val = _getvar_or_none(lowerer, arg)
+            if not llvm_val:
+                raise AssertionError
+            if isinstance(argtype, DpnpNdArray):
+                ir_builder.build_array_arg(
+                    array_val=llvm_val,
+                    array_rank=argtype.ndim,
+                    arg_list=args_list,
+                    args_ty_list=args_ty_list,
+                    arg_num=kernel_arg_num,
+                )
+                # FIXME: Get rid of magic constants
+                kernel_arg_num += 5 + (2 * argtype.ndim)
+            else:
+                ir_builder.build_arg(
+                    llvm_val, argtype, args_list, args_ty_list, kernel_arg_num
+                )
+                kernel_arg_num += 1
+
+        # Create a global range over which to submit the kernel based on the
+        # loop_ranges of the parfor
+        global_range = []
+        # SYCL ranges can have at max 3 dimension. If the parfor is of a higher
+        # dimension then the indexing for the higher dimensions is done inside
+        # the kernel.
+        global_range_rank = len(loop_ranges) if len(loop_ranges) < 3 else 3
+        for i in range(global_range_rank):
+            start, stop, step = loop_ranges[i]
+            stop = _load_range(lowerer, stop)
+            if step != 1:
+                raise UnsupportedParforError(
+                    "non-unit strides are not yet supported."
+                )
+            global_range.append(stop)
+
+        # Submit a synchronous kernel
+        ir_builder.submit_sync_ranged_kernel(
+            global_range, curr_queue, kernel_arg_num, args_list, args_ty_list
+        )
+
+        # At this point we can free the DPCTLSyclQueueRef (curr_queue)
+        ir_builder.free_queue(sycl_queue_val=curr_queue)
+
+    def _lower_parfor_as_kernel(self, lowerer, parfor):
+        """Lowers a parfor node created by the dpjit compiler to a kernel.
+
+        The general approach is as follows:
+
+            - The code from the parfor's init block is lowered normally
+              in the context of the current function.
+            - The body of the parfor is transformed into a kernel function.
+            - Dpctl runtime calls to submit the kernel are added.
+
+        """
+        # We copy the typemap here because for race condition variable we'll
+        # update their type to array so they can be updated by the kernel.
+        orig_typemap = lowerer.fndesc.typemap
+
+        # replace original typemap with copy and restore the original at the
+        # end.
+        lowerer.fndesc.typemap = copy.copy(orig_typemap)
+
+        if config.DEBUG_ARRAY_OPT:
+            print("lowerer.fndesc", lowerer.fndesc, type(lowerer.fndesc))
+
+        typemap = lowerer.fndesc.typemap
+        varmap = lowerer.varmap
+
+        loc = parfor.init_block.loc
+        scope = parfor.init_block.scope
+
+        # Lower the init block of the parfor.
+        for instr in parfor.init_block.body:
+            lowerer.lower_inst(instr)
+
+        for racevar in parfor.races:
+            if racevar not in varmap:
+                rvtyp = typemap[racevar]
+                rv = ir.Var(scope, racevar, loc)
+                lowerer._alloca_var(rv.name, rvtyp)
+
+        alias_map = {}
+        arg_aliases = {}
+
+        find_potential_aliases_parfor(
+            parfor,
+            parfor.params,
+            typemap,
+            lowerer.func_ir,
+            alias_map,
+            arg_aliases,
+        )
+
+        # run get_parfor_outputs() and get_parfor_reductions() before
+        # kernel creation since Jumps are modified so CFG of loop_body
+        # dict will become invalid
+        if parfor.params is None:
+            raise AssertionError
+
+        parfor_output_arrays = get_parfor_outputs(parfor, parfor.params)
+
+        # compile parfor body as a separate dpex kernel function
+        flags = copy.copy(parfor.flags)
+        flags.error_model = "numpy"
+
+        # Can't get here unless
+        # flags.set('auto_parallel', ParallelOptions(True))
+        index_var_typ = typemap[parfor.loop_nests[0].index_variable.name]
+
+        # index variables should have the same type, check rest of indices
+        for loop_nest in parfor.loop_nests[1:]:
+            if typemap[loop_nest.index_variable.name] != index_var_typ:
+                raise AssertionError
+
+        loop_ranges = [
+            (loop_nest.start, loop_nest.stop, loop_nest.step)
+            for loop_nest in parfor.loop_nests
+        ]
+
+        try:
+            psrfor_kernel = create_kernel_for_parfor(
+                lowerer,
+                parfor,
+                typemap,
+                flags,
+                loop_ranges,
+                bool(alias_map),
+                parfor.races,
+                parfor_output_arrays,
+            )
+        except Exception:
+            # FIXME: Make the exception more informative
+            raise UnsupportedParforError
+
+        # Finally submit the kernel
+        self._submit_parfor_kernel(lowerer, psrfor_kernel, loop_ranges)
+
+        # TODO: free the kernel at this point
+
+        # Restore the original typemap of the function that was replaced
+        # temporarily at the beginning of this function.
+        lowerer.fndesc.typemap = orig_typemap
+
+
+class ParforLowerFactory:
+    """A pseudo-factory class that maps a device filter string to a lowering
+    function.
+
+    Each parfor can have a "lowerer" attribute that determines how the parfor
+    node is to be lowered to LLVM IR. The factory class maintains a static map
+    that for every device type (filter string) encountered so far to a lowerer
+    function for that device type. At this point numba-dpex does not generate
+    device-specific code and there lowerer is always same for all devices.
+    By generating different instances we make sure prfors that will execute on
+    distinct devices as determined by compute-follows-data programming model are
+    never fused together.
+    """
+
+    device_to_lowerer_map = {}
+
+    @classmethod
+    def get_lowerer(cls, device):
+        try:
+            lowerer = ParforLowerFactory.device_to_lowerer_map[device]
+        except KeyError:
+            lowerer = ParforLowerImpl()._lower_parfor_as_kernel
+            ParforLowerFactory.device_to_lowerer_map[device] = lowerer
+
+        return lowerer
+
+
+class WrapperDefaultLower(Lower):
+    @property
+    def _disable_sroa_like_opt(self):
+        """We always return True."""
+        return True
+
+
+def lower_parfor_dpex(lowerer, parfor):
+    parfor.lowerer = ParforLowerImpl()._lower_parfor_as_kernel
+    if parfor.lowerer is None:
+        _lower_parfor_parallel_std(lowerer, parfor)
+    else:
+        parfor.lowerer(lowerer, parfor)
+
+
+class _ParforLower(Lower):
+    """Extends standard lowering to accommodate parfor.Parfor nodes that may
+    have the `lowerer` attribute set.
+    """
+
+    def __init__(self, context, library, fndesc, func_ir, metadata=None):
+        Lower.__init__(self, context, library, fndesc, func_ir, metadata)
+        self.dpex_lower = self._lower(
+            context, library, fndesc, func_ir, metadata
+        )
+
+    def _lower(self, context, library, fndesc, func_ir, metadata):
+        """Create Lower with changed linkageName in debug info"""
+        lower = WrapperDefaultLower(context, library, fndesc, func_ir, metadata)
+
+        # Debuginfo
+        if context.enable_debuginfo:
+            from numba.core.funcdesc import default_mangler, qualifying_prefix
+
+            from numba_dpex.debuginfo import DpexDIBuilder
+
+            qualprefix = qualifying_prefix(fndesc.modname, fndesc.qualname)
+            mangled_qualname = default_mangler(qualprefix, fndesc.argtypes)
+
+            lower.debuginfo = DpexDIBuilder(
+                module=lower.module,
+                filepath=func_ir.loc.filename,
+                linkage_name=mangled_qualname,
+                cgctx=context,
+            )
+
+        return lower
+
+    def lower(self):
+        context = self.dpex_lower.context
+
+        # Only Numba's CPUContext has the `lower_extension` attribute
+        context.lower_extensions[Parfor] = lower_parfor_dpex
+        self.dpex_lower.lower()
+        self.base_lower = self.dpex_lower
+
+        self.env = self.base_lower.env
+        self.call_helper = self.base_lower.call_helper
+
+    def create_cpython_wrapper(self, release_gil=False):
+        return self.base_lower.create_cpython_wrapper(release_gil)
+
+
+@register_pass(mutates_CFG=True, analysis_only=False)
+class ParforLoweringPass(LoweringPass):
+    """A custom lowering pass that does dpex-specific lowering of parfor
+    nodes.
+
+    FIXME: Redesign once numba-dpex supports Numba 0.57
+    """
+
+    _name = "dpjit_lowering"
+
+    def __init__(self):
+        LoweringPass.__init__(self)
+
+    def run_pass(self, state):
+        if state.library is None:
+            codegen = state.targetctx.codegen()
+            state.library = codegen.create_library(state.func_id.func_qualname)
+            # Enable object caching upfront, so that the library can
+            # be later serialized.
+            state.library.enable_object_caching()
+
+        targetctx = state.targetctx
+
+        library = state.library
+        interp = state.func_ir
+        typemap = state.typemap
+        restype = state.return_type
+        calltypes = state.calltypes
+        flags = state.flags
+        metadata = state.metadata
+
+        kwargs = {}
+
+        # for support numba 0.54 and <=0.55.0dev0=*_469
+        if hasattr(flags, "get_mangle_string"):
+            kwargs["abi_tags"] = flags.get_mangle_string()
+        # Lowering
+        fndesc = funcdesc.PythonFunctionDescriptor.from_specialized_function(
+            interp,
+            typemap,
+            restype,
+            calltypes,
+            mangler=targetctx.mangler,
+            inline=flags.forceinline,
+            noalias=flags.noalias,
+            **kwargs,
+        )
+
+        with targetctx.push_code_library(library):
+            lower = _ParforLower(
+                targetctx, library, fndesc, interp, metadata=metadata
+            )
+            lower.lower()
+            if not flags.no_cpython_wrapper:
+                lower.create_cpython_wrapper(flags.release_gil)
+
+            env = lower.env
+            call_helper = lower.call_helper
+            del lower
+
+        from numba.core.compiler import _LowerResult  # TODO: move this
+
+        if flags.no_compile:
+            state["cr"] = _LowerResult(fndesc, call_helper, cfunc=None, env=env)
+        else:
+            # Prepare for execution
+            cfunc = targetctx.get_executable(library, fndesc, env)
+            # Insert native function for use by other jitted-functions.
+            # We also register its library to allow for inlining.
+            targetctx.insert_user_function(cfunc, fndesc, [library])
+            state["cr"] = _LowerResult(
+                fndesc, call_helper, cfunc=cfunc, env=env
+            )
+
+        return True
diff --git a/numba_dpex/core/utils/kernel_builder.py b/numba_dpex/core/utils/kernel_builder.py
new file mode 100644
index 0000000000..84326539ae
--- /dev/null
+++ b/numba_dpex/core/utils/kernel_builder.py
@@ -0,0 +1,616 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import copy
+import sys
+import warnings
+
+import dpctl.program as dpctl_prog
+import dpnp
+import numba
+from numba.core import compiler, ir, types
+from numba.core.errors import NumbaParallelSafetyWarning
+from numba.core.ir_utils import (
+    add_offset_to_labels,
+    get_name_var_table,
+    get_unused_var_name,
+    legalize_names,
+    mk_unique_var,
+    remove_dead,
+    remove_dels,
+    rename_labels,
+    replace_var_names,
+)
+from numba.core.typing import signature
+
+import numba_dpex as dpex
+from numba_dpex import config
+
+from ..descriptor import dpex_kernel_target
+from ..passes import parfor
+from ..types.dpnp_ndarray_type import DpnpNdArray
+
+
+class GufuncKernel:
+    def __init__(
+        self,
+        name,
+        kernel,
+        signature,
+        kernel_args,
+        kernel_arg_types,
+        queue,
+    ):
+        self.name = name
+        self.kernel = kernel
+        self.signature = signature
+        self.kernel_args = kernel_args
+        self.kernel_arg_types = kernel_arg_types
+        self.queue = queue
+
+
+def _print_block(block):
+    for i, inst in enumerate(block.body):
+        print("    ", i, inst)
+
+
+def _print_body(body_dict):
+    """Pretty-print a set of IR blocks."""
+    for label, block in body_dict.items():
+        print("label: ", label)
+        _print_block(block)
+
+
+def _compile_kernel_parfor(
+    sycl_queue, kernel_name, func_ir, argtypes, debug=False
+):
+    # Create a SPIRVKernel object
+    kernel = dpex.core.kernel_interface.spirv_kernel.SpirvKernel(
+        func_ir, kernel_name
+    )
+
+    # compile the kernel
+    kernel.compile(
+        args=argtypes,
+        typing_ctx=dpex_kernel_target.typing_context,
+        target_ctx=dpex_kernel_target.target_context,
+        debug=debug,
+        compile_flags=None,
+    )
+
+    dpctl_create_program_from_spirv_flags = []
+    if debug or config.OPT == 0:
+        # if debug is ON we need to pass additional flags to igc.
+        dpctl_create_program_from_spirv_flags = ["-g", "-cl-opt-disable"]
+
+    # create a program
+    kernel_bundle = dpctl_prog.create_program_from_spirv(
+        sycl_queue,
+        kernel.device_driver_ir_module,
+        " ".join(dpctl_create_program_from_spirv_flags),
+    )
+    #  create a kernel
+    sycl_kernel = kernel_bundle.get_sycl_kernel(kernel.module_name)
+
+    return sycl_kernel
+
+
+def _legalize_names_with_typemap(names, typemap):
+    """Replace illegal characters in Numba IR var names.
+
+    We use ir_utils.legalize_names to replace internal IR variable names
+    containing illegal characters (e.g. period) with a legal character
+    (underscore) so as to create legal variable names. The original variable
+    names are in the typemap so we also need to add the legalized name to the
+    typemap as well.
+    """
+    outdict = legalize_names(names)
+    # For each pair in the dict of legalized names...
+    for x, y in outdict.items():
+        # If the name had some legalization change to it...
+        if x != y:
+            # Set the type of the new name the same as the type of the old name.
+            typemap[y] = typemap[x]
+    return outdict
+
+
+def _to_scalar_from_0d(x):
+    if isinstance(x, types.ArrayCompatible) and x.ndim == 0:
+        return x.dtype
+    else:
+        return x
+
+
+def _replace_var_with_array_in_block(vars, block, typemap, calltypes):
+    new_block = []
+    for inst in block.body:
+        if isinstance(inst, ir.Assign) and inst.target.name in vars:
+            const_node = ir.Const(0, inst.loc)
+            const_var = ir.Var(
+                inst.target.scope, mk_unique_var("$const_ind_0"), inst.loc
+            )
+            typemap[const_var.name] = types.uintp
+            const_assign = ir.Assign(const_node, const_var, inst.loc)
+            new_block.append(const_assign)
+
+            setitem_node = ir.SetItem(
+                inst.target, const_var, inst.value, inst.loc
+            )
+            calltypes[setitem_node] = signature(
+                types.none,
+                types.npytypes.Array(typemap[inst.target.name], 1, "C"),
+                types.intp,
+                typemap[inst.target.name],
+            )
+            new_block.append(setitem_node)
+            continue
+        elif isinstance(inst, parfor.Parfor):
+            _replace_var_with_array_internal(
+                vars, {0: inst.init_block}, typemap, calltypes
+            )
+            _replace_var_with_array_internal(
+                vars, inst.loop_body, typemap, calltypes
+            )
+
+        new_block.append(inst)
+    return new_block
+
+
+def _replace_var_with_array_internal(vars, loop_body, typemap, calltypes):
+    for label, block in loop_body.items():
+        block.body = _replace_var_with_array_in_block(
+            vars, block, typemap, calltypes
+        )
+
+
+def _replace_var_with_array(vars, loop_body, typemap, calltypes):
+    _replace_var_with_array_internal(vars, loop_body, typemap, calltypes)
+    for v in vars:
+        el_typ = typemap[v]
+        typemap.pop(v, None)
+        typemap[v] = types.npytypes.Array(el_typ, 1, "C")
+
+
+def _dbgprint_after_each_array_assignments(lowerer, loop_body, typemap):
+    for label, block in loop_body.items():
+        new_block = block.copy()
+        new_block.clear()
+        loc = block.loc
+        scope = block.scope
+        for inst in block.body:
+            new_block.append(inst)
+            # Append print after assignment
+            if isinstance(inst, ir.Assign):
+                # Only apply to numbers
+                if typemap[inst.target.name] not in types.number_domain:
+                    continue
+
+                # Make constant string
+                strval = "{} =".format(inst.target.name)
+                strconsttyp = types.StringLiteral(strval)
+
+                lhs = ir.Var(scope, mk_unique_var("str_const"), loc)
+                assign_lhs = ir.Assign(
+                    value=ir.Const(value=strval, loc=loc), target=lhs, loc=loc
+                )
+                typemap[lhs.name] = strconsttyp
+                new_block.append(assign_lhs)
+
+                # Make print node
+                print_node = ir.Print(
+                    args=[lhs, inst.target], vararg=None, loc=loc
+                )
+                new_block.append(print_node)
+                sig = signature(
+                    types.none, typemap[lhs.name], typemap[inst.target.name]
+                )
+                lowerer.fndesc.calltypes[print_node] = sig
+        loop_body[label] = new_block
+
+
+def _generate_kernel_stub_as_string(
+    kernel_name,
+    parfor_params,
+    parfor_dim,
+    legal_loop_indices,
+    loop_ranges,
+    param_dict,
+    has_reduction,
+    redvars,
+    typemap,
+    redvars_dict,
+    sentinel_name,
+):
+    """Generates a stub dpex kernel for the parfor.
+
+    Returns:
+        str: A string representing a stub kernel function for the parfor.
+    """
+    kernel_txt = ""
+
+    # Create the dpex kernel function.
+    kernel_txt += "def " + kernel_name
+    kernel_txt += "(" + (", ".join(parfor_params)) + "):\n"
+    global_id_dim = 0
+    for_loop_dim = parfor_dim
+
+    if parfor_dim > 3:
+        raise NotImplementedError
+        global_id_dim = 3
+    else:
+        global_id_dim = parfor_dim
+
+    for eachdim in range(global_id_dim):
+        kernel_txt += (
+            "    "
+            + legal_loop_indices[eachdim]
+            + " = "
+            + "dpex.get_global_id("
+            + str(eachdim)
+            + ")\n"
+        )
+
+    for eachdim in range(global_id_dim, for_loop_dim):
+        for indent in range(1 + (eachdim - global_id_dim)):
+            kernel_txt += "    "
+
+        start, stop, step = loop_ranges[eachdim]
+        start = param_dict.get(str(start), start)
+        stop = param_dict.get(str(stop), stop)
+        kernel_txt += (
+            "for "
+            + legal_loop_indices[eachdim]
+            + " in range("
+            + str(start)
+            + ", "
+            + str(stop)
+            + " + 1):\n"
+        )
+
+    for eachdim in range(global_id_dim, for_loop_dim):
+        for indent in range(1 + (eachdim - global_id_dim)):
+            kernel_txt += "    "
+
+    # Add the sentinel assignment so that we can find the loop body position
+    # in the IR.
+    kernel_txt += "    "
+    kernel_txt += sentinel_name + " = 0\n"
+
+    # A kernel function does not return anything
+    kernel_txt += "    return None\n"
+
+    return kernel_txt
+
+
+def _wrap_loop_body(loop_body):
+    blocks = loop_body.copy()  # shallow copy is enough
+    first_label = min(blocks.keys())
+    last_label = max(blocks.keys())
+    loc = blocks[last_label].loc
+    blocks[last_label].body.append(ir.Jump(first_label, loc))
+    return blocks
+
+
+def _unwrap_loop_body(loop_body):
+    last_label = max(loop_body.keys())
+    loop_body[last_label].body = loop_body[last_label].body[:-1]
+
+
+def _find_setitems_block(setitems, block, typemap):
+    for inst in block.body:
+        if isinstance(inst, ir.StaticSetItem) or isinstance(inst, ir.SetItem):
+            setitems.add(inst.target.name)
+        elif isinstance(inst, parfor.Parfor):
+            _find_setitems_block(setitems, inst.init_block, typemap)
+            _find_setitems_body(setitems, inst.loop_body, typemap)
+
+
+def _find_setitems_body(setitems, loop_body, typemap):
+    """
+    Find the arrays that are written into (goes into setitems)
+    """
+    for label, block in loop_body.items():
+        _find_setitems_block(setitems, block, typemap)
+
+
+def _replace_sentinel_with_parfor_body(kernel_ir, sentinel_name, loop_body):
+    # new label for splitting sentinel block
+    new_label = max(loop_body.keys()) + 1
+
+    # Search all the block in the kernel function for the sentinel assignment.
+    for label, block in kernel_ir.blocks.items():
+        for i, inst in enumerate(block.body):
+            if (
+                isinstance(inst, ir.Assign)
+                and inst.target.name == sentinel_name
+            ):
+                # We found the sentinel assignment.
+                loc = inst.loc
+                scope = block.scope
+                # split block across __sentinel__
+                # A new block is allocated for the statements prior to the
+                # sentinel but the new block maintains the current block label.
+                prev_block = ir.Block(scope, loc)
+                prev_block.body = block.body[:i]
+
+                # The current block is used for statements after the sentinel.
+                block.body = block.body[i + 1 :]  # noqa: E203
+                # But the current block gets a new label.
+                body_first_label = min(loop_body.keys())
+
+                # The previous block jumps to the minimum labelled block of the
+                # parfor body.
+                prev_block.append(ir.Jump(body_first_label, loc))
+                # Add all the parfor loop body blocks to the kernel function's
+                # IR.
+                for loop, b in loop_body.items():
+                    kernel_ir.blocks[loop] = b
+                body_last_label = max(loop_body.keys())
+                kernel_ir.blocks[new_label] = block
+                kernel_ir.blocks[label] = prev_block
+                # Add a jump from the last parfor body block to the block
+                # containing statements after the sentinel.
+                kernel_ir.blocks[body_last_label].append(
+                    ir.Jump(new_label, loc)
+                )
+                break
+        else:
+            continue
+        break
+
+
+def create_kernel_for_parfor(
+    lowerer,
+    parfor_node,
+    typemap,
+    flags,
+    loop_ranges,
+    has_aliases,
+    races,
+    parfor_outputs,
+):
+    """
+    Creates a numba_dpex.kernel function for a parfor node.
+
+    There are two parts to this function:
+
+        1) Code to iterate across the iteration space as defined by
+           the schedule.
+        2) The parfor body that does the work for a single point in
+           the iteration space.
+
+    Part 1 is created as Python text for simplicity with a sentinel
+    assignment to mark the point in the IR where the parfor body
+    should be added. This Python text is 'exec'ed into existence and its
+    IR retrieved with run_frontend. The IR is scanned for the sentinel
+    assignment where that basic block is split and the IR for the parfor
+    body inserted.
+    """
+    loc = parfor_node.init_block.loc
+
+    # The parfor body and the main function body share ir.Var nodes.
+    # We have to do some replacements of Var names in the parfor body
+    # to make them legal parameter names. If we don't copy then the
+    # Vars in the main function also would incorrectly change their name.
+    loop_body = copy.copy(parfor_node.loop_body)
+    remove_dels(loop_body)
+
+    parfor_dim = len(parfor_node.loop_nests)
+    loop_indices = [
+        loop_nest.index_variable.name for loop_nest in parfor_node.loop_nests
+    ]
+
+    # Get all the parfor params.
+    parfor_params = parfor_node.params
+
+    # Get all parfor reduction vars, and operators.
+    typemap = lowerer.fndesc.typemap
+
+    parfor_redvars, parfor_reddict = parfor.get_parfor_reductions(
+        lowerer.func_ir, parfor_node, parfor_params, lowerer.fndesc.calltypes
+    )
+    has_reduction = False if len(parfor_redvars) == 0 else True
+
+    if has_reduction:
+        raise NotImplementedError
+
+    # Compute just the parfor inputs as a set difference.
+    parfor_inputs = sorted(list(set(parfor_params) - set(parfor_outputs)))
+
+    for race in races:
+        msg = (
+            "Variable %s used in parallel loop may be written "
+            "to simultaneously by multiple workers and may result "
+            "in non-deterministic or unintended results." % race
+        )
+        warnings.warn(NumbaParallelSafetyWarning(msg, loc))
+
+    _replace_var_with_array(races, loop_body, typemap, lowerer.fndesc.calltypes)
+
+    # Reorder all the params so that inputs go first then outputs.
+    parfor_params = parfor_inputs + parfor_outputs
+
+    # Some Var and loop_indices may not have legal parameter names so create a
+    # dict of potentially illegal param name to guaranteed legal name.
+    param_dict = _legalize_names_with_typemap(parfor_params, typemap)
+    ind_dict = _legalize_names_with_typemap(loop_indices, typemap)
+    redvars_dict = legalize_names(parfor_redvars)
+
+    # Compute a new list of legal loop index names.
+    legal_loop_indices = [ind_dict[v] for v in loop_indices]
+    # Get the types of each parameter.
+    param_types = [_to_scalar_from_0d(typemap[v]) for v in parfor_params]
+    # Calculate types of args passed to the kernel function.
+    func_arg_types = [typemap[v] for v in (parfor_inputs + parfor_outputs)]
+
+    # Replace illegal parameter names in the loop body with legal ones.
+    replace_var_names(loop_body, param_dict)
+
+    # remember the name before legalizing as the actual arguments
+    parfor_args = parfor_params
+    # Change parfor_params to be legal names.
+    parfor_params = [param_dict[v] for v in parfor_params]
+    parfor_params_orig = parfor_params
+
+    parfor_params = []
+    ascontig = False
+    for pindex in range(len(parfor_params_orig)):
+        if (
+            ascontig
+            and pindex < len(parfor_inputs)
+            and isinstance(param_types[pindex], types.npytypes.Array)
+        ):
+            parfor_params.append(parfor_params_orig[pindex] + "param")
+        else:
+            parfor_params.append(parfor_params_orig[pindex])
+
+    # Change parfor body to replace illegal loop index vars with legal ones.
+    replace_var_names(loop_body, ind_dict)
+
+    loop_body_var_table = get_name_var_table(loop_body)
+    sentinel_name = get_unused_var_name("__sentinel__", loop_body_var_table)
+
+    if config.DEBUG_ARRAY_OPT >= 1:
+        print("legal parfor_params = ", parfor_params, type(parfor_params))
+
+    # Determine the unique names of the kernel functions.
+    kernel_name = "__numba_parfor_kernel_%s" % (parfor_node.id)
+
+    kernel_fn_txt = _generate_kernel_stub_as_string(
+        kernel_name,
+        parfor_params,
+        parfor_dim,
+        legal_loop_indices,
+        loop_ranges,
+        param_dict,
+        has_reduction,
+        parfor_redvars,
+        typemap,
+        redvars_dict,
+        sentinel_name,
+    )
+
+    if config.DEBUG_ARRAY_OPT:
+        print("kernel_fn_txt = ", type(kernel_fn_txt), "\n", kernel_fn_txt)
+        sys.stdout.flush()
+
+    # Exec the kernel_fn_txt string into existence.
+    globls = {"dpnp": dpnp, "numba": numba, "dpex": dpex}
+    locls = {}
+    exec(kernel_fn_txt, globls, locls)
+
+    kernel_fn = locls[kernel_name]
+
+    if config.DEBUG_ARRAY_OPT:
+        print("kernel_fn = ", type(kernel_fn), "\n", kernel_fn)
+    # Get the IR for the kernel_fn dpex kernel
+    kernel_ir = compiler.run_frontend(kernel_fn)
+
+    if config.DEBUG_ARRAY_OPT:
+        print("kernel_ir dump ", type(kernel_ir))
+        kernel_ir.dump()
+        print("loop_body dump ", type(loop_body))
+        _print_body(loop_body)
+
+    # rename all variables in kernel_ir afresh
+    var_table = get_name_var_table(kernel_ir.blocks)
+    new_var_dict = {}
+    reserved_names = (
+        [sentinel_name] + list(param_dict.values()) + legal_loop_indices
+    )
+    for name, var in var_table.items():
+        if not (name in reserved_names):
+            new_var_dict[name] = mk_unique_var(name)
+    replace_var_names(kernel_ir.blocks, new_var_dict)
+    if config.DEBUG_ARRAY_OPT:
+        print("kernel_ir dump after renaming ")
+        kernel_ir.dump()
+
+    gufunc_param_types = param_types
+
+    if config.DEBUG_ARRAY_OPT:
+        print(
+            "gufunc_param_types = ",
+            type(gufunc_param_types),
+            "\n",
+            gufunc_param_types,
+        )
+
+    gufunc_stub_last_label = max(kernel_ir.blocks.keys()) + 1
+
+    # Add gufunc stub last label to each parfor.loop_body label to prevent
+    # label conflicts.
+    loop_body = add_offset_to_labels(loop_body, gufunc_stub_last_label)
+
+    # If enabled, add a print statement after every assignment.
+    if config.DEBUG_ARRAY_OPT_RUNTIME:
+        _dbgprint_after_each_array_assignments(lowerer, loop_body, typemap)
+
+    _replace_sentinel_with_parfor_body(kernel_ir, sentinel_name, loop_body)
+
+    if config.DEBUG_ARRAY_OPT:
+        print("kernel_ir last dump before renaming")
+        kernel_ir.dump()
+
+    kernel_ir.blocks = rename_labels(kernel_ir.blocks)
+    remove_dels(kernel_ir.blocks)
+
+    old_alias = flags.noalias
+    if not has_aliases:
+        if config.DEBUG_ARRAY_OPT:
+            print("No aliases found so adding noalias flag.")
+        flags.noalias = True
+
+    remove_dead(kernel_ir.blocks, kernel_ir.arg_names, kernel_ir, typemap)
+
+    if config.DEBUG_ARRAY_OPT:
+        print("gufunc_ir after remove dead")
+        kernel_ir.dump()
+
+    kernel_sig = signature(types.none, *gufunc_param_types)
+
+    if config.DEBUG_ARRAY_OPT:
+        sys.stdout.flush()
+
+    if config.DEBUG_ARRAY_OPT:
+        print("after DUFunc inline".center(80, "-"))
+        kernel_ir.dump()
+
+    # The ParforLegalizeCFD pass has already ensured that the LHS and RHS
+    # arrays are on same device. We can take the queue from the first input
+    # array and use that to compile the kernel.
+
+    exec_queue = None
+
+    for arg in parfor_args:
+        obj = typemap[arg]
+        if isinstance(obj, DpnpNdArray):
+            exec_queue = obj.queue
+
+    if not exec_queue:
+        raise AssertionError(
+            "No execution found for parfor. No way to compile the kernel!"
+        )
+
+    sycl_kernel = _compile_kernel_parfor(
+        exec_queue,
+        kernel_name,
+        kernel_ir,
+        gufunc_param_types,
+        debug=flags.debuginfo,
+    )
+
+    flags.noalias = old_alias
+
+    if config.DEBUG_ARRAY_OPT:
+        print("kernel_sig = ", kernel_sig)
+
+    return GufuncKernel(
+        name=kernel_name,
+        kernel=sycl_kernel,
+        signature=kernel_sig,
+        kernel_args=parfor_args,
+        kernel_arg_types=func_arg_types,
+        queue=exec_queue,
+    )
diff --git a/numba_dpex/core/utils/kernel_launcher.py b/numba_dpex/core/utils/kernel_launcher.py
new file mode 100644
index 0000000000..d9652a744a
--- /dev/null
+++ b/numba_dpex/core/utils/kernel_launcher.py
@@ -0,0 +1,361 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from numba.core import cgutils, types
+
+from numba_dpex import utils
+from numba_dpex.core.runtime.context import DpexRTContext
+from numba_dpex.dpctl_iface import DpctlCAPIFnBuilder
+from numba_dpex.dpctl_iface._helpers import numba_type_to_dpctl_typenum
+
+
+class KernelLaunchIRBuilder:
+    """
+    KernelLaunchIRBuilder(lowerer, cres)
+    Helper class to build the LLVM IR for the submission of a kernel.
+
+    The class generates LLVM IR inside the current LLVM module that is needed
+    for submitting kernels. The LLVM Values that
+    """
+
+    def __init__(self, lowerer, kernel):
+        """Create a KernelLauncher for the specified kernel.
+
+        Args:
+            lowerer: The Numba Lowerer that will be used to generate the code.
+            kernel: The SYCL kernel for which we are generating the code.
+            num_inputs: The number of arguments to the kernels.
+        """
+        self.lowerer = lowerer
+        self.context = self.lowerer.context
+        self.builder = self.lowerer.builder
+        self.kernel = kernel
+        self.kernel_addr = self.kernel.addressof_ref()
+        self.rtctx = DpexRTContext(self.context)
+
+    def _build_nullptr(self):
+        """Builds the LLVM IR to represent a null pointer.
+
+        Returns: An LLVM Value storing a null pointer
+        """
+        zero = cgutils.alloca_once(self.builder, utils.LLVMTypes.int64_t)
+        self.builder.store(self.context.get_constant(types.int64, 0), zero)
+        return self.builder.bitcast(
+            zero, utils.get_llvm_type(context=self.context, type=types.voidptr)
+        )
+
+    def _build_array_attr_arg(
+        self,
+        array_val,
+        array_attr_pos,
+        array_attr_ty,
+        arg_list,
+        args_ty_list,
+        arg_num,
+    ):
+        array_attr = self.builder.gep(
+            array_val,
+            [
+                self.context.get_constant(types.int32, 0),
+                self.context.get_constant(types.int32, array_attr_pos),
+            ],
+        )
+
+        # FIXME: If pointer arg then load it to some value and pass that value.
+        # We also most likely need an address space cast
+        if isinstance(array_attr_ty, types.misc.RawPointer):
+            array_attr = self.builder.load(array_attr)
+
+        self.build_arg(
+            val=array_attr,
+            ty=array_attr_ty,
+            arg_list=arg_list,
+            args_ty_list=args_ty_list,
+            arg_num=arg_num,
+        )
+
+    def _build_flattened_array_args(
+        self, array_val, array_attr_pos, ndims, arg_list, args_ty_list, arg_num
+    ):
+        array_attr = self.builder.gep(
+            array_val,
+            [
+                self.context.get_constant(types.int32, 0),
+                self.context.get_constant(types.int32, array_attr_pos),
+            ],
+        )
+
+        for ndim in range(ndims):
+            self._build_array_attr_arg(
+                array_val=array_attr,
+                array_attr_pos=ndim,
+                array_attr_ty=types.int64,
+                arg_list=arg_list,
+                args_ty_list=args_ty_list,
+                arg_num=arg_num + ndim,
+            )
+
+    def build_arg(self, val, ty, arg_list, args_ty_list, arg_num):
+        """Stores the kernel arguments and the kernel argument types into
+        arrays that will be passed to DPCTLQueue_SubmitRange.
+
+        Args:
+            val: An LLVM IR Value that will be stored into the arguments array
+            ty: A Numba type that will be converted to a DPCTLKernelArgType
+            enum and stored into the argument types list array
+            arg_list: An LLVM IR Value array that stores the kernel arguments
+            args_ty_list: An LLVM IR Value array that stores the
+            DPCTLKernelArgType enum value for each kernel argument
+            arg_num: The index position at which the arg_list and args_ty_list
+            need to be updated.
+        """
+        kernel_arg_dst = self.builder.gep(
+            arg_list,
+            [self.context.get_constant(types.int32, arg_num)],
+        )
+        kernel_arg_ty_dst = self.builder.gep(
+            args_ty_list,
+            [self.context.get_constant(types.int32, arg_num)],
+        )
+        val = self.builder.bitcast(
+            val,
+            utils.get_llvm_type(context=self.context, type=types.voidptr),
+        )
+        self.builder.store(val, kernel_arg_dst)
+        self.builder.store(
+            numba_type_to_dpctl_typenum(self.context, ty), kernel_arg_ty_dst
+        )
+
+    def build_array_arg(
+        self, array_val, array_rank, arg_list, args_ty_list, arg_num
+    ):
+        """Creates a list of LLVM Values for an unpacked DpnpNdArray kernel
+        argument.
+
+        The steps performed here are the same as in
+        numba_dpex.core.kernel_interface.arg_pack_unpacker._unpack_array_helper
+        """
+        # Argument 1: Null pointer for the NRT_MemInfo attribute of the array
+        nullptr = self._build_nullptr()
+        self.build_arg(
+            val=nullptr,
+            ty=types.int64,
+            arg_list=arg_list,
+            args_ty_list=args_ty_list,
+            arg_num=arg_num,
+        )
+        arg_num += 1
+        # Argument 2: Null pointer for the Parent attribute of the array
+        nullptr = self._build_nullptr()
+        self.build_arg(
+            val=nullptr,
+            ty=types.int64,
+            arg_list=arg_list,
+            args_ty_list=args_ty_list,
+            arg_num=arg_num,
+        )
+        arg_num += 1
+        # Argument 3: Array size
+        self._build_array_attr_arg(
+            array_val=array_val,
+            array_attr_pos=2,
+            array_attr_ty=types.int64,
+            arg_list=arg_list,
+            args_ty_list=args_ty_list,
+            arg_num=arg_num,
+        )
+        arg_num += 1
+        # Argument 4: itemsize
+        self._build_array_attr_arg(
+            array_val=array_val,
+            array_attr_pos=3,
+            array_attr_ty=types.int64,
+            arg_list=arg_list,
+            args_ty_list=args_ty_list,
+            arg_num=arg_num,
+        )
+        arg_num += 1
+        # Argument 5: data pointer
+        self._build_array_attr_arg(
+            array_val=array_val,
+            array_attr_pos=4,
+            array_attr_ty=types.voidptr,
+            arg_list=arg_list,
+            args_ty_list=args_ty_list,
+            arg_num=arg_num,
+        )
+        arg_num += 1
+        # Arguments for flattened shape
+        self._build_flattened_array_args(
+            array_val=array_val,
+            array_attr_pos=5,
+            ndims=array_rank,
+            arg_list=arg_list,
+            args_ty_list=args_ty_list,
+            arg_num=arg_num,
+        )
+        arg_num += array_rank
+        # Arguments for flattened stride
+        self._build_flattened_array_args(
+            array_val=array_val,
+            array_attr_pos=6,
+            ndims=array_rank,
+            arg_list=arg_list,
+            args_ty_list=args_ty_list,
+            arg_num=arg_num,
+        )
+        arg_num += array_rank
+
+    def get_queue(self, exec_queue):
+        """Allocates memory on the stack to store a DPCTLSyclQueueRef.
+
+        Returns: A LLVM Value storing the pointer to the SYCL queue created
+        using the filter string for the Python exec_queue (dpctl.SyclQueue).
+        """
+
+        # Allocate a stack var to store the queue created from the filter string
+        sycl_queue_val = cgutils.alloca_once(
+            self.builder,
+            utils.get_llvm_type(context=self.context, type=types.voidptr),
+        )
+        # Insert a global constant to store the filter string
+        device = self.context.insert_const_string(
+            self.builder.module, exec_queue.sycl_device.filter_string
+        )
+        # Store the queue returned by DPEXRTQueue_CreateFromFilterString in a
+        # local variable
+        self.builder.store(
+            self.rtctx.get_queue_from_filter_string(
+                builder=self.builder, device=device
+            ),
+            sycl_queue_val,
+        )
+        return sycl_queue_val
+
+    def free_queue(self, sycl_queue_val):
+        """
+        Frees the ``DPCTLSyclQueueRef`` pointer that was used to launch the
+        kernels.
+
+        Args:
+            sycl_queue_val: The SYCL queue pointer to be freed.
+        """
+        fn = DpctlCAPIFnBuilder.get_dpctl_queue_delete(
+            builder=self.builder, context=self.context
+        )
+        self.builder.call(fn, [self.builder.load(sycl_queue_val)])
+
+    def allocate_kernel_arg_array(self, num_kernel_args):
+        """Allocates an array to store the LLVM Value for every kernel argument.
+
+        Args:
+            num_kernel_args (int): The number of kernel arguments that
+            determines the size of args array to allocate.
+
+        Returns: An LLVM IR value pointing to an array to store the kernel
+        arguments.
+        """
+        args_list = cgutils.alloca_once(
+            self.builder,
+            utils.get_llvm_type(context=self.context, type=types.voidptr),
+            size=self.context.get_constant(types.uintp, num_kernel_args),
+        )
+
+        return args_list
+
+    def allocate_kernel_arg_ty_array(self, num_kernel_args):
+        """Allocates an array to store the LLVM Value for the typenum for
+        every kernel argument.
+
+        Args:
+            num_kernel_args (int): The number of kernel arguments that
+            determines the size of args array to allocate.
+
+        Returns: An LLVM IR value pointing to an array to store the kernel
+        arguments typenums as defined in dpctl.
+        """
+        args_ty_list = cgutils.alloca_once(
+            self.builder,
+            utils.LLVMTypes.int32_t,
+            size=self.context.get_constant(types.uintp, num_kernel_args),
+        )
+
+        return args_ty_list
+
+    def _create_sycl_range(self, idx_range):
+        """_summary_
+
+        Args:
+            idx_range (_type_): _description_
+            kernel_name_tag (_type_): _description_
+        """
+        intp_t = utils.get_llvm_type(context=self.context, type=types.intp)
+        intp_ptr_t = utils.get_llvm_ptr_type(intp_t)
+        num_dim = len(idx_range)
+
+        # form the global range
+        global_range = cgutils.alloca_once(
+            self.builder,
+            utils.get_llvm_type(context=self.context, type=types.uintp),
+            size=self.context.get_constant(types.uintp, num_dim),
+        )
+
+        for i in range(num_dim):
+            rext = idx_range[i]
+            if rext.type != utils.LLVMTypes.int64_t:
+                rext = self.builder.sext(rext, utils.LLVMTypes.int64_t)
+
+            # we reverse the global range to account for how sycl and opencl
+            # range differs
+            self.builder.store(
+                rext,
+                self.builder.gep(
+                    global_range,
+                    [self.context.get_constant(types.uintp, (num_dim - 1) - i)],
+                ),
+            )
+
+        return self.builder.bitcast(global_range, intp_ptr_t)
+
+    def submit_sync_ranged_kernel(
+        self,
+        idx_range,
+        sycl_queue_val,
+        total_kernel_args,
+        arg_list,
+        arg_ty_list,
+    ):
+        """
+        submit_sync_ranged_kernel(dim_bounds, sycl_queue_val)
+        Submits the kernel to the specified queue, waits and then copies
+        back any results to the host.
+
+        Args:
+            idx_range: Tuple specifying the range over which the kernel is
+            to be submitted.
+            sycl_queue_val : The SYCL queue on which the kernel is
+                             submitted.
+        """
+        gr = self._create_sycl_range(idx_range)
+        args = [
+            self.builder.inttoptr(
+                self.context.get_constant(types.uintp, self.kernel_addr),
+                utils.get_llvm_type(context=self.context, type=types.voidptr),
+            ),
+            self.builder.load(sycl_queue_val),
+            arg_list,
+            arg_ty_list,
+            self.context.get_constant(types.uintp, total_kernel_args),
+            gr,
+            self.context.get_constant(types.uintp, len(idx_range)),
+            self.builder.bitcast(
+                utils.create_null_ptr(
+                    builder=self.builder, context=self.context
+                ),
+                utils.get_llvm_type(context=self.context, type=types.voidptr),
+            ),
+            self.context.get_constant(types.uintp, 0),
+        ]
+
+        self.rtctx.submit_range(self.builder, *args)

From f3b7ad6c884a79552c3b8e1825e90393a2ba06f9 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 4 Mar 2023 11:04:57 -0600
Subject: [PATCH 04/21] Adds a new pass for compute follows data legalization.

    - Dpnp expressions that rely on the __array_ufunc__ method
      of DpnpNdArray do not infer the usm_type and device type
      of the LHS of the expression.

      The pass is an incomplete implementation of an algorithm
      that traverses all the basic blocks and checks CFD compliance
      and fixes the LHS of all parfors created from dpnp array
      expressions and pranges.
---
 .../core/passes/parfor_legalize_cfd_pass.py   | 299 ++++++++++++++++++
 1 file changed, 299 insertions(+)
 create mode 100644 numba_dpex/core/passes/parfor_legalize_cfd_pass.py

diff --git a/numba_dpex/core/passes/parfor_legalize_cfd_pass.py b/numba_dpex/core/passes/parfor_legalize_cfd_pass.py
new file mode 100644
index 0000000000..bf9b5c61b0
--- /dev/null
+++ b/numba_dpex/core/passes/parfor_legalize_cfd_pass.py
@@ -0,0 +1,299 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from numba.core import ir, types
+from numba.core.compiler_machinery import FunctionPass, register_pass
+from numba.core.ir_utils import find_topo_order
+
+from numba_dpex.core.exceptions import ComputeFollowsDataInferenceError
+from numba_dpex.core.passes.parfor_lowering_pass import ParforLowerFactory
+from numba_dpex.core.types.dpnp_ndarray_type import DpnpNdArray
+
+from .parfor import (
+    Parfor,
+    ParforPassStates,
+    get_parfor_outputs,
+    get_parfor_params,
+)
+
+
+class ParforLegalizeCFDPassImpl(ParforPassStates):
+
+    """Legalizes the compute-follows-data based device attribute for parfor
+    nodes.
+
+    DpnpNdArray array-expressions populate the type of the left-hand-side (LHS)
+    of each expression as a default DpnpNdArray instance derived from the
+    __array_ufunc__ method of DpnpNdArray class. The pass fixes the LHS type by
+    properly applying compute follows data programming model. The pass first
+    checks if the right-hand-side (RHS) DpnpNdArray arguments are on the same
+    device, else raising a ComputeFollowsDataInferenceError. Once the RHS has
+    been validated, the LHS type is updated.
+
+    The pass also updated the usm_type of the LHS based on a USM type
+    propagation rule: device > shared > host. Thus, if the usm_type attribute of
+    the RHS arrays are "device" and "shared" respectively, the LHS array's
+    usm_type attribute will be "device".
+
+    Once the pass has identified a parfor with DpnpNdArrays and legalized it,
+    the "lowerer" attribute of the parfor is set to
+    ``numba_dpex.core.passes.parfor_lowering_pass._lower_parfor_as_kernel`` so
+    that the parfor node is lowered using Dpex's lowerer.
+
+    """
+
+    inputUsmTypeStrToInt = {"device": 3, "shared": 2, "host": 1}
+    inputUsmTypeIntToStr = {3: "device", 2: "shared", 1: "host"}
+
+    def _check_if_dpnp_empty_call(self, call_stmt, block):
+        func_def = block.find_variable_assignment(call_stmt.name)
+        if not (
+            isinstance(func_def, ir.Assign)
+            and isinstance(func_def.value, ir.Expr)
+            and func_def.value.op == "getattr"
+        ):
+            raise AssertionError
+
+        module_name = block.find_variable_assignment(
+            func_def.value.list_vars()[0].name
+        ).value.value.__name__
+
+        if func_def.value.attr == "empty" and module_name == "dpnp":
+            return True
+        else:
+            return False
+
+    def _check_cfd_parfor_params(self, parfor, checklist):
+        deviceTypes = set()
+        usmTypes = []
+
+        for para in checklist:
+            if not isinstance(self.typemap[para], DpnpNdArray):
+                continue
+            argty = self.typemap[para]
+            deviceTypes.add(argty.device)
+            try:
+                usmTypes.append(
+                    ParforLegalizeCFDPassImpl.inputUsmTypeStrToInt[
+                        argty.usm_type
+                    ]
+                )
+            except KeyError:
+                raise ValueError(
+                    "Unknown USM type encountered. Supported "
+                    "usm types are: device, shared and host."
+                )
+        # Check compute follows data on the dpnp arrays in checklist
+        if len(deviceTypes) > 1:
+            raise ComputeFollowsDataInferenceError(
+                kernel_name=parfor.loc.short(),
+                usmarray_argnum_list=[],
+            )
+        # Derive the usm_type based on usm allocator precedence rule:
+        # device > shared > host
+        conforming_usm_ty = max(usmTypes)
+        conforming_device_ty = deviceTypes.pop()
+
+        # FIXME: Changed to namedtuple
+        return (conforming_usm_ty, conforming_device_ty)
+
+    def _legalize_dpnp_empty_call(self, required_arrty, call_stmt, block):
+        args = call_stmt.args
+        sigargs = self.calltypes[call_stmt].args
+        sigargs_new = list(sigargs)
+        # Update the RHS usm_type, device, dtype attributes
+        for idx, arg in enumerate(args):
+            argdef = block.find_variable_assignment(arg.name)
+            if argdef:
+                attribute = argdef.target.name
+                if "usm_type" in attribute:
+                    self.typemap.update(
+                        {attribute: types.literal(required_arrty.usm_type)}
+                    )
+                    sigargs_new[idx] = types.literal(required_arrty.usm_type)
+                elif "device" in attribute:
+                    self.typemap.update(
+                        {attribute: types.literal(required_arrty.device)}
+                    )
+                    sigargs_new[idx] = types.literal(required_arrty.device)
+        sigargs = tuple(sigargs_new)
+        new_sig = self.typingctx.resolve_function_type(
+            self.typemap[call_stmt.func.name], sigargs, {}
+        )
+        self.calltypes.update({call_stmt: new_sig})
+
+    def _legalize_array_attrs(
+        self, arrattr, legalized_device_ty, legalized_usm_ty
+    ):
+        modified = False
+        if self.typemap[arrattr].device != legalized_device_ty:
+            self.typemap[arrattr].device = legalized_device_ty
+            modified = True
+
+        if self.typemap[arrattr].usm_type != legalized_usm_ty:
+            self.typemap[arrattr].usm_type = legalized_usm_ty
+            modified = True
+
+        return modified
+
+    def _legalize_parfor_params(self, parfor):
+        """Checks the parfor params for compute follows data compliance and
+        returns the conforming device for the parfor.
+
+        Args:
+            parfor: Parfor node to be analyzed
+
+        Returns:
+            str: The device filter string for the parfor if the parfor is
+            compute follows data conforming.
+        """
+        if parfor.params is None:
+            return
+        outputParams = get_parfor_outputs(parfor, parfor.params)
+
+        checklist = sorted(list(set(parfor.params) - set(outputParams)))
+
+        # Check if any output param was defined outside the parfor
+        for para in outputParams:
+            if (
+                isinstance(self.typemap[para], DpnpNdArray)
+                and para in self._seen_array_set
+            ):
+                checklist.append(para)
+
+        # Check params in checklist for CFD compliance and derive the common
+        # usm allocator and device based on the checklist params
+        usm_ty, device_ty = self._check_cfd_parfor_params(parfor, checklist)
+
+        # Update any outputs that are generated in the parfor
+        for para in outputParams:
+            if not isinstance(self.typemap[para], DpnpNdArray):
+                continue
+            # Legalize LHS. Skip if we already updated the type before and no
+            # further legalization is needed.
+            if self._legalize_array_attrs(
+                para,
+                device_ty,
+                ParforLegalizeCFDPassImpl.inputUsmTypeIntToStr[usm_ty],
+            ):
+                # Keep track of vars that have been updated
+                self._cfd_updated_values.add(para)
+            else:
+                try:
+                    self._cfd_updated_values.remove(para)
+                except KeyError:
+                    pass
+
+        return device_ty
+
+    def _legalize_cfd_parfor_blocks(self, parfor):
+        """Legalize the parfor params based on the compute follows data
+        programming model and usm allocator precedence rule.
+        """
+        conforming_device_ty = self._legalize_parfor_params(parfor)
+
+        # Update the parfor's lowerer attribute
+        parfor.lowerer = ParforLowerFactory.get_lowerer(conforming_device_ty)
+
+        init_block = parfor.init_block
+        blocks = parfor.loop_body
+
+        for stmt in init_block.body:
+            self._legalize_stmt(stmt, init_block, inparfor=True)
+
+        for block in blocks.values():
+            for stmt in block.body:
+                self._legalize_stmt(stmt, block, inparfor=True)
+
+    def _legalize_expr(self, stmt, lhs, lhsty, parent_block, inparfor=False):
+        rhs = stmt.value
+        if rhs.op != "call":
+            # The assumption is all other expr types are by now
+            # either parfors or are benign like setattr, getattr,
+            # getitem, etc. and we do not need to do CFD
+            # legalization.
+            self._seen_array_set.add(lhs)
+        else:
+            if self._check_if_dpnp_empty_call(rhs.func, parent_block):
+                if inparfor and lhs in self._cfd_updated_values:
+                    self._legalize_dpnp_empty_call(lhsty, rhs, parent_block)
+                self._seen_array_set.add(lhs)
+            # TODO: If any other array constructor that does not take
+            # args, just add to self._seen_array_set
+            else:
+                for ele in rhs.list_vars():
+                    if ele.name in self._cfd_updated_values:
+                        # TODO: Resolve function type with new argument
+                        raise NotImplementedError(
+                            "Compute follows data is not currently "
+                            "supported for function calls."
+                        )
+
+    def _legalize_stmt(self, stmt, parent_block, inparfor=False):
+        if isinstance(stmt, ir.Assign):
+            lhs = stmt.target.name
+            lhsty = self.typemap[lhs]
+            if isinstance(lhsty, DpnpNdArray):
+                if isinstance(stmt.value, ir.Arg):
+                    self._seen_array_set.add(lhs)
+                elif isinstance(stmt.value, ir.Expr):
+                    self._legalize_expr(
+                        stmt, lhs, lhsty, parent_block, inparfor
+                    )
+        elif isinstance(stmt, Parfor):
+            self._legalize_cfd_parfor_blocks(stmt)
+
+    def run(self):
+        # The get_parfor_params needs to be run here to initialize the parfor
+        # nodes prior to using them.
+        _, _ = get_parfor_params(
+            self.func_ir.blocks,
+            self.options.fusion,
+            self.nested_fusion_info,
+        )
+
+        self._cfd_updated_values = set()
+        self._seen_array_set = set()
+
+        # FIXME: Traversing the blocks in  topological order is not sufficient.
+        # The traversal should be converted to a backward data flow traversal of
+        # the CFG. The algorithm needs to then become a fixed-point work list
+        # algorithm.
+        topo_order = find_topo_order(self.func_ir.blocks)
+
+        # Apply CFD legalization to parfor nodes and dpnp_empty calls
+        for label in topo_order:
+            block = self.func_ir.blocks[label]
+            for stmt in block.body:
+                self._legalize_stmt(stmt, block)
+
+
+@register_pass(mutates_CFG=True, analysis_only=False)
+class ParforLegalizeCFDPass(FunctionPass):
+    _name = "parfor_Legalize_CFD_pass"
+
+    def __init__(self):
+        FunctionPass.__init__(self)
+
+    def run_pass(self, state):
+        """
+        Legalize CFD of parfor nodes.
+        """
+        # Ensure we have an IR and type information.
+        assert state.func_ir
+        parfor_pass = ParforLegalizeCFDPassImpl(
+            state.func_ir,
+            state.typemap,
+            state.calltypes,
+            state.return_type,
+            state.typingctx,
+            state.targetctx,
+            state.flags.auto_parallel,
+            state.flags,
+            state.metadata,
+            state.parfor_diagnostics,
+        )
+        parfor_pass.run()
+
+        return True

From 85bb5ce77acca362e24a831290b488e82bbe9c61 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 4 Mar 2023 12:35:21 -0600
Subject: [PATCH 05/21] Add passes to init

---
 numba_dpex/core/passes/__init__.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/numba_dpex/core/passes/__init__.py b/numba_dpex/core/passes/__init__.py
index 00fec5f515..5d50252a3b 100644
--- a/numba_dpex/core/passes/__init__.py
+++ b/numba_dpex/core/passes/__init__.py
@@ -1,3 +1,27 @@
 # SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
 #
 # SPDX-License-Identifier: Apache-2.0
+
+from .parfor_legalize_cfd_pass import ParforLegalizeCFDPass
+from .parfor_lowering_pass import ParforLoweringPass
+from .passes import (
+    DumpParforDiagnostics,
+    NoPythonBackend,
+    ParforFusionPass,
+    ParforPass,
+    ParforPreLoweringPass,
+    PreParforPass,
+    SplitParforPass,
+)
+
+__all__ = [
+    "DumpParforDiagnostics",
+    "ParforLoweringPass",
+    "ParforLegalizeCFDPass",
+    "ParforFusionPass",
+    "ParforPreLoweringPass",
+    "ParforPass",
+    "PreParforPass",
+    "SplitParforPass",
+    "NoPythonBackend",
+]

From ca8fde2023c2ebb1d86372a8a67cc0984dc01f60 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 4 Mar 2023 11:15:57 -0600
Subject: [PATCH 06/21] Add a new compiler pipeline that uses dpex's parfor
 passes.

   - The new DpjitPipeline uses the backported parfor passes
     with the new parfor lowerer and parfor compute follows data
     legalization pass.
   - dpjit decorator now used DpjitCompiler.
---
 numba_dpex/core/pipelines/dpjit_compiler.py | 118 ++++++++++++++++++++
 numba_dpex/decorators.py                    |   5 +-
 2 files changed, 121 insertions(+), 2 deletions(-)
 create mode 100644 numba_dpex/core/pipelines/dpjit_compiler.py

diff --git a/numba_dpex/core/pipelines/dpjit_compiler.py b/numba_dpex/core/pipelines/dpjit_compiler.py
new file mode 100644
index 0000000000..433dfd9a27
--- /dev/null
+++ b/numba_dpex/core/pipelines/dpjit_compiler.py
@@ -0,0 +1,118 @@
+# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from numba.core.compiler import CompilerBase, DefaultPassBuilder
+from numba.core.compiler_machinery import PassManager
+from numba.core.typed_passes import (
+    AnnotateTypes,
+    InlineOverloads,
+    IRLegalization,
+    NopythonRewrites,
+    NoPythonSupportedFeatureValidation,
+    NopythonTypeInference,
+    PreLowerStripPhis,
+    PreParforPass,
+)
+
+from numba_dpex.core.exceptions import UnsupportedCompilationModeError
+from numba_dpex.core.passes import (
+    DumpParforDiagnostics,
+    NoPythonBackend,
+    ParforFusionPass,
+    ParforLegalizeCFDPass,
+    ParforLoweringPass,
+    ParforPreLoweringPass,
+    SplitParforPass,
+)
+from numba_dpex.parfor_diagnostics import ExtendedParforDiagnostics
+
+
+class _DpjitPassBuilder(object):
+    """
+    A pass builder for dpex's DpjitCompiler that adds supports for
+    offloading dpnp array expressions and dpnp library calls to a SYCL device.
+
+    The pass builder does not implement pipelines for objectmode or interpreted
+    execution.
+    """
+
+    @staticmethod
+    def define_typed_pipeline(state, name="dpex_dpjit_typed"):
+        """Returns the typed part of the nopython pipeline"""
+        pm = PassManager(name)
+        # typing
+        pm.add_pass(NopythonTypeInference, "nopython frontend")
+        # Annotate only once legalized
+        pm.add_pass(AnnotateTypes, "annotate types")
+        # strip phis
+        pm.add_pass(PreLowerStripPhis, "remove phis nodes")
+
+        # optimization
+        pm.add_pass(InlineOverloads, "inline overloaded functions")
+        pm.add_pass(PreParforPass, "Preprocessing for parfors")
+        if not state.flags.no_rewrites:
+            pm.add_pass(NopythonRewrites, "nopython rewrites")
+        pm.add_pass(SplitParforPass, "convert to parfors")
+        pm.add_pass(
+            ParforLegalizeCFDPass, "Legalize parfors for compute follows data"
+        )
+        pm.add_pass(ParforFusionPass, "fuse parfors")
+        pm.add_pass(ParforPreLoweringPass, "parfor prelowering")
+
+        pm.finalize()
+        return pm
+
+    @staticmethod
+    def define_nopython_lowering_pipeline(state, name="dpex_dpjit_lowering"):
+        """Returns an nopython mode pipeline based PassManager"""
+        pm = PassManager(name)
+
+        # legalize
+        pm.add_pass(
+            NoPythonSupportedFeatureValidation,
+            "ensure features that are in use are in a valid form",
+        )
+        pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering")
+
+        # lower
+        pm.add_pass(ParforLoweringPass, "Custom lowerer for dpex parfor nodes")
+        pm.add_pass(NoPythonBackend, "nopython mode backend")
+        pm.add_pass(DumpParforDiagnostics, "dump parfor diagnostics")
+
+        pm.finalize()
+        return pm
+
+    @staticmethod
+    def define_nopython_pipeline(state, name="dpex_dpjit_nopython"):
+        """Returns an nopython mode pipeline based PassManager"""
+        # compose pipeline from untyped, typed and lowering parts
+        dpb = _DpjitPassBuilder
+        pm = PassManager(name)
+        untyped_passes = DefaultPassBuilder.define_untyped_pipeline(state)
+        pm.passes.extend(untyped_passes.passes)
+
+        typed_passes = dpb.define_typed_pipeline(state)
+        pm.passes.extend(typed_passes.passes)
+
+        lowering_passes = dpb.define_nopython_lowering_pipeline(state)
+        pm.passes.extend(lowering_passes.passes)
+
+        pm.finalize()
+        return pm
+
+
+class DpjitCompiler(CompilerBase):
+    """Dpex's compiler pipeline to offload parfor nodes into SYCL kernels."""
+
+    def define_pipelines(self):
+        pms = []
+        self.state.parfor_diagnostics = ExtendedParforDiagnostics()
+        self.state.metadata[
+            "parfor_diagnostics"
+        ] = self.state.parfor_diagnostics
+        if not self.state.flags.force_pyobject:
+            pms.append(_DpjitPassBuilder.define_nopython_pipeline(self.state))
+        if self.state.status.can_fallback or self.state.flags.force_pyobject:
+            raise UnsupportedCompilationModeError()
+        return pms
diff --git a/numba_dpex/decorators.py b/numba_dpex/decorators.py
index 1c05dfe495..8b27c0d358 100644
--- a/numba_dpex/decorators.py
+++ b/numba_dpex/decorators.py
@@ -16,7 +16,7 @@
     compile_func,
     compile_func_template,
 )
-from numba_dpex.core.pipelines.offload_compiler import OffloadCompiler
+from numba_dpex.core.pipelines.dpjit_compiler import DpjitCompiler
 
 
 def kernel(
@@ -165,7 +165,8 @@ def dpjit(*args, **kws):
         )
         del kws["forceobj"]
     kws.update({"nopython": True})
-    kws.update({"pipeline_class": OffloadCompiler})
+    kws.update({"parallel": True})
+    kws.update({"pipeline_class": DpjitCompiler})
 
     # FIXME: When trying to use dpex's target context, overloads do not work
     # properly. We will turn on dpex target once the issue is fixed.

From 004ac77119f1c0dc34e7e5208e5631c210c762de Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 4 Mar 2023 11:27:10 -0600
Subject: [PATCH 07/21] Unit tests for CFD legalization passes.

---
 .../passes/test_parfor_legalize_cfd_pass.py   | 82 +++++++++++++++++++
 1 file changed, 82 insertions(+)
 create mode 100644 numba_dpex/tests/passes/test_parfor_legalize_cfd_pass.py

diff --git a/numba_dpex/tests/passes/test_parfor_legalize_cfd_pass.py b/numba_dpex/tests/passes/test_parfor_legalize_cfd_pass.py
new file mode 100644
index 0000000000..451eb2392b
--- /dev/null
+++ b/numba_dpex/tests/passes/test_parfor_legalize_cfd_pass.py
@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Tests for checking enforcing CFD in parfor pass.
+"""
+
+
+import dpctl
+import dpnp
+import numba as nb
+import pytest
+
+from numba_dpex import dpjit
+from numba_dpex.core.exceptions import ComputeFollowsDataInferenceError
+from numba_dpex.tests._helper import skip_no_opencl_gpu
+
+shapes = [10, (2, 5)]
+dtypes = [dpnp.int32, dpnp.int64, dpnp.float32, dpnp.float64]
+usm_types = ["device"]
+devices = ["gpu"]
+
+
+@dpjit
+def func1(a, b):
+    c = a + b
+    return c
+
+
+@skip_no_opencl_gpu
+@pytest.mark.parametrize("shape", shapes)
+@pytest.mark.parametrize("dtype", dtypes)
+@pytest.mark.parametrize("usm_type", usm_types)
+@pytest.mark.parametrize("device", devices)
+def test_parfor_legalize_cfd_pass(shape, dtype, usm_type, device):
+    a = dpnp.zeros(shape=shape, dtype=dtype, usm_type=usm_type, device=device)
+    b = dpnp.ones(shape=shape, dtype=dtype, usm_type=usm_type, device=device)
+    try:
+        c = func1(a, b)
+    except Exception:
+        pytest.fail("Running Parfor CFD Pass check failed")
+
+    if len(c.shape) == 1:
+        assert c.shape[0] == shape
+    else:
+        assert c.shape == shape
+
+    assert c.dtype == dtype
+    assert c.usm_type == usm_type
+    if device != "unknown":
+        assert (
+            c.sycl_device.filter_string
+            == dpctl.SyclDevice(device).filter_string
+        )
+    else:
+        c.sycl_device.filter_string == dpctl.SyclDevice().filter_string
+
+
+@skip_no_opencl_gpu
+def test_parfor_legalize_cfd_pass_raise():
+    a = dpnp.zeros(shape=10, device="cpu")
+    b = dpnp.ones(shape=10, device="gpu")
+
+    with pytest.raises(ComputeFollowsDataInferenceError):
+        func1(a, b)
+
+
+@skip_no_opencl_gpu
+def test_cfd_error_due_to_lhs():
+    a = dpnp.zeros(shape=10, device="cpu")
+    b = dpnp.ones(shape=10, device="cpu")
+
+    @dpjit
+    def vecadd_prange(a, b):
+        c = dpnp.empty(a.shape, dtype=a.dtype, device="gpu")
+        for idx in nb.prange(a.size):
+            c[idx] = a[idx] + b[idx]
+        return c
+
+    with pytest.raises(ComputeFollowsDataInferenceError):
+        vecadd_prange(a, b)

From 61411d23a052e68eb2cc093d32a13600ad1b4304 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 4 Mar 2023 11:28:33 -0600
Subject: [PATCH 08/21] Unit tests for parfor with built-in ops.

---
 .../dpjit_tests/parfors/test_builtin_ops.py   | 122 ++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100644 numba_dpex/tests/dpjit_tests/parfors/test_builtin_ops.py

diff --git a/numba_dpex/tests/dpjit_tests/parfors/test_builtin_ops.py b/numba_dpex/tests/dpjit_tests/parfors/test_builtin_ops.py
new file mode 100644
index 0000000000..d793317f64
--- /dev/null
+++ b/numba_dpex/tests/dpjit_tests/parfors/test_builtin_ops.py
@@ -0,0 +1,122 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import dpctl
+import dpnp
+import numpy
+import pytest
+
+from numba_dpex import dpjit
+
+
+def parfor_add(a, b):
+    return a + b
+
+
+def parfor_sub(a, b):
+    return a - b
+
+
+def parfor_mult(a, b):
+    return a * b
+
+
+def parfor_divide(a, b):
+    return a / b
+
+
+def parfor_modulus(a, b):
+    return a % b
+
+
+def parfor_exponent(a, b):
+    return a**b
+
+
+shapes = [100, (25, 4)]
+dtypes = [dpnp.int32, dpnp.int64, dpnp.float32, dpnp.float64]
+usm_types = ["device"]
+funcs = [
+    parfor_add,
+    parfor_sub,
+    parfor_mult,
+    parfor_divide,
+    parfor_modulus,
+    parfor_exponent,
+]
+
+
+def parfor_floor(a, b):
+    return a // b
+
+
+@pytest.mark.parametrize("shape", shapes)
+@pytest.mark.parametrize("dtype", dtypes)
+@pytest.mark.parametrize("usm_type", usm_types)
+@pytest.mark.parametrize("func", funcs)
+def test_built_in_operators1(shape, dtype, usm_type, func):
+    queue = dpctl.SyclQueue()
+    a = dpnp.zeros(
+        shape=shape, dtype=dtype, usm_type=usm_type, sycl_queue=queue
+    )
+    b = dpnp.ones(shape=shape, dtype=dtype, usm_type=usm_type, sycl_queue=queue)
+    try:
+        op = dpjit(func)
+        c = op(a, b)
+        del op
+    except Exception:
+        pytest.fail("Failed to compile.")
+
+    if len(c.shape) == 1:
+        assert c.shape[0] == shape
+    else:
+        assert c.shape == shape
+
+    if func != parfor_divide:
+        assert c.dtype == dtype
+    assert c.usm_type == usm_type
+
+    assert c.sycl_device.filter_string == queue.sycl_device.filter_string
+
+    expected = dpnp.asnumpy(func(a, b))
+    nc = dpnp.asnumpy(c)
+
+    numpy.allclose(nc, expected)
+
+
+usm_types = ["host", "shared"]
+
+
+@pytest.mark.parametrize("shape", shapes)
+@pytest.mark.parametrize("dtype", dtypes)
+@pytest.mark.parametrize("usm_type", usm_types)
+@pytest.mark.parametrize("func", funcs)
+def test_built_in_operators2(shape, dtype, usm_type, func):
+    queue = dpctl.SyclQueue()
+    a = dpnp.zeros(
+        shape=shape, dtype=dtype, usm_type=usm_type, sycl_queue=queue
+    )
+    b = dpnp.ones(shape=shape, dtype=dtype, usm_type=usm_type, sycl_queue=queue)
+    try:
+        op = dpjit(func)
+        c = op(a, b)
+        del op
+    except Exception:
+        pytest.fail("Failed to compile.")
+
+    if len(c.shape) == 1:
+        assert c.shape[0] == shape
+    else:
+        assert c.shape == shape
+
+    if func != parfor_divide:
+        assert c.dtype == dtype
+    assert c.usm_type == usm_type
+
+    assert c.sycl_device.filter_string == queue.sycl_device.filter_string
+
+    expected = dpnp.asnumpy(func(a, b))
+    nc = dpnp.asnumpy(c)
+
+    numpy.allclose(nc, expected)

From d5c60b88aae0d9d66a25e02ff9e5f2bb53de1134 Mon Sep 17 00:00:00 2001
From: "Wang, Mingjie1" <mingjie1.wang@intel.com>
Date: Tue, 28 Feb 2023 13:54:27 -0600
Subject: [PATCH 09/21] Enabled parfor ufunc.

Added numba_dpex/core/typing/dpnpdecl.py.
Added numba_dpex/dpnp_iface/dpnp_ufunc_db.py.
Added numba_dpex/dpnp_iface/dpnpimpl.py.
---
 numba_dpex/core/passes/parfor.py       |  95 ++++++++++-----
 numba_dpex/core/passes/passes.py       |   1 +
 numba_dpex/core/typing/dpnpdecl.py     | 161 +++++++++++++++++++++++++
 numba_dpex/dpnp_iface/dpnp_ufunc_db.py |  40 ++++++
 numba_dpex/dpnp_iface/dpnpimpl.py      |  58 +++++++++
 5 files changed, 326 insertions(+), 29 deletions(-)
 create mode 100644 numba_dpex/core/typing/dpnpdecl.py
 create mode 100644 numba_dpex/dpnp_iface/dpnp_ufunc_db.py
 create mode 100644 numba_dpex/dpnp_iface/dpnpimpl.py

diff --git a/numba_dpex/core/passes/parfor.py b/numba_dpex/core/passes/parfor.py
index 9e162c90f1..454d63d350 100644
--- a/numba_dpex/core/passes/parfor.py
+++ b/numba_dpex/core/passes/parfor.py
@@ -2421,7 +2421,6 @@ def _arrayexpr_to_parfor(self, equiv_set, lhs, arrayexpr, avail_vars):
         expr = arrayexpr.expr
         arr_typ = pass_states.typemap[lhs.name]
         el_typ = arr_typ.dtype
-
         # generate loopnests and size variables from lhs correlations
         size_vars = equiv_set.get_shape(lhs)
         index_vars, loopnests = _mk_parfor_loops(
@@ -3788,6 +3787,46 @@ def _get_call_arg_types(expr, typemap):
     return tuple(new_arg_typs), new_kw_types
 
 
+def _ufunc_to_parfor_instr(
+    typemap,
+    op,
+    avail_vars,
+    loc,
+    scope,
+    func_ir,
+    out_ir,
+    arg_vars,
+    typingctx,
+    calltypes,
+    expr_out_var,
+):
+    func_var_name = _find_func_var(typemap, op, avail_vars, loc=loc)
+    func_var = ir.Var(scope, mk_unique_var(func_var_name), loc)
+    typemap[func_var.name] = typemap[func_var_name]
+    func_var_def = copy.deepcopy(func_ir.get_definition(func_var_name))
+    if (
+        isinstance(func_var_def, ir.Expr)
+        and func_var_def.op == "getattr"
+        and func_var_def.attr == "sqrt"
+    ):
+        g_math_var = ir.Var(scope, mk_unique_var("$math_g_var"), loc)
+        typemap[g_math_var.name] = types.misc.Module(math)
+        g_math = ir.Global("math", math, loc)
+        g_math_assign = ir.Assign(g_math, g_math_var, loc)
+        func_var_def = ir.Expr.getattr(g_math_var, "sqrt", loc)
+        out_ir.append(g_math_assign)
+    #                     out_ir.append(func_var_def)
+    ir_expr = ir.Expr.call(func_var, arg_vars, (), loc)
+    call_typ = typemap[func_var.name].get_call_type(
+        typingctx, tuple(typemap[a.name] for a in arg_vars), {}
+    )
+    calltypes[ir_expr] = call_typ
+    el_typ = call_typ.return_type
+    # signature(el_typ, el_typ)
+    out_ir.append(ir.Assign(func_var_def, func_var, loc))
+    out_ir.append(ir.Assign(ir_expr, expr_out_var, loc))
+
+
 def _arrayexpr_tree_to_ir(
     func_ir,
     typingctx,
@@ -3852,35 +3891,33 @@ def _arrayexpr_tree_to_ir(
                 # elif isinstance(op, (np.ufunc, DUFunc)):
                 # function calls are stored in variables which are not removed
                 # op is typing_key to the variables type
-                func_var_name = _find_func_var(typemap, op, avail_vars, loc=loc)
-                func_var = ir.Var(scope, mk_unique_var(func_var_name), loc)
-                typemap[func_var.name] = typemap[func_var_name]
-                func_var_def = copy.deepcopy(
-                    func_ir.get_definition(func_var_name)
-                )
-                if (
-                    isinstance(func_var_def, ir.Expr)
-                    and func_var_def.op == "getattr"
-                    and func_var_def.attr == "sqrt"
-                ):
-                    g_math_var = ir.Var(
-                        scope, mk_unique_var("$math_g_var"), loc
-                    )
-                    typemap[g_math_var.name] = types.misc.Module(math)
-                    g_math = ir.Global("math", math, loc)
-                    g_math_assign = ir.Assign(g_math, g_math_var, loc)
-                    func_var_def = ir.Expr.getattr(g_math_var, "sqrt", loc)
-                    out_ir.append(g_math_assign)
-                #                     out_ir.append(func_var_def)
-                ir_expr = ir.Expr.call(func_var, arg_vars, (), loc)
-                call_typ = typemap[func_var.name].get_call_type(
-                    typingctx, tuple(typemap[a.name] for a in arg_vars), {}
+                _ufunc_to_parfor_instr(
+                    typemap,
+                    op,
+                    avail_vars,
+                    loc,
+                    scope,
+                    func_ir,
+                    out_ir,
+                    arg_vars,
+                    typingctx,
+                    calltypes,
+                    expr_out_var,
                 )
-                calltypes[ir_expr] = call_typ
-                el_typ = call_typ.return_type
-                # signature(el_typ, el_typ)
-                out_ir.append(ir.Assign(func_var_def, func_var, loc))
-                out_ir.append(ir.Assign(ir_expr, expr_out_var, loc))
+        if hasattr(op, "is_dpnp_ufunc"):
+            _ufunc_to_parfor_instr(
+                typemap,
+                op,
+                avail_vars,
+                loc,
+                scope,
+                func_ir,
+                out_ir,
+                arg_vars,
+                typingctx,
+                calltypes,
+                expr_out_var,
+            )
     elif isinstance(expr, ir.Var):
         var_typ = typemap[expr.name]
         if isinstance(var_typ, types.Array):
diff --git a/numba_dpex/core/passes/passes.py b/numba_dpex/core/passes/passes.py
index ccf2640ac6..c5ab12271d 100644
--- a/numba_dpex/core/passes/passes.py
+++ b/numba_dpex/core/passes/passes.py
@@ -152,6 +152,7 @@ def run_pass(self, state):
         # Ensure we have an IR and type information.
         assert state.func_ir
         functions_map = swap_functions_map.copy()
+        # FIXME: remove once reduction is implemented
         functions_map.pop(("dot", "numpy"), None)
         functions_map.pop(("sum", "numpy"), None)
         functions_map.pop(("prod", "numpy"), None)
diff --git a/numba_dpex/core/typing/dpnpdecl.py b/numba_dpex/core/typing/dpnpdecl.py
new file mode 100644
index 0000000000..b837471ab0
--- /dev/null
+++ b/numba_dpex/core/typing/dpnpdecl.py
@@ -0,0 +1,161 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import dpnp
+from numba.core import types
+from numba.core.typing.npydecl import Numpy_rules_ufunc, infer_global
+
+# list of unary ufuncs to register
+
+_math_operations = [
+    "add",
+    "subtract",
+    "multiply",
+    "floor_divide",
+    "negative",
+    "power",
+    "remainder",
+    "fmod",
+    "absolute",
+    "sign",
+    "conjugate",
+    "exp",
+    "exp2",
+    "log",
+    "log2",
+    "log10",
+    "expm1",
+    "log1p",
+    "sqrt",
+    "square",
+    "cbrt",
+    "reciprocal",
+    "divide",
+    "mod",
+    "abs",
+    "fabs",
+]
+
+_trigonometric_functions = [
+    "sin",
+    "cos",
+    "tan",
+    "arcsin",
+    "arccos",
+    "arctan",
+    "arctan2",
+    "hypot",
+    "sinh",
+    "cosh",
+    "tanh",
+    "arcsinh",
+    "arccosh",
+    "arctanh",
+    "deg2rad",
+    "rad2deg",
+    "degrees",
+    "radians",
+]
+
+_bit_twiddling_functions = [
+    "bitwise_and",
+    "bitwise_or",
+    "bitwise_xor",
+    "invert",
+    "left_shift",
+    "right_shift",
+    "bitwise_not",
+]
+
+_comparison_functions = [
+    "greater",
+    "greater_equal",
+    "less",
+    "less_equal",
+    "not_equal",
+    "equal",
+    "logical_and",
+    "logical_or",
+    "logical_xor",
+    "logical_not",
+    "maximum",
+    "minimum",
+    "fmax",
+    "fmin",
+]
+
+_floating_functions = [
+    "isfinite",
+    "isinf",
+    "isnan",
+    "copysign",
+    "modf",
+    "frexp",
+    "floor",
+    "ceil",
+    "trunc",
+]
+
+_logic_functions = []
+
+
+# This is a set of the ufuncs that are not yet supported by Lowering. In order
+# to trigger no-python mode we must not register them until their Lowering is
+# implemented.
+#
+# It also works as a nice TODO list for ufunc support :)
+_unsupported = set(
+    [
+        "frexp",
+        "modf",
+        "logaddexp",
+        "logaddexp2",
+        "true_divide",
+        "positive",
+        "float_power",
+        "rint",
+        "divmod",
+        "gcd",
+        "lcm",
+        "signbit",
+        "nextafter",
+        "ldexp",
+        "spacing",
+        "isnat",
+    ]
+)
+
+# A list of ufuncs that are in fact aliases of other ufuncs. They need to insert the
+# resolve method, but not register the ufunc itself
+_aliases = set(["bitwise_not", "mod", "abs"])
+
+all_ufuncs = sum(
+    [
+        _math_operations,
+        _trigonometric_functions,
+        _bit_twiddling_functions,
+        _comparison_functions,
+        _floating_functions,
+        _logic_functions,
+    ],
+    [],
+)
+
+supported_ufuncs = [x for x in all_ufuncs if x not in _unsupported]
+
+
+def _dpnp_ufunc(name):
+    func = getattr(dpnp, name)
+
+    class typing_class(Numpy_rules_ufunc):
+        key = func
+
+    typing_class.__name__ = "resolve_{0}".format(name)
+
+    if name not in _aliases:  # if not name in _aliases
+        infer_global(func, types.Function(typing_class))
+
+
+for func in supported_ufuncs:
+    _dpnp_ufunc(func)
diff --git a/numba_dpex/dpnp_iface/dpnp_ufunc_db.py b/numba_dpex/dpnp_iface/dpnp_ufunc_db.py
new file mode 100644
index 0000000000..2b692843ce
--- /dev/null
+++ b/numba_dpex/dpnp_iface/dpnp_ufunc_db.py
@@ -0,0 +1,40 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+import dpnp
+import numpy as np
+
+from numba_dpex.core.typing import dpnpdecl
+
+
+def get_ufuncs():
+    """obtain a list of supported ufuncs in the db"""
+
+    from numba.np.ufunc_db import _lazy_init_db
+
+    _lazy_init_db()
+    from numba.np.ufunc_db import _ufunc_db
+
+    _fill_ufunc_db_with_dpnp_ufuncs(_ufunc_db)
+
+    return _ufunc_db.keys()
+
+
+def _fill_ufunc_db_with_dpnp_ufuncs(ufunc_db):
+    """Monkey patching dpnp for missing attributes."""
+    # FIXME: add more docstring
+
+    for ufuncop in dpnpdecl.supported_ufuncs:
+        op = getattr(dpnp, ufuncop)
+        npop = getattr(np, ufuncop)
+        op.nin = npop.nin
+        op.nout = npop.nout
+        op.nargs = npop.nargs
+        op.types = npop.types
+        op.is_dpnp_ufunc = True
+        ufunc_db.update({op: ufunc_db[npop]})
+        for key in list(ufunc_db[op].keys()):
+            if "FF->" in key or "DD->" in key or "F->" in key or "D->" in key:
+                ufunc_db[op].pop(key)
diff --git a/numba_dpex/dpnp_iface/dpnpimpl.py b/numba_dpex/dpnp_iface/dpnpimpl.py
new file mode 100644
index 0000000000..7873bbc5b5
--- /dev/null
+++ b/numba_dpex/dpnp_iface/dpnpimpl.py
@@ -0,0 +1,58 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import dpnp
+from numba.np import npyimpl
+
+from numba_dpex.core.typing.dpnpdecl import _unsupported
+from numba_dpex.dpnp_iface import dpnp_ufunc_db
+
+
+def _register_dpnp_ufuncs():
+    kernels = {}
+    # NOTE: Assuming ufunc implementation for the CPUContext.
+    for ufunc in dpnp_ufunc_db.get_ufuncs():
+        kernels[ufunc] = npyimpl.register_ufunc_kernel(
+            ufunc, npyimpl._ufunc_db_function(ufunc)
+        )
+
+    for _op_map in (
+        npyimpl.npydecl.NumpyRulesUnaryArrayOperator._op_map,
+        npyimpl.npydecl.NumpyRulesArrayOperator._op_map,
+    ):
+        for operator, ufunc_name in _op_map.items():
+            if ufunc_name in _unsupported:
+                continue
+            ufunc = getattr(dpnp, ufunc_name)
+            kernel = kernels[ufunc]
+            if ufunc.nin == 1:
+                npyimpl.register_unary_operator_kernel(operator, ufunc, kernel)
+            elif ufunc.nin == 2:
+                npyimpl.register_binary_operator_kernel(operator, ufunc, kernel)
+            else:
+                raise RuntimeError(
+                    "There shouldn't be any non-unary or binary operators"
+                )
+
+    for _op_map in (npyimpl.npydecl.NumpyRulesInplaceArrayOperator._op_map,):
+        for operator, ufunc_name in _op_map.items():
+            if ufunc_name in _unsupported:
+                continue
+            ufunc = getattr(dpnp, ufunc_name)
+            kernel = kernels[ufunc]
+            if ufunc.nin == 1:
+                npyimpl.register_unary_operator_kernel(
+                    operator, ufunc, kernel, inplace=True
+                )
+            elif ufunc.nin == 2:
+                npyimpl.register_binary_operator_kernel(
+                    operator, ufunc, kernel, inplace=True
+                )
+            else:
+                raise RuntimeError(
+                    "There shouldn't be any non-unary or binary operators"
+                )
+
+
+_register_dpnp_ufuncs()

From afff21244e75bf9ab8b901fc8ae407a725e27fb9 Mon Sep 17 00:00:00 2001
From: "Wang, Mingjie1" <mingjie1.wang@intel.com>
Date: Sat, 4 Mar 2023 11:38:23 -0600
Subject: [PATCH 10/21] Add unit tests for dpnp ufuncs lowered as parfors.

---
 .../parfors/test_dpnp_bitwise_ops.py          |  96 +++++++++++++
 .../parfors/test_dpnp_logic_ops.py            | 106 ++++++++++++++
 .../test_dpnp_transcedental_functions.py      | 131 ++++++++++++++++++
 .../test_dpnp_trigonometric_functions.py      | 110 +++++++++++++++
 4 files changed, 443 insertions(+)
 create mode 100644 numba_dpex/tests/dpjit_tests/parfors/test_dpnp_bitwise_ops.py
 create mode 100644 numba_dpex/tests/dpjit_tests/parfors/test_dpnp_logic_ops.py
 create mode 100644 numba_dpex/tests/dpjit_tests/parfors/test_dpnp_transcedental_functions.py
 create mode 100644 numba_dpex/tests/dpjit_tests/parfors/test_dpnp_trigonometric_functions.py

diff --git a/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_bitwise_ops.py b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_bitwise_ops.py
new file mode 100644
index 0000000000..7662df8afc
--- /dev/null
+++ b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_bitwise_ops.py
@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+import dpctl.tensor as dpt
+import dpnp
+import numpy as np
+import pytest
+
+from numba_dpex import dpjit
+from numba_dpex.tests._helper import assert_auto_offloading, filter_strings
+
+list_of_binary_ops = [
+    "bitwise_and",
+    "bitwise_or",
+    "bitwise_xor",
+    "left_shift",
+    "right_shift",
+]
+
+
+@pytest.fixture(params=list_of_binary_ops)
+def binary_op(request):
+    return request.param
+
+
+list_of_unary_ops = [
+    "bitwise_not",
+    "invert",
+]
+
+
+@pytest.fixture(params=list_of_unary_ops)
+def unary_op(request):
+    return request.param
+
+
+list_of_dtypes = [
+    dpnp.int32,
+    dpnp.int64,
+]
+
+
+@pytest.fixture(params=list_of_dtypes)
+def input_arrays(request):
+    # The size of input and out arrays to be used
+    N = 2048
+    a = dpnp.array(dpnp.random.random(N), request.param)
+    b = dpnp.array(dpnp.random.random(N), request.param)
+    return a, b
+
+
+@pytest.mark.parametrize("filter_str", filter_strings)
+def test_binary_ops(filter_str, binary_op, input_arrays):
+    a, b = input_arrays
+    binop = getattr(dpnp, binary_op)
+    actual = dpnp.empty(shape=a.shape, dtype=a.dtype)
+    expected = dpnp.empty(shape=a.shape, dtype=a.dtype)
+
+    @dpjit
+    def f(a, b):
+        return binop(a, b)
+
+    actual = f(a, b)
+
+    expected = binop(a, b)
+    np.testing.assert_allclose(
+        dpt.asnumpy(actual._array_obj),
+        dpt.asnumpy(expected._array_obj),
+        rtol=1e-5,
+        atol=0,
+    )
+
+
+@pytest.mark.parametrize("filter_str", filter_strings)
+def test_unary_ops(filter_str, unary_op, input_arrays):
+    a = input_arrays[0]
+    uop = getattr(dpnp, unary_op)
+    actual = np.empty(shape=a.shape, dtype=a.dtype)
+    expected = np.empty(shape=a.shape, dtype=a.dtype)
+
+    @dpjit
+    def f(a):
+        return uop(a)
+
+    actual = f(a)
+
+    expected = uop(a)
+
+    np.testing.assert_allclose(
+        dpt.asnumpy(actual._array_obj),
+        dpt.asnumpy(expected._array_obj),
+        rtol=1e-5,
+        atol=0,
+    )
diff --git a/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_logic_ops.py b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_logic_ops.py
new file mode 100644
index 0000000000..eb5bac45f9
--- /dev/null
+++ b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_logic_ops.py
@@ -0,0 +1,106 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import dpctl
+import dpctl.tensor as dpt
+import dpnp
+import numpy as np
+import pytest
+
+from numba_dpex import dpjit
+from numba_dpex.tests._helper import assert_auto_offloading, filter_strings
+
+""" Following cases, dpnp raises NotImplementedError"""
+
+list_of_binary_ops = [
+    "greater",
+    "greater_equal",
+    "less",
+    "less_equal",
+    "not_equal",
+    "equal",
+    "logical_and",
+    "logical_or",
+    "logical_xor",
+]
+
+
+@pytest.fixture(params=list_of_binary_ops)
+def binary_op(request):
+    return request.param
+
+
+list_of_unary_ops = [
+    "isinf",
+    "isfinite",
+    "isnan",
+]
+
+
+@pytest.fixture(params=list_of_unary_ops)
+def unary_op(request):
+    return request.param
+
+
+list_of_dtypes = [
+    dpnp.int32,
+    dpnp.int64,
+    dpnp.float32,
+    dpnp.float64,
+]
+
+
+@pytest.fixture(params=list_of_dtypes)
+def input_arrays(request):
+    # The size of input and out arrays to be used
+    N = 2048
+    a = dpnp.array(dpnp.random.random(N), request.param)
+    b = dpnp.array(dpnp.random.random(N), request.param)
+    return a, b
+
+
+@pytest.mark.xfail
+@pytest.mark.parametrize("filter_str", filter_strings)
+def test_binary_ops(filter_str, binary_op, input_arrays):
+    a, b = input_arrays
+    binop = getattr(dpnp, binary_op)
+    actual = dpnp.empty(shape=a.shape, dtype=a.dtype)
+    expected = dpnp.empty(shape=a.shape, dtype=a.dtype)
+
+    @dpjit
+    def f(a, b):
+        return binop(a, b)
+
+    actual = f(a, b)
+
+    expected = binop(a, b)
+    np.testing.assert_allclose(
+        dpt.asnumpy(actual._array_obj),
+        dpt.asnumpy(expected._array_obj),
+        rtol=1e-5,
+        atol=0,
+    )
+
+
+@pytest.mark.xfail
+@pytest.mark.parametrize("filter_str", filter_strings)
+def test_unary_ops(filter_str, unary_op, input_arrays):
+    a = input_arrays[0]
+    uop = getattr(dpnp, unary_op)
+    actual = dpnp.empty(shape=a.shape, dtype=a.dtype)
+    expected = dpnp.empty(shape=a.shape, dtype=a.dtype)
+
+    @dpjit
+    def f(a):
+        return uop(a)
+
+    actual = f(a)
+
+    expected = uop(a)
+    np.testing.assert_allclose(
+        dpt.asnumpy(actual._array_obj),
+        dpt.asnumpy(expected._array_obj),
+        rtol=1e-5,
+        atol=0,
+    )
diff --git a/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_transcedental_functions.py b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_transcedental_functions.py
new file mode 100644
index 0000000000..8685b14e84
--- /dev/null
+++ b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_transcedental_functions.py
@@ -0,0 +1,131 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import dpctl
+import dpctl.tensor as dpt
+import dpnp
+import numpy as np
+import pytest
+
+from numba_dpex import dpjit
+from numba_dpex.tests._helper import (
+    assert_auto_offloading,
+    filter_strings,
+    is_gen12,
+)
+
+"""dpnp raise error on : mod, abs and remainder(float32)"""
+list_of_binary_ops = [
+    "add",
+    "subtract",
+    "multiply",
+    "divide",
+    "power",
+    #   "remainder",
+    #   "mod",
+    "hypot",
+    "maximum",
+    "minimum",
+    "fmax",
+    "fmin",
+]
+
+
+@pytest.fixture(params=list_of_binary_ops)
+def binary_op(request):
+    return request.param
+
+
+list_of_unary_ops = [
+    "negative",
+    "abs",
+    "absolute",
+    "fabs",
+    "sign",
+    "conj",
+    "exp",
+    "exp2",
+    "log",
+    "log2",
+    "log10",
+    "expm1",
+    "log1p",
+    "sqrt",
+    "square",
+    "reciprocal",
+    "conjugate",
+    "floor",
+    "ceil",
+    "trunc",
+]
+
+
+@pytest.fixture(params=list_of_unary_ops)
+def unary_op(request):
+    return request.param
+
+
+list_of_dtypes = [
+    dpnp.float32,
+    dpnp.float64,
+]
+
+
+@pytest.fixture(params=list_of_dtypes)
+def input_arrays(request):
+    # The size of input and out arrays to be used
+    N = 2048
+    a = dpnp.array(dpnp.random.random(N), request.param)
+    b = dpnp.array(dpnp.random.random(N), request.param)
+    return a, b
+
+
+@pytest.mark.parametrize("filter_str", filter_strings)
+def test_binary_ops(filter_str, binary_op, input_arrays):
+    a, b = input_arrays
+    binop = getattr(dpnp, binary_op)
+    actual = dpnp.empty(shape=a.shape, dtype=a.dtype)
+    expected = dpnp.empty(shape=a.shape, dtype=a.dtype)
+
+    @dpjit
+    def f(a, b):
+        return binop(a, b)
+
+    actual = f(a, b)
+
+    expected = binop(a, b)
+
+    np.testing.assert_allclose(
+        dpt.asnumpy(actual._array_obj),
+        dpt.asnumpy(expected._array_obj),
+        rtol=1e-5,
+        atol=0,
+    )
+
+
+@pytest.mark.parametrize("filter_str", filter_strings)
+def test_unary_ops(filter_str, unary_op, input_arrays):
+    skip_ops = ["abs", "sign", "log", "log2", "log10", "expm1"]
+    if unary_op in skip_ops and is_gen12(filter_str):
+        pytest.skip()
+
+    a = input_arrays[0]
+    uop = getattr(dpnp, unary_op)
+    actual = dpnp.empty(shape=a.shape, dtype=a.dtype)
+    expected = dpnp.empty(shape=a.shape, dtype=a.dtype)
+
+    @dpjit
+    def f(a):
+        return uop(a)
+
+    actual = f(a)
+
+    expected = uop(a)
+
+    np.testing.assert_allclose(
+        dpt.asnumpy(actual._array_obj),
+        dpt.asnumpy(expected._array_obj),
+        rtol=1e-5,
+        atol=0,
+    )
diff --git a/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_trigonometric_functions.py b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_trigonometric_functions.py
new file mode 100644
index 0000000000..73ef3b4f5d
--- /dev/null
+++ b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_trigonometric_functions.py
@@ -0,0 +1,110 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import dpctl
+import dpctl.tensor as dpt
+import dpnp
+import numpy as np
+import pytest
+
+from numba_dpex import dpjit
+from numba_dpex.tests._helper import (
+    assert_auto_offloading,
+    filter_strings,
+    is_gen12,
+)
+
+list_of_filter_strs = [
+    "opencl:gpu:0",
+    "level_zero:gpu:0",
+    "opencl:cpu:0",
+]
+
+
+@pytest.fixture(params=list_of_filter_strs)
+def filter_str(request):
+    return request.param
+
+
+list_of_trig_ops = [
+    "sin",
+    "cos",
+    "tan",
+    "arcsin",
+    "arccos",
+    "arctan",
+    "arctan2",
+    "sinh",
+    "cosh",
+    "tanh",
+    "arcsinh",
+    "arccosh",
+    "arctanh",
+    "deg2rad",
+    "rad2deg",
+    "degrees",
+    "radians",
+]
+
+
+@pytest.fixture(params=list_of_trig_ops)
+def trig_op(request):
+    return request.param
+
+
+list_of_dtypes = [
+    dpnp.float32,
+    dpnp.float64,
+]
+
+
+@pytest.fixture(params=list_of_trig_ops)
+def dtype(request):
+    return request.param
+
+
+@pytest.fixture(params=list_of_dtypes)
+def input_arrays(request):
+    # The size of input and out arrays to be used
+    N = 2048
+
+    a = dpnp.array(dpnp.random.random(N), request.param)
+    b = dpnp.array(dpnp.random.random(N), request.param)
+    return a, b
+
+
+@pytest.mark.parametrize("filter_str", filter_strings)
+def test_trigonometric_fn(filter_str, trig_op, input_arrays):
+    # FIXME: Why does archcosh fail on Gen12 discrete graphics card?
+    if trig_op == "arccosh" and is_gen12(filter_str):
+        pytest.skip()
+
+    a, b = input_arrays
+    trig_fn = getattr(dpnp, trig_op)
+    actual = dpnp.empty(shape=a.shape, dtype=a.dtype)
+    expected = dpnp.empty(shape=a.shape, dtype=a.dtype)
+
+    if trig_op == "arctan2":
+
+        @dpjit
+        def f(a, b):
+            return trig_fn(a, b)
+
+        actual = f(a, b)
+        expected = trig_fn(a, b)
+    else:
+
+        @dpjit
+        def f(a):
+            return trig_fn(a)
+
+        actual = f(a)
+        expected = trig_fn(a)
+
+    np.testing.assert_allclose(
+        dpt.asnumpy(actual._array_obj),
+        dpt.asnumpy(expected._array_obj),
+        rtol=1e-5,
+        atol=0,
+    )

From c8ab2b28a7bc1e94c941d37dc489fa77a752af36 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 4 Mar 2023 11:45:20 -0600
Subject: [PATCH 11/21] Add support for built-in operators and true_divide.

   - The commit makes expressions such as A*2, where A is a
     dpnp.ndarray work.
   - Adds support for dpnp.true_divide.
---
 numba_dpex/core/typing/dpnpdecl.py | 50 +++++++++++++++++++++++++-----
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git a/numba_dpex/core/typing/dpnpdecl.py b/numba_dpex/core/typing/dpnpdecl.py
index b837471ab0..8fb20ba242 100644
--- a/numba_dpex/core/typing/dpnpdecl.py
+++ b/numba_dpex/core/typing/dpnpdecl.py
@@ -3,11 +3,42 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import dpnp
+import numpy as np
 from numba.core import types
-from numba.core.typing.npydecl import Numpy_rules_ufunc, infer_global
+from numba.core.typing.npydecl import (
+    Numpy_rules_ufunc,
+    NumpyRulesArrayOperator,
+    NumpyRulesInplaceArrayOperator,
+    NumpyRulesUnaryArrayOperator,
+    infer_global,
+)
+
+
+class DpnpRulesArrayOperator(NumpyRulesArrayOperator):
+    @property
+    def ufunc(self):
+        try:
+            op = getattr(dpnp, self._op_map[self.key])
+            npop = getattr(np, self._op_map[self.key])
+            op.nin = npop.nin
+            op.nout = npop.nout
+            op.nargs = npop.nargs
+            op.types = npop.types
+            op.is_dpnp_ufunc = True
+            return op
+        except:
+            pass
+
+
+class DpnpRulesInplaceArrayOperator(NumpyRulesInplaceArrayOperator):
+    pass
 
-# list of unary ufuncs to register
 
+class DpnpRulesUnaryArrayOperator(NumpyRulesUnaryArrayOperator):
+    pass
+
+
+# list of unary ufuncs to register
 _math_operations = [
     "add",
     "subtract",
@@ -32,6 +63,7 @@
     "cbrt",
     "reciprocal",
     "divide",
+    "true_divide",
     "mod",
     "abs",
     "fabs",
@@ -107,11 +139,10 @@
 # It also works as a nice TODO list for ufunc support :)
 _unsupported = set(
     [
-        "frexp",
-        "modf",
+        "frexp",  # Not supported by Numba
+        "modf",  # Not supported by Numba
         "logaddexp",
         "logaddexp2",
-        "true_divide",
         "positive",
         "float_power",
         "rint",
@@ -126,8 +157,8 @@
     ]
 )
 
-# A list of ufuncs that are in fact aliases of other ufuncs. They need to insert the
-# resolve method, but not register the ufunc itself
+# A list of ufuncs that are in fact aliases of other ufuncs. They need to insert
+# the resolve method, but not register the ufunc itself
 _aliases = set(["bitwise_not", "mod", "abs"])
 
 all_ufuncs = sum(
@@ -159,3 +190,8 @@ class typing_class(Numpy_rules_ufunc):
 
 for func in supported_ufuncs:
     _dpnp_ufunc(func)
+
+
+DpnpRulesArrayOperator.install_operations()
+DpnpRulesInplaceArrayOperator.install_operations()
+DpnpRulesUnaryArrayOperator.install_operations()

From 61669694de44ee0f4f0d166fb983a0538cbb6e7d Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 4 Mar 2023 13:46:48 -0600
Subject: [PATCH 12/21] Improve the check for NoneType queue in
 build_dpnp_ndarray

---
 numba_dpex/dpnp_iface/arrayobj.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/numba_dpex/dpnp_iface/arrayobj.py b/numba_dpex/dpnp_iface/arrayobj.py
index e906176d6a..90e3f922ad 100644
--- a/numba_dpex/dpnp_iface/arrayobj.py
+++ b/numba_dpex/dpnp_iface/arrayobj.py
@@ -50,6 +50,22 @@ class for number classes.
     return _dtype
 
 
+def _parse_layout(layout):
+    if isinstance(layout, types.StringLiteral):
+        layout_type_str = layout.literal_value
+        if layout_type_str not in ["C", "F", "A"]:
+            msg = f"Invalid layout specified: '{layout_type_str}'"
+            raise errors.NumbaValueError(msg)
+        return layout_type_str
+    elif isinstance(layout, str):
+        return layout
+    else:
+        raise TypeError(
+            "The parameter 'layout' is neither of "
+            + "'str' nor 'types.StringLiteral'"
+        )
+
+
 def _parse_usm_type(usm_type):
     """Parse usm_type parameter.
 
@@ -152,8 +168,7 @@ def build_dpnp_ndarray(
             The type has the same structure as USMNdArray used to
             represent dpctl.tensor.usm_ndarray.
     """
-
-    if queue:
+    if queue and not isinstance(queue, types.misc.Omitted):
         raise errors.TypingError(
             "The sycl_queue keyword is not yet supported by "
             "dpnp.empty(), dpnp.zeros(), dpnp.ones(), dpnp.empty_like(), "
@@ -162,7 +177,7 @@ def build_dpnp_ndarray(
         )
 
     # If a dtype value was passed in, then try to convert it to the
-    # coresponding Numba type. If None was passed, the default, then pass None
+    # corresponding Numba type. If None was passed, the default, then pass None
     # to the DpnpNdArray constructor. The default dtype will be derived based
     # on the behavior defined in dpctl.tensor.usm_ndarray.
 
@@ -231,6 +246,7 @@ def ol_dpnp_empty(
 
     _ndim = _ty_parse_shape(shape)
     _dtype = _parse_dtype(dtype)
+    _layout = _parse_layout(order)
     _usm_type = _parse_usm_type(usm_type) if usm_type is not None else "device"
     _device = (
         _parse_device_filter_string(device) if device is not None else "unknown"
@@ -238,7 +254,7 @@ def ol_dpnp_empty(
     if _ndim:
         ret_ty = build_dpnp_ndarray(
             _ndim,
-            layout=order,
+            layout=_layout,
             dtype=_dtype,
             usm_type=_usm_type,
             device=_device,

From d2e22ac360f9986df11fede399f0a6f5a1c9d348 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 4 Mar 2023 13:52:46 -0600
Subject: [PATCH 13/21] Call dpnp.empty with all args inside
 DpnpNdArray.__allocate__

---
 numba_dpex/core/types/dpnp_ndarray_type.py | 48 ++++++++++++++++++++--
 1 file changed, 45 insertions(+), 3 deletions(-)

diff --git a/numba_dpex/core/types/dpnp_ndarray_type.py b/numba_dpex/core/types/dpnp_ndarray_type.py
index 23df5d9477..75d77141c4 100644
--- a/numba_dpex/core/types/dpnp_ndarray_type.py
+++ b/numba_dpex/core/types/dpnp_ndarray_type.py
@@ -108,7 +108,7 @@ def __allocate__(
         # Assume str(dtype) returns a valid type
         dtype_str = str(dtype)
         # alloc call: lhs = empty_attr(size_var, typ_var)
-        typ_var = ir.Var(scope, mk_unique_var("$np_typ_var"), loc)
+        typ_var = ir.Var(scope, mk_unique_var("$np_dtype_var"), loc)
         if typemap:
             typemap[typ_var.name] = types.functions.NumberClass(dtype)
         # If dtype is a datetime/timedelta with a unit,
@@ -126,11 +126,53 @@ def __allocate__(
                 dtype_str = "bool_"
             np_typ_getattr = ir.Expr.getattr(g_np_var, dtype_str, loc)
             typ_var_assign = ir.Assign(np_typ_getattr, typ_var, loc)
-        alloc_call = ir.Expr.call(attr_var, [size_var, typ_var], (), loc)
+
+        # A default usm_type arg added as a placeholder
+        layout_var = ir.Var(scope, mk_unique_var("$layout_var"), loc)
+        usm_typ_var = ir.Var(scope, mk_unique_var("$np_usm_type_var"), loc)
+        # A default device string arg added as a placeholder
+        device_typ_var = ir.Var(scope, mk_unique_var("$np_device_var"), loc)
+
+        if typemap:
+            typemap[layout_var.name] = types.literal(lhs_typ.layout)
+            typemap[usm_typ_var.name] = types.literal(lhs_typ.usm_type)
+            typemap[device_typ_var.name] = types.literal(lhs_typ.device)
+
+        layout_var_assign = ir.Assign(
+            ir.Const(lhs_typ.layout, loc), layout_var, loc
+        )
+        usm_typ_var_assign = ir.Assign(
+            ir.Const(lhs_typ.usm_type, loc), usm_typ_var, loc
+        )
+        device_typ_var_assign = ir.Assign(
+            ir.Const(lhs_typ.device, loc), device_typ_var, loc
+        )
+
+        out.extend(
+            [layout_var_assign, usm_typ_var_assign, device_typ_var_assign]
+        )
+
+        alloc_call = ir.Expr.call(
+            attr_var,
+            [size_var, typ_var, layout_var, device_typ_var, usm_typ_var],
+            (),
+            loc,
+        )
 
         if calltypes:
             cac = typemap[attr_var.name].get_call_type(
-                typingctx, [size_typ, types.functions.NumberClass(dtype)], {}
+                typingctx,
+                [
+                    typemap[x.name]
+                    for x in [
+                        size_var,
+                        typ_var,
+                        layout_var,
+                        device_typ_var,
+                        usm_typ_var,
+                    ]
+                ],
+                {},
             )
             # By default, all calls to "empty" are typed as returning a standard
             # NumPy ndarray.  If we are allocating a ndarray subclass here then

From 5a9ac14d7b5047ce884927df4826a049f61e19fd Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 4 Mar 2023 15:17:42 -0600
Subject: [PATCH 14/21] Update the blackscholes example to use dpjit.

---
 numba_dpex/examples/blacksholes_njit.py | 85 +++++++++++++------------
 1 file changed, 46 insertions(+), 39 deletions(-)

diff --git a/numba_dpex/examples/blacksholes_njit.py b/numba_dpex/examples/blacksholes_njit.py
index ad7e602c18..375f17e044 100644
--- a/numba_dpex/examples/blacksholes_njit.py
+++ b/numba_dpex/examples/blacksholes_njit.py
@@ -6,19 +6,20 @@
 import math
 import time
 
-import dpctl
+import dpctl.tensor as dpt
+import dpnp
 import numba
-import numpy as np
 
+import numba_dpex as dpex
 
-@numba.vectorize(nopython=True)
-def cndf2(inp):
-    out = 0.5 + 0.5 * math.erf((math.sqrt(2.0) / 2.0) * inp)
-    return out
+# @numba.vectorize(nopython=True)
+# def cndf2(inp):
+#     out = 0.5 + 0.5 * math.erf((math.sqrt(2.0) / 2.0) * inp)
+#     return out
 
 
-@numba.njit(parallel=True, fastmath=True)
-def blackscholes(sptprice, strike, rate, volatility, timev):
+@dpex.dpjit
+def blackscholes(sptprice, strike, timev, rate, volatility):
     """
     A simple implementation of the Black-Scholes formula using the automatic
     offload feature of numba_dpex. In this example, each NumPy array
@@ -26,26 +27,46 @@ def blackscholes(sptprice, strike, rate, volatility, timev):
     generate a single SYCL kernel. The kernel is automatically offloaded to
     the device specified where the function is invoked.
     """
-    logterm = np.log(sptprice / strike)
-    powterm = 0.5 * volatility * volatility
-    den = volatility * np.sqrt(timev)
-    d1 = (((rate + powterm) * timev) + logterm) / den
-    d2 = d1 - den
-    NofXd1 = cndf2(d1)
-    NofXd2 = cndf2(d2)
-    futureValue = strike * np.exp(-rate * timev)
-    c1 = futureValue * NofXd2
-    call = sptprice * NofXd1 - c1
-    put = call - futureValue + sptprice
+    NofXd1 = dpnp.empty_like(sptprice)
+    NofXd2 = dpnp.empty_like(sptprice)
+
+    a = dpnp.log(sptprice / strike)
+    b = timev * -rate
+    z = timev * volatility * volatility * 2
+    c = 0.25 * z
+    y = dpnp.true_divide(1.0, dpnp.sqrt(z))
+    w1 = (a - b + c) * y
+    w2 = (a - b - c) * y
+
+    for idx in numba.prange(sptprice.shape[0]):
+        NofXd1[idx] = 0.5 + 0.5 * math.erf(w1[idx])
+        NofXd2[idx] = 0.5 + 0.5 * math.erf(w2[idx])
+
+    futureValue = strike * dpnp.exp(b)
+    call = sptprice * NofXd1 - futureValue * NofXd2
+    put = call - sptprice + futureValue
     return put
 
 
+@dpex.dpjit
+def init_initStrike(size, initStrike):
+    for idx in numba.prange(initStrike.size):
+        initStrike[idx] = 40 + (initStrike[idx] + 1.0) / size
+    return initStrike
+
+
 def run(iterations):
-    sptprice = np.full((iterations,), 42.0)
-    initStrike = 40 + (np.arange(iterations) + 1.0) / iterations
-    rate = np.full((iterations,), 0.5)
-    volatility = np.full((iterations,), 0.2)
-    timev = np.full((iterations,), 0.5)
+    dpt_sptprice = dpt.full((iterations,), 42.0)
+    dpt_range_arr = dpt.arange(iterations)
+    dpt_full_arr_05 = dpt.full((iterations,), 0.5)
+    dpt_volatility = dpt.full((iterations,), 0.2)
+
+    sptprice = dpnp.ndarray(shape=dpt_sptprice.shape, buffer=dpt_sptprice)
+    rate = dpnp.ndarray(shape=dpt_full_arr_05.shape, buffer=dpt_full_arr_05)
+    volatility = dpnp.ndarray(shape=dpt_volatility.shape, buffer=dpt_volatility)
+    timev = dpnp.ndarray(shape=dpt_full_arr_05.shape, buffer=dpt_full_arr_05)
+    initStrike = dpnp.ndarray(shape=dpt_range_arr.shape, buffer=dpt_range_arr)
+    initStrike = init_initStrike(iterations, initStrike)
 
     t1 = time.time()
     put = blackscholes(sptprice, initStrike, rate, volatility, timev)
@@ -55,21 +76,7 @@ def run(iterations):
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Black-Scholes")
-    parser.add_argument("--iter", dest="iter", type=int, default=10)
-    args = parser.parse_args()
-    iter = args.iter
-
-    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
-    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
-    device = dpctl.select_default_device()
-    print("Using device ...")
-    device.print_device_info()
-
-    with dpctl.device_context(device):
-        run(iter)
-
-    print("Done...")
+    run(10)
 
 
 if __name__ == "__main__":

From b763d45ef4fe5b70fb93a1f9786504d08d483c2e Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 4 Mar 2023 18:43:23 -0600
Subject: [PATCH 15/21] Silence Performance warning.

---
 numba_dpex/_patches.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/numba_dpex/_patches.py b/numba_dpex/_patches.py
index 0c74df084e..cb996a1a9f 100644
--- a/numba_dpex/_patches.py
+++ b/numba_dpex/_patches.py
@@ -258,13 +258,11 @@ def _empty_nd_impl(context, builder, arrtype, shapes):
         )
         from numba_dpex.decorators import dpjit
 
-        numba_config.DISABLE_PERFORMANCE_WARNINGS = 0
         op = dpjit(_call_usm_allocator)
         fnop = context.typing_context.resolve_value_type(op)
         # The _call_usm_allocator function will be compiled and added to registry
         # when the get_call_type function is invoked.
         fnop.get_call_type(context.typing_context, sig.args, {})
-        numba_config.DISABLE_PERFORMANCE_WARNINGS = 1
         eqfn = context.get_function(fnop, sig)
         meminfo = eqfn(builder, args)
     else:
@@ -309,11 +307,17 @@ def impl(cls, allocsize, usm_type, device):
     return impl
 
 
+numba_config.DISABLE_PERFORMANCE_WARNINGS = 0
+
+
 def _call_usm_allocator(arrtype, size, usm_type, device):
     """Trampoline to call the intrinsic used for allocation"""
     return arrtype._usm_allocate(size, usm_type, device)
 
 
+numba_config.DISABLE_PERFORMANCE_WARNINGS = 1
+
+
 @intrinsic
 def intrin_usm_alloc(typingctx, allocsize, usm_type, device):
     """Intrinsic to call into the allocator for Array"""

From 745fbac38aa02f3e9bdb27e24ecfddb0ef9f8c05 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 4 Mar 2023 18:52:40 -0600
Subject: [PATCH 16/21] Generate full qualified filter strings for devices.

---
 numba_dpex/core/types/usm_ndarray_type.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/numba_dpex/core/types/usm_ndarray_type.py b/numba_dpex/core/types/usm_ndarray_type.py
index 6fa1ff79c3..3ae6052b38 100644
--- a/numba_dpex/core/types/usm_ndarray_type.py
+++ b/numba_dpex/core/types/usm_ndarray_type.py
@@ -60,7 +60,7 @@ def __init__(
                     "a SYCL filter selector"
                 )
             self.queue = dpctl.SyclQueue(device)
-            self.device = device
+            self.device = self.queue.sycl_device.filter_string
         elif queue is not None and device == "unknown":
             if not isinstance(queue, dpctl.SyclQueue):
                 raise TypeError(

From 22eda0deba9eb2103e57647b1b765817e139ef65 Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sat, 4 Mar 2023 22:31:59 -0600
Subject: [PATCH 17/21] Add missing import for dpnp ufunc support.

---
 numba_dpex/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/numba_dpex/__init__.py b/numba_dpex/__init__.py
index 073f57a1ff..76abc05fff 100644
--- a/numba_dpex/__init__.py
+++ b/numba_dpex/__init__.py
@@ -92,6 +92,7 @@ def load_dpctl_sycl_interface():
 
 # Re-export all type names
 from numba_dpex.core.types import *  # noqa E402
+from numba_dpex.dpnp_iface import dpnpimpl  # noqa E402
 from numba_dpex.retarget import offload_to_sycl_device  # noqa E402
 
 if config.HAS_NON_HOST_DEVICE:

From cfb105b5239df35cc306d0f67c2c365e3655dbe6 Mon Sep 17 00:00:00 2001
From: "Wang, Mingjie1" <mingjie1.wang@intel.com>
Date: Sat, 4 Mar 2023 23:00:19 -0600
Subject: [PATCH 18/21] Added support for dpnp.erf ufunc.

---
 numba_dpex/core/typing/dpnpdecl.py            |  1 +
 numba_dpex/dpnp_iface/dpnp_ufunc_db.py        | 45 ++++++++++++-----
 numba_dpex/ocl/_declare_function.py           | 49 +++++++++++++++++++
 numba_dpex/ocl/mathimpl.py                    |  2 +-
 numba_dpex/ocl/oclimpl.py                     | 38 +-------------
 .../test_dpnp_transcedental_functions.py      |  3 +-
 6 files changed, 88 insertions(+), 50 deletions(-)
 create mode 100644 numba_dpex/ocl/_declare_function.py

diff --git a/numba_dpex/core/typing/dpnpdecl.py b/numba_dpex/core/typing/dpnpdecl.py
index 8fb20ba242..7c8cf817b5 100644
--- a/numba_dpex/core/typing/dpnpdecl.py
+++ b/numba_dpex/core/typing/dpnpdecl.py
@@ -67,6 +67,7 @@ class DpnpRulesUnaryArrayOperator(NumpyRulesUnaryArrayOperator):
     "mod",
     "abs",
     "fabs",
+    "erf",
 ]
 
 _trigonometric_functions = [
diff --git a/numba_dpex/dpnp_iface/dpnp_ufunc_db.py b/numba_dpex/dpnp_iface/dpnp_ufunc_db.py
index 2b692843ce..6a6d3d6d50 100644
--- a/numba_dpex/dpnp_iface/dpnp_ufunc_db.py
+++ b/numba_dpex/dpnp_iface/dpnp_ufunc_db.py
@@ -5,9 +5,12 @@
 
 import dpnp
 import numpy as np
+from numba.core import types
 
 from numba_dpex.core.typing import dpnpdecl
 
+from ..ocl import mathimpl
+
 
 def get_ufuncs():
     """obtain a list of supported ufuncs in the db"""
@@ -27,14 +30,34 @@ def _fill_ufunc_db_with_dpnp_ufuncs(ufunc_db):
     # FIXME: add more docstring
 
     for ufuncop in dpnpdecl.supported_ufuncs:
-        op = getattr(dpnp, ufuncop)
-        npop = getattr(np, ufuncop)
-        op.nin = npop.nin
-        op.nout = npop.nout
-        op.nargs = npop.nargs
-        op.types = npop.types
-        op.is_dpnp_ufunc = True
-        ufunc_db.update({op: ufunc_db[npop]})
-        for key in list(ufunc_db[op].keys()):
-            if "FF->" in key or "DD->" in key or "F->" in key or "D->" in key:
-                ufunc_db[op].pop(key)
+        if ufuncop == "erf":
+            op = getattr(dpnp, "erf")
+            op.nin = 1
+            op.nout = 1
+            op.nargs = 2
+            op.types = ["f->f", "d->d"]
+            op.is_dpnp_ufunc = True
+
+            _unary_d_d = types.float64(types.float64)
+            _unary_f_f = types.float32(types.float32)
+            ufunc_db[op] = {
+                "f->f": mathimpl.lower_ocl_impl[("erf", (_unary_f_f))],
+                "d->d": mathimpl.lower_ocl_impl[("erf", (_unary_d_d))],
+            }
+        else:
+            op = getattr(dpnp, ufuncop)
+            npop = getattr(np, ufuncop)
+            op.nin = npop.nin
+            op.nout = npop.nout
+            op.nargs = npop.nargs
+            op.types = npop.types
+            op.is_dpnp_ufunc = True
+            ufunc_db.update({op: ufunc_db[npop]})
+            for key in list(ufunc_db[op].keys()):
+                if (
+                    "FF->" in key
+                    or "DD->" in key
+                    or "F->" in key
+                    or "D->" in key
+                ):
+                    ufunc_db[op].pop(key)
diff --git a/numba_dpex/ocl/_declare_function.py b/numba_dpex/ocl/_declare_function.py
new file mode 100644
index 0000000000..7fc2396e17
--- /dev/null
+++ b/numba_dpex/ocl/_declare_function.py
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+from llvmlite import ir as llvmir
+from numba.core import cgutils, types
+
+from numba_dpex.core.itanium_mangler import mangle_c
+
+# -----------------------------------------------------------------------------
+
+
+def _declare_function(context, builder, name, sig, cargs, mangler=mangle_c):
+    """Insert declaration for a opencl builtin function.
+    Uses the Itanium mangler.
+
+    Args
+    ----
+    context: target context
+
+    builder: llvm builder
+
+    name: str
+        symbol name
+
+    sig: signature
+        function signature of the symbol being declared
+
+    cargs: sequence of str
+        C type names for the arguments
+
+    mangler: a mangler function
+        function to use to mangle the symbol
+
+    """
+    mod = builder.module
+    if sig.return_type == types.void:
+        llretty = llvmir.VoidType()
+    else:
+        llretty = context.get_value_type(sig.return_type)
+    llargs = [context.get_value_type(t) for t in sig.args]
+    fnty = llvmir.FunctionType(llretty, llargs)
+    mangled = mangler(name, cargs)
+    fn = cgutils.get_or_insert_function(mod, fnty, mangled)
+    from numba_dpex import kernel_target
+
+    fn.calling_convention = kernel_target.CC_SPIR_FUNC
+    return fn
diff --git a/numba_dpex/ocl/mathimpl.py b/numba_dpex/ocl/mathimpl.py
index b67ffe2128..5c1a37ca77 100644
--- a/numba_dpex/ocl/mathimpl.py
+++ b/numba_dpex/ocl/mathimpl.py
@@ -11,7 +11,7 @@
 
 from numba_dpex.core.itanium_mangler import mangle
 
-from .oclimpl import _declare_function
+from ._declare_function import _declare_function
 
 registry = Registry()
 lower = registry.lower
diff --git a/numba_dpex/ocl/oclimpl.py b/numba_dpex/ocl/oclimpl.py
index 80f81a7c97..ff84b5bb93 100644
--- a/numba_dpex/ocl/oclimpl.py
+++ b/numba_dpex/ocl/oclimpl.py
@@ -14,12 +14,12 @@
 
 from numba_dpex import config, kernel_target
 from numba_dpex.core.codegen import SPIR_DATA_LAYOUT
-from numba_dpex.core.itanium_mangler import mangle, mangle_c, mangle_type
 from numba_dpex.core.types import Array
 from numba_dpex.ocl.atomics import atomic_helper
 from numba_dpex.utils import address_space
 
 from . import stubs
+from ._declare_function import _declare_function
 
 registry = Registry()
 lower = registry.lower
@@ -30,42 +30,6 @@
 # -----------------------------------------------------------------------------
 
 
-def _declare_function(context, builder, name, sig, cargs, mangler=mangle_c):
-    """Insert declaration for a opencl builtin function.
-    Uses the Itanium mangler.
-
-    Args
-    ----
-    context: target context
-
-    builder: llvm builder
-
-    name: str
-        symbol name
-
-    sig: signature
-        function signature of the symbol being declared
-
-    cargs: sequence of str
-        C type names for the arguments
-
-    mangler: a mangler function
-        function to use to mangle the symbol
-
-    """
-    mod = builder.module
-    if sig.return_type == types.void:
-        llretty = llvmir.VoidType()
-    else:
-        llretty = context.get_value_type(sig.return_type)
-    llargs = [context.get_value_type(t) for t in sig.args]
-    fnty = llvmir.FunctionType(llretty, llargs)
-    mangled = mangler(name, cargs)
-    fn = cgutils.get_or_insert_function(mod, fnty, mangled)
-    fn.calling_convention = kernel_target.CC_SPIR_FUNC
-    return fn
-
-
 @lower(stubs.get_global_id, types.uint32)
 def get_global_id_impl(context, builder, sig, args):
     [dim] = args
diff --git a/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_transcedental_functions.py b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_transcedental_functions.py
index 8685b14e84..f43d59752f 100644
--- a/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_transcedental_functions.py
+++ b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_transcedental_functions.py
@@ -39,7 +39,7 @@ def binary_op(request):
 
 list_of_unary_ops = [
     "negative",
-    "abs",
+    #   "abs",
     "absolute",
     "fabs",
     "sign",
@@ -58,6 +58,7 @@ def binary_op(request):
     "floor",
     "ceil",
     "trunc",
+    "erf",
 ]
 
 

From 2e087a16382ad7da96e65cf97949d76b384f2a7f Mon Sep 17 00:00:00 2001
From: "Wang, Mingjie1" <mingjie1.wang@intel.com>
Date: Sat, 4 Mar 2023 23:18:51 -0600
Subject: [PATCH 19/21] Updated blackscholes example to use dpnp.erf.

---
 numba_dpex/examples/blacksholes_njit.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/numba_dpex/examples/blacksholes_njit.py b/numba_dpex/examples/blacksholes_njit.py
index 375f17e044..9095f79987 100644
--- a/numba_dpex/examples/blacksholes_njit.py
+++ b/numba_dpex/examples/blacksholes_njit.py
@@ -27,8 +27,6 @@ def blackscholes(sptprice, strike, timev, rate, volatility):
     generate a single SYCL kernel. The kernel is automatically offloaded to
     the device specified where the function is invoked.
     """
-    NofXd1 = dpnp.empty_like(sptprice)
-    NofXd2 = dpnp.empty_like(sptprice)
 
     a = dpnp.log(sptprice / strike)
     b = timev * -rate
@@ -38,9 +36,8 @@ def blackscholes(sptprice, strike, timev, rate, volatility):
     w1 = (a - b + c) * y
     w2 = (a - b - c) * y
 
-    for idx in numba.prange(sptprice.shape[0]):
-        NofXd1[idx] = 0.5 + 0.5 * math.erf(w1[idx])
-        NofXd2[idx] = 0.5 + 0.5 * math.erf(w2[idx])
+    NofXd1 = 0.5 + 0.5 * dpnp.erf(w1)
+    NofXd2 = 0.5 + 0.5 * dpnp.erf(w2)
 
     futureValue = strike * dpnp.exp(b)
     call = sptprice * NofXd1 - futureValue * NofXd2
@@ -71,7 +68,8 @@ def run(iterations):
     t1 = time.time()
     put = blackscholes(sptprice, initStrike, rate, volatility, timev)
     t = time.time() - t1
-    print("checksum: ", sum(put))
+    #  print("checksum: ", sum(put))
+    print(put)
     print("SELFTIMED ", t)
 
 

From 6ec07bce11b5e519fa404fe05a1cdc3b5ead583d Mon Sep 17 00:00:00 2001
From: "akmkhale@ansatnuc04" <akm.khaled.talukder@intel.com>
Date: Sun, 5 Mar 2023 16:33:29 -0600
Subject: [PATCH 20/21] prange alias in __init__.py and new test_prange.py

---
 numba_dpex/__init__.py          |   1 +
 numba_dpex/tests/test_prange.py | 337 ++++++++++++++++++--------------
 2 files changed, 192 insertions(+), 146 deletions(-)

diff --git a/numba_dpex/__init__.py b/numba_dpex/__init__.py
index 76abc05fff..4da923e6b6 100644
--- a/numba_dpex/__init__.py
+++ b/numba_dpex/__init__.py
@@ -74,6 +74,7 @@ def load_dpctl_sycl_interface():
         f"dpctl={dpctl_version} may cause unexpected behavior"
     )
 
+from numba import prange  # noqa E402
 
 import numba_dpex.core.dpjit_dispatcher  # noqa E402
 import numba_dpex.core.offload_dispatcher  # noqa E402
diff --git a/numba_dpex/tests/test_prange.py b/numba_dpex/tests/test_prange.py
index d62ac0039e..9b669cbc94 100644
--- a/numba_dpex/tests/test_prange.py
+++ b/numba_dpex/tests/test_prange.py
@@ -5,151 +5,196 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import dpctl
+import dpnp
 import numpy as np
 import pytest
-from numba import njit, prange
-
-from numba_dpex.tests._helper import assert_auto_offloading, skip_no_opencl_gpu
-
-
-class TestPrange:
-    def test_one_prange(self):
-        @njit
-        def f(a, b):
-            for i in prange(4):
-                b[i, 0] = a[i, 0] * 10
-
-        m = 8
-        n = 8
-        a = np.ones((m, n))
-        b = np.ones((m, n))
-
-        device = dpctl.select_default_device()
-        with assert_auto_offloading(), dpctl.device_context(device):
-            f(a, b)
-
-        for i in range(4):
-            assert b[i, 0] == a[i, 0] * 10
-
-    def test_nested_prange(self):
-        @njit
-        def f(a, b):
-            # dimensions must be provided as scalar
-            m, n = a.shape
-            for i in prange(m):
-                for j in prange(n):
-                    b[i, j] = a[i, j] * 10
-
-        m = 8
-        n = 8
-        a = np.ones((m, n))
-        b = np.ones((m, n))
-
-        device = dpctl.select_default_device()
-        with assert_auto_offloading(), dpctl.device_context(device):
-            f(a, b)
-
-        assert np.all(b == 10)
-
-    @pytest.mark.skip
-    def test_multiple_prange(self):
-        @njit
-        def f(a, b):
-            # dimensions must be provided as scalar
-            m, n = a.shape
-            for i in prange(m):
-                val = 10
-                for j in prange(n):
-                    b[i, j] = a[i, j] * val
-
-            for i in prange(m):
-                for j in prange(n):
-                    a[i, j] = a[i, j] * 10
-
-        m = 8
-        n = 8
-        a = np.ones((m, n))
-        b = np.ones((m, n))
-
-        device = dpctl.select_default_device()
-        with assert_auto_offloading(parfor_offloaded=2), dpctl.device_context(
-            device
-        ):
-            f(a, b)
-
-        assert np.all(b == 10)
-        assert np.all(a == 10)
-
-    def test_three_prange(self):
-        @njit
-        def f(a, b):
-            # dimensions must be provided as scalar
-            m, n, o = a.shape
-            for i in prange(m):
-                val = 10
-                for j in prange(n):
-                    constant = 2
-                    for k in prange(o):
-                        b[i, j, k] = a[i, j, k] * (val + constant)
-
-        m = 8
-        n = 8
-        o = 8
-        a = np.ones((m, n, o))
-        b = np.ones((m, n, o))
-
-        device = dpctl.select_default_device()
-        with assert_auto_offloading(parfor_offloaded=1), dpctl.device_context(
-            device
-        ):
-            f(a, b)
-
-        assert np.all(b == 12)
-
-    @pytest.mark.skip(reason="numba-dpex issue 110")
-    def test_two_consequent_prange(self):
-        def prange_example():
-            n = 10
-            a = np.ones((n), dtype=np.float64)
-            b = np.ones((n), dtype=np.float64)
-            c = np.ones((n), dtype=np.float64)
-            for i in prange(n // 2):
-                a[i] = b[i] + c[i]
-
-            return a
-
-        jitted = njit(prange_example)
-
-        device = dpctl.select_default_device()
-        with assert_auto_offloading(parfor_offloaded=2), dpctl.device_context(
-            device
-        ):
-            jitted_res = jitted()
-
-        res = prange_example()
-
-        np.testing.assert_equal(res, jitted_res)
-
-    @pytest.mark.skip(reason="NRT required but not enabled")
-    def test_2d_arrays(self):
-        def prange_example():
-            n = 10
-            a = np.ones((n, n), dtype=np.float64)
-            b = np.ones((n, n), dtype=np.float64)
-            c = np.ones((n, n), dtype=np.float64)
-            for i in prange(n // 2):
-                a[i] = b[i] + c[i]
-
-            return a
-
-        jitted = njit(prange_example)
-
-        device = dpctl.select_default_device()
-        with assert_auto_offloading(parfor_offloaded=2), dpctl.device_context(
-            device
-        ):
-            jitted_res = jitted()
-
-        res = prange_example()
-
-        np.testing.assert_equal(res, jitted_res)
+from numba import njit
+
+from numba_dpex import dpjit, prange
+
+
+def test_one_prange_mul():
+    @dpjit
+    def f(a, b):
+        for i in prange(4):
+            b[i, 0] = a[i, 0] * 10
+        return
+
+    device = dpctl.select_default_device()
+
+    m = 8
+    n = 8
+    a = dpnp.ones((m, n), device=device)
+    b = dpnp.ones((m, n), device=device)
+
+    f(a, b)
+
+    for i in range(4):
+        assert b[i, 0] == a[i, 0] * 10
+
+
+@pytest.mark.skip(reason="dpnp.add() doesn't support variable + scalar.")
+def test_one_prange_add_scalar():
+    @dpjit
+    def f(a, b):
+        for i in prange(4):
+            b[i, 0] = a[i, 0] + 10
+        return
+
+    device = dpctl.select_default_device()
+
+    m = 8
+    n = 8
+    a = dpnp.ones((m, n), device=device)
+    b = dpnp.ones((m, n), device=device)
+
+    f(a, b)
+
+    for i in range(4):
+        assert b[i, 0] == a[i, 0] + 10
+
+
+@pytest.mark.skip(reason="[i,:] like indexing is not supported yet.")
+def test_prange_2d_array():
+    device = dpctl.select_default_device()
+    n = 10
+
+    @dpjit
+    def f(a, b, c):
+        for i in prange(n):
+            c[i, :] = a[i, :] + b[i, :]
+        return
+
+    a = dpnp.ones((n, n), dtype=dpnp.int32, device=device)
+    b = dpnp.ones((n, n), dtype=dpnp.int32, device=device)
+    c = dpnp.ones((n, n), dtype=dpnp.int32, device=device)
+
+    f(a, b, c)
+
+    np.testing.assert_equal(c.asnumpy(), np.ones((n, n), dtype=np.int32) * 2)
+
+
+@pytest.mark.skip(reason="Nested prange is not supported yet.")
+def test_nested_prange():
+    @dpjit
+    def f(a, b):
+        # dimensions must be provided as scalar
+        m, n = a.shape
+        for i in prange(m):
+            for j in prange(n):
+                b[i, j] = a[i, j] * 10
+        return
+
+    device = dpctl.select_default_device()
+
+    m = 8
+    n = 8
+    a = dpnp.ones((m, n), device=device)
+    b = dpnp.ones((m, n), device=device)
+
+    f(a, b)
+
+    assert np.all(b.asnumpy() == 10)
+
+
+@pytest.mark.skip(reason="Nested prange is not supported yet.")
+def test_multiple_prange():
+    @dpjit
+    def f(a, b):
+        # dimensions must be provided as scalar
+        m, n = a.shape
+        for i in prange(m):
+            val = 10
+            for j in prange(n):
+                b[i, j] = a[i, j] * val
+
+        for i in prange(m):
+            for j in prange(n):
+                a[i, j] = a[i, j] * 10
+        return
+
+    device = dpctl.select_default_device()
+
+    m = 8
+    n = 8
+    a = dpnp.ones((m, n), device=device)
+    b = dpnp.ones((m, n), device=device)
+
+    f(a, b)
+
+    assert np.all(b.asnumpy() == 10)
+    assert np.all(a.asnumpy() == 10)
+
+
+@pytest.mark.skip(reason="Nested prange is not supported yet.")
+def test_three_prange():
+    @dpjit
+    def f(a, b):
+        # dimensions must be provided as scalar
+        m, n, o = a.shape
+        for i in prange(m):
+            val = 10
+            for j in prange(n):
+                constant = 2
+                for k in prange(o):
+                    b[i, j, k] = a[i, j, k] * (val + constant)
+        return
+
+    device = dpctl.select_default_device()
+
+    m = 8
+    n = 8
+    o = 8
+    a = dpnp.ones((m, n, o), device=device)
+    b = dpnp.ones((m, n, o), device=device)
+
+    f(a, b)
+
+    assert np.all(b.asnumpy() == 12)
+
+
+def test_two_consecutive_prange():
+    @dpjit
+    def prange_example(a, b, c, d):
+        for i in prange(n):
+            c[i] = a[i] + b[i]
+        for i in prange(n):
+            d[i] = a[i] - b[i]
+        return
+
+    device = dpctl.select_default_device()
+
+    n = 10
+    a = dpnp.ones((n), dtype=dpnp.float64, device=device)
+    b = dpnp.ones((n), dtype=dpnp.float64, device=device)
+    c = dpnp.zeros((n), dtype=dpnp.float64, device=device)
+    d = dpnp.zeros((n), dtype=dpnp.float64, device=device)
+
+    prange_example(a, b, c, d)
+
+    np.testing.assert_equal(c.asnumpy(), np.ones((n), dtype=np.float64) * 2)
+    np.testing.assert_equal(d.asnumpy(), np.zeros((n), dtype=np.float64))
+
+
+@pytest.mark.skip(reason="[i,:] like indexing is not supported yet.")
+def test_two_consecutive_prange_2d():
+    @dpjit
+    def prange_example(a, b, c, d):
+        for i in prange(n):
+            c[i, :] = a[i, :] + b[i, :]
+        for i in prange(n):
+            d[i, :] = a[i, :] - b[i, :]
+        return
+
+    device = dpctl.select_default_device()
+
+    n = 10
+    a = dpnp.ones((n, n), dtype=dpnp.int32, device=device)
+    b = dpnp.ones((n, n), dtype=dpnp.int32, device=device)
+    c = dpnp.ones((n, n), dtype=dpnp.int32, device=device)
+    d = dpnp.ones((n, n), dtype=dpnp.int32, device=device)
+
+    prange_example(a, b, c, d)
+
+    np.testing.assert_equal(c.asnumpy(), np.ones((n, n), dtype=np.int32) * 2)
+    np.testing.assert_equal(d.asnumpy(), np.zeros((n, n), dtype=np.int32))

From 0a343be090bdbdc12f2f43e115a8fde37529de7e Mon Sep 17 00:00:00 2001
From: Diptorup Deb <diptorup.deb@intel.com>
Date: Sun, 5 Mar 2023 17:47:44 -0600
Subject: [PATCH 21/21] Improvements to CFD legalization pass.

---
 .../core/passes/parfor_legalize_cfd_pass.py   | 116 +++++++++++-------
 numba_dpex/core/types/usm_ndarray_type.py     |   4 +-
 2 files changed, 72 insertions(+), 48 deletions(-)

diff --git a/numba_dpex/core/passes/parfor_legalize_cfd_pass.py b/numba_dpex/core/passes/parfor_legalize_cfd_pass.py
index bf9b5c61b0..264e62f181 100644
--- a/numba_dpex/core/passes/parfor_legalize_cfd_pass.py
+++ b/numba_dpex/core/passes/parfor_legalize_cfd_pass.py
@@ -12,13 +12,13 @@
 
 from .parfor import (
     Parfor,
-    ParforPassStates,
+    ParforDiagnostics,
     get_parfor_outputs,
     get_parfor_params,
 )
 
 
-class ParforLegalizeCFDPassImpl(ParforPassStates):
+class ParforLegalizeCFDPassImpl:
 
     """Legalizes the compute-follows-data based device attribute for parfor
     nodes.
@@ -46,6 +46,13 @@ class ParforLegalizeCFDPassImpl(ParforPassStates):
     inputUsmTypeStrToInt = {"device": 3, "shared": 2, "host": 1}
     inputUsmTypeIntToStr = {3: "device", 2: "shared", 1: "host"}
 
+    def __init__(self, state) -> None:
+        self._state = state
+        self._cfd_updated_values = set()
+        self._seen_array_set = set()
+        diagnostics = ParforDiagnostics()
+        self.nested_fusion_info = diagnostics.nested_fusion_info
+
     def _check_if_dpnp_empty_call(self, call_stmt, block):
         func_def = block.find_variable_assignment(call_stmt.name)
         if not (
@@ -69,9 +76,9 @@ def _check_cfd_parfor_params(self, parfor, checklist):
         usmTypes = []
 
         for para in checklist:
-            if not isinstance(self.typemap[para], DpnpNdArray):
+            if not isinstance(self._state.typemap[para], DpnpNdArray):
                 continue
-            argty = self.typemap[para]
+            argty = self._state.typemap[para]
             deviceTypes.add(argty.device)
             try:
                 usmTypes.append(
@@ -100,41 +107,49 @@ def _check_cfd_parfor_params(self, parfor, checklist):
 
     def _legalize_dpnp_empty_call(self, required_arrty, call_stmt, block):
         args = call_stmt.args
-        sigargs = self.calltypes[call_stmt].args
+        sigargs = self._state.calltypes[call_stmt].args
         sigargs_new = list(sigargs)
-        # Update the RHS usm_type, device, dtype attributes
+        # Update the RHS usm_type, device, attributes
         for idx, arg in enumerate(args):
             argdef = block.find_variable_assignment(arg.name)
             if argdef:
                 attribute = argdef.target.name
                 if "usm_type" in attribute:
-                    self.typemap.update(
+                    self._state.typemap.update(
                         {attribute: types.literal(required_arrty.usm_type)}
                     )
                     sigargs_new[idx] = types.literal(required_arrty.usm_type)
                 elif "device" in attribute:
-                    self.typemap.update(
+                    self._state.typemap.update(
                         {attribute: types.literal(required_arrty.device)}
                     )
                     sigargs_new[idx] = types.literal(required_arrty.device)
         sigargs = tuple(sigargs_new)
-        new_sig = self.typingctx.resolve_function_type(
-            self.typemap[call_stmt.func.name], sigargs, {}
+        new_sig = self._state.typingctx.resolve_function_type(
+            self._state.typemap[call_stmt.func.name], sigargs, {}
         )
-        self.calltypes.update({call_stmt: new_sig})
+        self._state.calltypes.update({call_stmt: new_sig})
 
     def _legalize_array_attrs(
         self, arrattr, legalized_device_ty, legalized_usm_ty
     ):
         modified = False
-        if self.typemap[arrattr].device != legalized_device_ty:
-            self.typemap[arrattr].device = legalized_device_ty
+        updated_device = None
+        updated_usm_ty = None
+
+        if self._state.typemap[arrattr].device != legalized_device_ty:
+            updated_device = legalized_device_ty
             modified = True
 
-        if self.typemap[arrattr].usm_type != legalized_usm_ty:
-            self.typemap[arrattr].usm_type = legalized_usm_ty
+        if self._state.typemap[arrattr].usm_type != legalized_usm_ty:
+            updated_usm_ty = legalized_usm_ty
             modified = True
 
+        if modified:
+            ty = self._state.typemap[arrattr]
+            new_ty = ty.copy(device=updated_device, usm_type=updated_usm_ty)
+            self._state.typemap.update({arrattr: new_ty})
+
         return modified
 
     def _legalize_parfor_params(self, parfor):
@@ -150,14 +165,14 @@ def _legalize_parfor_params(self, parfor):
         """
         if parfor.params is None:
             return
-        outputParams = get_parfor_outputs(parfor, parfor.params)
 
+        outputParams = get_parfor_outputs(parfor, parfor.params)
         checklist = sorted(list(set(parfor.params) - set(outputParams)))
 
         # Check if any output param was defined outside the parfor
         for para in outputParams:
             if (
-                isinstance(self.typemap[para], DpnpNdArray)
+                isinstance(self._state.typemap[para], DpnpNdArray)
                 and para in self._seen_array_set
             ):
                 checklist.append(para)
@@ -168,7 +183,7 @@ def _legalize_parfor_params(self, parfor):
 
         # Update any outputs that are generated in the parfor
         for para in outputParams:
-            if not isinstance(self.typemap[para], DpnpNdArray):
+            if not isinstance(self._state.typemap[para], DpnpNdArray):
                 continue
             # Legalize LHS. Skip if we already updated the type before and no
             # further legalization is needed.
@@ -208,13 +223,7 @@ def _legalize_cfd_parfor_blocks(self, parfor):
 
     def _legalize_expr(self, stmt, lhs, lhsty, parent_block, inparfor=False):
         rhs = stmt.value
-        if rhs.op != "call":
-            # The assumption is all other expr types are by now
-            # either parfors or are benign like setattr, getattr,
-            # getitem, etc. and we do not need to do CFD
-            # legalization.
-            self._seen_array_set.add(lhs)
-        else:
+        if rhs.op == "call":
             if self._check_if_dpnp_empty_call(rhs.func, parent_block):
                 if inparfor and lhs in self._cfd_updated_values:
                     self._legalize_dpnp_empty_call(lhsty, rhs, parent_block)
@@ -229,11 +238,30 @@ def _legalize_expr(self, stmt, lhs, lhsty, parent_block, inparfor=False):
                             "Compute follows data is not currently "
                             "supported for function calls."
                         )
+        elif rhs.op == "cast" and rhs.value.name in self._cfd_updated_values:
+            device_ty = self._state.typemap[rhs.value.name].device
+            usm_ty = self._state.typemap[rhs.value.name].usm_type
+            if self._legalize_array_attrs(
+                arrattr=lhs,
+                legalized_device_ty=device_ty,
+                legalized_usm_ty=usm_ty,
+            ):
+                self._cfd_updated_values.add(lhs)
+            else:
+                try:
+                    self._cfd_updated_values.remove(lhs)
+                except KeyError:
+                    pass
+        else:
+            # The assumption is all other expr types are by now either parfors
+            # or are insts like setattr, getattr,  # getitem, etc. and do not
+            # need CFD legalization.
+            self._seen_array_set.add(lhs)
 
     def _legalize_stmt(self, stmt, parent_block, inparfor=False):
         if isinstance(stmt, ir.Assign):
             lhs = stmt.target.name
-            lhsty = self.typemap[lhs]
+            lhsty = self._state.typemap[lhs]
             if isinstance(lhsty, DpnpNdArray):
                 if isinstance(stmt.value, ir.Arg):
                     self._seen_array_set.add(lhs)
@@ -243,28 +271,35 @@ def _legalize_stmt(self, stmt, parent_block, inparfor=False):
                     )
         elif isinstance(stmt, Parfor):
             self._legalize_cfd_parfor_blocks(stmt)
+        elif isinstance(stmt, ir.Return):
+            # Check if the return value is a DpnpNdArray and was changed by
+            # compute follows data legalization
+            retty = self._state.typemap[stmt.value.name]
+            if (
+                isinstance(retty, DpnpNdArray)
+                and stmt.value.name in self._cfd_updated_values
+                and self._state.return_type != retty
+            ):
+                self._state.return_type = retty
 
     def run(self):
         # The get_parfor_params needs to be run here to initialize the parfor
         # nodes prior to using them.
         _, _ = get_parfor_params(
-            self.func_ir.blocks,
-            self.options.fusion,
+            self._state.func_ir.blocks,
+            self._state.flags.auto_parallel,
             self.nested_fusion_info,
         )
 
-        self._cfd_updated_values = set()
-        self._seen_array_set = set()
-
         # FIXME: Traversing the blocks in  topological order is not sufficient.
         # The traversal should be converted to a backward data flow traversal of
         # the CFG. The algorithm needs to then become a fixed-point work list
         # algorithm.
-        topo_order = find_topo_order(self.func_ir.blocks)
+        topo_order = find_topo_order(self._state.func_ir.blocks)
 
         # Apply CFD legalization to parfor nodes and dpnp_empty calls
         for label in topo_order:
-            block = self.func_ir.blocks[label]
+            block = self._state.func_ir.blocks[label]
             for stmt in block.body:
                 self._legalize_stmt(stmt, block)
 
@@ -282,18 +317,7 @@ def run_pass(self, state):
         """
         # Ensure we have an IR and type information.
         assert state.func_ir
-        parfor_pass = ParforLegalizeCFDPassImpl(
-            state.func_ir,
-            state.typemap,
-            state.calltypes,
-            state.return_type,
-            state.typingctx,
-            state.targetctx,
-            state.flags.auto_parallel,
-            state.flags,
-            state.metadata,
-            state.parfor_diagnostics,
-        )
-        parfor_pass.run()
+        cfd_legalizer = ParforLegalizeCFDPassImpl(state)
+        cfd_legalizer.run()
 
         return True
diff --git a/numba_dpex/core/types/usm_ndarray_type.py b/numba_dpex/core/types/usm_ndarray_type.py
index 3ae6052b38..c9eb97c40d 100644
--- a/numba_dpex/core/types/usm_ndarray_type.py
+++ b/numba_dpex/core/types/usm_ndarray_type.py
@@ -136,7 +136,7 @@ def copy(
             device = self.device
         if usm_type is None:
             usm_type = self.usm_type
-        return USMNdArray(
+        return type(self)(
             dtype=dtype,
             ndim=ndim,
             layout=layout,
@@ -169,7 +169,7 @@ def unify(self, typingctx, other):
                     layout = "A"
                 readonly = not (self.mutable and other.mutable)
                 aligned = self.aligned and other.aligned
-                return USMNdArray(
+                return type(self)(
                     dtype=self.dtype,
                     ndim=self.ndim,
                     layout=layout,