diff --git a/numba_dpex/__init__.py b/numba_dpex/__init__.py
index 073f57a1ff..4da923e6b6 100644
--- a/numba_dpex/__init__.py
+++ b/numba_dpex/__init__.py
@@ -74,6 +74,7 @@ def load_dpctl_sycl_interface():
         f"dpctl={dpctl_version} may cause unexpected behavior"
     )
 
+from numba import prange  # noqa E402
 
 import numba_dpex.core.dpjit_dispatcher  # noqa E402
 import numba_dpex.core.offload_dispatcher  # noqa E402
@@ -92,6 +93,7 @@ def load_dpctl_sycl_interface():
 
 # Re-export all type names
 from numba_dpex.core.types import *  # noqa E402
+from numba_dpex.dpnp_iface import dpnpimpl  # noqa E402
 from numba_dpex.retarget import offload_to_sycl_device  # noqa E402
 
 if config.HAS_NON_HOST_DEVICE:
diff --git a/numba_dpex/_patches.py b/numba_dpex/_patches.py
index 0c74df084e..cb996a1a9f 100644
--- a/numba_dpex/_patches.py
+++ b/numba_dpex/_patches.py
@@ -258,13 +258,11 @@ def _empty_nd_impl(context, builder, arrtype, shapes):
         )
         from numba_dpex.decorators import dpjit
 
-        numba_config.DISABLE_PERFORMANCE_WARNINGS = 0
         op = dpjit(_call_usm_allocator)
         fnop = context.typing_context.resolve_value_type(op)
         # The _call_usm_allocator function will be compiled and added to registry
         # when the get_call_type function is invoked.
         fnop.get_call_type(context.typing_context, sig.args, {})
-        numba_config.DISABLE_PERFORMANCE_WARNINGS = 1
         eqfn = context.get_function(fnop, sig)
         meminfo = eqfn(builder, args)
     else:
@@ -309,11 +307,17 @@ def impl(cls, allocsize, usm_type, device):
     return impl
 
 
+numba_config.DISABLE_PERFORMANCE_WARNINGS = 0
+
+
 def _call_usm_allocator(arrtype, size, usm_type, device):
     """Trampoline to call the intrinsic used for allocation"""
     return arrtype._usm_allocate(size, usm_type, device)
 
 
+numba_config.DISABLE_PERFORMANCE_WARNINGS = 1
+
+
 @intrinsic
 def intrin_usm_alloc(typingctx, allocsize, usm_type, device):
     """Intrinsic to call into the allocator for Array"""
diff --git a/numba_dpex/core/exceptions.py b/numba_dpex/core/exceptions.py
index 1d75dd6f40..55b3f7e2de 100644
--- a/numba_dpex/core/exceptions.py
+++ b/numba_dpex/core/exceptions.py
@@ -221,7 +221,7 @@ def __init__(
                 f"Arguments {ndarray_args} are non-usm arrays, "
                 f"and arguments {usmarray_args} are usm arrays."
             )
-        elif usmarray_argnum_list:
+        elif usmarray_argnum_list is not None:
             usmarray_args = ",".join([str(i) for i in usmarray_argnum_list])
             self.message = (
                 f'Execution queue for kernel "{kernel_name}" could '
@@ -433,3 +433,13 @@ def __init__(self, kernel_name, argtypes) -> None:
         )
 
         super().__init__(self.message)
+
+
+class UnsupportedParforError(Exception):
+    """Exception raised when a parfor node could not be lowered by Numba-dpex"""
+
+    def __init__(self, extra_msg=None) -> None:
+        self.message = "Expression cannot be offloaded"
+        if extra_msg:
+            self.message += " due to " + extra_msg
+        super().__init__(self.message)
diff --git a/numba_dpex/core/passes/__init__.py b/numba_dpex/core/passes/__init__.py
index 00fec5f515..5d50252a3b 100644
--- a/numba_dpex/core/passes/__init__.py
+++ b/numba_dpex/core/passes/__init__.py
@@ -1,3 +1,27 @@
 # SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
 #
 # SPDX-License-Identifier: Apache-2.0
+
+from .parfor_legalize_cfd_pass import ParforLegalizeCFDPass
+from .parfor_lowering_pass import ParforLoweringPass
+from .passes import (
+    DumpParforDiagnostics,
+    NoPythonBackend,
+    ParforFusionPass,
+    ParforPass,
+    ParforPreLoweringPass,
+    PreParforPass,
+    SplitParforPass,
+)
+
+__all__ = [
+    "DumpParforDiagnostics",
+    "ParforLoweringPass",
+    "ParforLegalizeCFDPass",
+    "ParforFusionPass",
+    "ParforPreLoweringPass",
+    "ParforPass",
+    "PreParforPass",
+    "SplitParforPass",
+    "NoPythonBackend",
+]
diff --git a/numba_dpex/core/passes/parfor.py b/numba_dpex/core/passes/parfor.py
index 9e162c90f1..454d63d350 100644
--- a/numba_dpex/core/passes/parfor.py
+++ b/numba_dpex/core/passes/parfor.py
@@ -2421,7 +2421,6 @@ def _arrayexpr_to_parfor(self, equiv_set, lhs, arrayexpr, avail_vars):
         expr = arrayexpr.expr
         arr_typ = pass_states.typemap[lhs.name]
         el_typ = arr_typ.dtype
-
         # generate loopnests and size variables from lhs correlations
         size_vars = equiv_set.get_shape(lhs)
         index_vars, loopnests = _mk_parfor_loops(
@@ -3788,6 +3787,46 @@ def _get_call_arg_types(expr, typemap):
     return tuple(new_arg_typs), new_kw_types
 
 
+def _ufunc_to_parfor_instr(
+    typemap,
+    op,
+    avail_vars,
+    loc,
+    scope,
+    func_ir,
+    out_ir,
+    arg_vars,
+    typingctx,
+    calltypes,
+    expr_out_var,
+):
+    func_var_name = _find_func_var(typemap, op, avail_vars, loc=loc)
+    func_var = ir.Var(scope, mk_unique_var(func_var_name), loc)
+    typemap[func_var.name] = typemap[func_var_name]
+    func_var_def = copy.deepcopy(func_ir.get_definition(func_var_name))
+    if (
+        isinstance(func_var_def, ir.Expr)
+        and func_var_def.op == "getattr"
+        and func_var_def.attr == "sqrt"
+    ):
+        g_math_var = ir.Var(scope, mk_unique_var("$math_g_var"), loc)
+        typemap[g_math_var.name] = types.misc.Module(math)
+        g_math = ir.Global("math", math, loc)
+        g_math_assign = ir.Assign(g_math, g_math_var, loc)
+        func_var_def = ir.Expr.getattr(g_math_var, "sqrt", loc)
+        out_ir.append(g_math_assign)
+    #                     out_ir.append(func_var_def)
+    ir_expr = ir.Expr.call(func_var, arg_vars, (), loc)
+    call_typ = typemap[func_var.name].get_call_type(
+        typingctx, tuple(typemap[a.name] for a in arg_vars), {}
+    )
+    calltypes[ir_expr] = call_typ
+    el_typ = call_typ.return_type
+    # signature(el_typ, el_typ)
+    out_ir.append(ir.Assign(func_var_def, func_var, loc))
+    out_ir.append(ir.Assign(ir_expr, expr_out_var, loc))
+
+
 def _arrayexpr_tree_to_ir(
     func_ir,
     typingctx,
@@ -3852,35 +3891,33 @@ def _arrayexpr_tree_to_ir(
                 # elif isinstance(op, (np.ufunc, DUFunc)):
                 # function calls are stored in variables which are not removed
                 # op is typing_key to the variables type
-                func_var_name = _find_func_var(typemap, op, avail_vars, loc=loc)
-                func_var = ir.Var(scope, mk_unique_var(func_var_name), loc)
-                typemap[func_var.name] = typemap[func_var_name]
-                func_var_def = copy.deepcopy(
-                    func_ir.get_definition(func_var_name)
-                )
-                if (
-                    isinstance(func_var_def, ir.Expr)
-                    and func_var_def.op == "getattr"
-                    and func_var_def.attr == "sqrt"
-                ):
-                    g_math_var = ir.Var(
-                        scope, mk_unique_var("$math_g_var"), loc
-                    )
-                    typemap[g_math_var.name] = types.misc.Module(math)
-                    g_math = ir.Global("math", math, loc)
-                    g_math_assign = ir.Assign(g_math, g_math_var, loc)
-                    func_var_def = ir.Expr.getattr(g_math_var, "sqrt", loc)
-                    out_ir.append(g_math_assign)
-                #                     out_ir.append(func_var_def)
-                ir_expr = ir.Expr.call(func_var, arg_vars, (), loc)
-                call_typ = typemap[func_var.name].get_call_type(
-                    typingctx, tuple(typemap[a.name] for a in arg_vars), {}
+                _ufunc_to_parfor_instr(
+                    typemap,
+                    op,
+                    avail_vars,
+                    loc,
+                    scope,
+                    func_ir,
+                    out_ir,
+                    arg_vars,
+                    typingctx,
+                    calltypes,
+                    expr_out_var,
                 )
-                calltypes[ir_expr] = call_typ
-                el_typ = call_typ.return_type
-                # signature(el_typ, el_typ)
-                out_ir.append(ir.Assign(func_var_def, func_var, loc))
-                out_ir.append(ir.Assign(ir_expr, expr_out_var, loc))
+        if hasattr(op, "is_dpnp_ufunc"):
+            _ufunc_to_parfor_instr(
+                typemap,
+                op,
+                avail_vars,
+                loc,
+                scope,
+                func_ir,
+                out_ir,
+                arg_vars,
+                typingctx,
+                calltypes,
+                expr_out_var,
+            )
     elif isinstance(expr, ir.Var):
         var_typ = typemap[expr.name]
         if isinstance(var_typ, types.Array):
diff --git a/numba_dpex/core/passes/parfor_legalize_cfd_pass.py b/numba_dpex/core/passes/parfor_legalize_cfd_pass.py
new file mode 100644
index 0000000000..264e62f181
--- /dev/null
+++ b/numba_dpex/core/passes/parfor_legalize_cfd_pass.py
@@ -0,0 +1,323 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from numba.core import ir, types
+from numba.core.compiler_machinery import FunctionPass, register_pass
+from numba.core.ir_utils import find_topo_order
+
+from numba_dpex.core.exceptions import ComputeFollowsDataInferenceError
+from numba_dpex.core.passes.parfor_lowering_pass import ParforLowerFactory
+from numba_dpex.core.types.dpnp_ndarray_type import DpnpNdArray
+
+from .parfor import (
+    Parfor,
+    ParforDiagnostics,
+    get_parfor_outputs,
+    get_parfor_params,
+)
+
+
+class ParforLegalizeCFDPassImpl:
+
+    """Legalizes the compute-follows-data based device attribute for parfor
+    nodes.
+
+    DpnpNdArray array-expressions populate the type of the left-hand-side (LHS)
+    of each expression as a default DpnpNdArray instance derived from the
+    __array_ufunc__ method of DpnpNdArray class. The pass fixes the LHS type by
+    properly applying compute follows data programming model. The pass first
+    checks if the right-hand-side (RHS) DpnpNdArray arguments are on the same
+    device, else raising a ComputeFollowsDataInferenceError. Once the RHS has
+    been validated, the LHS type is updated.
+
+    The pass also updated the usm_type of the LHS based on a USM type
+    propagation rule: device > shared > host. Thus, if the usm_type attribute of
+    the RHS arrays are "device" and "shared" respectively, the LHS array's
+    usm_type attribute will be "device".
+
+    Once the pass has identified a parfor with DpnpNdArrays and legalized it,
+    the "lowerer" attribute of the parfor is set to
+    ``numba_dpex.core.passes.parfor_lowering_pass._lower_parfor_as_kernel`` so
+    that the parfor node is lowered using Dpex's lowerer.
+
+    """
+
+    inputUsmTypeStrToInt = {"device": 3, "shared": 2, "host": 1}
+    inputUsmTypeIntToStr = {3: "device", 2: "shared", 1: "host"}
+
+    def __init__(self, state) -> None:
+        self._state = state
+        self._cfd_updated_values = set()
+        self._seen_array_set = set()
+        diagnostics = ParforDiagnostics()
+        self.nested_fusion_info = diagnostics.nested_fusion_info
+
+    def _check_if_dpnp_empty_call(self, call_stmt, block):
+        func_def = block.find_variable_assignment(call_stmt.name)
+        if not (
+            isinstance(func_def, ir.Assign)
+            and isinstance(func_def.value, ir.Expr)
+            and func_def.value.op == "getattr"
+        ):
+            raise AssertionError
+
+        module_name = block.find_variable_assignment(
+            func_def.value.list_vars()[0].name
+        ).value.value.__name__
+
+        if func_def.value.attr == "empty" and module_name == "dpnp":
+            return True
+        else:
+            return False
+
+    def _check_cfd_parfor_params(self, parfor, checklist):
+        deviceTypes = set()
+        usmTypes = []
+
+        for para in checklist:
+            if not isinstance(self._state.typemap[para], DpnpNdArray):
+                continue
+            argty = self._state.typemap[para]
+            deviceTypes.add(argty.device)
+            try:
+                usmTypes.append(
+                    ParforLegalizeCFDPassImpl.inputUsmTypeStrToInt[
+                        argty.usm_type
+                    ]
+                )
+            except KeyError:
+                raise ValueError(
+                    "Unknown USM type encountered. Supported "
+                    "usm types are: device, shared and host."
+                )
+        # Check compute follows data on the dpnp arrays in checklist
+        if len(deviceTypes) > 1:
+            raise ComputeFollowsDataInferenceError(
+                kernel_name=parfor.loc.short(),
+                usmarray_argnum_list=[],
+            )
+        # Derive the usm_type based on usm allocator precedence rule:
+        # device > shared > host
+        conforming_usm_ty = max(usmTypes)
+        conforming_device_ty = deviceTypes.pop()
+
+        # FIXME: Changed to namedtuple
+        return (conforming_usm_ty, conforming_device_ty)
+
+    def _legalize_dpnp_empty_call(self, required_arrty, call_stmt, block):
+        args = call_stmt.args
+        sigargs = self._state.calltypes[call_stmt].args
+        sigargs_new = list(sigargs)
+        # Update the RHS usm_type, device, attributes
+        for idx, arg in enumerate(args):
+            argdef = block.find_variable_assignment(arg.name)
+            if argdef:
+                attribute = argdef.target.name
+                if "usm_type" in attribute:
+                    self._state.typemap.update(
+                        {attribute: types.literal(required_arrty.usm_type)}
+                    )
+                    sigargs_new[idx] = types.literal(required_arrty.usm_type)
+                elif "device" in attribute:
+                    self._state.typemap.update(
+                        {attribute: types.literal(required_arrty.device)}
+                    )
+                    sigargs_new[idx] = types.literal(required_arrty.device)
+        sigargs = tuple(sigargs_new)
+        new_sig = self._state.typingctx.resolve_function_type(
+            self._state.typemap[call_stmt.func.name], sigargs, {}
+        )
+        self._state.calltypes.update({call_stmt: new_sig})
+
+    def _legalize_array_attrs(
+        self, arrattr, legalized_device_ty, legalized_usm_ty
+    ):
+        modified = False
+        updated_device = None
+        updated_usm_ty = None
+
+        if self._state.typemap[arrattr].device != legalized_device_ty:
+            updated_device = legalized_device_ty
+            modified = True
+
+        if self._state.typemap[arrattr].usm_type != legalized_usm_ty:
+            updated_usm_ty = legalized_usm_ty
+            modified = True
+
+        if modified:
+            ty = self._state.typemap[arrattr]
+            new_ty = ty.copy(device=updated_device, usm_type=updated_usm_ty)
+            self._state.typemap.update({arrattr: new_ty})
+
+        return modified
+
+    def _legalize_parfor_params(self, parfor):
+        """Checks the parfor params for compute follows data compliance and
+        returns the conforming device for the parfor.
+
+        Args:
+            parfor: Parfor node to be analyzed
+
+        Returns:
+            str: The device filter string for the parfor if the parfor is
+            compute follows data conforming.
+        """
+        if parfor.params is None:
+            return
+
+        outputParams = get_parfor_outputs(parfor, parfor.params)
+        checklist = sorted(list(set(parfor.params) - set(outputParams)))
+
+        # Check if any output param was defined outside the parfor
+        for para in outputParams:
+            if (
+                isinstance(self._state.typemap[para], DpnpNdArray)
+                and para in self._seen_array_set
+            ):
+                checklist.append(para)
+
+        # Check params in checklist for CFD compliance and derive the common
+        # usm allocator and device based on the checklist params
+        usm_ty, device_ty = self._check_cfd_parfor_params(parfor, checklist)
+
+        # Update any outputs that are generated in the parfor
+        for para in outputParams:
+            if not isinstance(self._state.typemap[para], DpnpNdArray):
+                continue
+            # Legalize LHS. Skip if we already updated the type before and no
+            # further legalization is needed.
+            if self._legalize_array_attrs(
+                para,
+                device_ty,
+                ParforLegalizeCFDPassImpl.inputUsmTypeIntToStr[usm_ty],
+            ):
+                # Keep track of vars that have been updated
+                self._cfd_updated_values.add(para)
+            else:
+                try:
+                    self._cfd_updated_values.remove(para)
+                except KeyError:
+                    pass
+
+        return device_ty
+
+    def _legalize_cfd_parfor_blocks(self, parfor):
+        """Legalize the parfor params based on the compute follows data
+        programming model and usm allocator precedence rule.
+        """
+        conforming_device_ty = self._legalize_parfor_params(parfor)
+
+        # Update the parfor's lowerer attribute
+        parfor.lowerer = ParforLowerFactory.get_lowerer(conforming_device_ty)
+
+        init_block = parfor.init_block
+        blocks = parfor.loop_body
+
+        for stmt in init_block.body:
+            self._legalize_stmt(stmt, init_block, inparfor=True)
+
+        for block in blocks.values():
+            for stmt in block.body:
+                self._legalize_stmt(stmt, block, inparfor=True)
+
+    def _legalize_expr(self, stmt, lhs, lhsty, parent_block, inparfor=False):
+        rhs = stmt.value
+        if rhs.op == "call":
+            if self._check_if_dpnp_empty_call(rhs.func, parent_block):
+                if inparfor and lhs in self._cfd_updated_values:
+                    self._legalize_dpnp_empty_call(lhsty, rhs, parent_block)
+                self._seen_array_set.add(lhs)
+            # TODO: If any other array constructor that does not take
+            # args, just add to self._seen_array_set
+            else:
+                for ele in rhs.list_vars():
+                    if ele.name in self._cfd_updated_values:
+                        # TODO: Resolve function type with new argument
+                        raise NotImplementedError(
+                            "Compute follows data is not currently "
+                            "supported for function calls."
+                        )
+        elif rhs.op == "cast" and rhs.value.name in self._cfd_updated_values:
+            device_ty = self._state.typemap[rhs.value.name].device
+            usm_ty = self._state.typemap[rhs.value.name].usm_type
+            if self._legalize_array_attrs(
+                arrattr=lhs,
+                legalized_device_ty=device_ty,
+                legalized_usm_ty=usm_ty,
+            ):
+                self._cfd_updated_values.add(lhs)
+            else:
+                try:
+                    self._cfd_updated_values.remove(lhs)
+                except KeyError:
+                    pass
+        else:
+            # The assumption is all other expr types are by now either parfors
+            # or are insts like setattr, getattr,  # getitem, etc. and do not
+            # need CFD legalization.
+            self._seen_array_set.add(lhs)
+
+    def _legalize_stmt(self, stmt, parent_block, inparfor=False):
+        if isinstance(stmt, ir.Assign):
+            lhs = stmt.target.name
+            lhsty = self._state.typemap[lhs]
+            if isinstance(lhsty, DpnpNdArray):
+                if isinstance(stmt.value, ir.Arg):
+                    self._seen_array_set.add(lhs)
+                elif isinstance(stmt.value, ir.Expr):
+                    self._legalize_expr(
+                        stmt, lhs, lhsty, parent_block, inparfor
+                    )
+        elif isinstance(stmt, Parfor):
+            self._legalize_cfd_parfor_blocks(stmt)
+        elif isinstance(stmt, ir.Return):
+            # Check if the return value is a DpnpNdArray and was changed by
+            # compute follows data legalization
+            retty = self._state.typemap[stmt.value.name]
+            if (
+                isinstance(retty, DpnpNdArray)
+                and stmt.value.name in self._cfd_updated_values
+                and self._state.return_type != retty
+            ):
+                self._state.return_type = retty
+
+    def run(self):
+        # The get_parfor_params needs to be run here to initialize the parfor
+        # nodes prior to using them.
+        _, _ = get_parfor_params(
+            self._state.func_ir.blocks,
+            self._state.flags.auto_parallel,
+            self.nested_fusion_info,
+        )
+
+        # FIXME: Traversing the blocks in  topological order is not sufficient.
+        # The traversal should be converted to a backward data flow traversal of
+        # the CFG. The algorithm needs to then become a fixed-point work list
+        # algorithm.
+        topo_order = find_topo_order(self._state.func_ir.blocks)
+
+        # Apply CFD legalization to parfor nodes and dpnp_empty calls
+        for label in topo_order:
+            block = self._state.func_ir.blocks[label]
+            for stmt in block.body:
+                self._legalize_stmt(stmt, block)
+
+
+@register_pass(mutates_CFG=True, analysis_only=False)
+class ParforLegalizeCFDPass(FunctionPass):
+    _name = "parfor_Legalize_CFD_pass"
+
+    def __init__(self):
+        FunctionPass.__init__(self)
+
+    def run_pass(self, state):
+        """
+        Legalize CFD of parfor nodes.
+        """
+        # Ensure we have an IR and type information.
+        assert state.func_ir
+        cfd_legalizer = ParforLegalizeCFDPassImpl(state)
+        cfd_legalizer.run()
+
+        return True
diff --git a/numba_dpex/core/passes/parfor_lowering_pass.py b/numba_dpex/core/passes/parfor_lowering_pass.py
new file mode 100644
index 0000000000..1afd0b16f8
--- /dev/null
+++ b/numba_dpex/core/passes/parfor_lowering_pass.py
@@ -0,0 +1,394 @@
+# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import copy
+
+from numba.core import funcdesc, ir, types
+from numba.core.compiler_machinery import LoweringPass, register_pass
+from numba.core.lowering import Lower
+from numba.parfors.parfor_lowering import (
+    _lower_parfor_parallel as _lower_parfor_parallel_std,
+)
+
+from numba_dpex import config
+from numba_dpex.core.utils.kernel_launcher import KernelLaunchIRBuilder
+
+from ..exceptions import UnsupportedParforError
+from ..types.dpnp_ndarray_type import DpnpNdArray
+from ..utils.kernel_builder import create_kernel_for_parfor
+from .parfor import Parfor, find_potential_aliases_parfor, get_parfor_outputs
+
+# A global list of kernels to keep the objects alive indefinitely.
+keep_alive_kernels = []
+
+
+def _getvar_or_none(lowerer, x):
+    try:
+        return lowerer.getvar(x)
+    except:
+        return None
+
+
+def _load_range(lowerer, value):
+    if isinstance(value, ir.Var):
+        return lowerer.loadvar(value.name)
+    else:
+        return lowerer.context.get_constant(types.uintp, value)
+
+
+class ParforLowerImpl:
+    """Provides a custom lowerer for parfor nodes that generates a SYCL kernel
+    for a parfor and submits it to a queue.
+    """
+
+    def _submit_parfor_kernel(
+        self,
+        lowerer,
+        kernel_fn,
+        loop_ranges,
+    ):
+        """
+        Adds the call to the dpex kernel function from the main function.
+        """
+        keep_alive_kernels.append(kernel_fn.kernel)
+
+        # Helper class that generates the LLVM IR values inside the current LLVM
+        # module that are needed to submit the kernel to a queue.
+        ir_builder = KernelLaunchIRBuilder(lowerer, kernel_fn.kernel)
+
+        # Create a local variable storing a pointer to a DPCTLSyclQueueRef
+        # pointer.
+        curr_queue = ir_builder.get_queue(exec_queue=kernel_fn.queue)
+
+        num_flattened_args = 0
+
+        # Compute number of args to be passed to the kernel. Note that the
+        # actual number of kernel arguments is greater than the count of
+        # kernel_fn.kernel_args as arrays get flattened.
+        for arg_type in kernel_fn.kernel_arg_types:
+            if isinstance(arg_type, DpnpNdArray):
+                # FIXME: Remove magic constants
+                num_flattened_args += 5 + (2 * arg_type.ndim)
+            else:
+                num_flattened_args += 1
+
+        # Create LLVM values for the kernel args list and kernel arg types list
+        args_list = ir_builder.allocate_kernel_arg_array(num_flattened_args)
+        args_ty_list = ir_builder.allocate_kernel_arg_ty_array(
+            num_flattened_args
+        )
+
+        # Populate the args_list and the args_ty_list LLVM arrays
+        kernel_arg_num = 0
+        for arg_num, arg in enumerate(kernel_fn.kernel_args):
+            argtype = kernel_fn.kernel_arg_types[arg_num]
+            llvm_val = _getvar_or_none(lowerer, arg)
+            if not llvm_val:
+                raise AssertionError
+            if isinstance(argtype, DpnpNdArray):
+                ir_builder.build_array_arg(
+                    array_val=llvm_val,
+                    array_rank=argtype.ndim,
+                    arg_list=args_list,
+                    args_ty_list=args_ty_list,
+                    arg_num=kernel_arg_num,
+                )
+                # FIXME: Get rid of magic constants
+                kernel_arg_num += 5 + (2 * argtype.ndim)
+            else:
+                ir_builder.build_arg(
+                    llvm_val, argtype, args_list, args_ty_list, kernel_arg_num
+                )
+                kernel_arg_num += 1
+
+        # Create a global range over which to submit the kernel based on the
+        # loop_ranges of the parfor
+        global_range = []
+        # SYCL ranges can have at max 3 dimension. If the parfor is of a higher
+        # dimension then the indexing for the higher dimensions is done inside
+        # the kernel.
+        global_range_rank = len(loop_ranges) if len(loop_ranges) < 3 else 3
+        for i in range(global_range_rank):
+            start, stop, step = loop_ranges[i]
+            stop = _load_range(lowerer, stop)
+            if step != 1:
+                raise UnsupportedParforError(
+                    "non-unit strides are not yet supported."
+                )
+            global_range.append(stop)
+
+        # Submit a synchronous kernel
+        ir_builder.submit_sync_ranged_kernel(
+            global_range, curr_queue, kernel_arg_num, args_list, args_ty_list
+        )
+
+        # At this point we can free the DPCTLSyclQueueRef (curr_queue)
+        ir_builder.free_queue(sycl_queue_val=curr_queue)
+
+    def _lower_parfor_as_kernel(self, lowerer, parfor):
+        """Lowers a parfor node created by the dpjit compiler to a kernel.
+
+        The general approach is as follows:
+
+            - The code from the parfor's init block is lowered normally
+              in the context of the current function.
+            - The body of the parfor is transformed into a kernel function.
+            - Dpctl runtime calls to submit the kernel are added.
+
+        """
+        # We copy the typemap here because for race condition variable we'll
+        # update their type to array so they can be updated by the kernel.
+        orig_typemap = lowerer.fndesc.typemap
+
+        # replace original typemap with copy and restore the original at the
+        # end.
+        lowerer.fndesc.typemap = copy.copy(orig_typemap)
+
+        if config.DEBUG_ARRAY_OPT:
+            print("lowerer.fndesc", lowerer.fndesc, type(lowerer.fndesc))
+
+        typemap = lowerer.fndesc.typemap
+        varmap = lowerer.varmap
+
+        loc = parfor.init_block.loc
+        scope = parfor.init_block.scope
+
+        # Lower the init block of the parfor.
+        for instr in parfor.init_block.body:
+            lowerer.lower_inst(instr)
+
+        for racevar in parfor.races:
+            if racevar not in varmap:
+                rvtyp = typemap[racevar]
+                rv = ir.Var(scope, racevar, loc)
+                lowerer._alloca_var(rv.name, rvtyp)
+
+        alias_map = {}
+        arg_aliases = {}
+
+        find_potential_aliases_parfor(
+            parfor,
+            parfor.params,
+            typemap,
+            lowerer.func_ir,
+            alias_map,
+            arg_aliases,
+        )
+
+        # run get_parfor_outputs() and get_parfor_reductions() before
+        # kernel creation since Jumps are modified so CFG of loop_body
+        # dict will become invalid
+        if parfor.params is None:
+            raise AssertionError
+
+        parfor_output_arrays = get_parfor_outputs(parfor, parfor.params)
+
+        # compile parfor body as a separate dpex kernel function
+        flags = copy.copy(parfor.flags)
+        flags.error_model = "numpy"
+
+        # Can't get here unless
+        # flags.set('auto_parallel', ParallelOptions(True))
+        index_var_typ = typemap[parfor.loop_nests[0].index_variable.name]
+
+        # index variables should have the same type, check rest of indices
+        for loop_nest in parfor.loop_nests[1:]:
+            if typemap[loop_nest.index_variable.name] != index_var_typ:
+                raise AssertionError
+
+        loop_ranges = [
+            (loop_nest.start, loop_nest.stop, loop_nest.step)
+            for loop_nest in parfor.loop_nests
+        ]
+
+        try:
+            psrfor_kernel = create_kernel_for_parfor(
+                lowerer,
+                parfor,
+                typemap,
+                flags,
+                loop_ranges,
+                bool(alias_map),
+                parfor.races,
+                parfor_output_arrays,
+            )
+        except Exception:
+            # FIXME: Make the exception more informative
+            raise UnsupportedParforError
+
+        # Finally submit the kernel
+        self._submit_parfor_kernel(lowerer, psrfor_kernel, loop_ranges)
+
+        # TODO: free the kernel at this point
+
+        # Restore the original typemap of the function that was replaced
+        # temporarily at the beginning of this function.
+        lowerer.fndesc.typemap = orig_typemap
+
+
+class ParforLowerFactory:
+    """A pseudo-factory class that maps a device filter string to a lowering
+    function.
+
+    Each parfor can have a "lowerer" attribute that determines how the parfor
+    node is to be lowered to LLVM IR. The factory class maintains a static map
+    that for every device type (filter string) encountered so far to a lowerer
+    function for that device type. At this point numba-dpex does not generate
+    device-specific code and there lowerer is always same for all devices.
+    By generating different instances we make sure prfors that will execute on
+    distinct devices as determined by compute-follows-data programming model are
+    never fused together.
+    """
+
+    device_to_lowerer_map = {}
+
+    @classmethod
+    def get_lowerer(cls, device):
+        try:
+            lowerer = ParforLowerFactory.device_to_lowerer_map[device]
+        except KeyError:
+            lowerer = ParforLowerImpl()._lower_parfor_as_kernel
+            ParforLowerFactory.device_to_lowerer_map[device] = lowerer
+
+        return lowerer
+
+
+class WrapperDefaultLower(Lower):
+    @property
+    def _disable_sroa_like_opt(self):
+        """We always return True."""
+        return True
+
+
+def lower_parfor_dpex(lowerer, parfor):
+    parfor.lowerer = ParforLowerImpl()._lower_parfor_as_kernel
+    if parfor.lowerer is None:
+        _lower_parfor_parallel_std(lowerer, parfor)
+    else:
+        parfor.lowerer(lowerer, parfor)
+
+
+class _ParforLower(Lower):
+    """Extends standard lowering to accommodate parfor.Parfor nodes that may
+    have the `lowerer` attribute set.
+    """
+
+    def __init__(self, context, library, fndesc, func_ir, metadata=None):
+        Lower.__init__(self, context, library, fndesc, func_ir, metadata)
+        self.dpex_lower = self._lower(
+            context, library, fndesc, func_ir, metadata
+        )
+
+    def _lower(self, context, library, fndesc, func_ir, metadata):
+        """Create Lower with changed linkageName in debug info"""
+        lower = WrapperDefaultLower(context, library, fndesc, func_ir, metadata)
+
+        # Debuginfo
+        if context.enable_debuginfo:
+            from numba.core.funcdesc import default_mangler, qualifying_prefix
+
+            from numba_dpex.debuginfo import DpexDIBuilder
+
+            qualprefix = qualifying_prefix(fndesc.modname, fndesc.qualname)
+            mangled_qualname = default_mangler(qualprefix, fndesc.argtypes)
+
+            lower.debuginfo = DpexDIBuilder(
+                module=lower.module,
+                filepath=func_ir.loc.filename,
+                linkage_name=mangled_qualname,
+                cgctx=context,
+            )
+
+        return lower
+
+    def lower(self):
+        context = self.dpex_lower.context
+
+        # Only Numba's CPUContext has the `lower_extension` attribute
+        context.lower_extensions[Parfor] = lower_parfor_dpex
+        self.dpex_lower.lower()
+        self.base_lower = self.dpex_lower
+
+        self.env = self.base_lower.env
+        self.call_helper = self.base_lower.call_helper
+
+    def create_cpython_wrapper(self, release_gil=False):
+        return self.base_lower.create_cpython_wrapper(release_gil)
+
+
+@register_pass(mutates_CFG=True, analysis_only=False)
+class ParforLoweringPass(LoweringPass):
+    """A custom lowering pass that does dpex-specific lowering of parfor
+    nodes.
+
+    FIXME: Redesign once numba-dpex supports Numba 0.57
+    """
+
+    _name = "dpjit_lowering"
+
+    def __init__(self):
+        LoweringPass.__init__(self)
+
+    def run_pass(self, state):
+        if state.library is None:
+            codegen = state.targetctx.codegen()
+            state.library = codegen.create_library(state.func_id.func_qualname)
+            # Enable object caching upfront, so that the library can
+            # be later serialized.
+            state.library.enable_object_caching()
+
+        targetctx = state.targetctx
+
+        library = state.library
+        interp = state.func_ir
+        typemap = state.typemap
+        restype = state.return_type
+        calltypes = state.calltypes
+        flags = state.flags
+        metadata = state.metadata
+
+        kwargs = {}
+
+        # for support numba 0.54 and <=0.55.0dev0=*_469
+        if hasattr(flags, "get_mangle_string"):
+            kwargs["abi_tags"] = flags.get_mangle_string()
+        # Lowering
+        fndesc = funcdesc.PythonFunctionDescriptor.from_specialized_function(
+            interp,
+            typemap,
+            restype,
+            calltypes,
+            mangler=targetctx.mangler,
+            inline=flags.forceinline,
+            noalias=flags.noalias,
+            **kwargs,
+        )
+
+        with targetctx.push_code_library(library):
+            lower = _ParforLower(
+                targetctx, library, fndesc, interp, metadata=metadata
+            )
+            lower.lower()
+            if not flags.no_cpython_wrapper:
+                lower.create_cpython_wrapper(flags.release_gil)
+
+            env = lower.env
+            call_helper = lower.call_helper
+            del lower
+
+        from numba.core.compiler import _LowerResult  # TODO: move this
+
+        if flags.no_compile:
+            state["cr"] = _LowerResult(fndesc, call_helper, cfunc=None, env=env)
+        else:
+            # Prepare for execution
+            cfunc = targetctx.get_executable(library, fndesc, env)
+            # Insert native function for use by other jitted-functions.
+            # We also register its library to allow for inlining.
+            targetctx.insert_user_function(cfunc, fndesc, [library])
+            state["cr"] = _LowerResult(
+                fndesc, call_helper, cfunc=cfunc, env=env
+            )
+
+        return True
diff --git a/numba_dpex/core/passes/passes.py b/numba_dpex/core/passes/passes.py
index ccf2640ac6..c5ab12271d 100644
--- a/numba_dpex/core/passes/passes.py
+++ b/numba_dpex/core/passes/passes.py
@@ -152,6 +152,7 @@ def run_pass(self, state):
         # Ensure we have an IR and type information.
         assert state.func_ir
         functions_map = swap_functions_map.copy()
+        # FIXME: remove once reduction is implemented
         functions_map.pop(("dot", "numpy"), None)
         functions_map.pop(("sum", "numpy"), None)
         functions_map.pop(("prod", "numpy"), None)
diff --git a/numba_dpex/core/pipelines/dpjit_compiler.py b/numba_dpex/core/pipelines/dpjit_compiler.py
new file mode 100644
index 0000000000..433dfd9a27
--- /dev/null
+++ b/numba_dpex/core/pipelines/dpjit_compiler.py
@@ -0,0 +1,118 @@
+# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from numba.core.compiler import CompilerBase, DefaultPassBuilder
+from numba.core.compiler_machinery import PassManager
+from numba.core.typed_passes import (
+    AnnotateTypes,
+    InlineOverloads,
+    IRLegalization,
+    NopythonRewrites,
+    NoPythonSupportedFeatureValidation,
+    NopythonTypeInference,
+    PreLowerStripPhis,
+    PreParforPass,
+)
+
+from numba_dpex.core.exceptions import UnsupportedCompilationModeError
+from numba_dpex.core.passes import (
+    DumpParforDiagnostics,
+    NoPythonBackend,
+    ParforFusionPass,
+    ParforLegalizeCFDPass,
+    ParforLoweringPass,
+    ParforPreLoweringPass,
+    SplitParforPass,
+)
+from numba_dpex.parfor_diagnostics import ExtendedParforDiagnostics
+
+
+class _DpjitPassBuilder(object):
+    """
+    A pass builder for dpex's DpjitCompiler that adds supports for
+    offloading dpnp array expressions and dpnp library calls to a SYCL device.
+
+    The pass builder does not implement pipelines for objectmode or interpreted
+    execution.
+    """
+
+    @staticmethod
+    def define_typed_pipeline(state, name="dpex_dpjit_typed"):
+        """Returns the typed part of the nopython pipeline"""
+        pm = PassManager(name)
+        # typing
+        pm.add_pass(NopythonTypeInference, "nopython frontend")
+        # Annotate only once legalized
+        pm.add_pass(AnnotateTypes, "annotate types")
+        # strip phis
+        pm.add_pass(PreLowerStripPhis, "remove phis nodes")
+
+        # optimization
+        pm.add_pass(InlineOverloads, "inline overloaded functions")
+        pm.add_pass(PreParforPass, "Preprocessing for parfors")
+        if not state.flags.no_rewrites:
+            pm.add_pass(NopythonRewrites, "nopython rewrites")
+        pm.add_pass(SplitParforPass, "convert to parfors")
+        pm.add_pass(
+            ParforLegalizeCFDPass, "Legalize parfors for compute follows data"
+        )
+        pm.add_pass(ParforFusionPass, "fuse parfors")
+        pm.add_pass(ParforPreLoweringPass, "parfor prelowering")
+
+        pm.finalize()
+        return pm
+
+    @staticmethod
+    def define_nopython_lowering_pipeline(state, name="dpex_dpjit_lowering"):
+        """Returns an nopython mode pipeline based PassManager"""
+        pm = PassManager(name)
+
+        # legalize
+        pm.add_pass(
+            NoPythonSupportedFeatureValidation,
+            "ensure features that are in use are in a valid form",
+        )
+        pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering")
+
+        # lower
+        pm.add_pass(ParforLoweringPass, "Custom lowerer for dpex parfor nodes")
+        pm.add_pass(NoPythonBackend, "nopython mode backend")
+        pm.add_pass(DumpParforDiagnostics, "dump parfor diagnostics")
+
+        pm.finalize()
+        return pm
+
+    @staticmethod
+    def define_nopython_pipeline(state, name="dpex_dpjit_nopython"):
+        """Returns an nopython mode pipeline based PassManager"""
+        # compose pipeline from untyped, typed and lowering parts
+        dpb = _DpjitPassBuilder
+        pm = PassManager(name)
+        untyped_passes = DefaultPassBuilder.define_untyped_pipeline(state)
+        pm.passes.extend(untyped_passes.passes)
+
+        typed_passes = dpb.define_typed_pipeline(state)
+        pm.passes.extend(typed_passes.passes)
+
+        lowering_passes = dpb.define_nopython_lowering_pipeline(state)
+        pm.passes.extend(lowering_passes.passes)
+
+        pm.finalize()
+        return pm
+
+
+class DpjitCompiler(CompilerBase):
+    """Dpex's compiler pipeline to offload parfor nodes into SYCL kernels."""
+
+    def define_pipelines(self):
+        pms = []
+        self.state.parfor_diagnostics = ExtendedParforDiagnostics()
+        self.state.metadata[
+            "parfor_diagnostics"
+        ] = self.state.parfor_diagnostics
+        if not self.state.flags.force_pyobject:
+            pms.append(_DpjitPassBuilder.define_nopython_pipeline(self.state))
+        if self.state.status.can_fallback or self.state.flags.force_pyobject:
+            raise UnsupportedCompilationModeError()
+        return pms
diff --git a/numba_dpex/core/runtime/_dpexrt_python.c b/numba_dpex/core/runtime/_dpexrt_python.c
index fc13f2954e..3712704bec 100644
--- a/numba_dpex/core/runtime/_dpexrt_python.c
+++ b/numba_dpex/core/runtime/_dpexrt_python.c
@@ -21,6 +21,22 @@
 
 #include "numba/_arraystruct.h"
 
+/* Debugging facilities - enabled at compile-time */
+/* #undef NDEBUG */
+#if 0
+#include <stdio.h>
+#define DPEXRT_DEBUG(X)                                                        \
+    {                                                                          \
+        X;                                                                     \
+        fflush(stdout);                                                        \
+    }
+#else
+#define DPEXRT_DEBUG(X)                                                        \
+    if (0) {                                                                   \
+        X;                                                                     \
+    }
+#endif
+
 // forward declarations
 static struct PyUSMArrayObject *PyUSMNdArray_ARRAYOBJ(PyObject *obj);
 static npy_intp product_of_shape(npy_intp *shape, npy_intp ndim);
@@ -30,6 +46,7 @@ static void *usm_host_malloc(size_t size, void *opaque_data);
 static void usm_free(void *data, void *opaque_data);
 static NRT_ExternalAllocator *
 NRT_ExternalAllocator_new_for_usm(DPCTLSyclQueueRef qref, size_t usm_type);
+static void *DPEXRTQueue_CreateFromFilterString(const char *device);
 static MemInfoDtorInfo *MemInfoDtorInfo_new(NRT_MemInfo *mi, PyObject *owner);
 static NRT_MemInfo *NRT_MemInfo_new_from_usmndarray(PyObject *ndarrobj,
                                                     void *data,
@@ -106,6 +123,91 @@ static void usm_free(void *data, void *opaque_data)
     DPCTLfree_with_queue(data, qref);
 }
 
+/*----------------------------------------------------------------------------*/
+/*--------- Functions for dpctl libsyclinterface/sycl gluing         ---------*/
+/*----------------------------------------------------------------------------*/
+
+/*!
+ * @brief Creates and returns a DPCTLSyclQueueRef from a filter string.
+ *
+ * @param    device         A sycl::oneapi_ext::filter_string
+ * @return   {DPCTLSyclQueueRef}       A DPCTLSyclQueueRef object as void*.
+ */
+static void *DPEXRTQueue_CreateFromFilterString(const char *device)
+{
+    DPCTLSyclDeviceSelectorRef dselector = NULL;
+    DPCTLSyclDeviceRef dref = NULL;
+    DPCTLSyclQueueRef qref = NULL;
+
+    DPEXRT_DEBUG(nrt_debug_print(
+        "DPEXRT-DEBUG: Inside DPEXRT_get_sycl_queue %s, line %d\n", __FILE__,
+        __LINE__));
+
+    if (!(dselector = DPCTLFilterSelector_Create(device))) {
+        DPEXRT_DEBUG(nrt_debug_print(
+            "DPEXRT-ERROR: Could not create a sycl::device_selector from "
+            "filter string: %s at %s %d.\n",
+            device, __FILE__, __LINE__));
+        goto error;
+    }
+
+    if (!(dref = DPCTLDevice_CreateFromSelector(dselector)))
+        goto error;
+
+    if (!(qref = DPCTLQueue_CreateForDevice(dref, NULL, 0)))
+        goto error;
+
+    DPCTLDeviceSelector_Delete(dselector);
+    DPCTLDevice_Delete(dref);
+
+    DPEXRT_DEBUG(nrt_debug_print(
+        "DPEXRT-DEBUG: Created sycl::queue on device %s at %s, line %d\n",
+        device, __FILE__, __LINE__));
+
+    return (void *)qref;
+
+error:
+    DPCTLDeviceSelector_Delete(dselector);
+    DPCTLDevice_Delete(dref);
+
+    return NULL;
+}
+
+static void DpexrtQueue_SubmitRange(const void *KRef,
+                                    const void *QRef,
+                                    void **Args,
+                                    const DPCTLKernelArgType *ArgTypes,
+                                    size_t NArgs,
+                                    const size_t Range[3],
+                                    size_t NRange,
+                                    const void *DepEvents,
+                                    size_t NDepEvents)
+{
+    DPCTLSyclEventRef eref = NULL;
+    DPCTLSyclQueueRef qref = NULL;
+
+    DPEXRT_DEBUG(nrt_debug_print(
+        "DPEXRT-DEBUG: Inside DpexrtQueue_SubmitRange %s, line %d\n", __FILE__,
+        __LINE__));
+
+    qref = (DPCTLSyclQueueRef)QRef;
+
+    eref = DPCTLQueue_SubmitRange(
+        (DPCTLSyclKernelRef)KRef, qref, Args, (DPCTLKernelArgType *)ArgTypes,
+        NArgs, Range, NRange, (DPCTLSyclEventRef *)DepEvents, NDepEvents);
+    DPCTLQueue_Wait(qref);
+    DPCTLEvent_Wait(eref);
+    DPCTLEvent_Delete(eref);
+
+    DPEXRT_DEBUG(nrt_debug_print(
+        "DPEXRT-DEBUG: Done with DpexrtQueue_SubmitRange %s, line %d\n",
+        __FILE__, __LINE__));
+}
+
+/*----------------------------------------------------------------------------*/
+/*---------------------- Functions for NRT_MemInfo allocation ----------------*/
+/*----------------------------------------------------------------------------*/
+
 /*!
  * @brief Creates a new NRT_ExternalAllocator object tied to a SYCL USM
  *        allocator.
@@ -128,13 +230,15 @@ NRT_ExternalAllocator_new_for_usm(DPCTLSyclQueueRef qref, size_t usm_type)
 
     allocator = (NRT_ExternalAllocator *)malloc(sizeof(NRT_ExternalAllocator));
     if (allocator == NULL) {
-        NRT_Debug(nrt_debug_print("DPEXRT-ERROR: failed to allocate memory for "
-                                  "NRT_ExternalAllocator at %s, line %d.\n",
-                                  __FILE__, __LINE__));
+        DPEXRT_DEBUG(
+            nrt_debug_print("DPEXRT-ERROR: failed to allocate memory for "
+                            "NRT_ExternalAllocator at %s, line %d.\n",
+                            __FILE__, __LINE__));
         goto error;
     }
-    NRT_Debug(nrt_debug_print("DPEXRT-DEBUG: usm type = %d at %s, line %d.\n",
-                              usm_type, __FILE__, __LINE__));
+    DPEXRT_DEBUG(
+        nrt_debug_print("DPEXRT-DEBUG: usm type = %d at %s, line %d.\n",
+                        usm_type, __FILE__, __LINE__));
 
     switch (usm_type) {
     case 1:
@@ -147,9 +251,9 @@ NRT_ExternalAllocator_new_for_usm(DPCTLSyclQueueRef qref, size_t usm_type)
         allocator->malloc = usm_host_malloc;
         break;
     default:
-        NRT_Debug(nrt_debug_print("DPEXRT-ERROR: Encountered an unknown usm "
-                                  "allocation type (%d) at %s, line %d\n",
-                                  usm_type, __FILE__, __LINE__));
+        DPEXRT_DEBUG(nrt_debug_print("DPEXRT-ERROR: Encountered an unknown usm "
+                                     "allocation type (%d) at %s, line %d\n",
+                                     usm_type, __FILE__, __LINE__));
         goto error;
     }
 
@@ -193,7 +297,7 @@ static void usmndarray_meminfo_dtor(void *ptr, size_t size, void *info)
 
     // Sanity-check to make sure the mi_dtor_info is an actual pointer.
     if (!(mi_dtor_info = (MemInfoDtorInfo *)info)) {
-        NRT_Debug(nrt_debug_print(
+        DPEXRT_DEBUG(nrt_debug_print(
             "DPEXRT-ERROR: MemInfoDtorInfo object might be corrupted. Aborting "
             "MemInfo destruction at %s, line %d\n",
             __FILE__, __LINE__));
@@ -248,9 +352,9 @@ static MemInfoDtorInfo *MemInfoDtorInfo_new(NRT_MemInfo *mi, PyObject *owner)
     MemInfoDtorInfo *mi_dtor_info = NULL;
 
     if (!(mi_dtor_info = (MemInfoDtorInfo *)malloc(sizeof(MemInfoDtorInfo)))) {
-        NRT_Debug(nrt_debug_print("DPEXRT-ERROR: Could not allocate a new "
-                                  "MemInfoDtorInfo object at %s, line %d\n",
-                                  __FILE__, __LINE__));
+        DPEXRT_DEBUG(nrt_debug_print("DPEXRT-ERROR: Could not allocate a new "
+                                     "MemInfoDtorInfo object at %s, line %d\n",
+                                     __FILE__, __LINE__));
         return NULL;
     }
     mi_dtor_info->mi = mi;
@@ -283,7 +387,7 @@ static NRT_MemInfo *NRT_MemInfo_new_from_usmndarray(PyObject *ndarrobj,
 
     // Allocate a new NRT_MemInfo object
     if (!(mi = (NRT_MemInfo *)malloc(sizeof(NRT_MemInfo)))) {
-        NRT_Debug(nrt_debug_print(
+        DPEXRT_DEBUG(nrt_debug_print(
             "DPEXRT-ERROR: Could not allocate a new NRT_MemInfo "
             "object  at %s, line %d\n",
             __FILE__, __LINE__));
@@ -291,7 +395,7 @@ static NRT_MemInfo *NRT_MemInfo_new_from_usmndarray(PyObject *ndarrobj,
     }
 
     if (!(cref = DPCTLQueue_GetContext(qref))) {
-        NRT_Debug(nrt_debug_print(
+        DPEXRT_DEBUG(nrt_debug_print(
             "DPEXRT-ERROR: Could not get the DPCTLSyclContext from "
             "the queue object at %s, line %d\n",
             __FILE__, __LINE__));
@@ -303,7 +407,7 @@ static NRT_MemInfo *NRT_MemInfo_new_from_usmndarray(PyObject *ndarrobj,
 
     // Allocate a new NRT_ExternalAllocator
     if (!(ext_alloca = NRT_ExternalAllocator_new_for_usm(qref, usm_type))) {
-        NRT_Debug(
+        DPEXRT_DEBUG(
             nrt_debug_print("DPEXRT-ERROR: Could not allocate a new "
                             "NRT_ExternalAllocator object  at %s, line %d\n",
                             __FILE__, __LINE__));
@@ -312,9 +416,9 @@ static NRT_MemInfo *NRT_MemInfo_new_from_usmndarray(PyObject *ndarrobj,
 
     // Allocate a new MemInfoDtorInfo
     if (!(midtor_info = MemInfoDtorInfo_new(mi, ndarrobj))) {
-        NRT_Debug(nrt_debug_print("DPEXRT-ERROR: Could not allocate a new "
-                                  "MemInfoDtorInfo object  at %s, line %d\n",
-                                  __FILE__, __LINE__));
+        DPEXRT_DEBUG(nrt_debug_print("DPEXRT-ERROR: Could not allocate a new "
+                                     "MemInfoDtorInfo object  at %s, line %d\n",
+                                     __FILE__, __LINE__));
         goto error;
     }
 
@@ -326,14 +430,14 @@ static NRT_MemInfo *NRT_MemInfo_new_from_usmndarray(PyObject *ndarrobj,
     mi->size = nitems * itemsize;
     mi->external_allocator = ext_alloca;
 
-    NRT_Debug(nrt_debug_print(
+    DPEXRT_DEBUG(nrt_debug_print(
         "DPEXRT-DEBUG: NRT_MemInfo_init mi=%p external_allocator=%p\n", mi,
         ext_alloca));
 
     return mi;
 
 error:
-    NRT_Debug(nrt_debug_print(
+    DPEXRT_DEBUG(nrt_debug_print(
         "DPEXRT-ERROR: Failed inside NRT_MemInfo_new_from_usmndarray clean up "
         "and return NULL at %s, line %d\n",
         __FILE__, __LINE__));
@@ -359,37 +463,27 @@ DPEXRT_MemInfo_alloc(npy_intp size, size_t usm_type, const char *device)
     NRT_MemInfo *mi = NULL;
     NRT_ExternalAllocator *ext_alloca = NULL;
     MemInfoDtorInfo *midtor_info = NULL;
-    DPCTLSyclDeviceSelectorRef dselector = NULL;
-    DPCTLSyclDeviceRef dref = NULL;
     DPCTLSyclQueueRef qref = NULL;
 
-    NRT_Debug(nrt_debug_print(
+    DPEXRT_DEBUG(nrt_debug_print(
         "DPEXRT-DEBUG: Inside DPEXRT_MemInfo_alloc  %s, line %d\n", __FILE__,
         __LINE__));
     // Allocate a new NRT_MemInfo object
     if (!(mi = (NRT_MemInfo *)malloc(sizeof(NRT_MemInfo)))) {
-        NRT_Debug(nrt_debug_print(
+        DPEXRT_DEBUG(nrt_debug_print(
             "DPEXRT-ERROR: Could not allocate a new NRT_MemInfo object.\n"));
         goto error;
     }
 
-    if (!(dselector = DPCTLFilterSelector_Create(device))) {
-        NRT_Debug(nrt_debug_print(
-            "DPEXRT-ERROR: Could not create a sycl::device_selector from "
-            "filter string: %s at %s %d.\n",
-            device, __FILE__, __LINE__));
+    if (!(qref = (DPCTLSyclQueueRef)DPEXRTQueue_CreateFromFilterString(device)))
+    {
+        DPEXRT_DEBUG(
+            nrt_debug_print("DPEXRT-ERROR: Could not create a sycl::queue from "
+                            "filter string: %s at %s %d.\n",
+                            device, __FILE__, __LINE__));
         goto error;
     }
 
-    if (!(dref = DPCTLDevice_CreateFromSelector(dselector)))
-        goto error;
-
-    if (!(qref = DPCTLQueue_CreateForDevice(dref, NULL, 0)))
-        goto error;
-
-    DPCTLDeviceSelector_Delete(dselector);
-    DPCTLDevice_Delete(dref);
-
     // Allocate a new NRT_ExternalAllocator
     if (!(ext_alloca = NRT_ExternalAllocator_new_for_usm(qref, usm_type)))
         goto error;
@@ -407,7 +501,7 @@ DPEXRT_MemInfo_alloc(npy_intp size, size_t usm_type, const char *device)
 
     mi->size = size;
     mi->external_allocator = ext_alloca;
-    NRT_Debug(nrt_debug_print(
+    DPEXRT_DEBUG(nrt_debug_print(
         "DPEXRT-DEBUG: DPEXRT_MemInfo_alloc mi=%p "
         "external_allocator=%p for usm_type %zu on device %s, %s at %d\n",
         mi, ext_alloca, usm_type, device, __FILE__, __LINE__));
@@ -418,8 +512,6 @@ DPEXRT_MemInfo_alloc(npy_intp size, size_t usm_type, const char *device)
     free(mi);
     free(ext_alloca);
     free(midtor_info);
-    DPCTLDeviceSelector_Delete(dselector);
-    DPCTLDevice_Delete(dref);
 
     return NULL;
 }
@@ -444,8 +536,6 @@ static NRT_MemInfo *DPEXRT_MemInfo_fill(NRT_MemInfo *mi,
                                         uint8_t value,
                                         const char *device)
 {
-    DPCTLSyclDeviceSelectorRef dselector = NULL;
-    DPCTLSyclDeviceRef dref = NULL;
     DPCTLSyclQueueRef qref = NULL;
     DPCTLSyclEventRef eref = NULL;
     size_t count = 0, size = 0, exp = 0;
@@ -455,34 +545,20 @@ static NRT_MemInfo *DPEXRT_MemInfo_fill(NRT_MemInfo *mi,
         exp++;
     count = (unsigned int)(size >> exp);
 
-    NRT_Debug(nrt_debug_print(
+    DPEXRT_DEBUG(nrt_debug_print(
         "DPEXRT-DEBUG: mi->size = %u, itemsize = %u, count = %u, "
         "value = %u, Inside DPEXRT_MemInfo_fill %s, line %d\n",
         mi->size, itemsize << exp, count, value, __FILE__, __LINE__));
 
     if (mi->data == NULL) {
-        NRT_Debug(nrt_debug_print("DPEXRT-DEBUG: mi->data is NULL, "
-                                  "Inside DPEXRT_MemInfo_fill %s, line %d\n",
-                                  __FILE__, __LINE__));
+        DPEXRT_DEBUG(nrt_debug_print("DPEXRT-DEBUG: mi->data is NULL, "
+                                     "Inside DPEXRT_MemInfo_fill %s, line %d\n",
+                                     __FILE__, __LINE__));
         goto error;
     }
 
-    if (!(dselector = DPCTLFilterSelector_Create(device))) {
-        NRT_Debug(nrt_debug_print(
-            "DPEXRT-ERROR: Could not create a sycl::device_selector from "
-            "filter string: %s at %s %d.\n",
-            device, __FILE__, __LINE__));
+    if (!(qref = (DPCTLSyclQueueRef)DPEXRTQueue_CreateFromFilterString(device)))
         goto error;
-    }
-
-    if (!(dref = DPCTLDevice_CreateFromSelector(dselector)))
-        goto error;
-
-    if (!(qref = DPCTLQueue_CreateForDevice(dref, NULL, 0)))
-        goto error;
-
-    DPCTLDeviceSelector_Delete(dselector);
-    DPCTLDevice_Delete(dref);
 
     switch (exp) {
     case 3:
@@ -535,8 +611,6 @@ static NRT_MemInfo *DPEXRT_MemInfo_fill(NRT_MemInfo *mi,
 error:
     DPCTLQueue_Delete(qref);
     DPCTLEvent_Delete(eref);
-    DPCTLDeviceSelector_Delete(dselector);
-    DPCTLDevice_Delete(dref);
 
     return NULL;
 }
@@ -616,20 +690,20 @@ static int DPEXRT_sycl_usm_ndarray_from_python(PyObject *obj,
     // collecting the array.
     Py_IncRef(obj);
 
-    NRT_Debug(nrt_debug_print(
+    DPEXRT_DEBUG(nrt_debug_print(
         "DPEXRT-DEBUG: In DPEXRT_sycl_usm_ndarray_from_python.\n"));
 
     // Check if the PyObject obj has an _array_obj attribute that is of
     // dpctl.tensor.usm_ndarray type.
     if (!(arrayobj = PyUSMNdArray_ARRAYOBJ(obj))) {
-        NRT_Debug(nrt_debug_print(
+        DPEXRT_DEBUG(nrt_debug_print(
             "DPEXRT-ERROR: PyUSMNdArray_ARRAYOBJ check failed %d\n", __FILE__,
             __LINE__));
         goto error;
     }
 
     if (!(ndim = UsmNDArray_GetNDim(arrayobj))) {
-        NRT_Debug(nrt_debug_print(
+        DPEXRT_DEBUG(nrt_debug_print(
             "DPEXRT-ERROR: UsmNDArray_GetNDim returned 0 at %s, line %d\n",
             __FILE__, __LINE__));
         goto error;
@@ -640,7 +714,7 @@ static int DPEXRT_sycl_usm_ndarray_from_python(PyObject *obj,
     nitems = product_of_shape(shape, ndim);
     itemsize = (npy_intp)UsmNDArray_GetElementSize(arrayobj);
     if (!(qref = UsmNDArray_GetQueueRef(arrayobj))) {
-        NRT_Debug(nrt_debug_print(
+        DPEXRT_DEBUG(nrt_debug_print(
             "DPEXRT-ERROR: UsmNDArray_GetQueueRef returned NULL at "
             "%s, line %d.\n",
             __FILE__, __LINE__));
@@ -650,7 +724,7 @@ static int DPEXRT_sycl_usm_ndarray_from_python(PyObject *obj,
     if (!(arystruct->meminfo = NRT_MemInfo_new_from_usmndarray(
               obj, data, nitems, itemsize, qref)))
     {
-        NRT_Debug(nrt_debug_print(
+        DPEXRT_DEBUG(nrt_debug_print(
             "DPEXRT-ERROR: NRT_MemInfo_new_from_usmndarray failed "
             "at %s, line %d.\n",
             __FILE__, __LINE__));
@@ -698,7 +772,7 @@ static int DPEXRT_sycl_usm_ndarray_from_python(PyObject *obj,
     // code of -1.
     // Decref the Pyobject of the array
     // ensure the GIL
-    NRT_Debug(nrt_debug_print(
+    DPEXRT_DEBUG(nrt_debug_print(
         "DPEXRT-ERROR: Failed to unbox dpnp ndarray into a Numba "
         "arraystruct at %s, line %d\n",
         __FILE__, __LINE__));
@@ -732,7 +806,7 @@ static PyObject *box_from_arystruct_parent(arystruct_t *arystruct,
     struct PyUSMArrayObject *arrayobj = NULL;
     npy_intp itemsize = 0;
 
-    NRT_Debug(nrt_debug_print("DPEXRT-DEBUG: In try_to_return_parent.\n"));
+    DPEXRT_DEBUG(nrt_debug_print("DPEXRT-DEBUG: In try_to_return_parent.\n"));
 
     if (!(arrayobj = PyUSMNdArray_ARRAYOBJ(arystruct->parent)))
         return NULL;
@@ -778,7 +852,7 @@ static PyObject *box_from_arystruct_parent(arystruct_t *arystruct,
     // parent, we need to increment the reference count of the parent here.
     Py_IncRef(array);
 
-    NRT_Debug(nrt_debug_print(
+    DPEXRT_DEBUG(nrt_debug_print(
         "DPEXRT-DEBUG: try_to_return_parent found a valid parent.\n"));
 
     /* Yes, it is the same array return a new reference */
@@ -818,7 +892,7 @@ DPEXRT_sycl_usm_ndarray_to_python_acqref(arystruct_t *arystruct,
     int exp = 0;
     npy_intp itemsize = 0;
 
-    NRT_Debug(nrt_debug_print(
+    DPEXRT_DEBUG(nrt_debug_print(
         "DPEXRT-DEBUG: In DPEXRT_sycl_usm_ndarray_to_python_acqref.\n"));
 
     if (descr == NULL) {
@@ -837,7 +911,7 @@ DPEXRT_sycl_usm_ndarray_to_python_acqref(arystruct_t *arystruct,
     // If the arystruct has a parent attribute, try to box the parent and
     // return it.
     if (arystruct->parent) {
-        NRT_Debug(nrt_debug_print(
+        DPEXRT_DEBUG(nrt_debug_print(
             "DPEXRT-DEBUG: arystruct has a parent, therefore "
             "trying to box and return the parent at %s, line %d\n",
             __FILE__, __LINE__));
@@ -875,8 +949,8 @@ DPEXRT_sycl_usm_ndarray_to_python_acqref(arystruct_t *arystruct,
         NRT_MemInfo_acquire(arystruct->meminfo);
         status = MemInfo_init(miobj, args, NULL);
         if (status != 0) {
-            NRT_Debug(nrt_debug_print("MemInfo_init failed at %s, line %d\n",
-                                      __FILE__, __LINE__));
+            DPEXRT_DEBUG(nrt_debug_print("MemInfo_init failed at %s, line %d\n",
+                                         __FILE__, __LINE__));
             Py_DECREF(args);
             PyErr_Format(PyExc_ValueError,
                          "In 'DPEXRT_sycl_usm_ndarray_to_python_acqref', "
@@ -960,7 +1034,7 @@ DPEXRT_sycl_usm_ndarray_to_python_acqref(arystruct_t *arystruct,
         return (PyObject *)NULL;
     }
 
-    NRT_Debug(nrt_debug_print(
+    DPEXRT_DEBUG(nrt_debug_print(
         "Returning from DPEXRT_sycl_usm_ndarray_to_python_acqref "
         "at %s, line %d\n",
         __FILE__, __LINE__));
@@ -994,6 +1068,9 @@ static PyObject *build_c_helpers_dict(void)
                  &DPEXRT_sycl_usm_ndarray_from_python);
     _declpointer("DPEXRT_sycl_usm_ndarray_to_python_acqref",
                  &DPEXRT_sycl_usm_ndarray_to_python_acqref);
+    _declpointer("DPEXRTQueue_CreateFromFilterString",
+                 &DPEXRTQueue_CreateFromFilterString);
+    _declpointer("DpexrtQueue_SubmitRange", &DpexrtQueue_SubmitRange);
     _declpointer("DPEXRT_MemInfo_alloc", &DPEXRT_MemInfo_alloc);
     _declpointer("DPEXRT_MemInfo_fill", &DPEXRT_MemInfo_fill);
     _declpointer("NRT_ExternalAllocator_new_for_usm",
@@ -1044,6 +1121,11 @@ MOD_INIT(_dpexrt_python)
     PyModule_AddObject(
         m, "DPEXRT_sycl_usm_ndarray_to_python_acqref",
         PyLong_FromVoidPtr(&DPEXRT_sycl_usm_ndarray_to_python_acqref));
+
+    PyModule_AddObject(m, "DPEXRTQueue_CreateFromFilterString",
+                       PyLong_FromVoidPtr(&DPEXRTQueue_CreateFromFilterString));
+    PyModule_AddObject(m, "DpexrtQueue_SubmitRange",
+                       PyLong_FromVoidPtr(&DpexrtQueue_SubmitRange));
     PyModule_AddObject(m, "DPEXRT_MemInfo_alloc",
                        PyLong_FromVoidPtr(&DPEXRT_MemInfo_alloc));
     PyModule_AddObject(m, "DPEXRT_MemInfo_fill",
diff --git a/numba_dpex/core/runtime/context.py b/numba_dpex/core/runtime/context.py
index 8e70baef67..3a23a819c0 100644
--- a/numba_dpex/core/runtime/context.py
+++ b/numba_dpex/core/runtime/context.py
@@ -129,7 +129,7 @@ def arraystruct_from_python(self, pyapi, obj, ptr):
         return self.error
 
     def usm_ndarray_to_python_acqref(self, pyapi, aryty, ary, dtypeptr):
-        """_summary_
+        """Boxes a DpnpNdArray native object into a Python dpnp.ndarray.
 
         Args:
             pyapi (_type_): _description_
@@ -140,14 +140,14 @@ def usm_ndarray_to_python_acqref(self, pyapi, aryty, ary, dtypeptr):
         Returns:
             _type_: _description_
         """
-        args = [
+        argtys = [
             pyapi.voidptr,
             pyapi.pyobj,
             llvmir.IntType(32),
             llvmir.IntType(32),
             pyapi.pyobj,
         ]
-        fnty = llvmir.FunctionType(pyapi.pyobj, args)
+        fnty = llvmir.FunctionType(pyapi.pyobj, argtys)
         fn = pyapi._get_function(
             fnty, "DPEXRT_sycl_usm_ndarray_to_python_acqref"
         )
@@ -165,3 +165,86 @@ def usm_ndarray_to_python_acqref(self, pyapi, aryty, ary, dtypeptr):
 
         args = [ptr, serial_aryty_pytype, ndim, writable, dtypeptr]
         return pyapi.builder.call(fn, args)
+
+    def get_queue_from_filter_string(self, builder, device):
+        """Calls DPEXRTQueue_CreateFromFilterString to create a new sycl::queue
+        from a given filter string.
+
+        Args:
+            device (llvmlite.ir.values.FormattedConstant): An LLVM ArrayType
+                storing a const string for a DPC++ filter selector string.
+
+        Returns: A DPCTLSyclQueueRef pointer.
+        """
+        mod = builder.module
+        fnty = llvmir.FunctionType(
+            cgutils.voidptr_t,
+            [cgutils.voidptr_t],
+        )
+        fn = cgutils.get_or_insert_function(
+            mod, fnty, "DPEXRTQueue_CreateFromFilterString"
+        )
+        fn.return_value.add_attribute("noalias")
+
+        ret = builder.call(fn, [device])
+
+        return ret
+
+    def submit_range(
+        self,
+        builder,
+        kref,
+        qref,
+        args,
+        argtys,
+        nargs,
+        range,
+        nrange,
+        depevents,
+        ndepevents,
+    ):
+        """Calls DPEXRTQueue_CreateFromFilterString to create a new sycl::queue
+        from a given filter string.
+
+        Args:
+            device (llvmlite.ir.values.FormattedConstant): An LLVM ArrayType
+                storing a const string for a DPC++ filter selector string.
+
+        Returns: A DPCTLSyclQueueRef pointer.
+        """
+        mod = builder.module
+        fnty = llvmir.FunctionType(
+            llvmir.types.VoidType(),
+            [
+                cgutils.voidptr_t,
+                cgutils.voidptr_t,
+                cgutils.voidptr_t.as_pointer(),
+                cgutils.int32_t.as_pointer(),
+                llvmir.IntType(64),
+                llvmir.IntType(64).as_pointer(),
+                llvmir.IntType(64),
+                cgutils.voidptr_t,
+                llvmir.IntType(64),
+            ],
+        )
+        fn = cgutils.get_or_insert_function(
+            mod, fnty, "DpexrtQueue_SubmitRange"
+        )
+        # fn.return_value.add_attribute("noalias")
+
+        ret = builder.call(
+            fn,
+            [
+                kref,
+                qref,
+                args,
+                argtys,
+                nargs,
+                range,
+                nrange,
+                depevents,
+                ndepevents,
+            ],
+        )
+
+        return ret
diff --git a/numba_dpex/core/types/dpnp_ndarray_type.py b/numba_dpex/core/types/dpnp_ndarray_type.py
index 23df5d9477..75d77141c4 100644
--- a/numba_dpex/core/types/dpnp_ndarray_type.py
+++ b/numba_dpex/core/types/dpnp_ndarray_type.py
@@ -108,7 +108,7 @@ def __allocate__(
         # Assume str(dtype) returns a valid type
         dtype_str = str(dtype)
         # alloc call: lhs = empty_attr(size_var, typ_var)
-        typ_var = ir.Var(scope, mk_unique_var("$np_typ_var"), loc)
+        typ_var = ir.Var(scope, mk_unique_var("$np_dtype_var"), loc)
         if typemap:
             typemap[typ_var.name] = types.functions.NumberClass(dtype)
         # If dtype is a datetime/timedelta with a unit,
@@ -126,11 +126,53 @@ def __allocate__(
                 dtype_str = "bool_"
             np_typ_getattr = ir.Expr.getattr(g_np_var, dtype_str, loc)
             typ_var_assign = ir.Assign(np_typ_getattr, typ_var, loc)
-        alloc_call = ir.Expr.call(attr_var, [size_var, typ_var], (), loc)
+
+        # A default usm_type arg added as a placeholder
+        layout_var = ir.Var(scope, mk_unique_var("$layout_var"), loc)
+        usm_typ_var = ir.Var(scope, mk_unique_var("$np_usm_type_var"), loc)
+        # A default device string arg added as a placeholder
+        device_typ_var = ir.Var(scope, mk_unique_var("$np_device_var"), loc)
+
+        if typemap:
+            typemap[layout_var.name] = types.literal(lhs_typ.layout)
+            typemap[usm_typ_var.name] = types.literal(lhs_typ.usm_type)
+            typemap[device_typ_var.name] = types.literal(lhs_typ.device)
+
+        layout_var_assign = ir.Assign(
+            ir.Const(lhs_typ.layout, loc), layout_var, loc
+        )
+        usm_typ_var_assign = ir.Assign(
+            ir.Const(lhs_typ.usm_type, loc), usm_typ_var, loc
+        )
+        device_typ_var_assign = ir.Assign(
+            ir.Const(lhs_typ.device, loc), device_typ_var, loc
+        )
+
+        out.extend(
+            [layout_var_assign, usm_typ_var_assign, device_typ_var_assign]
+        )
+
+        alloc_call = ir.Expr.call(
+            attr_var,
+            [size_var, typ_var, layout_var, device_typ_var, usm_typ_var],
+            (),
+            loc,
+        )
 
         if calltypes:
             cac = typemap[attr_var.name].get_call_type(
-                typingctx, [size_typ, types.functions.NumberClass(dtype)], {}
+                typingctx,
+                [
+                    typemap[x.name]
+                    for x in [
+                        size_var,
+                        typ_var,
+                        layout_var,
+                        device_typ_var,
+                        usm_typ_var,
+                    ]
+                ],
+                {},
             )
             # By default, all calls to "empty" are typed as returning a standard
             # NumPy ndarray.  If we are allocating a ndarray subclass here then
diff --git a/numba_dpex/core/types/usm_ndarray_type.py b/numba_dpex/core/types/usm_ndarray_type.py
index 6fa1ff79c3..c9eb97c40d 100644
--- a/numba_dpex/core/types/usm_ndarray_type.py
+++ b/numba_dpex/core/types/usm_ndarray_type.py
@@ -60,7 +60,7 @@ def __init__(
                     "a SYCL filter selector"
                 )
             self.queue = dpctl.SyclQueue(device)
-            self.device = device
+            self.device = self.queue.sycl_device.filter_string
         elif queue is not None and device == "unknown":
             if not isinstance(queue, dpctl.SyclQueue):
                 raise TypeError(
@@ -136,7 +136,7 @@ def copy(
             device = self.device
         if usm_type is None:
             usm_type = self.usm_type
-        return USMNdArray(
+        return type(self)(
             dtype=dtype,
             ndim=ndim,
             layout=layout,
@@ -169,7 +169,7 @@ def unify(self, typingctx, other):
                     layout = "A"
                 readonly = not (self.mutable and other.mutable)
                 aligned = self.aligned and other.aligned
-                return USMNdArray(
+                return type(self)(
                     dtype=self.dtype,
                     ndim=self.ndim,
                     layout=layout,
diff --git a/numba_dpex/core/typing/dpnpdecl.py b/numba_dpex/core/typing/dpnpdecl.py
new file mode 100644
index 0000000000..7c8cf817b5
--- /dev/null
+++ b/numba_dpex/core/typing/dpnpdecl.py
@@ -0,0 +1,198 @@
+# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import dpnp
+import numpy as np
+from numba.core import types
+from numba.core.typing.npydecl import (
+    Numpy_rules_ufunc,
+    NumpyRulesArrayOperator,
+    NumpyRulesInplaceArrayOperator,
+    NumpyRulesUnaryArrayOperator,
+    infer_global,
+)
+
+
+class DpnpRulesArrayOperator(NumpyRulesArrayOperator):
+    @property
+    def ufunc(self):
+        try:
+            op = getattr(dpnp, self._op_map[self.key])
+            npop = getattr(np, self._op_map[self.key])
+            op.nin = npop.nin
+            op.nout = npop.nout
+            op.nargs = npop.nargs
+            op.types = npop.types
+            op.is_dpnp_ufunc = True
+            return op
+        except:
+            pass
+
+
+class DpnpRulesInplaceArrayOperator(NumpyRulesInplaceArrayOperator):
+    pass
+
+
+class DpnpRulesUnaryArrayOperator(NumpyRulesUnaryArrayOperator):
+    pass
+
+
+# list of unary ufuncs to register
+_math_operations = [
+    "add",
+    "subtract",
+    "multiply",
+    "floor_divide",
+    "negative",
+    "power",
+    "remainder",
+    "fmod",
+    "absolute",
+    "sign",
+    "conjugate",
+    "exp",
+    "exp2",
+    "log",
+    "log2",
+    "log10",
+    "expm1",
+    "log1p",
+    "sqrt",
+    "square",
+    "cbrt",
+    "reciprocal",
+    "divide",
+    "true_divide",
+    "mod",
+    "abs",
+    "fabs",
+    "erf",
+]
+
+_trigonometric_functions = [
+    "sin",
+    "cos",
+    "tan",
+    "arcsin",
+    "arccos",
+    "arctan",
+    "arctan2",
+    "hypot",
+    "sinh",
+    "cosh",
+    "tanh",
+    "arcsinh",
+    "arccosh",
+    "arctanh",
+    "deg2rad",
+    "rad2deg",
+    "degrees",
+    "radians",
+]
+
+_bit_twiddling_functions = [
+    "bitwise_and",
+    "bitwise_or",
+    "bitwise_xor",
+    "invert",
+    "left_shift",
+    "right_shift",
+    "bitwise_not",
+]
+
+_comparison_functions = [
+    "greater",
+    "greater_equal",
+    "less",
+    "less_equal",
+    "not_equal",
+    "equal",
+    "logical_and",
+    "logical_or",
+    "logical_xor",
+    "logical_not",
+    "maximum",
+    "minimum",
+    "fmax",
+    "fmin",
+]
+
+_floating_functions = [
+    "isfinite",
+    "isinf",
+    "isnan",
+    "copysign",
+    "modf",
+    "frexp",
+    "floor",
+    "ceil",
+    "trunc",
+]
+
+_logic_functions = []
+
+
+# This is a set of the ufuncs that are not yet supported by Lowering. In order
+# to trigger no-python mode we must not register them until their Lowering is
+# implemented.
+#
+# It also works as a nice TODO list for ufunc support :)
+_unsupported = set(
+    [
+        "frexp",  # Not supported by Numba
+        "modf",  # Not supported by Numba
+        "logaddexp",
+        "logaddexp2",
+        "positive",
+        "float_power",
+        "rint",
+        "divmod",
+        "gcd",
+        "lcm",
+        "signbit",
+        "nextafter",
+        "ldexp",
+        "spacing",
+        "isnat",
+    ]
+)
+
+# A list of ufuncs that are in fact aliases of other ufuncs. They need to insert
+# the resolve method, but not register the ufunc itself
+_aliases = set(["bitwise_not", "mod", "abs"])
+
+all_ufuncs = sum(
+    [
+        _math_operations,
+        _trigonometric_functions,
+        _bit_twiddling_functions,
+        _comparison_functions,
+        _floating_functions,
+        _logic_functions,
+    ],
+    [],
+)
+
+supported_ufuncs = [x for x in all_ufuncs if x not in _unsupported]
+
+
+def _dpnp_ufunc(name):
+    func = getattr(dpnp, name)
+
+    class typing_class(Numpy_rules_ufunc):
+        key = func
+
+    typing_class.__name__ = "resolve_{0}".format(name)
+
+    if name not in _aliases:  # if not name in _aliases
+        infer_global(func, types.Function(typing_class))
+
+
+for func in supported_ufuncs:
+    _dpnp_ufunc(func)
+
+
+DpnpRulesArrayOperator.install_operations()
+DpnpRulesInplaceArrayOperator.install_operations()
+DpnpRulesUnaryArrayOperator.install_operations()
diff --git a/numba_dpex/core/utils/kernel_builder.py b/numba_dpex/core/utils/kernel_builder.py
new file mode 100644
index 0000000000..84326539ae
--- /dev/null
+++ b/numba_dpex/core/utils/kernel_builder.py
@@ -0,0 +1,616 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import copy
+import sys
+import warnings
+
+import dpctl.program as dpctl_prog
+import dpnp
+import numba
+from numba.core import compiler, ir, types
+from numba.core.errors import NumbaParallelSafetyWarning
+from numba.core.ir_utils import (
+    add_offset_to_labels,
+    get_name_var_table,
+    get_unused_var_name,
+    legalize_names,
+    mk_unique_var,
+    remove_dead,
+    remove_dels,
+    rename_labels,
+    replace_var_names,
+)
+from numba.core.typing import signature
+
+import numba_dpex as dpex
+from numba_dpex import config
+
+from ..descriptor import dpex_kernel_target
+from ..passes import parfor
+from ..types.dpnp_ndarray_type import DpnpNdArray
+
+
+class GufuncKernel:
+    def __init__(
+        self,
+        name,
+        kernel,
+        signature,
+        kernel_args,
+        kernel_arg_types,
+        queue,
+    ):
+        self.name = name
+        self.kernel = kernel
+        self.signature = signature
+        self.kernel_args = kernel_args
+        self.kernel_arg_types = kernel_arg_types
+        self.queue = queue
+
+
+def _print_block(block):
+    for i, inst in enumerate(block.body):
+        print("    ", i, inst)
+
+
+def _print_body(body_dict):
+    """Pretty-print a set of IR blocks."""
+    for label, block in body_dict.items():
+        print("label: ", label)
+        _print_block(block)
+
+
+def _compile_kernel_parfor(
+    sycl_queue, kernel_name, func_ir, argtypes, debug=False
+):
+    # Create a SPIRVKernel object
+    kernel = dpex.core.kernel_interface.spirv_kernel.SpirvKernel(
+        func_ir, kernel_name
+    )
+
+    # compile the kernel
+    kernel.compile(
+        args=argtypes,
+        typing_ctx=dpex_kernel_target.typing_context,
+        target_ctx=dpex_kernel_target.target_context,
+        debug=debug,
+        compile_flags=None,
+    )
+
+    dpctl_create_program_from_spirv_flags = []
+    if debug or config.OPT == 0:
+        # if debug is ON we need to pass additional flags to igc.
+        dpctl_create_program_from_spirv_flags = ["-g", "-cl-opt-disable"]
+
+    # create a program
+    kernel_bundle = dpctl_prog.create_program_from_spirv(
+        sycl_queue,
+        kernel.device_driver_ir_module,
+        " ".join(dpctl_create_program_from_spirv_flags),
+    )
+    #  create a kernel
+    sycl_kernel = kernel_bundle.get_sycl_kernel(kernel.module_name)
+
+    return sycl_kernel
+
+
+def _legalize_names_with_typemap(names, typemap):
+    """Replace illegal characters in Numba IR var names.
+
+    We use ir_utils.legalize_names to replace internal IR variable names
+    containing illegal characters (e.g. period) with a legal character
+    (underscore) so as to create legal variable names. The original variable
+    names are in the typemap so we also need to add the legalized name to the
+    typemap as well.
+    """
+    outdict = legalize_names(names)
+    # For each pair in the dict of legalized names...
+    for x, y in outdict.items():
+        # If the name had some legalization change to it...
+        if x != y:
+            # Set the type of the new name the same as the type of the old name.
+            typemap[y] = typemap[x]
+    return outdict
+
+
+def _to_scalar_from_0d(x):
+    if isinstance(x, types.ArrayCompatible) and x.ndim == 0:
+        return x.dtype
+    else:
+        return x
+
+
+def _replace_var_with_array_in_block(vars, block, typemap, calltypes):
+    new_block = []
+    for inst in block.body:
+        if isinstance(inst, ir.Assign) and inst.target.name in vars:
+            const_node = ir.Const(0, inst.loc)
+            const_var = ir.Var(
+                inst.target.scope, mk_unique_var("$const_ind_0"), inst.loc
+            )
+            typemap[const_var.name] = types.uintp
+            const_assign = ir.Assign(const_node, const_var, inst.loc)
+            new_block.append(const_assign)
+
+            setitem_node = ir.SetItem(
+                inst.target, const_var, inst.value, inst.loc
+            )
+            calltypes[setitem_node] = signature(
+                types.none,
+                types.npytypes.Array(typemap[inst.target.name], 1, "C"),
+                types.intp,
+                typemap[inst.target.name],
+            )
+            new_block.append(setitem_node)
+            continue
+        elif isinstance(inst, parfor.Parfor):
+            _replace_var_with_array_internal(
+                vars, {0: inst.init_block}, typemap, calltypes
+            )
+            _replace_var_with_array_internal(
+                vars, inst.loop_body, typemap, calltypes
+            )
+
+        new_block.append(inst)
+    return new_block
+
+
+def _replace_var_with_array_internal(vars, loop_body, typemap, calltypes):
+    for label, block in loop_body.items():
+        block.body = _replace_var_with_array_in_block(
+            vars, block, typemap, calltypes
+        )
+
+
+def _replace_var_with_array(vars, loop_body, typemap, calltypes):
+    _replace_var_with_array_internal(vars, loop_body, typemap, calltypes)
+    for v in vars:
+        el_typ = typemap[v]
+        typemap.pop(v, None)
+        typemap[v] = types.npytypes.Array(el_typ, 1, "C")
+
+
+def _dbgprint_after_each_array_assignments(lowerer, loop_body, typemap):
+    for label, block in loop_body.items():
+        new_block = block.copy()
+        new_block.clear()
+        loc = block.loc
+        scope = block.scope
+        for inst in block.body:
+            new_block.append(inst)
+            # Append print after assignment
+            if isinstance(inst, ir.Assign):
+                # Only apply to numbers
+                if typemap[inst.target.name] not in types.number_domain:
+                    continue
+
+                # Make constant string
+                strval = "{} =".format(inst.target.name)
+                strconsttyp = types.StringLiteral(strval)
+
+                lhs = ir.Var(scope, mk_unique_var("str_const"), loc)
+                assign_lhs = ir.Assign(
+                    value=ir.Const(value=strval, loc=loc), target=lhs, loc=loc
+                )
+                typemap[lhs.name] = strconsttyp
+                new_block.append(assign_lhs)
+
+                # Make print node
+                print_node = ir.Print(
+                    args=[lhs, inst.target], vararg=None, loc=loc
+                )
+                new_block.append(print_node)
+                sig = signature(
+                    types.none, typemap[lhs.name], typemap[inst.target.name]
+                )
+                lowerer.fndesc.calltypes[print_node] = sig
+        loop_body[label] = new_block
+
+
+def _generate_kernel_stub_as_string(
+    kernel_name,
+    parfor_params,
+    parfor_dim,
+    legal_loop_indices,
+    loop_ranges,
+    param_dict,
+    has_reduction,
+    redvars,
+    typemap,
+    redvars_dict,
+    sentinel_name,
+):
+    """Generates a stub dpex kernel for the parfor.
+
+    Returns:
+        str: A string representing a stub kernel function for the parfor.
+    """
+    kernel_txt = ""
+
+    # Create the dpex kernel function.
+    kernel_txt += "def " + kernel_name
+    kernel_txt += "(" + (", ".join(parfor_params)) + "):\n"
+    global_id_dim = 0
+    for_loop_dim = parfor_dim
+
+    if parfor_dim > 3:
+        raise NotImplementedError
+        global_id_dim = 3
+    else:
+        global_id_dim = parfor_dim
+
+    for eachdim in range(global_id_dim):
+        kernel_txt += (
+            "    "
+            + legal_loop_indices[eachdim]
+            + " = "
+            + "dpex.get_global_id("
+            + str(eachdim)
+            + ")\n"
+        )
+
+    for eachdim in range(global_id_dim, for_loop_dim):
+        for indent in range(1 + (eachdim - global_id_dim)):
+            kernel_txt += "    "
+
+        start, stop, step = loop_ranges[eachdim]
+        start = param_dict.get(str(start), start)
+        stop = param_dict.get(str(stop), stop)
+        kernel_txt += (
+            "for "
+            + legal_loop_indices[eachdim]
+            + " in range("
+            + str(start)
+            + ", "
+            + str(stop)
+            + " + 1):\n"
+        )
+
+    for eachdim in range(global_id_dim, for_loop_dim):
+        for indent in range(1 + (eachdim - global_id_dim)):
+            kernel_txt += "    "
+
+    # Add the sentinel assignment so that we can find the loop body position
+    # in the IR.
+    kernel_txt += "    "
+    kernel_txt += sentinel_name + " = 0\n"
+
+    # A kernel function does not return anything
+    kernel_txt += "    return None\n"
+
+    return kernel_txt
+
+
+def _wrap_loop_body(loop_body):
+    blocks = loop_body.copy()  # shallow copy is enough
+    first_label = min(blocks.keys())
+    last_label = max(blocks.keys())
+    loc = blocks[last_label].loc
+    blocks[last_label].body.append(ir.Jump(first_label, loc))
+    return blocks
+
+
+def _unwrap_loop_body(loop_body):
+    last_label = max(loop_body.keys())
+    loop_body[last_label].body = loop_body[last_label].body[:-1]
+
+
+def _find_setitems_block(setitems, block, typemap):
+    for inst in block.body:
+        if isinstance(inst, ir.StaticSetItem) or isinstance(inst, ir.SetItem):
+            setitems.add(inst.target.name)
+        elif isinstance(inst, parfor.Parfor):
+            _find_setitems_block(setitems, inst.init_block, typemap)
+            _find_setitems_body(setitems, inst.loop_body, typemap)
+
+
+def _find_setitems_body(setitems, loop_body, typemap):
+    """
+    Find the arrays that are written into (goes into setitems)
+    """
+    for label, block in loop_body.items():
+        _find_setitems_block(setitems, block, typemap)
+
+
+def _replace_sentinel_with_parfor_body(kernel_ir, sentinel_name, loop_body):
+    # new label for splitting sentinel block
+    new_label = max(loop_body.keys()) + 1
+
+    # Search all the block in the kernel function for the sentinel assignment.
+    for label, block in kernel_ir.blocks.items():
+        for i, inst in enumerate(block.body):
+            if (
+                isinstance(inst, ir.Assign)
+                and inst.target.name == sentinel_name
+            ):
+                # We found the sentinel assignment.
+                loc = inst.loc
+                scope = block.scope
+                # split block across __sentinel__
+                # A new block is allocated for the statements prior to the
+                # sentinel but the new block maintains the current block label.
+                prev_block = ir.Block(scope, loc)
+                prev_block.body = block.body[:i]
+
+                # The current block is used for statements after the sentinel.
+                block.body = block.body[i + 1 :]  # noqa: E203
+                # But the current block gets a new label.
+                body_first_label = min(loop_body.keys())
+
+                # The previous block jumps to the minimum labelled block of the
+                # parfor body.
+                prev_block.append(ir.Jump(body_first_label, loc))
+                # Add all the parfor loop body blocks to the kernel function's
+                # IR.
+                for loop, b in loop_body.items():
+                    kernel_ir.blocks[loop] = b
+                body_last_label = max(loop_body.keys())
+                kernel_ir.blocks[new_label] = block
+                kernel_ir.blocks[label] = prev_block
+                # Add a jump from the last parfor body block to the block
+                # containing statements after the sentinel.
+                kernel_ir.blocks[body_last_label].append(
+                    ir.Jump(new_label, loc)
+                )
+                break
+        else:
+            continue
+        break
+
+
+def create_kernel_for_parfor(
+    lowerer,
+    parfor_node,
+    typemap,
+    flags,
+    loop_ranges,
+    has_aliases,
+    races,
+    parfor_outputs,
+):
+    """
+    Creates a numba_dpex.kernel function for a parfor node.
+
+    There are two parts to this function:
+
+        1) Code to iterate across the iteration space as defined by
+           the schedule.
+        2) The parfor body that does the work for a single point in
+           the iteration space.
+
+    Part 1 is created as Python text for simplicity with a sentinel
+    assignment to mark the point in the IR where the parfor body
+    should be added. This Python text is 'exec'ed into existence and its
+    IR retrieved with run_frontend. The IR is scanned for the sentinel
+    assignment where that basic block is split and the IR for the parfor
+    body inserted.
+    """
+    loc = parfor_node.init_block.loc
+
+    # The parfor body and the main function body share ir.Var nodes.
+    # We have to do some replacements of Var names in the parfor body
+    # to make them legal parameter names. If we don't copy then the
+    # Vars in the main function also would incorrectly change their name.
+    loop_body = copy.copy(parfor_node.loop_body)
+    remove_dels(loop_body)
+
+    parfor_dim = len(parfor_node.loop_nests)
+    loop_indices = [
+        loop_nest.index_variable.name for loop_nest in parfor_node.loop_nests
+    ]
+
+    # Get all the parfor params.
+    parfor_params = parfor_node.params
+
+    # Get all parfor reduction vars, and operators.
+    typemap = lowerer.fndesc.typemap
+
+    parfor_redvars, parfor_reddict = parfor.get_parfor_reductions(
+        lowerer.func_ir, parfor_node, parfor_params, lowerer.fndesc.calltypes
+    )
+    has_reduction = False if len(parfor_redvars) == 0 else True
+
+    if has_reduction:
+        raise NotImplementedError
+
+    # Compute just the parfor inputs as a set difference.
+    parfor_inputs = sorted(list(set(parfor_params) - set(parfor_outputs)))
+
+    for race in races:
+        msg = (
+            "Variable %s used in parallel loop may be written "
+            "to simultaneously by multiple workers and may result "
+            "in non-deterministic or unintended results." % race
+        )
+        warnings.warn(NumbaParallelSafetyWarning(msg, loc))
+
+    _replace_var_with_array(races, loop_body, typemap, lowerer.fndesc.calltypes)
+
+    # Reorder all the params so that inputs go first then outputs.
+    parfor_params = parfor_inputs + parfor_outputs
+
+    # Some Var and loop_indices may not have legal parameter names so create a
+    # dict of potentially illegal param name to guaranteed legal name.
+    param_dict = _legalize_names_with_typemap(parfor_params, typemap)
+    ind_dict = _legalize_names_with_typemap(loop_indices, typemap)
+    redvars_dict = legalize_names(parfor_redvars)
+
+    # Compute a new list of legal loop index names.
+    legal_loop_indices = [ind_dict[v] for v in loop_indices]
+    # Get the types of each parameter.
+    param_types = [_to_scalar_from_0d(typemap[v]) for v in parfor_params]
+    # Calculate types of args passed to the kernel function.
+    func_arg_types = [typemap[v] for v in (parfor_inputs + parfor_outputs)]
+
+    # Replace illegal parameter names in the loop body with legal ones.
+    replace_var_names(loop_body, param_dict)
+
+    # remember the name before legalizing as the actual arguments
+    parfor_args = parfor_params
+    # Change parfor_params to be legal names.
+    parfor_params = [param_dict[v] for v in parfor_params]
+    parfor_params_orig = parfor_params
+
+    parfor_params = []
+    ascontig = False
+    for pindex in range(len(parfor_params_orig)):
+        if (
+            ascontig
+            and pindex < len(parfor_inputs)
+            and isinstance(param_types[pindex], types.npytypes.Array)
+        ):
+            parfor_params.append(parfor_params_orig[pindex] + "param")
+        else:
+            parfor_params.append(parfor_params_orig[pindex])
+
+    # Change parfor body to replace illegal loop index vars with legal ones.
+    replace_var_names(loop_body, ind_dict)
+
+    loop_body_var_table = get_name_var_table(loop_body)
+    sentinel_name = get_unused_var_name("__sentinel__", loop_body_var_table)
+
+    if config.DEBUG_ARRAY_OPT >= 1:
+        print("legal parfor_params = ", parfor_params, type(parfor_params))
+
+    # Determine the unique names of the kernel functions.
+    kernel_name = "__numba_parfor_kernel_%s" % (parfor_node.id)
+
+    kernel_fn_txt = _generate_kernel_stub_as_string(
+        kernel_name,
+        parfor_params,
+        parfor_dim,
+        legal_loop_indices,
+        loop_ranges,
+        param_dict,
+        has_reduction,
+        parfor_redvars,
+        typemap,
+        redvars_dict,
+        sentinel_name,
+    )
+
+    if config.DEBUG_ARRAY_OPT:
+        print("kernel_fn_txt = ", type(kernel_fn_txt), "\n", kernel_fn_txt)
+        sys.stdout.flush()
+
+    # Exec the kernel_fn_txt string into existence.
+    globls = {"dpnp": dpnp, "numba": numba, "dpex": dpex}
+    locls = {}
+    exec(kernel_fn_txt, globls, locls)
+
+    kernel_fn = locls[kernel_name]
+
+    if config.DEBUG_ARRAY_OPT:
+        print("kernel_fn = ", type(kernel_fn), "\n", kernel_fn)
+    # Get the IR for the kernel_fn dpex kernel
+    kernel_ir = compiler.run_frontend(kernel_fn)
+
+    if config.DEBUG_ARRAY_OPT:
+        print("kernel_ir dump ", type(kernel_ir))
+        kernel_ir.dump()
+        print("loop_body dump ", type(loop_body))
+        _print_body(loop_body)
+
+    # rename all variables in kernel_ir afresh
+    var_table = get_name_var_table(kernel_ir.blocks)
+    new_var_dict = {}
+    reserved_names = (
+        [sentinel_name] + list(param_dict.values()) + legal_loop_indices
+    )
+    for name, var in var_table.items():
+        if not (name in reserved_names):
+            new_var_dict[name] = mk_unique_var(name)
+    replace_var_names(kernel_ir.blocks, new_var_dict)
+    if config.DEBUG_ARRAY_OPT:
+        print("kernel_ir dump after renaming ")
+        kernel_ir.dump()
+
+    gufunc_param_types = param_types
+
+    if config.DEBUG_ARRAY_OPT:
+        print(
+            "gufunc_param_types = ",
+            type(gufunc_param_types),
+            "\n",
+            gufunc_param_types,
+        )
+
+    gufunc_stub_last_label = max(kernel_ir.blocks.keys()) + 1
+
+    # Add gufunc stub last label to each parfor.loop_body label to prevent
+    # label conflicts.
+    loop_body = add_offset_to_labels(loop_body, gufunc_stub_last_label)
+
+    # If enabled, add a print statement after every assignment.
+    if config.DEBUG_ARRAY_OPT_RUNTIME:
+        _dbgprint_after_each_array_assignments(lowerer, loop_body, typemap)
+
+    _replace_sentinel_with_parfor_body(kernel_ir, sentinel_name, loop_body)
+
+    if config.DEBUG_ARRAY_OPT:
+        print("kernel_ir last dump before renaming")
+        kernel_ir.dump()
+
+    kernel_ir.blocks = rename_labels(kernel_ir.blocks)
+    remove_dels(kernel_ir.blocks)
+
+    old_alias = flags.noalias
+    if not has_aliases:
+        if config.DEBUG_ARRAY_OPT:
+            print("No aliases found so adding noalias flag.")
+        flags.noalias = True
+
+    remove_dead(kernel_ir.blocks, kernel_ir.arg_names, kernel_ir, typemap)
+
+    if config.DEBUG_ARRAY_OPT:
+        print("gufunc_ir after remove dead")
+        kernel_ir.dump()
+
+    kernel_sig = signature(types.none, *gufunc_param_types)
+
+    if config.DEBUG_ARRAY_OPT:
+        sys.stdout.flush()
+
+    if config.DEBUG_ARRAY_OPT:
+        print("after DUFunc inline".center(80, "-"))
+        kernel_ir.dump()
+
+    # The ParforLegalizeCFD pass has already ensured that the LHS and RHS
+    # arrays are on same device. We can take the queue from the first input
+    # array and use that to compile the kernel.
+
+    exec_queue = None
+
+    for arg in parfor_args:
+        obj = typemap[arg]
+        if isinstance(obj, DpnpNdArray):
+            exec_queue = obj.queue
+
+    if not exec_queue:
+        raise AssertionError(
+            "No execution found for parfor. No way to compile the kernel!"
+        )
+
+    sycl_kernel = _compile_kernel_parfor(
+        exec_queue,
+        kernel_name,
+        kernel_ir,
+        gufunc_param_types,
+        debug=flags.debuginfo,
+    )
+
+    flags.noalias = old_alias
+
+    if config.DEBUG_ARRAY_OPT:
+        print("kernel_sig = ", kernel_sig)
+
+    return GufuncKernel(
+        name=kernel_name,
+        kernel=sycl_kernel,
+        signature=kernel_sig,
+        kernel_args=parfor_args,
+        kernel_arg_types=func_arg_types,
+        queue=exec_queue,
+    )
diff --git a/numba_dpex/core/utils/kernel_launcher.py b/numba_dpex/core/utils/kernel_launcher.py
new file mode 100644
index 0000000000..d9652a744a
--- /dev/null
+++ b/numba_dpex/core/utils/kernel_launcher.py
@@ -0,0 +1,361 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from numba.core import cgutils, types
+
+from numba_dpex import utils
+from numba_dpex.core.runtime.context import DpexRTContext
+from numba_dpex.dpctl_iface import DpctlCAPIFnBuilder
+from numba_dpex.dpctl_iface._helpers import numba_type_to_dpctl_typenum
+
+
+class KernelLaunchIRBuilder:
+    """
+    KernelLaunchIRBuilder(lowerer, cres)
+    Helper class to build the LLVM IR for the submission of a kernel.
+
+    The class generates LLVM IR inside the current LLVM module that is needed
+    for submitting kernels. The LLVM Values that
+    """
+
+    def __init__(self, lowerer, kernel):
+        """Create a KernelLauncher for the specified kernel.
+
+        Args:
+            lowerer: The Numba Lowerer that will be used to generate the code.
+            kernel: The SYCL kernel for which we are generating the code.
+            num_inputs: The number of arguments to the kernels.
+        """
+        self.lowerer = lowerer
+        self.context = self.lowerer.context
+        self.builder = self.lowerer.builder
+        self.kernel = kernel
+        self.kernel_addr = self.kernel.addressof_ref()
+        self.rtctx = DpexRTContext(self.context)
+
+    def _build_nullptr(self):
+        """Builds the LLVM IR to represent a null pointer.
+
+        Returns: An LLVM Value storing a null pointer
+        """
+        zero = cgutils.alloca_once(self.builder, utils.LLVMTypes.int64_t)
+        self.builder.store(self.context.get_constant(types.int64, 0), zero)
+        return self.builder.bitcast(
+            zero, utils.get_llvm_type(context=self.context, type=types.voidptr)
+        )
+
+    def _build_array_attr_arg(
+        self,
+        array_val,
+        array_attr_pos,
+        array_attr_ty,
+        arg_list,
+        args_ty_list,
+        arg_num,
+    ):
+        array_attr = self.builder.gep(
+            array_val,
+            [
+                self.context.get_constant(types.int32, 0),
+                self.context.get_constant(types.int32, array_attr_pos),
+            ],
+        )
+
+        # FIXME: If pointer arg then load it to some value and pass that value.
+        # We also most likely need an address space cast
+        if isinstance(array_attr_ty, types.misc.RawPointer):
+            array_attr = self.builder.load(array_attr)
+
+        self.build_arg(
+            val=array_attr,
+            ty=array_attr_ty,
+            arg_list=arg_list,
+            args_ty_list=args_ty_list,
+            arg_num=arg_num,
+        )
+
+    def _build_flattened_array_args(
+        self, array_val, array_attr_pos, ndims, arg_list, args_ty_list, arg_num
+    ):
+        array_attr = self.builder.gep(
+            array_val,
+            [
+                self.context.get_constant(types.int32, 0),
+                self.context.get_constant(types.int32, array_attr_pos),
+            ],
+        )
+
+        for ndim in range(ndims):
+            self._build_array_attr_arg(
+                array_val=array_attr,
+                array_attr_pos=ndim,
+                array_attr_ty=types.int64,
+                arg_list=arg_list,
+                args_ty_list=args_ty_list,
+                arg_num=arg_num + ndim,
+            )
+
+    def build_arg(self, val, ty, arg_list, args_ty_list, arg_num):
+        """Stores the kernel arguments and the kernel argument types into
+        arrays that will be passed to DPCTLQueue_SubmitRange.
+
+        Args:
+            val: An LLVM IR Value that will be stored into the arguments array
+            ty: A Numba type that will be converted to a DPCTLKernelArgType
+            enum and stored into the argument types list array
+            arg_list: An LLVM IR Value array that stores the kernel arguments
+            args_ty_list: An LLVM IR Value array that stores the
+            DPCTLKernelArgType enum value for each kernel argument
+            arg_num: The index position at which the arg_list and args_ty_list
+            need to be updated.
+        """
+        kernel_arg_dst = self.builder.gep(
+            arg_list,
+            [self.context.get_constant(types.int32, arg_num)],
+        )
+        kernel_arg_ty_dst = self.builder.gep(
+            args_ty_list,
+            [self.context.get_constant(types.int32, arg_num)],
+        )
+        val = self.builder.bitcast(
+            val,
+            utils.get_llvm_type(context=self.context, type=types.voidptr),
+        )
+        self.builder.store(val, kernel_arg_dst)
+        self.builder.store(
+            numba_type_to_dpctl_typenum(self.context, ty), kernel_arg_ty_dst
+        )
+
+    def build_array_arg(
+        self, array_val, array_rank, arg_list, args_ty_list, arg_num
+    ):
+        """Creates a list of LLVM Values for an unpacked DpnpNdArray kernel
+        argument.
+
+        The steps performed here are the same as in
+        numba_dpex.core.kernel_interface.arg_pack_unpacker._unpack_array_helper
+        """
+        # Argument 1: Null pointer for the NRT_MemInfo attribute of the array
+        nullptr = self._build_nullptr()
+        self.build_arg(
+            val=nullptr,
+            ty=types.int64,
+            arg_list=arg_list,
+            args_ty_list=args_ty_list,
+            arg_num=arg_num,
+        )
+        arg_num += 1
+        # Argument 2: Null pointer for the Parent attribute of the array
+        nullptr = self._build_nullptr()
+        self.build_arg(
+            val=nullptr,
+            ty=types.int64,
+            arg_list=arg_list,
+            args_ty_list=args_ty_list,
+            arg_num=arg_num,
+        )
+        arg_num += 1
+        # Argument 3: Array size
+        self._build_array_attr_arg(
+            array_val=array_val,
+            array_attr_pos=2,
+            array_attr_ty=types.int64,
+            arg_list=arg_list,
+            args_ty_list=args_ty_list,
+            arg_num=arg_num,
+        )
+        arg_num += 1
+        # Argument 4: itemsize
+        self._build_array_attr_arg(
+            array_val=array_val,
+            array_attr_pos=3,
+            array_attr_ty=types.int64,
+            arg_list=arg_list,
+            args_ty_list=args_ty_list,
+            arg_num=arg_num,
+        )
+        arg_num += 1
+        # Argument 5: data pointer
+        self._build_array_attr_arg(
+            array_val=array_val,
+            array_attr_pos=4,
+            array_attr_ty=types.voidptr,
+            arg_list=arg_list,
+            args_ty_list=args_ty_list,
+            arg_num=arg_num,
+        )
+        arg_num += 1
+        # Arguments for flattened shape
+        self._build_flattened_array_args(
+            array_val=array_val,
+            array_attr_pos=5,
+            ndims=array_rank,
+            arg_list=arg_list,
+            args_ty_list=args_ty_list,
+            arg_num=arg_num,
+        )
+        arg_num += array_rank
+        # Arguments for flattened stride
+        self._build_flattened_array_args(
+            array_val=array_val,
+            array_attr_pos=6,
+            ndims=array_rank,
+            arg_list=arg_list,
+            args_ty_list=args_ty_list,
+            arg_num=arg_num,
+        )
+        arg_num += array_rank
+
+    def get_queue(self, exec_queue):
+        """Allocates memory on the stack to store a DPCTLSyclQueueRef.
+
+        Returns: A LLVM Value storing the pointer to the SYCL queue created
+        using the filter string for the Python exec_queue (dpctl.SyclQueue).
+        """
+
+        # Allocate a stack var to store the queue created from the filter string
+        sycl_queue_val = cgutils.alloca_once(
+            self.builder,
+            utils.get_llvm_type(context=self.context, type=types.voidptr),
+        )
+        # Insert a global constant to store the filter string
+        device = self.context.insert_const_string(
+            self.builder.module, exec_queue.sycl_device.filter_string
+        )
+        # Store the queue returned by DPEXRTQueue_CreateFromFilterString in a
+        # local variable
+        self.builder.store(
+            self.rtctx.get_queue_from_filter_string(
+                builder=self.builder, device=device
+            ),
+            sycl_queue_val,
+        )
+        return sycl_queue_val
+
+    def free_queue(self, sycl_queue_val):
+        """
+        Frees the ``DPCTLSyclQueueRef`` pointer that was used to launch the
+        kernels.
+
+        Args:
+            sycl_queue_val: The SYCL queue pointer to be freed.
+        """
+        fn = DpctlCAPIFnBuilder.get_dpctl_queue_delete(
+            builder=self.builder, context=self.context
+        )
+        self.builder.call(fn, [self.builder.load(sycl_queue_val)])
+
+    def allocate_kernel_arg_array(self, num_kernel_args):
+        """Allocates an array to store the LLVM Value for every kernel argument.
+
+        Args:
+            num_kernel_args (int): The number of kernel arguments that
+            determines the size of args array to allocate.
+
+        Returns: An LLVM IR value pointing to an array to store the kernel
+        arguments.
+        """
+        args_list = cgutils.alloca_once(
+            self.builder,
+            utils.get_llvm_type(context=self.context, type=types.voidptr),
+            size=self.context.get_constant(types.uintp, num_kernel_args),
+        )
+
+        return args_list
+
+    def allocate_kernel_arg_ty_array(self, num_kernel_args):
+        """Allocates an array to store the LLVM Value for the typenum for
+        every kernel argument.
+
+        Args:
+            num_kernel_args (int): The number of kernel arguments that
+            determines the size of args array to allocate.
+
+        Returns: An LLVM IR value pointing to an array to store the kernel
+        arguments typenums as defined in dpctl.
+        """
+        args_ty_list = cgutils.alloca_once(
+            self.builder,
+            utils.LLVMTypes.int32_t,
+            size=self.context.get_constant(types.uintp, num_kernel_args),
+        )
+
+        return args_ty_list
+
+    def _create_sycl_range(self, idx_range):
+        """_summary_
+
+        Args:
+            idx_range (_type_): _description_
+            kernel_name_tag (_type_): _description_
+        """
+        intp_t = utils.get_llvm_type(context=self.context, type=types.intp)
+        intp_ptr_t = utils.get_llvm_ptr_type(intp_t)
+        num_dim = len(idx_range)
+
+        # form the global range
+        global_range = cgutils.alloca_once(
+            self.builder,
+            utils.get_llvm_type(context=self.context, type=types.uintp),
+            size=self.context.get_constant(types.uintp, num_dim),
+        )
+
+        for i in range(num_dim):
+            rext = idx_range[i]
+            if rext.type != utils.LLVMTypes.int64_t:
+                rext = self.builder.sext(rext, utils.LLVMTypes.int64_t)
+
+            # we reverse the global range to account for how sycl and opencl
+            # range differs
+            self.builder.store(
+                rext,
+                self.builder.gep(
+                    global_range,
+                    [self.context.get_constant(types.uintp, (num_dim - 1) - i)],
+                ),
+            )
+
+        return self.builder.bitcast(global_range, intp_ptr_t)
+
+    def submit_sync_ranged_kernel(
+        self,
+        idx_range,
+        sycl_queue_val,
+        total_kernel_args,
+        arg_list,
+        arg_ty_list,
+    ):
+        """
+        submit_sync_ranged_kernel(dim_bounds, sycl_queue_val)
+        Submits the kernel to the specified queue, waits and then copies
+        back any results to the host.
+
+        Args:
+            idx_range: Tuple specifying the range over which the kernel is
+            to be submitted.
+            sycl_queue_val : The SYCL queue on which the kernel is
+                             submitted.
+        """
+        gr = self._create_sycl_range(idx_range)
+        args = [
+            self.builder.inttoptr(
+                self.context.get_constant(types.uintp, self.kernel_addr),
+                utils.get_llvm_type(context=self.context, type=types.voidptr),
+            ),
+            self.builder.load(sycl_queue_val),
+            arg_list,
+            arg_ty_list,
+            self.context.get_constant(types.uintp, total_kernel_args),
+            gr,
+            self.context.get_constant(types.uintp, len(idx_range)),
+            self.builder.bitcast(
+                utils.create_null_ptr(
+                    builder=self.builder, context=self.context
+                ),
+                utils.get_llvm_type(context=self.context, type=types.voidptr),
+            ),
+            self.context.get_constant(types.uintp, 0),
+        ]
+
+        self.rtctx.submit_range(self.builder, *args)
diff --git a/numba_dpex/decorators.py b/numba_dpex/decorators.py
index 1c05dfe495..8b27c0d358 100644
--- a/numba_dpex/decorators.py
+++ b/numba_dpex/decorators.py
@@ -16,7 +16,7 @@
     compile_func,
     compile_func_template,
 )
-from numba_dpex.core.pipelines.offload_compiler import OffloadCompiler
+from numba_dpex.core.pipelines.dpjit_compiler import DpjitCompiler
 
 
 def kernel(
@@ -165,7 +165,8 @@ def dpjit(*args, **kws):
         )
         del kws["forceobj"]
     kws.update({"nopython": True})
-    kws.update({"pipeline_class": OffloadCompiler})
+    kws.update({"parallel": True})
+    kws.update({"pipeline_class": DpjitCompiler})
 
     # FIXME: When trying to use dpex's target context, overloads do not work
     # properly. We will turn on dpex target once the issue is fixed.
diff --git a/numba_dpex/dpnp_iface/arrayobj.py b/numba_dpex/dpnp_iface/arrayobj.py
index e906176d6a..90e3f922ad 100644
--- a/numba_dpex/dpnp_iface/arrayobj.py
+++ b/numba_dpex/dpnp_iface/arrayobj.py
@@ -50,6 +50,22 @@ class for number classes.
     return _dtype
 
 
+def _parse_layout(layout):
+    if isinstance(layout, types.StringLiteral):
+        layout_type_str = layout.literal_value
+        if layout_type_str not in ["C", "F", "A"]:
+            msg = f"Invalid layout specified: '{layout_type_str}'"
+            raise errors.NumbaValueError(msg)
+        return layout_type_str
+    elif isinstance(layout, str):
+        return layout
+    else:
+        raise TypeError(
+            "The parameter 'layout' is neither of "
+            + "'str' nor 'types.StringLiteral'"
+        )
+
+
 def _parse_usm_type(usm_type):
     """Parse usm_type parameter.
 
@@ -152,8 +168,7 @@ def build_dpnp_ndarray(
             The type has the same structure as USMNdArray used to
             represent dpctl.tensor.usm_ndarray.
     """
-
-    if queue:
+    if queue and not isinstance(queue, types.misc.Omitted):
         raise errors.TypingError(
             "The sycl_queue keyword is not yet supported by "
             "dpnp.empty(), dpnp.zeros(), dpnp.ones(), dpnp.empty_like(), "
@@ -162,7 +177,7 @@ def build_dpnp_ndarray(
         )
 
     # If a dtype value was passed in, then try to convert it to the
-    # coresponding Numba type. If None was passed, the default, then pass None
+    # corresponding Numba type. If None was passed, the default, then pass None
     # to the DpnpNdArray constructor. The default dtype will be derived based
     # on the behavior defined in dpctl.tensor.usm_ndarray.
 
@@ -231,6 +246,7 @@ def ol_dpnp_empty(
 
     _ndim = _ty_parse_shape(shape)
     _dtype = _parse_dtype(dtype)
+    _layout = _parse_layout(order)
     _usm_type = _parse_usm_type(usm_type) if usm_type is not None else "device"
     _device = (
         _parse_device_filter_string(device) if device is not None else "unknown"
@@ -238,7 +254,7 @@ def ol_dpnp_empty(
     if _ndim:
         ret_ty = build_dpnp_ndarray(
             _ndim,
-            layout=order,
+            layout=_layout,
             dtype=_dtype,
             usm_type=_usm_type,
             device=_device,
diff --git a/numba_dpex/dpnp_iface/dpnp_ufunc_db.py b/numba_dpex/dpnp_iface/dpnp_ufunc_db.py
new file mode 100644
index 0000000000..6a6d3d6d50
--- /dev/null
+++ b/numba_dpex/dpnp_iface/dpnp_ufunc_db.py
@@ -0,0 +1,63 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+import dpnp
+import numpy as np
+from numba.core import types
+
+from numba_dpex.core.typing import dpnpdecl
+
+from ..ocl import mathimpl
+
+
+def get_ufuncs():
+    """obtain a list of supported ufuncs in the db"""
+
+    from numba.np.ufunc_db import _lazy_init_db
+
+    _lazy_init_db()
+    from numba.np.ufunc_db import _ufunc_db
+
+    _fill_ufunc_db_with_dpnp_ufuncs(_ufunc_db)
+
+    return _ufunc_db.keys()
+
+
+def _fill_ufunc_db_with_dpnp_ufuncs(ufunc_db):
+    """Monkey patching dpnp for missing attributes."""
+    # FIXME: add more docstring
+
+    for ufuncop in dpnpdecl.supported_ufuncs:
+        if ufuncop == "erf":
+            op = getattr(dpnp, "erf")
+            op.nin = 1
+            op.nout = 1
+            op.nargs = 2
+            op.types = ["f->f", "d->d"]
+            op.is_dpnp_ufunc = True
+
+            _unary_d_d = types.float64(types.float64)
+            _unary_f_f = types.float32(types.float32)
+            ufunc_db[op] = {
+                "f->f": mathimpl.lower_ocl_impl[("erf", (_unary_f_f))],
+                "d->d": mathimpl.lower_ocl_impl[("erf", (_unary_d_d))],
+            }
+        else:
+            op = getattr(dpnp, ufuncop)
+            npop = getattr(np, ufuncop)
+            op.nin = npop.nin
+            op.nout = npop.nout
+            op.nargs = npop.nargs
+            op.types = npop.types
+            op.is_dpnp_ufunc = True
+            ufunc_db.update({op: ufunc_db[npop]})
+            for key in list(ufunc_db[op].keys()):
+                if (
+                    "FF->" in key
+                    or "DD->" in key
+                    or "F->" in key
+                    or "D->" in key
+                ):
+                    ufunc_db[op].pop(key)
diff --git a/numba_dpex/dpnp_iface/dpnpimpl.py b/numba_dpex/dpnp_iface/dpnpimpl.py
new file mode 100644
index 0000000000..7873bbc5b5
--- /dev/null
+++ b/numba_dpex/dpnp_iface/dpnpimpl.py
@@ -0,0 +1,58 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import dpnp
+from numba.np import npyimpl
+
+from numba_dpex.core.typing.dpnpdecl import _unsupported
+from numba_dpex.dpnp_iface import dpnp_ufunc_db
+
+
+def _register_dpnp_ufuncs():
+    kernels = {}
+    # NOTE: Assuming ufunc implementation for the CPUContext.
+    for ufunc in dpnp_ufunc_db.get_ufuncs():
+        kernels[ufunc] = npyimpl.register_ufunc_kernel(
+            ufunc, npyimpl._ufunc_db_function(ufunc)
+        )
+
+    for _op_map in (
+        npyimpl.npydecl.NumpyRulesUnaryArrayOperator._op_map,
+        npyimpl.npydecl.NumpyRulesArrayOperator._op_map,
+    ):
+        for operator, ufunc_name in _op_map.items():
+            if ufunc_name in _unsupported:
+                continue
+            ufunc = getattr(dpnp, ufunc_name)
+            kernel = kernels[ufunc]
+            if ufunc.nin == 1:
+                npyimpl.register_unary_operator_kernel(operator, ufunc, kernel)
+            elif ufunc.nin == 2:
+                npyimpl.register_binary_operator_kernel(operator, ufunc, kernel)
+            else:
+                raise RuntimeError(
+                    "There shouldn't be any non-unary or binary operators"
+                )
+
+    for _op_map in (npyimpl.npydecl.NumpyRulesInplaceArrayOperator._op_map,):
+        for operator, ufunc_name in _op_map.items():
+            if ufunc_name in _unsupported:
+                continue
+            ufunc = getattr(dpnp, ufunc_name)
+            kernel = kernels[ufunc]
+            if ufunc.nin == 1:
+                npyimpl.register_unary_operator_kernel(
+                    operator, ufunc, kernel, inplace=True
+                )
+            elif ufunc.nin == 2:
+                npyimpl.register_binary_operator_kernel(
+                    operator, ufunc, kernel, inplace=True
+                )
+            else:
+                raise RuntimeError(
+                    "There shouldn't be any non-unary or binary operators"
+                )
+
+
+_register_dpnp_ufuncs()
diff --git a/numba_dpex/examples/blacksholes_njit.py b/numba_dpex/examples/blacksholes_njit.py
index ad7e602c18..9095f79987 100644
--- a/numba_dpex/examples/blacksholes_njit.py
+++ b/numba_dpex/examples/blacksholes_njit.py
@@ -6,19 +6,20 @@
 import math
 import time
 
-import dpctl
+import dpctl.tensor as dpt
+import dpnp
 import numba
-import numpy as np
 
+import numba_dpex as dpex
 
-@numba.vectorize(nopython=True)
-def cndf2(inp):
-    out = 0.5 + 0.5 * math.erf((math.sqrt(2.0) / 2.0) * inp)
-    return out
+# @numba.vectorize(nopython=True)
+# def cndf2(inp):
+#     out = 0.5 + 0.5 * math.erf((math.sqrt(2.0) / 2.0) * inp)
+#     return out
 
 
-@numba.njit(parallel=True, fastmath=True)
-def blackscholes(sptprice, strike, rate, volatility, timev):
+@dpex.dpjit
+def blackscholes(sptprice, strike, timev, rate, volatility):
     """
     A simple implementation of the Black-Scholes formula using the automatic
     offload feature of numba_dpex. In this example, each NumPy array
@@ -26,50 +27,54 @@ def blackscholes(sptprice, strike, rate, volatility, timev):
     generate a single SYCL kernel. The kernel is automatically offloaded to
     the device specified where the function is invoked.
     """
-    logterm = np.log(sptprice / strike)
-    powterm = 0.5 * volatility * volatility
-    den = volatility * np.sqrt(timev)
-    d1 = (((rate + powterm) * timev) + logterm) / den
-    d2 = d1 - den
-    NofXd1 = cndf2(d1)
-    NofXd2 = cndf2(d2)
-    futureValue = strike * np.exp(-rate * timev)
-    c1 = futureValue * NofXd2
-    call = sptprice * NofXd1 - c1
-    put = call - futureValue + sptprice
+
+    a = dpnp.log(sptprice / strike)
+    b = timev * -rate
+    z = timev * volatility * volatility * 2
+    c = 0.25 * z
+    y = dpnp.true_divide(1.0, dpnp.sqrt(z))
+    w1 = (a - b + c) * y
+    w2 = (a - b - c) * y
+
+    NofXd1 = 0.5 + 0.5 * dpnp.erf(w1)
+    NofXd2 = 0.5 + 0.5 * dpnp.erf(w2)
+
+    futureValue = strike * dpnp.exp(b)
+    call = sptprice * NofXd1 - futureValue * NofXd2
+    put = call - sptprice + futureValue
     return put
 
 
+@dpex.dpjit
+def init_initStrike(size, initStrike):
+    for idx in numba.prange(initStrike.size):
+        initStrike[idx] = 40 + (initStrike[idx] + 1.0) / size
+    return initStrike
+
+
 def run(iterations):
-    sptprice = np.full((iterations,), 42.0)
-    initStrike = 40 + (np.arange(iterations) + 1.0) / iterations
-    rate = np.full((iterations,), 0.5)
-    volatility = np.full((iterations,), 0.2)
-    timev = np.full((iterations,), 0.5)
+    dpt_sptprice = dpt.full((iterations,), 42.0)
+    dpt_range_arr = dpt.arange(iterations)
+    dpt_full_arr_05 = dpt.full((iterations,), 0.5)
+    dpt_volatility = dpt.full((iterations,), 0.2)
+
+    sptprice = dpnp.ndarray(shape=dpt_sptprice.shape, buffer=dpt_sptprice)
+    rate = dpnp.ndarray(shape=dpt_full_arr_05.shape, buffer=dpt_full_arr_05)
+    volatility = dpnp.ndarray(shape=dpt_volatility.shape, buffer=dpt_volatility)
+    timev = dpnp.ndarray(shape=dpt_full_arr_05.shape, buffer=dpt_full_arr_05)
+    initStrike = dpnp.ndarray(shape=dpt_range_arr.shape, buffer=dpt_range_arr)
+    initStrike = init_initStrike(iterations, initStrike)
 
     t1 = time.time()
     put = blackscholes(sptprice, initStrike, rate, volatility, timev)
     t = time.time() - t1
-    print("checksum: ", sum(put))
+    #  print("checksum: ", sum(put))
+    print(put)
     print("SELFTIMED ", t)
 
 
 def main():
-    parser = argparse.ArgumentParser(description="Black-Scholes")
-    parser.add_argument("--iter", dest="iter", type=int, default=10)
-    args = parser.parse_args()
-    iter = args.iter
-
-    # Use the environment variable SYCL_DEVICE_FILTER to change the default device.
-    # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter.
-    device = dpctl.select_default_device()
-    print("Using device ...")
-    device.print_device_info()
-
-    with dpctl.device_context(device):
-        run(iter)
-
-    print("Done...")
+    run(10)
 
 
 if __name__ == "__main__":
diff --git a/numba_dpex/ocl/_declare_function.py b/numba_dpex/ocl/_declare_function.py
new file mode 100644
index 0000000000..7fc2396e17
--- /dev/null
+++ b/numba_dpex/ocl/_declare_function.py
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+from llvmlite import ir as llvmir
+from numba.core import cgutils, types
+
+from numba_dpex.core.itanium_mangler import mangle_c
+
+# -----------------------------------------------------------------------------
+
+
+def _declare_function(context, builder, name, sig, cargs, mangler=mangle_c):
+    """Insert declaration for a opencl builtin function.
+    Uses the Itanium mangler.
+
+    Args
+    ----
+    context: target context
+
+    builder: llvm builder
+
+    name: str
+        symbol name
+
+    sig: signature
+        function signature of the symbol being declared
+
+    cargs: sequence of str
+        C type names for the arguments
+
+    mangler: a mangler function
+        function to use to mangle the symbol
+
+    """
+    mod = builder.module
+    if sig.return_type == types.void:
+        llretty = llvmir.VoidType()
+    else:
+        llretty = context.get_value_type(sig.return_type)
+    llargs = [context.get_value_type(t) for t in sig.args]
+    fnty = llvmir.FunctionType(llretty, llargs)
+    mangled = mangler(name, cargs)
+    fn = cgutils.get_or_insert_function(mod, fnty, mangled)
+    from numba_dpex import kernel_target
+
+    fn.calling_convention = kernel_target.CC_SPIR_FUNC
+    return fn
diff --git a/numba_dpex/ocl/mathimpl.py b/numba_dpex/ocl/mathimpl.py
index b67ffe2128..5c1a37ca77 100644
--- a/numba_dpex/ocl/mathimpl.py
+++ b/numba_dpex/ocl/mathimpl.py
@@ -11,7 +11,7 @@
 
 from numba_dpex.core.itanium_mangler import mangle
 
-from .oclimpl import _declare_function
+from ._declare_function import _declare_function
 
 registry = Registry()
 lower = registry.lower
diff --git a/numba_dpex/ocl/oclimpl.py b/numba_dpex/ocl/oclimpl.py
index 80f81a7c97..ff84b5bb93 100644
--- a/numba_dpex/ocl/oclimpl.py
+++ b/numba_dpex/ocl/oclimpl.py
@@ -14,12 +14,12 @@
 
 from numba_dpex import config, kernel_target
 from numba_dpex.core.codegen import SPIR_DATA_LAYOUT
-from numba_dpex.core.itanium_mangler import mangle, mangle_c, mangle_type
 from numba_dpex.core.types import Array
 from numba_dpex.ocl.atomics import atomic_helper
 from numba_dpex.utils import address_space
 
 from . import stubs
+from ._declare_function import _declare_function
 
 registry = Registry()
 lower = registry.lower
@@ -30,42 +30,6 @@
 # -----------------------------------------------------------------------------
 
 
-def _declare_function(context, builder, name, sig, cargs, mangler=mangle_c):
-    """Insert declaration for a opencl builtin function.
-    Uses the Itanium mangler.
-
-    Args
-    ----
-    context: target context
-
-    builder: llvm builder
-
-    name: str
-        symbol name
-
-    sig: signature
-        function signature of the symbol being declared
-
-    cargs: sequence of str
-        C type names for the arguments
-
-    mangler: a mangler function
-        function to use to mangle the symbol
-
-    """
-    mod = builder.module
-    if sig.return_type == types.void:
-        llretty = llvmir.VoidType()
-    else:
-        llretty = context.get_value_type(sig.return_type)
-    llargs = [context.get_value_type(t) for t in sig.args]
-    fnty = llvmir.FunctionType(llretty, llargs)
-    mangled = mangler(name, cargs)
-    fn = cgutils.get_or_insert_function(mod, fnty, mangled)
-    fn.calling_convention = kernel_target.CC_SPIR_FUNC
-    return fn
-
-
 @lower(stubs.get_global_id, types.uint32)
 def get_global_id_impl(context, builder, sig, args):
     [dim] = args
diff --git a/numba_dpex/tests/dpjit_tests/parfors/test_builtin_ops.py b/numba_dpex/tests/dpjit_tests/parfors/test_builtin_ops.py
new file mode 100644
index 0000000000..d793317f64
--- /dev/null
+++ b/numba_dpex/tests/dpjit_tests/parfors/test_builtin_ops.py
@@ -0,0 +1,122 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import dpctl
+import dpnp
+import numpy
+import pytest
+
+from numba_dpex import dpjit
+
+
+def parfor_add(a, b):
+    return a + b
+
+
+def parfor_sub(a, b):
+    return a - b
+
+
+def parfor_mult(a, b):
+    return a * b
+
+
+def parfor_divide(a, b):
+    return a / b
+
+
+def parfor_modulus(a, b):
+    return a % b
+
+
+def parfor_exponent(a, b):
+    return a**b
+
+
+shapes = [100, (25, 4)]
+dtypes = [dpnp.int32, dpnp.int64, dpnp.float32, dpnp.float64]
+usm_types = ["device"]
+funcs = [
+    parfor_add,
+    parfor_sub,
+    parfor_mult,
+    parfor_divide,
+    parfor_modulus,
+    parfor_exponent,
+]
+
+
+def parfor_floor(a, b):
+    return a // b
+
+
+@pytest.mark.parametrize("shape", shapes)
+@pytest.mark.parametrize("dtype", dtypes)
+@pytest.mark.parametrize("usm_type", usm_types)
+@pytest.mark.parametrize("func", funcs)
+def test_built_in_operators1(shape, dtype, usm_type, func):
+    queue = dpctl.SyclQueue()
+    a = dpnp.zeros(
+        shape=shape, dtype=dtype, usm_type=usm_type, sycl_queue=queue
+    )
+    b = dpnp.ones(shape=shape, dtype=dtype, usm_type=usm_type, sycl_queue=queue)
+    try:
+        op = dpjit(func)
+        c = op(a, b)
+        del op
+    except Exception:
+        pytest.fail("Failed to compile.")
+
+    if len(c.shape) == 1:
+        assert c.shape[0] == shape
+    else:
+        assert c.shape == shape
+
+    if func != parfor_divide:
+        assert c.dtype == dtype
+    assert c.usm_type == usm_type
+
+    assert c.sycl_device.filter_string == queue.sycl_device.filter_string
+
+    expected = dpnp.asnumpy(func(a, b))
+    nc = dpnp.asnumpy(c)
+
+    numpy.allclose(nc, expected)
+
+
+usm_types = ["host", "shared"]
+
+
+@pytest.mark.parametrize("shape", shapes)
+@pytest.mark.parametrize("dtype", dtypes)
+@pytest.mark.parametrize("usm_type", usm_types)
+@pytest.mark.parametrize("func", funcs)
+def test_built_in_operators2(shape, dtype, usm_type, func):
+    queue = dpctl.SyclQueue()
+    a = dpnp.zeros(
+        shape=shape, dtype=dtype, usm_type=usm_type, sycl_queue=queue
+    )
+    b = dpnp.ones(shape=shape, dtype=dtype, usm_type=usm_type, sycl_queue=queue)
+    try:
+        op = dpjit(func)
+        c = op(a, b)
+        del op
+    except Exception:
+        pytest.fail("Failed to compile.")
+
+    if len(c.shape) == 1:
+        assert c.shape[0] == shape
+    else:
+        assert c.shape == shape
+
+    if func != parfor_divide:
+        assert c.dtype == dtype
+    assert c.usm_type == usm_type
+
+    assert c.sycl_device.filter_string == queue.sycl_device.filter_string
+
+    expected = dpnp.asnumpy(func(a, b))
+    nc = dpnp.asnumpy(c)
+
+    numpy.allclose(nc, expected)
diff --git a/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_bitwise_ops.py b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_bitwise_ops.py
new file mode 100644
index 0000000000..7662df8afc
--- /dev/null
+++ b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_bitwise_ops.py
@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+import dpctl.tensor as dpt
+import dpnp
+import numpy as np
+import pytest
+
+from numba_dpex import dpjit
+from numba_dpex.tests._helper import assert_auto_offloading, filter_strings
+
+list_of_binary_ops = [
+    "bitwise_and",
+    "bitwise_or",
+    "bitwise_xor",
+    "left_shift",
+    "right_shift",
+]
+
+
+@pytest.fixture(params=list_of_binary_ops)
+def binary_op(request):
+    return request.param
+
+
+list_of_unary_ops = [
+    "bitwise_not",
+    "invert",
+]
+
+
+@pytest.fixture(params=list_of_unary_ops)
+def unary_op(request):
+    return request.param
+
+
+list_of_dtypes = [
+    dpnp.int32,
+    dpnp.int64,
+]
+
+
+@pytest.fixture(params=list_of_dtypes)
+def input_arrays(request):
+    # The size of input and out arrays to be used
+    N = 2048
+    a = dpnp.array(dpnp.random.random(N), request.param)
+    b = dpnp.array(dpnp.random.random(N), request.param)
+    return a, b
+
+
+@pytest.mark.parametrize("filter_str", filter_strings)
+def test_binary_ops(filter_str, binary_op, input_arrays):
+    a, b = input_arrays
+    binop = getattr(dpnp, binary_op)
+    actual = dpnp.empty(shape=a.shape, dtype=a.dtype)
+    expected = dpnp.empty(shape=a.shape, dtype=a.dtype)
+
+    @dpjit
+    def f(a, b):
+        return binop(a, b)
+
+    actual = f(a, b)
+
+    expected = binop(a, b)
+    np.testing.assert_allclose(
+        dpt.asnumpy(actual._array_obj),
+        dpt.asnumpy(expected._array_obj),
+        rtol=1e-5,
+        atol=0,
+    )
+
+
+@pytest.mark.parametrize("filter_str", filter_strings)
+def test_unary_ops(filter_str, unary_op, input_arrays):
+    a = input_arrays[0]
+    uop = getattr(dpnp, unary_op)
+    actual = np.empty(shape=a.shape, dtype=a.dtype)
+    expected = np.empty(shape=a.shape, dtype=a.dtype)
+
+    @dpjit
+    def f(a):
+        return uop(a)
+
+    actual = f(a)
+
+    expected = uop(a)
+
+    np.testing.assert_allclose(
+        dpt.asnumpy(actual._array_obj),
+        dpt.asnumpy(expected._array_obj),
+        rtol=1e-5,
+        atol=0,
+    )
diff --git a/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_logic_ops.py b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_logic_ops.py
new file mode 100644
index 0000000000..eb5bac45f9
--- /dev/null
+++ b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_logic_ops.py
@@ -0,0 +1,106 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import dpctl
+import dpctl.tensor as dpt
+import dpnp
+import numpy as np
+import pytest
+
+from numba_dpex import dpjit
+from numba_dpex.tests._helper import assert_auto_offloading, filter_strings
+
+""" Following cases, dpnp raises NotImplementedError"""
+
+list_of_binary_ops = [
+    "greater",
+    "greater_equal",
+    "less",
+    "less_equal",
+    "not_equal",
+    "equal",
+    "logical_and",
+    "logical_or",
+    "logical_xor",
+]
+
+
+@pytest.fixture(params=list_of_binary_ops)
+def binary_op(request):
+    return request.param
+
+
+list_of_unary_ops = [
+    "isinf",
+    "isfinite",
+    "isnan",
+]
+
+
+@pytest.fixture(params=list_of_unary_ops)
+def unary_op(request):
+    return request.param
+
+
+list_of_dtypes = [
+    dpnp.int32,
+    dpnp.int64,
+    dpnp.float32,
+    dpnp.float64,
+]
+
+
+@pytest.fixture(params=list_of_dtypes)
+def input_arrays(request):
+    # The size of input and out arrays to be used
+    N = 2048
+    a = dpnp.array(dpnp.random.random(N), request.param)
+    b = dpnp.array(dpnp.random.random(N), request.param)
+    return a, b
+
+
+@pytest.mark.xfail
+@pytest.mark.parametrize("filter_str", filter_strings)
+def test_binary_ops(filter_str, binary_op, input_arrays):
+    a, b = input_arrays
+    binop = getattr(dpnp, binary_op)
+    actual = dpnp.empty(shape=a.shape, dtype=a.dtype)
+    expected = dpnp.empty(shape=a.shape, dtype=a.dtype)
+
+    @dpjit
+    def f(a, b):
+        return binop(a, b)
+
+    actual = f(a, b)
+
+    expected = binop(a, b)
+    np.testing.assert_allclose(
+        dpt.asnumpy(actual._array_obj),
+        dpt.asnumpy(expected._array_obj),
+        rtol=1e-5,
+        atol=0,
+    )
+
+
+@pytest.mark.xfail
+@pytest.mark.parametrize("filter_str", filter_strings)
+def test_unary_ops(filter_str, unary_op, input_arrays):
+    a = input_arrays[0]
+    uop = getattr(dpnp, unary_op)
+    actual = dpnp.empty(shape=a.shape, dtype=a.dtype)
+    expected = dpnp.empty(shape=a.shape, dtype=a.dtype)
+
+    @dpjit
+    def f(a):
+        return uop(a)
+
+    actual = f(a)
+
+    expected = uop(a)
+    np.testing.assert_allclose(
+        dpt.asnumpy(actual._array_obj),
+        dpt.asnumpy(expected._array_obj),
+        rtol=1e-5,
+        atol=0,
+    )
diff --git a/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_transcedental_functions.py b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_transcedental_functions.py
new file mode 100644
index 0000000000..f43d59752f
--- /dev/null
+++ b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_transcedental_functions.py
@@ -0,0 +1,132 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import dpctl
+import dpctl.tensor as dpt
+import dpnp
+import numpy as np
+import pytest
+
+from numba_dpex import dpjit
+from numba_dpex.tests._helper import (
+    assert_auto_offloading,
+    filter_strings,
+    is_gen12,
+)
+
+"""dpnp raise error on : mod, abs and remainder(float32)"""
+list_of_binary_ops = [
+    "add",
+    "subtract",
+    "multiply",
+    "divide",
+    "power",
+    #   "remainder",
+    #   "mod",
+    "hypot",
+    "maximum",
+    "minimum",
+    "fmax",
+    "fmin",
+]
+
+
+@pytest.fixture(params=list_of_binary_ops)
+def binary_op(request):
+    return request.param
+
+
+list_of_unary_ops = [
+    "negative",
+    #   "abs",
+    "absolute",
+    "fabs",
+    "sign",
+    "conj",
+    "exp",
+    "exp2",
+    "log",
+    "log2",
+    "log10",
+    "expm1",
+    "log1p",
+    "sqrt",
+    "square",
+    "reciprocal",
+    "conjugate",
+    "floor",
+    "ceil",
+    "trunc",
+    "erf",
+]
+
+
+@pytest.fixture(params=list_of_unary_ops)
+def unary_op(request):
+    return request.param
+
+
+list_of_dtypes = [
+    dpnp.float32,
+    dpnp.float64,
+]
+
+
+@pytest.fixture(params=list_of_dtypes)
+def input_arrays(request):
+    # The size of input and out arrays to be used
+    N = 2048
+    a = dpnp.array(dpnp.random.random(N), request.param)
+    b = dpnp.array(dpnp.random.random(N), request.param)
+    return a, b
+
+
+@pytest.mark.parametrize("filter_str", filter_strings)
+def test_binary_ops(filter_str, binary_op, input_arrays):
+    a, b = input_arrays
+    binop = getattr(dpnp, binary_op)
+    actual = dpnp.empty(shape=a.shape, dtype=a.dtype)
+    expected = dpnp.empty(shape=a.shape, dtype=a.dtype)
+
+    @dpjit
+    def f(a, b):
+        return binop(a, b)
+
+    actual = f(a, b)
+
+    expected = binop(a, b)
+
+    np.testing.assert_allclose(
+        dpt.asnumpy(actual._array_obj),
+        dpt.asnumpy(expected._array_obj),
+        rtol=1e-5,
+        atol=0,
+    )
+
+
+@pytest.mark.parametrize("filter_str", filter_strings)
+def test_unary_ops(filter_str, unary_op, input_arrays):
+    skip_ops = ["abs", "sign", "log", "log2", "log10", "expm1"]
+    if unary_op in skip_ops and is_gen12(filter_str):
+        pytest.skip()
+
+    a = input_arrays[0]
+    uop = getattr(dpnp, unary_op)
+    actual = dpnp.empty(shape=a.shape, dtype=a.dtype)
+    expected = dpnp.empty(shape=a.shape, dtype=a.dtype)
+
+    @dpjit
+    def f(a):
+        return uop(a)
+
+    actual = f(a)
+
+    expected = uop(a)
+
+    np.testing.assert_allclose(
+        dpt.asnumpy(actual._array_obj),
+        dpt.asnumpy(expected._array_obj),
+        rtol=1e-5,
+        atol=0,
+    )
diff --git a/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_trigonometric_functions.py b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_trigonometric_functions.py
new file mode 100644
index 0000000000..73ef3b4f5d
--- /dev/null
+++ b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_trigonometric_functions.py
@@ -0,0 +1,110 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import dpctl
+import dpctl.tensor as dpt
+import dpnp
+import numpy as np
+import pytest
+
+from numba_dpex import dpjit
+from numba_dpex.tests._helper import (
+    assert_auto_offloading,
+    filter_strings,
+    is_gen12,
+)
+
+list_of_filter_strs = [
+    "opencl:gpu:0",
+    "level_zero:gpu:0",
+    "opencl:cpu:0",
+]
+
+
+@pytest.fixture(params=list_of_filter_strs)
+def filter_str(request):
+    return request.param
+
+
+list_of_trig_ops = [
+    "sin",
+    "cos",
+    "tan",
+    "arcsin",
+    "arccos",
+    "arctan",
+    "arctan2",
+    "sinh",
+    "cosh",
+    "tanh",
+    "arcsinh",
+    "arccosh",
+    "arctanh",
+    "deg2rad",
+    "rad2deg",
+    "degrees",
+    "radians",
+]
+
+
+@pytest.fixture(params=list_of_trig_ops)
+def trig_op(request):
+    return request.param
+
+
+list_of_dtypes = [
+    dpnp.float32,
+    dpnp.float64,
+]
+
+
+@pytest.fixture(params=list_of_trig_ops)
+def dtype(request):
+    return request.param
+
+
+@pytest.fixture(params=list_of_dtypes)
+def input_arrays(request):
+    # The size of input and out arrays to be used
+    N = 2048
+
+    a = dpnp.array(dpnp.random.random(N), request.param)
+    b = dpnp.array(dpnp.random.random(N), request.param)
+    return a, b
+
+
+@pytest.mark.parametrize("filter_str", filter_strings)
+def test_trigonometric_fn(filter_str, trig_op, input_arrays):
+    # FIXME: Why does archcosh fail on Gen12 discrete graphics card?
+    if trig_op == "arccosh" and is_gen12(filter_str):
+        pytest.skip()
+
+    a, b = input_arrays
+    trig_fn = getattr(dpnp, trig_op)
+    actual = dpnp.empty(shape=a.shape, dtype=a.dtype)
+    expected = dpnp.empty(shape=a.shape, dtype=a.dtype)
+
+    if trig_op == "arctan2":
+
+        @dpjit
+        def f(a, b):
+            return trig_fn(a, b)
+
+        actual = f(a, b)
+        expected = trig_fn(a, b)
+    else:
+
+        @dpjit
+        def f(a):
+            return trig_fn(a)
+
+        actual = f(a)
+        expected = trig_fn(a)
+
+    np.testing.assert_allclose(
+        dpt.asnumpy(actual._array_obj),
+        dpt.asnumpy(expected._array_obj),
+        rtol=1e-5,
+        atol=0,
+    )
diff --git a/numba_dpex/tests/passes/test_parfor_legalize_cfd_pass.py b/numba_dpex/tests/passes/test_parfor_legalize_cfd_pass.py
new file mode 100644
index 0000000000..451eb2392b
--- /dev/null
+++ b/numba_dpex/tests/passes/test_parfor_legalize_cfd_pass.py
@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Tests for checking enforcing CFD in parfor pass.
+"""
+
+
+import dpctl
+import dpnp
+import numba as nb
+import pytest
+
+from numba_dpex import dpjit
+from numba_dpex.core.exceptions import ComputeFollowsDataInferenceError
+from numba_dpex.tests._helper import skip_no_opencl_gpu
+
+shapes = [10, (2, 5)]
+dtypes = [dpnp.int32, dpnp.int64, dpnp.float32, dpnp.float64]
+usm_types = ["device"]
+devices = ["gpu"]
+
+
+@dpjit
+def func1(a, b):
+    c = a + b
+    return c
+
+
+@skip_no_opencl_gpu
+@pytest.mark.parametrize("shape", shapes)
+@pytest.mark.parametrize("dtype", dtypes)
+@pytest.mark.parametrize("usm_type", usm_types)
+@pytest.mark.parametrize("device", devices)
+def test_parfor_legalize_cfd_pass(shape, dtype, usm_type, device):
+    a = dpnp.zeros(shape=shape, dtype=dtype, usm_type=usm_type, device=device)
+    b = dpnp.ones(shape=shape, dtype=dtype, usm_type=usm_type, device=device)
+    try:
+        c = func1(a, b)
+    except Exception:
+        pytest.fail("Running Parfor CFD Pass check failed")
+
+    if len(c.shape) == 1:
+        assert c.shape[0] == shape
+    else:
+        assert c.shape == shape
+
+    assert c.dtype == dtype
+    assert c.usm_type == usm_type
+    if device != "unknown":
+        assert (
+            c.sycl_device.filter_string
+            == dpctl.SyclDevice(device).filter_string
+        )
+    else:
+        c.sycl_device.filter_string == dpctl.SyclDevice().filter_string
+
+
+@skip_no_opencl_gpu
+def test_parfor_legalize_cfd_pass_raise():
+    a = dpnp.zeros(shape=10, device="cpu")
+    b = dpnp.ones(shape=10, device="gpu")
+
+    with pytest.raises(ComputeFollowsDataInferenceError):
+        func1(a, b)
+
+
+@skip_no_opencl_gpu
+def test_cfd_error_due_to_lhs():
+    a = dpnp.zeros(shape=10, device="cpu")
+    b = dpnp.ones(shape=10, device="cpu")
+
+    @dpjit
+    def vecadd_prange(a, b):
+        c = dpnp.empty(a.shape, dtype=a.dtype, device="gpu")
+        for idx in nb.prange(a.size):
+            c[idx] = a[idx] + b[idx]
+        return c
+
+    with pytest.raises(ComputeFollowsDataInferenceError):
+        vecadd_prange(a, b)
diff --git a/numba_dpex/tests/test_prange.py b/numba_dpex/tests/test_prange.py
index d62ac0039e..9b669cbc94 100644
--- a/numba_dpex/tests/test_prange.py
+++ b/numba_dpex/tests/test_prange.py
@@ -5,151 +5,196 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import dpctl
+import dpnp
 import numpy as np
 import pytest
-from numba import njit, prange
-
-from numba_dpex.tests._helper import assert_auto_offloading, skip_no_opencl_gpu
-
-
-class TestPrange:
-    def test_one_prange(self):
-        @njit
-        def f(a, b):
-            for i in prange(4):
-                b[i, 0] = a[i, 0] * 10
-
-        m = 8
-        n = 8
-        a = np.ones((m, n))
-        b = np.ones((m, n))
-
-        device = dpctl.select_default_device()
-        with assert_auto_offloading(), dpctl.device_context(device):
-            f(a, b)
-
-        for i in range(4):
-            assert b[i, 0] == a[i, 0] * 10
-
-    def test_nested_prange(self):
-        @njit
-        def f(a, b):
-            # dimensions must be provided as scalar
-            m, n = a.shape
-            for i in prange(m):
-                for j in prange(n):
-                    b[i, j] = a[i, j] * 10
-
-        m = 8
-        n = 8
-        a = np.ones((m, n))
-        b = np.ones((m, n))
-
-        device = dpctl.select_default_device()
-        with assert_auto_offloading(), dpctl.device_context(device):
-            f(a, b)
-
-        assert np.all(b == 10)
-
-    @pytest.mark.skip
-    def test_multiple_prange(self):
-        @njit
-        def f(a, b):
-            # dimensions must be provided as scalar
-            m, n = a.shape
-            for i in prange(m):
-                val = 10
-                for j in prange(n):
-                    b[i, j] = a[i, j] * val
-
-            for i in prange(m):
-                for j in prange(n):
-                    a[i, j] = a[i, j] * 10
-
-        m = 8
-        n = 8
-        a = np.ones((m, n))
-        b = np.ones((m, n))
-
-        device = dpctl.select_default_device()
-        with assert_auto_offloading(parfor_offloaded=2), dpctl.device_context(
-            device
-        ):
-            f(a, b)
-
-        assert np.all(b == 10)
-        assert np.all(a == 10)
-
-    def test_three_prange(self):
-        @njit
-        def f(a, b):
-            # dimensions must be provided as scalar
-            m, n, o = a.shape
-            for i in prange(m):
-                val = 10
-                for j in prange(n):
-                    constant = 2
-                    for k in prange(o):
-                        b[i, j, k] = a[i, j, k] * (val + constant)
-
-        m = 8
-        n = 8
-        o = 8
-        a = np.ones((m, n, o))
-        b = np.ones((m, n, o))
-
-        device = dpctl.select_default_device()
-        with assert_auto_offloading(parfor_offloaded=1), dpctl.device_context(
-            device
-        ):
-            f(a, b)
-
-        assert np.all(b == 12)
-
-    @pytest.mark.skip(reason="numba-dpex issue 110")
-    def test_two_consequent_prange(self):
-        def prange_example():
-            n = 10
-            a = np.ones((n), dtype=np.float64)
-            b = np.ones((n), dtype=np.float64)
-            c = np.ones((n), dtype=np.float64)
-            for i in prange(n // 2):
-                a[i] = b[i] + c[i]
-
-            return a
-
-        jitted = njit(prange_example)
-
-        device = dpctl.select_default_device()
-        with assert_auto_offloading(parfor_offloaded=2), dpctl.device_context(
-            device
-        ):
-            jitted_res = jitted()
-
-        res = prange_example()
-
-        np.testing.assert_equal(res, jitted_res)
-
-    @pytest.mark.skip(reason="NRT required but not enabled")
-    def test_2d_arrays(self):
-        def prange_example():
-            n = 10
-            a = np.ones((n, n), dtype=np.float64)
-            b = np.ones((n, n), dtype=np.float64)
-            c = np.ones((n, n), dtype=np.float64)
-            for i in prange(n // 2):
-                a[i] = b[i] + c[i]
-
-            return a
-
-        jitted = njit(prange_example)
-
-        device = dpctl.select_default_device()
-        with assert_auto_offloading(parfor_offloaded=2), dpctl.device_context(
-            device
-        ):
-            jitted_res = jitted()
-
-        res = prange_example()
-
-        np.testing.assert_equal(res, jitted_res)
+from numba import njit
+
+from numba_dpex import dpjit, prange
+
+
+def test_one_prange_mul():
+    @dpjit
+    def f(a, b):
+        for i in prange(4):
+            b[i, 0] = a[i, 0] * 10
+        return
+
+    device = dpctl.select_default_device()
+
+    m = 8
+    n = 8
+    a = dpnp.ones((m, n), device=device)
+    b = dpnp.ones((m, n), device=device)
+
+    f(a, b)
+
+    for i in range(4):
+        assert b[i, 0] == a[i, 0] * 10
+
+
+@pytest.mark.skip(reason="dpnp.add() doesn't support variable + scalar.")
+def test_one_prange_add_scalar():
+    @dpjit
+    def f(a, b):
+        for i in prange(4):
+            b[i, 0] = a[i, 0] + 10
+        return
+
+    device = dpctl.select_default_device()
+
+    m = 8
+    n = 8
+    a = dpnp.ones((m, n), device=device)
+    b = dpnp.ones((m, n), device=device)
+
+    f(a, b)
+
+    for i in range(4):
+        assert b[i, 0] == a[i, 0] + 10
+
+
+@pytest.mark.skip(reason="[i,:] like indexing is not supported yet.")
+def test_prange_2d_array():
+    device = dpctl.select_default_device()
+    n = 10
+
+    @dpjit
+    def f(a, b, c):
+        for i in prange(n):
+            c[i, :] = a[i, :] + b[i, :]
+        return
+
+    a = dpnp.ones((n, n), dtype=dpnp.int32, device=device)
+    b = dpnp.ones((n, n), dtype=dpnp.int32, device=device)
+    c = dpnp.ones((n, n), dtype=dpnp.int32, device=device)
+
+    f(a, b, c)
+
+    np.testing.assert_equal(c.asnumpy(), np.ones((n, n), dtype=np.int32) * 2)
+
+
+@pytest.mark.skip(reason="Nested prange is not supported yet.")
+def test_nested_prange():
+    @dpjit
+    def f(a, b):
+        # dimensions must be provided as scalar
+        m, n = a.shape
+        for i in prange(m):
+            for j in prange(n):
+                b[i, j] = a[i, j] * 10
+        return
+
+    device = dpctl.select_default_device()
+
+    m = 8
+    n = 8
+    a = dpnp.ones((m, n), device=device)
+    b = dpnp.ones((m, n), device=device)
+
+    f(a, b)
+
+    assert np.all(b.asnumpy() == 10)
+
+
+@pytest.mark.skip(reason="Nested prange is not supported yet.")
+def test_multiple_prange():
+    @dpjit
+    def f(a, b):
+        # dimensions must be provided as scalar
+        m, n = a.shape
+        for i in prange(m):
+            val = 10
+            for j in prange(n):
+                b[i, j] = a[i, j] * val
+
+        for i in prange(m):
+            for j in prange(n):
+                a[i, j] = a[i, j] * 10
+        return
+
+    device = dpctl.select_default_device()
+
+    m = 8
+    n = 8
+    a = dpnp.ones((m, n), device=device)
+    b = dpnp.ones((m, n), device=device)
+
+    f(a, b)
+
+    assert np.all(b.asnumpy() == 10)
+    assert np.all(a.asnumpy() == 10)
+
+
+@pytest.mark.skip(reason="Nested prange is not supported yet.")
+def test_three_prange():
+    @dpjit
+    def f(a, b):
+        # dimensions must be provided as scalar
+        m, n, o = a.shape
+        for i in prange(m):
+            val = 10
+            for j in prange(n):
+                constant = 2
+                for k in prange(o):
+                    b[i, j, k] = a[i, j, k] * (val + constant)
+        return
+
+    device = dpctl.select_default_device()
+
+    m = 8
+    n = 8
+    o = 8
+    a = dpnp.ones((m, n, o), device=device)
+    b = dpnp.ones((m, n, o), device=device)
+
+    f(a, b)
+
+    assert np.all(b.asnumpy() == 12)
+
+
+def test_two_consecutive_prange():
+    @dpjit
+    def prange_example(a, b, c, d):
+        for i in prange(n):
+            c[i] = a[i] + b[i]
+        for i in prange(n):
+            d[i] = a[i] - b[i]
+        return
+
+    device = dpctl.select_default_device()
+
+    n = 10
+    a = dpnp.ones((n), dtype=dpnp.float64, device=device)
+    b = dpnp.ones((n), dtype=dpnp.float64, device=device)
+    c = dpnp.zeros((n), dtype=dpnp.float64, device=device)
+    d = dpnp.zeros((n), dtype=dpnp.float64, device=device)
+
+    prange_example(a, b, c, d)
+
+    np.testing.assert_equal(c.asnumpy(), np.ones((n), dtype=np.float64) * 2)
+    np.testing.assert_equal(d.asnumpy(), np.zeros((n), dtype=np.float64))
+
+
+@pytest.mark.skip(reason="[i,:] like indexing is not supported yet.")
+def test_two_consecutive_prange_2d():
+    @dpjit
+    def prange_example(a, b, c, d):
+        for i in prange(n):
+            c[i, :] = a[i, :] + b[i, :]
+        for i in prange(n):
+            d[i, :] = a[i, :] - b[i, :]
+        return
+
+    device = dpctl.select_default_device()
+
+    n = 10
+    a = dpnp.ones((n, n), dtype=dpnp.int32, device=device)
+    b = dpnp.ones((n, n), dtype=dpnp.int32, device=device)
+    c = dpnp.ones((n, n), dtype=dpnp.int32, device=device)
+    d = dpnp.ones((n, n), dtype=dpnp.int32, device=device)
+
+    prange_example(a, b, c, d)
+
+    np.testing.assert_equal(c.asnumpy(), np.ones((n, n), dtype=np.int32) * 2)
+    np.testing.assert_equal(d.asnumpy(), np.zeros((n, n), dtype=np.int32))