diff --git a/numba_dpex/__init__.py b/numba_dpex/__init__.py index 073f57a1ff..4da923e6b6 100644 --- a/numba_dpex/__init__.py +++ b/numba_dpex/__init__.py @@ -74,6 +74,7 @@ def load_dpctl_sycl_interface(): f"dpctl={dpctl_version} may cause unexpected behavior" ) +from numba import prange # noqa E402 import numba_dpex.core.dpjit_dispatcher # noqa E402 import numba_dpex.core.offload_dispatcher # noqa E402 @@ -92,6 +93,7 @@ def load_dpctl_sycl_interface(): # Re-export all type names from numba_dpex.core.types import * # noqa E402 +from numba_dpex.dpnp_iface import dpnpimpl # noqa E402 from numba_dpex.retarget import offload_to_sycl_device # noqa E402 if config.HAS_NON_HOST_DEVICE: diff --git a/numba_dpex/_patches.py b/numba_dpex/_patches.py index 0c74df084e..cb996a1a9f 100644 --- a/numba_dpex/_patches.py +++ b/numba_dpex/_patches.py @@ -258,13 +258,11 @@ def _empty_nd_impl(context, builder, arrtype, shapes): ) from numba_dpex.decorators import dpjit - numba_config.DISABLE_PERFORMANCE_WARNINGS = 0 op = dpjit(_call_usm_allocator) fnop = context.typing_context.resolve_value_type(op) # The _call_usm_allocator function will be compiled and added to registry # when the get_call_type function is invoked. fnop.get_call_type(context.typing_context, sig.args, {}) - numba_config.DISABLE_PERFORMANCE_WARNINGS = 1 eqfn = context.get_function(fnop, sig) meminfo = eqfn(builder, args) else: @@ -309,11 +307,17 @@ def impl(cls, allocsize, usm_type, device): return impl +numba_config.DISABLE_PERFORMANCE_WARNINGS = 0 + + def _call_usm_allocator(arrtype, size, usm_type, device): """Trampoline to call the intrinsic used for allocation""" return arrtype._usm_allocate(size, usm_type, device) +numba_config.DISABLE_PERFORMANCE_WARNINGS = 1 + + @intrinsic def intrin_usm_alloc(typingctx, allocsize, usm_type, device): """Intrinsic to call into the allocator for Array""" diff --git a/numba_dpex/core/exceptions.py b/numba_dpex/core/exceptions.py index 1d75dd6f40..55b3f7e2de 100644 --- a/numba_dpex/core/exceptions.py +++ b/numba_dpex/core/exceptions.py @@ -221,7 +221,7 @@ def __init__( f"Arguments {ndarray_args} are non-usm arrays, " f"and arguments {usmarray_args} are usm arrays." ) - elif usmarray_argnum_list: + elif usmarray_argnum_list is not None: usmarray_args = ",".join([str(i) for i in usmarray_argnum_list]) self.message = ( f'Execution queue for kernel "{kernel_name}" could ' @@ -433,3 +433,13 @@ def __init__(self, kernel_name, argtypes) -> None: ) super().__init__(self.message) + + +class UnsupportedParforError(Exception): + """Exception raised when a parfor node could not be lowered by Numba-dpex""" + + def __init__(self, extra_msg=None) -> None: + self.message = "Expression cannot be offloaded" + if extra_msg: + self.message += " due to " + extra_msg + super().__init__(self.message) diff --git a/numba_dpex/core/passes/__init__.py b/numba_dpex/core/passes/__init__.py index 00fec5f515..5d50252a3b 100644 --- a/numba_dpex/core/passes/__init__.py +++ b/numba_dpex/core/passes/__init__.py @@ -1,3 +1,27 @@ # SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation # # SPDX-License-Identifier: Apache-2.0 + +from .parfor_legalize_cfd_pass import ParforLegalizeCFDPass +from .parfor_lowering_pass import ParforLoweringPass +from .passes import ( + DumpParforDiagnostics, + NoPythonBackend, + ParforFusionPass, + ParforPass, + ParforPreLoweringPass, + PreParforPass, + SplitParforPass, +) + +__all__ = [ + "DumpParforDiagnostics", + "ParforLoweringPass", + "ParforLegalizeCFDPass", + "ParforFusionPass", + "ParforPreLoweringPass", + "ParforPass", + "PreParforPass", + "SplitParforPass", + "NoPythonBackend", +] diff --git a/numba_dpex/core/passes/parfor.py b/numba_dpex/core/passes/parfor.py index 9e162c90f1..454d63d350 100644 --- a/numba_dpex/core/passes/parfor.py +++ b/numba_dpex/core/passes/parfor.py @@ -2421,7 +2421,6 @@ def _arrayexpr_to_parfor(self, equiv_set, lhs, arrayexpr, avail_vars): expr = arrayexpr.expr arr_typ = pass_states.typemap[lhs.name] el_typ = arr_typ.dtype - # generate loopnests and size variables from lhs correlations size_vars = equiv_set.get_shape(lhs) index_vars, loopnests = _mk_parfor_loops( @@ -3788,6 +3787,46 @@ def _get_call_arg_types(expr, typemap): return tuple(new_arg_typs), new_kw_types +def _ufunc_to_parfor_instr( + typemap, + op, + avail_vars, + loc, + scope, + func_ir, + out_ir, + arg_vars, + typingctx, + calltypes, + expr_out_var, +): + func_var_name = _find_func_var(typemap, op, avail_vars, loc=loc) + func_var = ir.Var(scope, mk_unique_var(func_var_name), loc) + typemap[func_var.name] = typemap[func_var_name] + func_var_def = copy.deepcopy(func_ir.get_definition(func_var_name)) + if ( + isinstance(func_var_def, ir.Expr) + and func_var_def.op == "getattr" + and func_var_def.attr == "sqrt" + ): + g_math_var = ir.Var(scope, mk_unique_var("$math_g_var"), loc) + typemap[g_math_var.name] = types.misc.Module(math) + g_math = ir.Global("math", math, loc) + g_math_assign = ir.Assign(g_math, g_math_var, loc) + func_var_def = ir.Expr.getattr(g_math_var, "sqrt", loc) + out_ir.append(g_math_assign) + # out_ir.append(func_var_def) + ir_expr = ir.Expr.call(func_var, arg_vars, (), loc) + call_typ = typemap[func_var.name].get_call_type( + typingctx, tuple(typemap[a.name] for a in arg_vars), {} + ) + calltypes[ir_expr] = call_typ + el_typ = call_typ.return_type + # signature(el_typ, el_typ) + out_ir.append(ir.Assign(func_var_def, func_var, loc)) + out_ir.append(ir.Assign(ir_expr, expr_out_var, loc)) + + def _arrayexpr_tree_to_ir( func_ir, typingctx, @@ -3852,35 +3891,33 @@ def _arrayexpr_tree_to_ir( # elif isinstance(op, (np.ufunc, DUFunc)): # function calls are stored in variables which are not removed # op is typing_key to the variables type - func_var_name = _find_func_var(typemap, op, avail_vars, loc=loc) - func_var = ir.Var(scope, mk_unique_var(func_var_name), loc) - typemap[func_var.name] = typemap[func_var_name] - func_var_def = copy.deepcopy( - func_ir.get_definition(func_var_name) - ) - if ( - isinstance(func_var_def, ir.Expr) - and func_var_def.op == "getattr" - and func_var_def.attr == "sqrt" - ): - g_math_var = ir.Var( - scope, mk_unique_var("$math_g_var"), loc - ) - typemap[g_math_var.name] = types.misc.Module(math) - g_math = ir.Global("math", math, loc) - g_math_assign = ir.Assign(g_math, g_math_var, loc) - func_var_def = ir.Expr.getattr(g_math_var, "sqrt", loc) - out_ir.append(g_math_assign) - # out_ir.append(func_var_def) - ir_expr = ir.Expr.call(func_var, arg_vars, (), loc) - call_typ = typemap[func_var.name].get_call_type( - typingctx, tuple(typemap[a.name] for a in arg_vars), {} + _ufunc_to_parfor_instr( + typemap, + op, + avail_vars, + loc, + scope, + func_ir, + out_ir, + arg_vars, + typingctx, + calltypes, + expr_out_var, ) - calltypes[ir_expr] = call_typ - el_typ = call_typ.return_type - # signature(el_typ, el_typ) - out_ir.append(ir.Assign(func_var_def, func_var, loc)) - out_ir.append(ir.Assign(ir_expr, expr_out_var, loc)) + if hasattr(op, "is_dpnp_ufunc"): + _ufunc_to_parfor_instr( + typemap, + op, + avail_vars, + loc, + scope, + func_ir, + out_ir, + arg_vars, + typingctx, + calltypes, + expr_out_var, + ) elif isinstance(expr, ir.Var): var_typ = typemap[expr.name] if isinstance(var_typ, types.Array): diff --git a/numba_dpex/core/passes/parfor_legalize_cfd_pass.py b/numba_dpex/core/passes/parfor_legalize_cfd_pass.py new file mode 100644 index 0000000000..264e62f181 --- /dev/null +++ b/numba_dpex/core/passes/parfor_legalize_cfd_pass.py @@ -0,0 +1,323 @@ +# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +from numba.core import ir, types +from numba.core.compiler_machinery import FunctionPass, register_pass +from numba.core.ir_utils import find_topo_order + +from numba_dpex.core.exceptions import ComputeFollowsDataInferenceError +from numba_dpex.core.passes.parfor_lowering_pass import ParforLowerFactory +from numba_dpex.core.types.dpnp_ndarray_type import DpnpNdArray + +from .parfor import ( + Parfor, + ParforDiagnostics, + get_parfor_outputs, + get_parfor_params, +) + + +class ParforLegalizeCFDPassImpl: + + """Legalizes the compute-follows-data based device attribute for parfor + nodes. + + DpnpNdArray array-expressions populate the type of the left-hand-side (LHS) + of each expression as a default DpnpNdArray instance derived from the + __array_ufunc__ method of DpnpNdArray class. The pass fixes the LHS type by + properly applying compute follows data programming model. The pass first + checks if the right-hand-side (RHS) DpnpNdArray arguments are on the same + device, else raising a ComputeFollowsDataInferenceError. Once the RHS has + been validated, the LHS type is updated. + + The pass also updated the usm_type of the LHS based on a USM type + propagation rule: device > shared > host. Thus, if the usm_type attribute of + the RHS arrays are "device" and "shared" respectively, the LHS array's + usm_type attribute will be "device". + + Once the pass has identified a parfor with DpnpNdArrays and legalized it, + the "lowerer" attribute of the parfor is set to + ``numba_dpex.core.passes.parfor_lowering_pass._lower_parfor_as_kernel`` so + that the parfor node is lowered using Dpex's lowerer. + + """ + + inputUsmTypeStrToInt = {"device": 3, "shared": 2, "host": 1} + inputUsmTypeIntToStr = {3: "device", 2: "shared", 1: "host"} + + def __init__(self, state) -> None: + self._state = state + self._cfd_updated_values = set() + self._seen_array_set = set() + diagnostics = ParforDiagnostics() + self.nested_fusion_info = diagnostics.nested_fusion_info + + def _check_if_dpnp_empty_call(self, call_stmt, block): + func_def = block.find_variable_assignment(call_stmt.name) + if not ( + isinstance(func_def, ir.Assign) + and isinstance(func_def.value, ir.Expr) + and func_def.value.op == "getattr" + ): + raise AssertionError + + module_name = block.find_variable_assignment( + func_def.value.list_vars()[0].name + ).value.value.__name__ + + if func_def.value.attr == "empty" and module_name == "dpnp": + return True + else: + return False + + def _check_cfd_parfor_params(self, parfor, checklist): + deviceTypes = set() + usmTypes = [] + + for para in checklist: + if not isinstance(self._state.typemap[para], DpnpNdArray): + continue + argty = self._state.typemap[para] + deviceTypes.add(argty.device) + try: + usmTypes.append( + ParforLegalizeCFDPassImpl.inputUsmTypeStrToInt[ + argty.usm_type + ] + ) + except KeyError: + raise ValueError( + "Unknown USM type encountered. Supported " + "usm types are: device, shared and host." + ) + # Check compute follows data on the dpnp arrays in checklist + if len(deviceTypes) > 1: + raise ComputeFollowsDataInferenceError( + kernel_name=parfor.loc.short(), + usmarray_argnum_list=[], + ) + # Derive the usm_type based on usm allocator precedence rule: + # device > shared > host + conforming_usm_ty = max(usmTypes) + conforming_device_ty = deviceTypes.pop() + + # FIXME: Changed to namedtuple + return (conforming_usm_ty, conforming_device_ty) + + def _legalize_dpnp_empty_call(self, required_arrty, call_stmt, block): + args = call_stmt.args + sigargs = self._state.calltypes[call_stmt].args + sigargs_new = list(sigargs) + # Update the RHS usm_type, device, attributes + for idx, arg in enumerate(args): + argdef = block.find_variable_assignment(arg.name) + if argdef: + attribute = argdef.target.name + if "usm_type" in attribute: + self._state.typemap.update( + {attribute: types.literal(required_arrty.usm_type)} + ) + sigargs_new[idx] = types.literal(required_arrty.usm_type) + elif "device" in attribute: + self._state.typemap.update( + {attribute: types.literal(required_arrty.device)} + ) + sigargs_new[idx] = types.literal(required_arrty.device) + sigargs = tuple(sigargs_new) + new_sig = self._state.typingctx.resolve_function_type( + self._state.typemap[call_stmt.func.name], sigargs, {} + ) + self._state.calltypes.update({call_stmt: new_sig}) + + def _legalize_array_attrs( + self, arrattr, legalized_device_ty, legalized_usm_ty + ): + modified = False + updated_device = None + updated_usm_ty = None + + if self._state.typemap[arrattr].device != legalized_device_ty: + updated_device = legalized_device_ty + modified = True + + if self._state.typemap[arrattr].usm_type != legalized_usm_ty: + updated_usm_ty = legalized_usm_ty + modified = True + + if modified: + ty = self._state.typemap[arrattr] + new_ty = ty.copy(device=updated_device, usm_type=updated_usm_ty) + self._state.typemap.update({arrattr: new_ty}) + + return modified + + def _legalize_parfor_params(self, parfor): + """Checks the parfor params for compute follows data compliance and + returns the conforming device for the parfor. + + Args: + parfor: Parfor node to be analyzed + + Returns: + str: The device filter string for the parfor if the parfor is + compute follows data conforming. + """ + if parfor.params is None: + return + + outputParams = get_parfor_outputs(parfor, parfor.params) + checklist = sorted(list(set(parfor.params) - set(outputParams))) + + # Check if any output param was defined outside the parfor + for para in outputParams: + if ( + isinstance(self._state.typemap[para], DpnpNdArray) + and para in self._seen_array_set + ): + checklist.append(para) + + # Check params in checklist for CFD compliance and derive the common + # usm allocator and device based on the checklist params + usm_ty, device_ty = self._check_cfd_parfor_params(parfor, checklist) + + # Update any outputs that are generated in the parfor + for para in outputParams: + if not isinstance(self._state.typemap[para], DpnpNdArray): + continue + # Legalize LHS. Skip if we already updated the type before and no + # further legalization is needed. + if self._legalize_array_attrs( + para, + device_ty, + ParforLegalizeCFDPassImpl.inputUsmTypeIntToStr[usm_ty], + ): + # Keep track of vars that have been updated + self._cfd_updated_values.add(para) + else: + try: + self._cfd_updated_values.remove(para) + except KeyError: + pass + + return device_ty + + def _legalize_cfd_parfor_blocks(self, parfor): + """Legalize the parfor params based on the compute follows data + programming model and usm allocator precedence rule. + """ + conforming_device_ty = self._legalize_parfor_params(parfor) + + # Update the parfor's lowerer attribute + parfor.lowerer = ParforLowerFactory.get_lowerer(conforming_device_ty) + + init_block = parfor.init_block + blocks = parfor.loop_body + + for stmt in init_block.body: + self._legalize_stmt(stmt, init_block, inparfor=True) + + for block in blocks.values(): + for stmt in block.body: + self._legalize_stmt(stmt, block, inparfor=True) + + def _legalize_expr(self, stmt, lhs, lhsty, parent_block, inparfor=False): + rhs = stmt.value + if rhs.op == "call": + if self._check_if_dpnp_empty_call(rhs.func, parent_block): + if inparfor and lhs in self._cfd_updated_values: + self._legalize_dpnp_empty_call(lhsty, rhs, parent_block) + self._seen_array_set.add(lhs) + # TODO: If any other array constructor that does not take + # args, just add to self._seen_array_set + else: + for ele in rhs.list_vars(): + if ele.name in self._cfd_updated_values: + # TODO: Resolve function type with new argument + raise NotImplementedError( + "Compute follows data is not currently " + "supported for function calls." + ) + elif rhs.op == "cast" and rhs.value.name in self._cfd_updated_values: + device_ty = self._state.typemap[rhs.value.name].device + usm_ty = self._state.typemap[rhs.value.name].usm_type + if self._legalize_array_attrs( + arrattr=lhs, + legalized_device_ty=device_ty, + legalized_usm_ty=usm_ty, + ): + self._cfd_updated_values.add(lhs) + else: + try: + self._cfd_updated_values.remove(lhs) + except KeyError: + pass + else: + # The assumption is all other expr types are by now either parfors + # or are insts like setattr, getattr, # getitem, etc. and do not + # need CFD legalization. + self._seen_array_set.add(lhs) + + def _legalize_stmt(self, stmt, parent_block, inparfor=False): + if isinstance(stmt, ir.Assign): + lhs = stmt.target.name + lhsty = self._state.typemap[lhs] + if isinstance(lhsty, DpnpNdArray): + if isinstance(stmt.value, ir.Arg): + self._seen_array_set.add(lhs) + elif isinstance(stmt.value, ir.Expr): + self._legalize_expr( + stmt, lhs, lhsty, parent_block, inparfor + ) + elif isinstance(stmt, Parfor): + self._legalize_cfd_parfor_blocks(stmt) + elif isinstance(stmt, ir.Return): + # Check if the return value is a DpnpNdArray and was changed by + # compute follows data legalization + retty = self._state.typemap[stmt.value.name] + if ( + isinstance(retty, DpnpNdArray) + and stmt.value.name in self._cfd_updated_values + and self._state.return_type != retty + ): + self._state.return_type = retty + + def run(self): + # The get_parfor_params needs to be run here to initialize the parfor + # nodes prior to using them. + _, _ = get_parfor_params( + self._state.func_ir.blocks, + self._state.flags.auto_parallel, + self.nested_fusion_info, + ) + + # FIXME: Traversing the blocks in topological order is not sufficient. + # The traversal should be converted to a backward data flow traversal of + # the CFG. The algorithm needs to then become a fixed-point work list + # algorithm. + topo_order = find_topo_order(self._state.func_ir.blocks) + + # Apply CFD legalization to parfor nodes and dpnp_empty calls + for label in topo_order: + block = self._state.func_ir.blocks[label] + for stmt in block.body: + self._legalize_stmt(stmt, block) + + +@register_pass(mutates_CFG=True, analysis_only=False) +class ParforLegalizeCFDPass(FunctionPass): + _name = "parfor_Legalize_CFD_pass" + + def __init__(self): + FunctionPass.__init__(self) + + def run_pass(self, state): + """ + Legalize CFD of parfor nodes. + """ + # Ensure we have an IR and type information. + assert state.func_ir + cfd_legalizer = ParforLegalizeCFDPassImpl(state) + cfd_legalizer.run() + + return True diff --git a/numba_dpex/core/passes/parfor_lowering_pass.py b/numba_dpex/core/passes/parfor_lowering_pass.py new file mode 100644 index 0000000000..1afd0b16f8 --- /dev/null +++ b/numba_dpex/core/passes/parfor_lowering_pass.py @@ -0,0 +1,394 @@ +# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import copy + +from numba.core import funcdesc, ir, types +from numba.core.compiler_machinery import LoweringPass, register_pass +from numba.core.lowering import Lower +from numba.parfors.parfor_lowering import ( + _lower_parfor_parallel as _lower_parfor_parallel_std, +) + +from numba_dpex import config +from numba_dpex.core.utils.kernel_launcher import KernelLaunchIRBuilder + +from ..exceptions import UnsupportedParforError +from ..types.dpnp_ndarray_type import DpnpNdArray +from ..utils.kernel_builder import create_kernel_for_parfor +from .parfor import Parfor, find_potential_aliases_parfor, get_parfor_outputs + +# A global list of kernels to keep the objects alive indefinitely. +keep_alive_kernels = [] + + +def _getvar_or_none(lowerer, x): + try: + return lowerer.getvar(x) + except: + return None + + +def _load_range(lowerer, value): + if isinstance(value, ir.Var): + return lowerer.loadvar(value.name) + else: + return lowerer.context.get_constant(types.uintp, value) + + +class ParforLowerImpl: + """Provides a custom lowerer for parfor nodes that generates a SYCL kernel + for a parfor and submits it to a queue. + """ + + def _submit_parfor_kernel( + self, + lowerer, + kernel_fn, + loop_ranges, + ): + """ + Adds the call to the dpex kernel function from the main function. + """ + keep_alive_kernels.append(kernel_fn.kernel) + + # Helper class that generates the LLVM IR values inside the current LLVM + # module that are needed to submit the kernel to a queue. + ir_builder = KernelLaunchIRBuilder(lowerer, kernel_fn.kernel) + + # Create a local variable storing a pointer to a DPCTLSyclQueueRef + # pointer. + curr_queue = ir_builder.get_queue(exec_queue=kernel_fn.queue) + + num_flattened_args = 0 + + # Compute number of args to be passed to the kernel. Note that the + # actual number of kernel arguments is greater than the count of + # kernel_fn.kernel_args as arrays get flattened. + for arg_type in kernel_fn.kernel_arg_types: + if isinstance(arg_type, DpnpNdArray): + # FIXME: Remove magic constants + num_flattened_args += 5 + (2 * arg_type.ndim) + else: + num_flattened_args += 1 + + # Create LLVM values for the kernel args list and kernel arg types list + args_list = ir_builder.allocate_kernel_arg_array(num_flattened_args) + args_ty_list = ir_builder.allocate_kernel_arg_ty_array( + num_flattened_args + ) + + # Populate the args_list and the args_ty_list LLVM arrays + kernel_arg_num = 0 + for arg_num, arg in enumerate(kernel_fn.kernel_args): + argtype = kernel_fn.kernel_arg_types[arg_num] + llvm_val = _getvar_or_none(lowerer, arg) + if not llvm_val: + raise AssertionError + if isinstance(argtype, DpnpNdArray): + ir_builder.build_array_arg( + array_val=llvm_val, + array_rank=argtype.ndim, + arg_list=args_list, + args_ty_list=args_ty_list, + arg_num=kernel_arg_num, + ) + # FIXME: Get rid of magic constants + kernel_arg_num += 5 + (2 * argtype.ndim) + else: + ir_builder.build_arg( + llvm_val, argtype, args_list, args_ty_list, kernel_arg_num + ) + kernel_arg_num += 1 + + # Create a global range over which to submit the kernel based on the + # loop_ranges of the parfor + global_range = [] + # SYCL ranges can have at max 3 dimension. If the parfor is of a higher + # dimension then the indexing for the higher dimensions is done inside + # the kernel. + global_range_rank = len(loop_ranges) if len(loop_ranges) < 3 else 3 + for i in range(global_range_rank): + start, stop, step = loop_ranges[i] + stop = _load_range(lowerer, stop) + if step != 1: + raise UnsupportedParforError( + "non-unit strides are not yet supported." + ) + global_range.append(stop) + + # Submit a synchronous kernel + ir_builder.submit_sync_ranged_kernel( + global_range, curr_queue, kernel_arg_num, args_list, args_ty_list + ) + + # At this point we can free the DPCTLSyclQueueRef (curr_queue) + ir_builder.free_queue(sycl_queue_val=curr_queue) + + def _lower_parfor_as_kernel(self, lowerer, parfor): + """Lowers a parfor node created by the dpjit compiler to a kernel. + + The general approach is as follows: + + - The code from the parfor's init block is lowered normally + in the context of the current function. + - The body of the parfor is transformed into a kernel function. + - Dpctl runtime calls to submit the kernel are added. + + """ + # We copy the typemap here because for race condition variable we'll + # update their type to array so they can be updated by the kernel. + orig_typemap = lowerer.fndesc.typemap + + # replace original typemap with copy and restore the original at the + # end. + lowerer.fndesc.typemap = copy.copy(orig_typemap) + + if config.DEBUG_ARRAY_OPT: + print("lowerer.fndesc", lowerer.fndesc, type(lowerer.fndesc)) + + typemap = lowerer.fndesc.typemap + varmap = lowerer.varmap + + loc = parfor.init_block.loc + scope = parfor.init_block.scope + + # Lower the init block of the parfor. + for instr in parfor.init_block.body: + lowerer.lower_inst(instr) + + for racevar in parfor.races: + if racevar not in varmap: + rvtyp = typemap[racevar] + rv = ir.Var(scope, racevar, loc) + lowerer._alloca_var(rv.name, rvtyp) + + alias_map = {} + arg_aliases = {} + + find_potential_aliases_parfor( + parfor, + parfor.params, + typemap, + lowerer.func_ir, + alias_map, + arg_aliases, + ) + + # run get_parfor_outputs() and get_parfor_reductions() before + # kernel creation since Jumps are modified so CFG of loop_body + # dict will become invalid + if parfor.params is None: + raise AssertionError + + parfor_output_arrays = get_parfor_outputs(parfor, parfor.params) + + # compile parfor body as a separate dpex kernel function + flags = copy.copy(parfor.flags) + flags.error_model = "numpy" + + # Can't get here unless + # flags.set('auto_parallel', ParallelOptions(True)) + index_var_typ = typemap[parfor.loop_nests[0].index_variable.name] + + # index variables should have the same type, check rest of indices + for loop_nest in parfor.loop_nests[1:]: + if typemap[loop_nest.index_variable.name] != index_var_typ: + raise AssertionError + + loop_ranges = [ + (loop_nest.start, loop_nest.stop, loop_nest.step) + for loop_nest in parfor.loop_nests + ] + + try: + psrfor_kernel = create_kernel_for_parfor( + lowerer, + parfor, + typemap, + flags, + loop_ranges, + bool(alias_map), + parfor.races, + parfor_output_arrays, + ) + except Exception: + # FIXME: Make the exception more informative + raise UnsupportedParforError + + # Finally submit the kernel + self._submit_parfor_kernel(lowerer, psrfor_kernel, loop_ranges) + + # TODO: free the kernel at this point + + # Restore the original typemap of the function that was replaced + # temporarily at the beginning of this function. + lowerer.fndesc.typemap = orig_typemap + + +class ParforLowerFactory: + """A pseudo-factory class that maps a device filter string to a lowering + function. + + Each parfor can have a "lowerer" attribute that determines how the parfor + node is to be lowered to LLVM IR. The factory class maintains a static map + that for every device type (filter string) encountered so far to a lowerer + function for that device type. At this point numba-dpex does not generate + device-specific code and there lowerer is always same for all devices. + By generating different instances we make sure prfors that will execute on + distinct devices as determined by compute-follows-data programming model are + never fused together. + """ + + device_to_lowerer_map = {} + + @classmethod + def get_lowerer(cls, device): + try: + lowerer = ParforLowerFactory.device_to_lowerer_map[device] + except KeyError: + lowerer = ParforLowerImpl()._lower_parfor_as_kernel + ParforLowerFactory.device_to_lowerer_map[device] = lowerer + + return lowerer + + +class WrapperDefaultLower(Lower): + @property + def _disable_sroa_like_opt(self): + """We always return True.""" + return True + + +def lower_parfor_dpex(lowerer, parfor): + parfor.lowerer = ParforLowerImpl()._lower_parfor_as_kernel + if parfor.lowerer is None: + _lower_parfor_parallel_std(lowerer, parfor) + else: + parfor.lowerer(lowerer, parfor) + + +class _ParforLower(Lower): + """Extends standard lowering to accommodate parfor.Parfor nodes that may + have the `lowerer` attribute set. + """ + + def __init__(self, context, library, fndesc, func_ir, metadata=None): + Lower.__init__(self, context, library, fndesc, func_ir, metadata) + self.dpex_lower = self._lower( + context, library, fndesc, func_ir, metadata + ) + + def _lower(self, context, library, fndesc, func_ir, metadata): + """Create Lower with changed linkageName in debug info""" + lower = WrapperDefaultLower(context, library, fndesc, func_ir, metadata) + + # Debuginfo + if context.enable_debuginfo: + from numba.core.funcdesc import default_mangler, qualifying_prefix + + from numba_dpex.debuginfo import DpexDIBuilder + + qualprefix = qualifying_prefix(fndesc.modname, fndesc.qualname) + mangled_qualname = default_mangler(qualprefix, fndesc.argtypes) + + lower.debuginfo = DpexDIBuilder( + module=lower.module, + filepath=func_ir.loc.filename, + linkage_name=mangled_qualname, + cgctx=context, + ) + + return lower + + def lower(self): + context = self.dpex_lower.context + + # Only Numba's CPUContext has the `lower_extension` attribute + context.lower_extensions[Parfor] = lower_parfor_dpex + self.dpex_lower.lower() + self.base_lower = self.dpex_lower + + self.env = self.base_lower.env + self.call_helper = self.base_lower.call_helper + + def create_cpython_wrapper(self, release_gil=False): + return self.base_lower.create_cpython_wrapper(release_gil) + + +@register_pass(mutates_CFG=True, analysis_only=False) +class ParforLoweringPass(LoweringPass): + """A custom lowering pass that does dpex-specific lowering of parfor + nodes. + + FIXME: Redesign once numba-dpex supports Numba 0.57 + """ + + _name = "dpjit_lowering" + + def __init__(self): + LoweringPass.__init__(self) + + def run_pass(self, state): + if state.library is None: + codegen = state.targetctx.codegen() + state.library = codegen.create_library(state.func_id.func_qualname) + # Enable object caching upfront, so that the library can + # be later serialized. + state.library.enable_object_caching() + + targetctx = state.targetctx + + library = state.library + interp = state.func_ir + typemap = state.typemap + restype = state.return_type + calltypes = state.calltypes + flags = state.flags + metadata = state.metadata + + kwargs = {} + + # for support numba 0.54 and <=0.55.0dev0=*_469 + if hasattr(flags, "get_mangle_string"): + kwargs["abi_tags"] = flags.get_mangle_string() + # Lowering + fndesc = funcdesc.PythonFunctionDescriptor.from_specialized_function( + interp, + typemap, + restype, + calltypes, + mangler=targetctx.mangler, + inline=flags.forceinline, + noalias=flags.noalias, + **kwargs, + ) + + with targetctx.push_code_library(library): + lower = _ParforLower( + targetctx, library, fndesc, interp, metadata=metadata + ) + lower.lower() + if not flags.no_cpython_wrapper: + lower.create_cpython_wrapper(flags.release_gil) + + env = lower.env + call_helper = lower.call_helper + del lower + + from numba.core.compiler import _LowerResult # TODO: move this + + if flags.no_compile: + state["cr"] = _LowerResult(fndesc, call_helper, cfunc=None, env=env) + else: + # Prepare for execution + cfunc = targetctx.get_executable(library, fndesc, env) + # Insert native function for use by other jitted-functions. + # We also register its library to allow for inlining. + targetctx.insert_user_function(cfunc, fndesc, [library]) + state["cr"] = _LowerResult( + fndesc, call_helper, cfunc=cfunc, env=env + ) + + return True diff --git a/numba_dpex/core/passes/passes.py b/numba_dpex/core/passes/passes.py index ccf2640ac6..c5ab12271d 100644 --- a/numba_dpex/core/passes/passes.py +++ b/numba_dpex/core/passes/passes.py @@ -152,6 +152,7 @@ def run_pass(self, state): # Ensure we have an IR and type information. assert state.func_ir functions_map = swap_functions_map.copy() + # FIXME: remove once reduction is implemented functions_map.pop(("dot", "numpy"), None) functions_map.pop(("sum", "numpy"), None) functions_map.pop(("prod", "numpy"), None) diff --git a/numba_dpex/core/pipelines/dpjit_compiler.py b/numba_dpex/core/pipelines/dpjit_compiler.py new file mode 100644 index 0000000000..433dfd9a27 --- /dev/null +++ b/numba_dpex/core/pipelines/dpjit_compiler.py @@ -0,0 +1,118 @@ +# SPDX-FileCopyrightText: 2020 - 2022 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +from numba.core.compiler import CompilerBase, DefaultPassBuilder +from numba.core.compiler_machinery import PassManager +from numba.core.typed_passes import ( + AnnotateTypes, + InlineOverloads, + IRLegalization, + NopythonRewrites, + NoPythonSupportedFeatureValidation, + NopythonTypeInference, + PreLowerStripPhis, + PreParforPass, +) + +from numba_dpex.core.exceptions import UnsupportedCompilationModeError +from numba_dpex.core.passes import ( + DumpParforDiagnostics, + NoPythonBackend, + ParforFusionPass, + ParforLegalizeCFDPass, + ParforLoweringPass, + ParforPreLoweringPass, + SplitParforPass, +) +from numba_dpex.parfor_diagnostics import ExtendedParforDiagnostics + + +class _DpjitPassBuilder(object): + """ + A pass builder for dpex's DpjitCompiler that adds supports for + offloading dpnp array expressions and dpnp library calls to a SYCL device. + + The pass builder does not implement pipelines for objectmode or interpreted + execution. + """ + + @staticmethod + def define_typed_pipeline(state, name="dpex_dpjit_typed"): + """Returns the typed part of the nopython pipeline""" + pm = PassManager(name) + # typing + pm.add_pass(NopythonTypeInference, "nopython frontend") + # Annotate only once legalized + pm.add_pass(AnnotateTypes, "annotate types") + # strip phis + pm.add_pass(PreLowerStripPhis, "remove phis nodes") + + # optimization + pm.add_pass(InlineOverloads, "inline overloaded functions") + pm.add_pass(PreParforPass, "Preprocessing for parfors") + if not state.flags.no_rewrites: + pm.add_pass(NopythonRewrites, "nopython rewrites") + pm.add_pass(SplitParforPass, "convert to parfors") + pm.add_pass( + ParforLegalizeCFDPass, "Legalize parfors for compute follows data" + ) + pm.add_pass(ParforFusionPass, "fuse parfors") + pm.add_pass(ParforPreLoweringPass, "parfor prelowering") + + pm.finalize() + return pm + + @staticmethod + def define_nopython_lowering_pipeline(state, name="dpex_dpjit_lowering"): + """Returns an nopython mode pipeline based PassManager""" + pm = PassManager(name) + + # legalize + pm.add_pass( + NoPythonSupportedFeatureValidation, + "ensure features that are in use are in a valid form", + ) + pm.add_pass(IRLegalization, "ensure IR is legal prior to lowering") + + # lower + pm.add_pass(ParforLoweringPass, "Custom lowerer for dpex parfor nodes") + pm.add_pass(NoPythonBackend, "nopython mode backend") + pm.add_pass(DumpParforDiagnostics, "dump parfor diagnostics") + + pm.finalize() + return pm + + @staticmethod + def define_nopython_pipeline(state, name="dpex_dpjit_nopython"): + """Returns an nopython mode pipeline based PassManager""" + # compose pipeline from untyped, typed and lowering parts + dpb = _DpjitPassBuilder + pm = PassManager(name) + untyped_passes = DefaultPassBuilder.define_untyped_pipeline(state) + pm.passes.extend(untyped_passes.passes) + + typed_passes = dpb.define_typed_pipeline(state) + pm.passes.extend(typed_passes.passes) + + lowering_passes = dpb.define_nopython_lowering_pipeline(state) + pm.passes.extend(lowering_passes.passes) + + pm.finalize() + return pm + + +class DpjitCompiler(CompilerBase): + """Dpex's compiler pipeline to offload parfor nodes into SYCL kernels.""" + + def define_pipelines(self): + pms = [] + self.state.parfor_diagnostics = ExtendedParforDiagnostics() + self.state.metadata[ + "parfor_diagnostics" + ] = self.state.parfor_diagnostics + if not self.state.flags.force_pyobject: + pms.append(_DpjitPassBuilder.define_nopython_pipeline(self.state)) + if self.state.status.can_fallback or self.state.flags.force_pyobject: + raise UnsupportedCompilationModeError() + return pms diff --git a/numba_dpex/core/runtime/_dpexrt_python.c b/numba_dpex/core/runtime/_dpexrt_python.c index fc13f2954e..3712704bec 100644 --- a/numba_dpex/core/runtime/_dpexrt_python.c +++ b/numba_dpex/core/runtime/_dpexrt_python.c @@ -21,6 +21,22 @@ #include "numba/_arraystruct.h" +/* Debugging facilities - enabled at compile-time */ +/* #undef NDEBUG */ +#if 0 +#include +#define DPEXRT_DEBUG(X) \ + { \ + X; \ + fflush(stdout); \ + } +#else +#define DPEXRT_DEBUG(X) \ + if (0) { \ + X; \ + } +#endif + // forward declarations static struct PyUSMArrayObject *PyUSMNdArray_ARRAYOBJ(PyObject *obj); static npy_intp product_of_shape(npy_intp *shape, npy_intp ndim); @@ -30,6 +46,7 @@ static void *usm_host_malloc(size_t size, void *opaque_data); static void usm_free(void *data, void *opaque_data); static NRT_ExternalAllocator * NRT_ExternalAllocator_new_for_usm(DPCTLSyclQueueRef qref, size_t usm_type); +static void *DPEXRTQueue_CreateFromFilterString(const char *device); static MemInfoDtorInfo *MemInfoDtorInfo_new(NRT_MemInfo *mi, PyObject *owner); static NRT_MemInfo *NRT_MemInfo_new_from_usmndarray(PyObject *ndarrobj, void *data, @@ -106,6 +123,91 @@ static void usm_free(void *data, void *opaque_data) DPCTLfree_with_queue(data, qref); } +/*----------------------------------------------------------------------------*/ +/*--------- Functions for dpctl libsyclinterface/sycl gluing ---------*/ +/*----------------------------------------------------------------------------*/ + +/*! + * @brief Creates and returns a DPCTLSyclQueueRef from a filter string. + * + * @param device A sycl::oneapi_ext::filter_string + * @return {DPCTLSyclQueueRef} A DPCTLSyclQueueRef object as void*. + */ +static void *DPEXRTQueue_CreateFromFilterString(const char *device) +{ + DPCTLSyclDeviceSelectorRef dselector = NULL; + DPCTLSyclDeviceRef dref = NULL; + DPCTLSyclQueueRef qref = NULL; + + DPEXRT_DEBUG(nrt_debug_print( + "DPEXRT-DEBUG: Inside DPEXRT_get_sycl_queue %s, line %d\n", __FILE__, + __LINE__)); + + if (!(dselector = DPCTLFilterSelector_Create(device))) { + DPEXRT_DEBUG(nrt_debug_print( + "DPEXRT-ERROR: Could not create a sycl::device_selector from " + "filter string: %s at %s %d.\n", + device, __FILE__, __LINE__)); + goto error; + } + + if (!(dref = DPCTLDevice_CreateFromSelector(dselector))) + goto error; + + if (!(qref = DPCTLQueue_CreateForDevice(dref, NULL, 0))) + goto error; + + DPCTLDeviceSelector_Delete(dselector); + DPCTLDevice_Delete(dref); + + DPEXRT_DEBUG(nrt_debug_print( + "DPEXRT-DEBUG: Created sycl::queue on device %s at %s, line %d\n", + device, __FILE__, __LINE__)); + + return (void *)qref; + +error: + DPCTLDeviceSelector_Delete(dselector); + DPCTLDevice_Delete(dref); + + return NULL; +} + +static void DpexrtQueue_SubmitRange(const void *KRef, + const void *QRef, + void **Args, + const DPCTLKernelArgType *ArgTypes, + size_t NArgs, + const size_t Range[3], + size_t NRange, + const void *DepEvents, + size_t NDepEvents) +{ + DPCTLSyclEventRef eref = NULL; + DPCTLSyclQueueRef qref = NULL; + + DPEXRT_DEBUG(nrt_debug_print( + "DPEXRT-DEBUG: Inside DpexrtQueue_SubmitRange %s, line %d\n", __FILE__, + __LINE__)); + + qref = (DPCTLSyclQueueRef)QRef; + + eref = DPCTLQueue_SubmitRange( + (DPCTLSyclKernelRef)KRef, qref, Args, (DPCTLKernelArgType *)ArgTypes, + NArgs, Range, NRange, (DPCTLSyclEventRef *)DepEvents, NDepEvents); + DPCTLQueue_Wait(qref); + DPCTLEvent_Wait(eref); + DPCTLEvent_Delete(eref); + + DPEXRT_DEBUG(nrt_debug_print( + "DPEXRT-DEBUG: Done with DpexrtQueue_SubmitRange %s, line %d\n", + __FILE__, __LINE__)); +} + +/*----------------------------------------------------------------------------*/ +/*---------------------- Functions for NRT_MemInfo allocation ----------------*/ +/*----------------------------------------------------------------------------*/ + /*! * @brief Creates a new NRT_ExternalAllocator object tied to a SYCL USM * allocator. @@ -128,13 +230,15 @@ NRT_ExternalAllocator_new_for_usm(DPCTLSyclQueueRef qref, size_t usm_type) allocator = (NRT_ExternalAllocator *)malloc(sizeof(NRT_ExternalAllocator)); if (allocator == NULL) { - NRT_Debug(nrt_debug_print("DPEXRT-ERROR: failed to allocate memory for " - "NRT_ExternalAllocator at %s, line %d.\n", - __FILE__, __LINE__)); + DPEXRT_DEBUG( + nrt_debug_print("DPEXRT-ERROR: failed to allocate memory for " + "NRT_ExternalAllocator at %s, line %d.\n", + __FILE__, __LINE__)); goto error; } - NRT_Debug(nrt_debug_print("DPEXRT-DEBUG: usm type = %d at %s, line %d.\n", - usm_type, __FILE__, __LINE__)); + DPEXRT_DEBUG( + nrt_debug_print("DPEXRT-DEBUG: usm type = %d at %s, line %d.\n", + usm_type, __FILE__, __LINE__)); switch (usm_type) { case 1: @@ -147,9 +251,9 @@ NRT_ExternalAllocator_new_for_usm(DPCTLSyclQueueRef qref, size_t usm_type) allocator->malloc = usm_host_malloc; break; default: - NRT_Debug(nrt_debug_print("DPEXRT-ERROR: Encountered an unknown usm " - "allocation type (%d) at %s, line %d\n", - usm_type, __FILE__, __LINE__)); + DPEXRT_DEBUG(nrt_debug_print("DPEXRT-ERROR: Encountered an unknown usm " + "allocation type (%d) at %s, line %d\n", + usm_type, __FILE__, __LINE__)); goto error; } @@ -193,7 +297,7 @@ static void usmndarray_meminfo_dtor(void *ptr, size_t size, void *info) // Sanity-check to make sure the mi_dtor_info is an actual pointer. if (!(mi_dtor_info = (MemInfoDtorInfo *)info)) { - NRT_Debug(nrt_debug_print( + DPEXRT_DEBUG(nrt_debug_print( "DPEXRT-ERROR: MemInfoDtorInfo object might be corrupted. Aborting " "MemInfo destruction at %s, line %d\n", __FILE__, __LINE__)); @@ -248,9 +352,9 @@ static MemInfoDtorInfo *MemInfoDtorInfo_new(NRT_MemInfo *mi, PyObject *owner) MemInfoDtorInfo *mi_dtor_info = NULL; if (!(mi_dtor_info = (MemInfoDtorInfo *)malloc(sizeof(MemInfoDtorInfo)))) { - NRT_Debug(nrt_debug_print("DPEXRT-ERROR: Could not allocate a new " - "MemInfoDtorInfo object at %s, line %d\n", - __FILE__, __LINE__)); + DPEXRT_DEBUG(nrt_debug_print("DPEXRT-ERROR: Could not allocate a new " + "MemInfoDtorInfo object at %s, line %d\n", + __FILE__, __LINE__)); return NULL; } mi_dtor_info->mi = mi; @@ -283,7 +387,7 @@ static NRT_MemInfo *NRT_MemInfo_new_from_usmndarray(PyObject *ndarrobj, // Allocate a new NRT_MemInfo object if (!(mi = (NRT_MemInfo *)malloc(sizeof(NRT_MemInfo)))) { - NRT_Debug(nrt_debug_print( + DPEXRT_DEBUG(nrt_debug_print( "DPEXRT-ERROR: Could not allocate a new NRT_MemInfo " "object at %s, line %d\n", __FILE__, __LINE__)); @@ -291,7 +395,7 @@ static NRT_MemInfo *NRT_MemInfo_new_from_usmndarray(PyObject *ndarrobj, } if (!(cref = DPCTLQueue_GetContext(qref))) { - NRT_Debug(nrt_debug_print( + DPEXRT_DEBUG(nrt_debug_print( "DPEXRT-ERROR: Could not get the DPCTLSyclContext from " "the queue object at %s, line %d\n", __FILE__, __LINE__)); @@ -303,7 +407,7 @@ static NRT_MemInfo *NRT_MemInfo_new_from_usmndarray(PyObject *ndarrobj, // Allocate a new NRT_ExternalAllocator if (!(ext_alloca = NRT_ExternalAllocator_new_for_usm(qref, usm_type))) { - NRT_Debug( + DPEXRT_DEBUG( nrt_debug_print("DPEXRT-ERROR: Could not allocate a new " "NRT_ExternalAllocator object at %s, line %d\n", __FILE__, __LINE__)); @@ -312,9 +416,9 @@ static NRT_MemInfo *NRT_MemInfo_new_from_usmndarray(PyObject *ndarrobj, // Allocate a new MemInfoDtorInfo if (!(midtor_info = MemInfoDtorInfo_new(mi, ndarrobj))) { - NRT_Debug(nrt_debug_print("DPEXRT-ERROR: Could not allocate a new " - "MemInfoDtorInfo object at %s, line %d\n", - __FILE__, __LINE__)); + DPEXRT_DEBUG(nrt_debug_print("DPEXRT-ERROR: Could not allocate a new " + "MemInfoDtorInfo object at %s, line %d\n", + __FILE__, __LINE__)); goto error; } @@ -326,14 +430,14 @@ static NRT_MemInfo *NRT_MemInfo_new_from_usmndarray(PyObject *ndarrobj, mi->size = nitems * itemsize; mi->external_allocator = ext_alloca; - NRT_Debug(nrt_debug_print( + DPEXRT_DEBUG(nrt_debug_print( "DPEXRT-DEBUG: NRT_MemInfo_init mi=%p external_allocator=%p\n", mi, ext_alloca)); return mi; error: - NRT_Debug(nrt_debug_print( + DPEXRT_DEBUG(nrt_debug_print( "DPEXRT-ERROR: Failed inside NRT_MemInfo_new_from_usmndarray clean up " "and return NULL at %s, line %d\n", __FILE__, __LINE__)); @@ -359,37 +463,27 @@ DPEXRT_MemInfo_alloc(npy_intp size, size_t usm_type, const char *device) NRT_MemInfo *mi = NULL; NRT_ExternalAllocator *ext_alloca = NULL; MemInfoDtorInfo *midtor_info = NULL; - DPCTLSyclDeviceSelectorRef dselector = NULL; - DPCTLSyclDeviceRef dref = NULL; DPCTLSyclQueueRef qref = NULL; - NRT_Debug(nrt_debug_print( + DPEXRT_DEBUG(nrt_debug_print( "DPEXRT-DEBUG: Inside DPEXRT_MemInfo_alloc %s, line %d\n", __FILE__, __LINE__)); // Allocate a new NRT_MemInfo object if (!(mi = (NRT_MemInfo *)malloc(sizeof(NRT_MemInfo)))) { - NRT_Debug(nrt_debug_print( + DPEXRT_DEBUG(nrt_debug_print( "DPEXRT-ERROR: Could not allocate a new NRT_MemInfo object.\n")); goto error; } - if (!(dselector = DPCTLFilterSelector_Create(device))) { - NRT_Debug(nrt_debug_print( - "DPEXRT-ERROR: Could not create a sycl::device_selector from " - "filter string: %s at %s %d.\n", - device, __FILE__, __LINE__)); + if (!(qref = (DPCTLSyclQueueRef)DPEXRTQueue_CreateFromFilterString(device))) + { + DPEXRT_DEBUG( + nrt_debug_print("DPEXRT-ERROR: Could not create a sycl::queue from " + "filter string: %s at %s %d.\n", + device, __FILE__, __LINE__)); goto error; } - if (!(dref = DPCTLDevice_CreateFromSelector(dselector))) - goto error; - - if (!(qref = DPCTLQueue_CreateForDevice(dref, NULL, 0))) - goto error; - - DPCTLDeviceSelector_Delete(dselector); - DPCTLDevice_Delete(dref); - // Allocate a new NRT_ExternalAllocator if (!(ext_alloca = NRT_ExternalAllocator_new_for_usm(qref, usm_type))) goto error; @@ -407,7 +501,7 @@ DPEXRT_MemInfo_alloc(npy_intp size, size_t usm_type, const char *device) mi->size = size; mi->external_allocator = ext_alloca; - NRT_Debug(nrt_debug_print( + DPEXRT_DEBUG(nrt_debug_print( "DPEXRT-DEBUG: DPEXRT_MemInfo_alloc mi=%p " "external_allocator=%p for usm_type %zu on device %s, %s at %d\n", mi, ext_alloca, usm_type, device, __FILE__, __LINE__)); @@ -418,8 +512,6 @@ DPEXRT_MemInfo_alloc(npy_intp size, size_t usm_type, const char *device) free(mi); free(ext_alloca); free(midtor_info); - DPCTLDeviceSelector_Delete(dselector); - DPCTLDevice_Delete(dref); return NULL; } @@ -444,8 +536,6 @@ static NRT_MemInfo *DPEXRT_MemInfo_fill(NRT_MemInfo *mi, uint8_t value, const char *device) { - DPCTLSyclDeviceSelectorRef dselector = NULL; - DPCTLSyclDeviceRef dref = NULL; DPCTLSyclQueueRef qref = NULL; DPCTLSyclEventRef eref = NULL; size_t count = 0, size = 0, exp = 0; @@ -455,34 +545,20 @@ static NRT_MemInfo *DPEXRT_MemInfo_fill(NRT_MemInfo *mi, exp++; count = (unsigned int)(size >> exp); - NRT_Debug(nrt_debug_print( + DPEXRT_DEBUG(nrt_debug_print( "DPEXRT-DEBUG: mi->size = %u, itemsize = %u, count = %u, " "value = %u, Inside DPEXRT_MemInfo_fill %s, line %d\n", mi->size, itemsize << exp, count, value, __FILE__, __LINE__)); if (mi->data == NULL) { - NRT_Debug(nrt_debug_print("DPEXRT-DEBUG: mi->data is NULL, " - "Inside DPEXRT_MemInfo_fill %s, line %d\n", - __FILE__, __LINE__)); + DPEXRT_DEBUG(nrt_debug_print("DPEXRT-DEBUG: mi->data is NULL, " + "Inside DPEXRT_MemInfo_fill %s, line %d\n", + __FILE__, __LINE__)); goto error; } - if (!(dselector = DPCTLFilterSelector_Create(device))) { - NRT_Debug(nrt_debug_print( - "DPEXRT-ERROR: Could not create a sycl::device_selector from " - "filter string: %s at %s %d.\n", - device, __FILE__, __LINE__)); + if (!(qref = (DPCTLSyclQueueRef)DPEXRTQueue_CreateFromFilterString(device))) goto error; - } - - if (!(dref = DPCTLDevice_CreateFromSelector(dselector))) - goto error; - - if (!(qref = DPCTLQueue_CreateForDevice(dref, NULL, 0))) - goto error; - - DPCTLDeviceSelector_Delete(dselector); - DPCTLDevice_Delete(dref); switch (exp) { case 3: @@ -535,8 +611,6 @@ static NRT_MemInfo *DPEXRT_MemInfo_fill(NRT_MemInfo *mi, error: DPCTLQueue_Delete(qref); DPCTLEvent_Delete(eref); - DPCTLDeviceSelector_Delete(dselector); - DPCTLDevice_Delete(dref); return NULL; } @@ -616,20 +690,20 @@ static int DPEXRT_sycl_usm_ndarray_from_python(PyObject *obj, // collecting the array. Py_IncRef(obj); - NRT_Debug(nrt_debug_print( + DPEXRT_DEBUG(nrt_debug_print( "DPEXRT-DEBUG: In DPEXRT_sycl_usm_ndarray_from_python.\n")); // Check if the PyObject obj has an _array_obj attribute that is of // dpctl.tensor.usm_ndarray type. if (!(arrayobj = PyUSMNdArray_ARRAYOBJ(obj))) { - NRT_Debug(nrt_debug_print( + DPEXRT_DEBUG(nrt_debug_print( "DPEXRT-ERROR: PyUSMNdArray_ARRAYOBJ check failed %d\n", __FILE__, __LINE__)); goto error; } if (!(ndim = UsmNDArray_GetNDim(arrayobj))) { - NRT_Debug(nrt_debug_print( + DPEXRT_DEBUG(nrt_debug_print( "DPEXRT-ERROR: UsmNDArray_GetNDim returned 0 at %s, line %d\n", __FILE__, __LINE__)); goto error; @@ -640,7 +714,7 @@ static int DPEXRT_sycl_usm_ndarray_from_python(PyObject *obj, nitems = product_of_shape(shape, ndim); itemsize = (npy_intp)UsmNDArray_GetElementSize(arrayobj); if (!(qref = UsmNDArray_GetQueueRef(arrayobj))) { - NRT_Debug(nrt_debug_print( + DPEXRT_DEBUG(nrt_debug_print( "DPEXRT-ERROR: UsmNDArray_GetQueueRef returned NULL at " "%s, line %d.\n", __FILE__, __LINE__)); @@ -650,7 +724,7 @@ static int DPEXRT_sycl_usm_ndarray_from_python(PyObject *obj, if (!(arystruct->meminfo = NRT_MemInfo_new_from_usmndarray( obj, data, nitems, itemsize, qref))) { - NRT_Debug(nrt_debug_print( + DPEXRT_DEBUG(nrt_debug_print( "DPEXRT-ERROR: NRT_MemInfo_new_from_usmndarray failed " "at %s, line %d.\n", __FILE__, __LINE__)); @@ -698,7 +772,7 @@ static int DPEXRT_sycl_usm_ndarray_from_python(PyObject *obj, // code of -1. // Decref the Pyobject of the array // ensure the GIL - NRT_Debug(nrt_debug_print( + DPEXRT_DEBUG(nrt_debug_print( "DPEXRT-ERROR: Failed to unbox dpnp ndarray into a Numba " "arraystruct at %s, line %d\n", __FILE__, __LINE__)); @@ -732,7 +806,7 @@ static PyObject *box_from_arystruct_parent(arystruct_t *arystruct, struct PyUSMArrayObject *arrayobj = NULL; npy_intp itemsize = 0; - NRT_Debug(nrt_debug_print("DPEXRT-DEBUG: In try_to_return_parent.\n")); + DPEXRT_DEBUG(nrt_debug_print("DPEXRT-DEBUG: In try_to_return_parent.\n")); if (!(arrayobj = PyUSMNdArray_ARRAYOBJ(arystruct->parent))) return NULL; @@ -778,7 +852,7 @@ static PyObject *box_from_arystruct_parent(arystruct_t *arystruct, // parent, we need to increment the reference count of the parent here. Py_IncRef(array); - NRT_Debug(nrt_debug_print( + DPEXRT_DEBUG(nrt_debug_print( "DPEXRT-DEBUG: try_to_return_parent found a valid parent.\n")); /* Yes, it is the same array return a new reference */ @@ -818,7 +892,7 @@ DPEXRT_sycl_usm_ndarray_to_python_acqref(arystruct_t *arystruct, int exp = 0; npy_intp itemsize = 0; - NRT_Debug(nrt_debug_print( + DPEXRT_DEBUG(nrt_debug_print( "DPEXRT-DEBUG: In DPEXRT_sycl_usm_ndarray_to_python_acqref.\n")); if (descr == NULL) { @@ -837,7 +911,7 @@ DPEXRT_sycl_usm_ndarray_to_python_acqref(arystruct_t *arystruct, // If the arystruct has a parent attribute, try to box the parent and // return it. if (arystruct->parent) { - NRT_Debug(nrt_debug_print( + DPEXRT_DEBUG(nrt_debug_print( "DPEXRT-DEBUG: arystruct has a parent, therefore " "trying to box and return the parent at %s, line %d\n", __FILE__, __LINE__)); @@ -875,8 +949,8 @@ DPEXRT_sycl_usm_ndarray_to_python_acqref(arystruct_t *arystruct, NRT_MemInfo_acquire(arystruct->meminfo); status = MemInfo_init(miobj, args, NULL); if (status != 0) { - NRT_Debug(nrt_debug_print("MemInfo_init failed at %s, line %d\n", - __FILE__, __LINE__)); + DPEXRT_DEBUG(nrt_debug_print("MemInfo_init failed at %s, line %d\n", + __FILE__, __LINE__)); Py_DECREF(args); PyErr_Format(PyExc_ValueError, "In 'DPEXRT_sycl_usm_ndarray_to_python_acqref', " @@ -960,7 +1034,7 @@ DPEXRT_sycl_usm_ndarray_to_python_acqref(arystruct_t *arystruct, return (PyObject *)NULL; } - NRT_Debug(nrt_debug_print( + DPEXRT_DEBUG(nrt_debug_print( "Returning from DPEXRT_sycl_usm_ndarray_to_python_acqref " "at %s, line %d\n", __FILE__, __LINE__)); @@ -994,6 +1068,9 @@ static PyObject *build_c_helpers_dict(void) &DPEXRT_sycl_usm_ndarray_from_python); _declpointer("DPEXRT_sycl_usm_ndarray_to_python_acqref", &DPEXRT_sycl_usm_ndarray_to_python_acqref); + _declpointer("DPEXRTQueue_CreateFromFilterString", + &DPEXRTQueue_CreateFromFilterString); + _declpointer("DpexrtQueue_SubmitRange", &DpexrtQueue_SubmitRange); _declpointer("DPEXRT_MemInfo_alloc", &DPEXRT_MemInfo_alloc); _declpointer("DPEXRT_MemInfo_fill", &DPEXRT_MemInfo_fill); _declpointer("NRT_ExternalAllocator_new_for_usm", @@ -1044,6 +1121,11 @@ MOD_INIT(_dpexrt_python) PyModule_AddObject( m, "DPEXRT_sycl_usm_ndarray_to_python_acqref", PyLong_FromVoidPtr(&DPEXRT_sycl_usm_ndarray_to_python_acqref)); + + PyModule_AddObject(m, "DPEXRTQueue_CreateFromFilterString", + PyLong_FromVoidPtr(&DPEXRTQueue_CreateFromFilterString)); + PyModule_AddObject(m, "DpexrtQueue_SubmitRange", + PyLong_FromVoidPtr(&DpexrtQueue_SubmitRange)); PyModule_AddObject(m, "DPEXRT_MemInfo_alloc", PyLong_FromVoidPtr(&DPEXRT_MemInfo_alloc)); PyModule_AddObject(m, "DPEXRT_MemInfo_fill", diff --git a/numba_dpex/core/runtime/context.py b/numba_dpex/core/runtime/context.py index 8e70baef67..3a23a819c0 100644 --- a/numba_dpex/core/runtime/context.py +++ b/numba_dpex/core/runtime/context.py @@ -129,7 +129,7 @@ def arraystruct_from_python(self, pyapi, obj, ptr): return self.error def usm_ndarray_to_python_acqref(self, pyapi, aryty, ary, dtypeptr): - """_summary_ + """Boxes a DpnpNdArray native object into a Python dpnp.ndarray. Args: pyapi (_type_): _description_ @@ -140,14 +140,14 @@ def usm_ndarray_to_python_acqref(self, pyapi, aryty, ary, dtypeptr): Returns: _type_: _description_ """ - args = [ + argtys = [ pyapi.voidptr, pyapi.pyobj, llvmir.IntType(32), llvmir.IntType(32), pyapi.pyobj, ] - fnty = llvmir.FunctionType(pyapi.pyobj, args) + fnty = llvmir.FunctionType(pyapi.pyobj, argtys) fn = pyapi._get_function( fnty, "DPEXRT_sycl_usm_ndarray_to_python_acqref" ) @@ -165,3 +165,86 @@ def usm_ndarray_to_python_acqref(self, pyapi, aryty, ary, dtypeptr): args = [ptr, serial_aryty_pytype, ndim, writable, dtypeptr] return pyapi.builder.call(fn, args) + + def get_queue_from_filter_string(self, builder, device): + """Calls DPEXRTQueue_CreateFromFilterString to create a new sycl::queue + from a given filter string. + + Args: + device (llvmlite.ir.values.FormattedConstant): An LLVM ArrayType + storing a const string for a DPC++ filter selector string. + + Returns: A DPCTLSyclQueueRef pointer. + """ + mod = builder.module + fnty = llvmir.FunctionType( + cgutils.voidptr_t, + [cgutils.voidptr_t], + ) + fn = cgutils.get_or_insert_function( + mod, fnty, "DPEXRTQueue_CreateFromFilterString" + ) + fn.return_value.add_attribute("noalias") + + ret = builder.call(fn, [device]) + + return ret + + def submit_range( + self, + builder, + kref, + qref, + args, + argtys, + nargs, + range, + nrange, + depevents, + ndepevents, + ): + """Calls DPEXRTQueue_CreateFromFilterString to create a new sycl::queue + from a given filter string. + + Args: + device (llvmlite.ir.values.FormattedConstant): An LLVM ArrayType + storing a const string for a DPC++ filter selector string. + + Returns: A DPCTLSyclQueueRef pointer. + """ + mod = builder.module + fnty = llvmir.FunctionType( + llvmir.types.VoidType(), + [ + cgutils.voidptr_t, + cgutils.voidptr_t, + cgutils.voidptr_t.as_pointer(), + cgutils.int32_t.as_pointer(), + llvmir.IntType(64), + llvmir.IntType(64).as_pointer(), + llvmir.IntType(64), + cgutils.voidptr_t, + llvmir.IntType(64), + ], + ) + fn = cgutils.get_or_insert_function( + mod, fnty, "DpexrtQueue_SubmitRange" + ) + # fn.return_value.add_attribute("noalias") + + ret = builder.call( + fn, + [ + kref, + qref, + args, + argtys, + nargs, + range, + nrange, + depevents, + ndepevents, + ], + ) + + return ret diff --git a/numba_dpex/core/types/dpnp_ndarray_type.py b/numba_dpex/core/types/dpnp_ndarray_type.py index 23df5d9477..75d77141c4 100644 --- a/numba_dpex/core/types/dpnp_ndarray_type.py +++ b/numba_dpex/core/types/dpnp_ndarray_type.py @@ -108,7 +108,7 @@ def __allocate__( # Assume str(dtype) returns a valid type dtype_str = str(dtype) # alloc call: lhs = empty_attr(size_var, typ_var) - typ_var = ir.Var(scope, mk_unique_var("$np_typ_var"), loc) + typ_var = ir.Var(scope, mk_unique_var("$np_dtype_var"), loc) if typemap: typemap[typ_var.name] = types.functions.NumberClass(dtype) # If dtype is a datetime/timedelta with a unit, @@ -126,11 +126,53 @@ def __allocate__( dtype_str = "bool_" np_typ_getattr = ir.Expr.getattr(g_np_var, dtype_str, loc) typ_var_assign = ir.Assign(np_typ_getattr, typ_var, loc) - alloc_call = ir.Expr.call(attr_var, [size_var, typ_var], (), loc) + + # A default usm_type arg added as a placeholder + layout_var = ir.Var(scope, mk_unique_var("$layout_var"), loc) + usm_typ_var = ir.Var(scope, mk_unique_var("$np_usm_type_var"), loc) + # A default device string arg added as a placeholder + device_typ_var = ir.Var(scope, mk_unique_var("$np_device_var"), loc) + + if typemap: + typemap[layout_var.name] = types.literal(lhs_typ.layout) + typemap[usm_typ_var.name] = types.literal(lhs_typ.usm_type) + typemap[device_typ_var.name] = types.literal(lhs_typ.device) + + layout_var_assign = ir.Assign( + ir.Const(lhs_typ.layout, loc), layout_var, loc + ) + usm_typ_var_assign = ir.Assign( + ir.Const(lhs_typ.usm_type, loc), usm_typ_var, loc + ) + device_typ_var_assign = ir.Assign( + ir.Const(lhs_typ.device, loc), device_typ_var, loc + ) + + out.extend( + [layout_var_assign, usm_typ_var_assign, device_typ_var_assign] + ) + + alloc_call = ir.Expr.call( + attr_var, + [size_var, typ_var, layout_var, device_typ_var, usm_typ_var], + (), + loc, + ) if calltypes: cac = typemap[attr_var.name].get_call_type( - typingctx, [size_typ, types.functions.NumberClass(dtype)], {} + typingctx, + [ + typemap[x.name] + for x in [ + size_var, + typ_var, + layout_var, + device_typ_var, + usm_typ_var, + ] + ], + {}, ) # By default, all calls to "empty" are typed as returning a standard # NumPy ndarray. If we are allocating a ndarray subclass here then diff --git a/numba_dpex/core/types/usm_ndarray_type.py b/numba_dpex/core/types/usm_ndarray_type.py index 6fa1ff79c3..c9eb97c40d 100644 --- a/numba_dpex/core/types/usm_ndarray_type.py +++ b/numba_dpex/core/types/usm_ndarray_type.py @@ -60,7 +60,7 @@ def __init__( "a SYCL filter selector" ) self.queue = dpctl.SyclQueue(device) - self.device = device + self.device = self.queue.sycl_device.filter_string elif queue is not None and device == "unknown": if not isinstance(queue, dpctl.SyclQueue): raise TypeError( @@ -136,7 +136,7 @@ def copy( device = self.device if usm_type is None: usm_type = self.usm_type - return USMNdArray( + return type(self)( dtype=dtype, ndim=ndim, layout=layout, @@ -169,7 +169,7 @@ def unify(self, typingctx, other): layout = "A" readonly = not (self.mutable and other.mutable) aligned = self.aligned and other.aligned - return USMNdArray( + return type(self)( dtype=self.dtype, ndim=self.ndim, layout=layout, diff --git a/numba_dpex/core/typing/dpnpdecl.py b/numba_dpex/core/typing/dpnpdecl.py new file mode 100644 index 0000000000..7c8cf817b5 --- /dev/null +++ b/numba_dpex/core/typing/dpnpdecl.py @@ -0,0 +1,198 @@ +# SPDX-FileCopyrightText: 2022 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import dpnp +import numpy as np +from numba.core import types +from numba.core.typing.npydecl import ( + Numpy_rules_ufunc, + NumpyRulesArrayOperator, + NumpyRulesInplaceArrayOperator, + NumpyRulesUnaryArrayOperator, + infer_global, +) + + +class DpnpRulesArrayOperator(NumpyRulesArrayOperator): + @property + def ufunc(self): + try: + op = getattr(dpnp, self._op_map[self.key]) + npop = getattr(np, self._op_map[self.key]) + op.nin = npop.nin + op.nout = npop.nout + op.nargs = npop.nargs + op.types = npop.types + op.is_dpnp_ufunc = True + return op + except: + pass + + +class DpnpRulesInplaceArrayOperator(NumpyRulesInplaceArrayOperator): + pass + + +class DpnpRulesUnaryArrayOperator(NumpyRulesUnaryArrayOperator): + pass + + +# list of unary ufuncs to register +_math_operations = [ + "add", + "subtract", + "multiply", + "floor_divide", + "negative", + "power", + "remainder", + "fmod", + "absolute", + "sign", + "conjugate", + "exp", + "exp2", + "log", + "log2", + "log10", + "expm1", + "log1p", + "sqrt", + "square", + "cbrt", + "reciprocal", + "divide", + "true_divide", + "mod", + "abs", + "fabs", + "erf", +] + +_trigonometric_functions = [ + "sin", + "cos", + "tan", + "arcsin", + "arccos", + "arctan", + "arctan2", + "hypot", + "sinh", + "cosh", + "tanh", + "arcsinh", + "arccosh", + "arctanh", + "deg2rad", + "rad2deg", + "degrees", + "radians", +] + +_bit_twiddling_functions = [ + "bitwise_and", + "bitwise_or", + "bitwise_xor", + "invert", + "left_shift", + "right_shift", + "bitwise_not", +] + +_comparison_functions = [ + "greater", + "greater_equal", + "less", + "less_equal", + "not_equal", + "equal", + "logical_and", + "logical_or", + "logical_xor", + "logical_not", + "maximum", + "minimum", + "fmax", + "fmin", +] + +_floating_functions = [ + "isfinite", + "isinf", + "isnan", + "copysign", + "modf", + "frexp", + "floor", + "ceil", + "trunc", +] + +_logic_functions = [] + + +# This is a set of the ufuncs that are not yet supported by Lowering. In order +# to trigger no-python mode we must not register them until their Lowering is +# implemented. +# +# It also works as a nice TODO list for ufunc support :) +_unsupported = set( + [ + "frexp", # Not supported by Numba + "modf", # Not supported by Numba + "logaddexp", + "logaddexp2", + "positive", + "float_power", + "rint", + "divmod", + "gcd", + "lcm", + "signbit", + "nextafter", + "ldexp", + "spacing", + "isnat", + ] +) + +# A list of ufuncs that are in fact aliases of other ufuncs. They need to insert +# the resolve method, but not register the ufunc itself +_aliases = set(["bitwise_not", "mod", "abs"]) + +all_ufuncs = sum( + [ + _math_operations, + _trigonometric_functions, + _bit_twiddling_functions, + _comparison_functions, + _floating_functions, + _logic_functions, + ], + [], +) + +supported_ufuncs = [x for x in all_ufuncs if x not in _unsupported] + + +def _dpnp_ufunc(name): + func = getattr(dpnp, name) + + class typing_class(Numpy_rules_ufunc): + key = func + + typing_class.__name__ = "resolve_{0}".format(name) + + if name not in _aliases: # if not name in _aliases + infer_global(func, types.Function(typing_class)) + + +for func in supported_ufuncs: + _dpnp_ufunc(func) + + +DpnpRulesArrayOperator.install_operations() +DpnpRulesInplaceArrayOperator.install_operations() +DpnpRulesUnaryArrayOperator.install_operations() diff --git a/numba_dpex/core/utils/kernel_builder.py b/numba_dpex/core/utils/kernel_builder.py new file mode 100644 index 0000000000..84326539ae --- /dev/null +++ b/numba_dpex/core/utils/kernel_builder.py @@ -0,0 +1,616 @@ +# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import copy +import sys +import warnings + +import dpctl.program as dpctl_prog +import dpnp +import numba +from numba.core import compiler, ir, types +from numba.core.errors import NumbaParallelSafetyWarning +from numba.core.ir_utils import ( + add_offset_to_labels, + get_name_var_table, + get_unused_var_name, + legalize_names, + mk_unique_var, + remove_dead, + remove_dels, + rename_labels, + replace_var_names, +) +from numba.core.typing import signature + +import numba_dpex as dpex +from numba_dpex import config + +from ..descriptor import dpex_kernel_target +from ..passes import parfor +from ..types.dpnp_ndarray_type import DpnpNdArray + + +class GufuncKernel: + def __init__( + self, + name, + kernel, + signature, + kernel_args, + kernel_arg_types, + queue, + ): + self.name = name + self.kernel = kernel + self.signature = signature + self.kernel_args = kernel_args + self.kernel_arg_types = kernel_arg_types + self.queue = queue + + +def _print_block(block): + for i, inst in enumerate(block.body): + print(" ", i, inst) + + +def _print_body(body_dict): + """Pretty-print a set of IR blocks.""" + for label, block in body_dict.items(): + print("label: ", label) + _print_block(block) + + +def _compile_kernel_parfor( + sycl_queue, kernel_name, func_ir, argtypes, debug=False +): + # Create a SPIRVKernel object + kernel = dpex.core.kernel_interface.spirv_kernel.SpirvKernel( + func_ir, kernel_name + ) + + # compile the kernel + kernel.compile( + args=argtypes, + typing_ctx=dpex_kernel_target.typing_context, + target_ctx=dpex_kernel_target.target_context, + debug=debug, + compile_flags=None, + ) + + dpctl_create_program_from_spirv_flags = [] + if debug or config.OPT == 0: + # if debug is ON we need to pass additional flags to igc. + dpctl_create_program_from_spirv_flags = ["-g", "-cl-opt-disable"] + + # create a program + kernel_bundle = dpctl_prog.create_program_from_spirv( + sycl_queue, + kernel.device_driver_ir_module, + " ".join(dpctl_create_program_from_spirv_flags), + ) + # create a kernel + sycl_kernel = kernel_bundle.get_sycl_kernel(kernel.module_name) + + return sycl_kernel + + +def _legalize_names_with_typemap(names, typemap): + """Replace illegal characters in Numba IR var names. + + We use ir_utils.legalize_names to replace internal IR variable names + containing illegal characters (e.g. period) with a legal character + (underscore) so as to create legal variable names. The original variable + names are in the typemap so we also need to add the legalized name to the + typemap as well. + """ + outdict = legalize_names(names) + # For each pair in the dict of legalized names... + for x, y in outdict.items(): + # If the name had some legalization change to it... + if x != y: + # Set the type of the new name the same as the type of the old name. + typemap[y] = typemap[x] + return outdict + + +def _to_scalar_from_0d(x): + if isinstance(x, types.ArrayCompatible) and x.ndim == 0: + return x.dtype + else: + return x + + +def _replace_var_with_array_in_block(vars, block, typemap, calltypes): + new_block = [] + for inst in block.body: + if isinstance(inst, ir.Assign) and inst.target.name in vars: + const_node = ir.Const(0, inst.loc) + const_var = ir.Var( + inst.target.scope, mk_unique_var("$const_ind_0"), inst.loc + ) + typemap[const_var.name] = types.uintp + const_assign = ir.Assign(const_node, const_var, inst.loc) + new_block.append(const_assign) + + setitem_node = ir.SetItem( + inst.target, const_var, inst.value, inst.loc + ) + calltypes[setitem_node] = signature( + types.none, + types.npytypes.Array(typemap[inst.target.name], 1, "C"), + types.intp, + typemap[inst.target.name], + ) + new_block.append(setitem_node) + continue + elif isinstance(inst, parfor.Parfor): + _replace_var_with_array_internal( + vars, {0: inst.init_block}, typemap, calltypes + ) + _replace_var_with_array_internal( + vars, inst.loop_body, typemap, calltypes + ) + + new_block.append(inst) + return new_block + + +def _replace_var_with_array_internal(vars, loop_body, typemap, calltypes): + for label, block in loop_body.items(): + block.body = _replace_var_with_array_in_block( + vars, block, typemap, calltypes + ) + + +def _replace_var_with_array(vars, loop_body, typemap, calltypes): + _replace_var_with_array_internal(vars, loop_body, typemap, calltypes) + for v in vars: + el_typ = typemap[v] + typemap.pop(v, None) + typemap[v] = types.npytypes.Array(el_typ, 1, "C") + + +def _dbgprint_after_each_array_assignments(lowerer, loop_body, typemap): + for label, block in loop_body.items(): + new_block = block.copy() + new_block.clear() + loc = block.loc + scope = block.scope + for inst in block.body: + new_block.append(inst) + # Append print after assignment + if isinstance(inst, ir.Assign): + # Only apply to numbers + if typemap[inst.target.name] not in types.number_domain: + continue + + # Make constant string + strval = "{} =".format(inst.target.name) + strconsttyp = types.StringLiteral(strval) + + lhs = ir.Var(scope, mk_unique_var("str_const"), loc) + assign_lhs = ir.Assign( + value=ir.Const(value=strval, loc=loc), target=lhs, loc=loc + ) + typemap[lhs.name] = strconsttyp + new_block.append(assign_lhs) + + # Make print node + print_node = ir.Print( + args=[lhs, inst.target], vararg=None, loc=loc + ) + new_block.append(print_node) + sig = signature( + types.none, typemap[lhs.name], typemap[inst.target.name] + ) + lowerer.fndesc.calltypes[print_node] = sig + loop_body[label] = new_block + + +def _generate_kernel_stub_as_string( + kernel_name, + parfor_params, + parfor_dim, + legal_loop_indices, + loop_ranges, + param_dict, + has_reduction, + redvars, + typemap, + redvars_dict, + sentinel_name, +): + """Generates a stub dpex kernel for the parfor. + + Returns: + str: A string representing a stub kernel function for the parfor. + """ + kernel_txt = "" + + # Create the dpex kernel function. + kernel_txt += "def " + kernel_name + kernel_txt += "(" + (", ".join(parfor_params)) + "):\n" + global_id_dim = 0 + for_loop_dim = parfor_dim + + if parfor_dim > 3: + raise NotImplementedError + global_id_dim = 3 + else: + global_id_dim = parfor_dim + + for eachdim in range(global_id_dim): + kernel_txt += ( + " " + + legal_loop_indices[eachdim] + + " = " + + "dpex.get_global_id(" + + str(eachdim) + + ")\n" + ) + + for eachdim in range(global_id_dim, for_loop_dim): + for indent in range(1 + (eachdim - global_id_dim)): + kernel_txt += " " + + start, stop, step = loop_ranges[eachdim] + start = param_dict.get(str(start), start) + stop = param_dict.get(str(stop), stop) + kernel_txt += ( + "for " + + legal_loop_indices[eachdim] + + " in range(" + + str(start) + + ", " + + str(stop) + + " + 1):\n" + ) + + for eachdim in range(global_id_dim, for_loop_dim): + for indent in range(1 + (eachdim - global_id_dim)): + kernel_txt += " " + + # Add the sentinel assignment so that we can find the loop body position + # in the IR. + kernel_txt += " " + kernel_txt += sentinel_name + " = 0\n" + + # A kernel function does not return anything + kernel_txt += " return None\n" + + return kernel_txt + + +def _wrap_loop_body(loop_body): + blocks = loop_body.copy() # shallow copy is enough + first_label = min(blocks.keys()) + last_label = max(blocks.keys()) + loc = blocks[last_label].loc + blocks[last_label].body.append(ir.Jump(first_label, loc)) + return blocks + + +def _unwrap_loop_body(loop_body): + last_label = max(loop_body.keys()) + loop_body[last_label].body = loop_body[last_label].body[:-1] + + +def _find_setitems_block(setitems, block, typemap): + for inst in block.body: + if isinstance(inst, ir.StaticSetItem) or isinstance(inst, ir.SetItem): + setitems.add(inst.target.name) + elif isinstance(inst, parfor.Parfor): + _find_setitems_block(setitems, inst.init_block, typemap) + _find_setitems_body(setitems, inst.loop_body, typemap) + + +def _find_setitems_body(setitems, loop_body, typemap): + """ + Find the arrays that are written into (goes into setitems) + """ + for label, block in loop_body.items(): + _find_setitems_block(setitems, block, typemap) + + +def _replace_sentinel_with_parfor_body(kernel_ir, sentinel_name, loop_body): + # new label for splitting sentinel block + new_label = max(loop_body.keys()) + 1 + + # Search all the block in the kernel function for the sentinel assignment. + for label, block in kernel_ir.blocks.items(): + for i, inst in enumerate(block.body): + if ( + isinstance(inst, ir.Assign) + and inst.target.name == sentinel_name + ): + # We found the sentinel assignment. + loc = inst.loc + scope = block.scope + # split block across __sentinel__ + # A new block is allocated for the statements prior to the + # sentinel but the new block maintains the current block label. + prev_block = ir.Block(scope, loc) + prev_block.body = block.body[:i] + + # The current block is used for statements after the sentinel. + block.body = block.body[i + 1 :] # noqa: E203 + # But the current block gets a new label. + body_first_label = min(loop_body.keys()) + + # The previous block jumps to the minimum labelled block of the + # parfor body. + prev_block.append(ir.Jump(body_first_label, loc)) + # Add all the parfor loop body blocks to the kernel function's + # IR. + for loop, b in loop_body.items(): + kernel_ir.blocks[loop] = b + body_last_label = max(loop_body.keys()) + kernel_ir.blocks[new_label] = block + kernel_ir.blocks[label] = prev_block + # Add a jump from the last parfor body block to the block + # containing statements after the sentinel. + kernel_ir.blocks[body_last_label].append( + ir.Jump(new_label, loc) + ) + break + else: + continue + break + + +def create_kernel_for_parfor( + lowerer, + parfor_node, + typemap, + flags, + loop_ranges, + has_aliases, + races, + parfor_outputs, +): + """ + Creates a numba_dpex.kernel function for a parfor node. + + There are two parts to this function: + + 1) Code to iterate across the iteration space as defined by + the schedule. + 2) The parfor body that does the work for a single point in + the iteration space. + + Part 1 is created as Python text for simplicity with a sentinel + assignment to mark the point in the IR where the parfor body + should be added. This Python text is 'exec'ed into existence and its + IR retrieved with run_frontend. The IR is scanned for the sentinel + assignment where that basic block is split and the IR for the parfor + body inserted. + """ + loc = parfor_node.init_block.loc + + # The parfor body and the main function body share ir.Var nodes. + # We have to do some replacements of Var names in the parfor body + # to make them legal parameter names. If we don't copy then the + # Vars in the main function also would incorrectly change their name. + loop_body = copy.copy(parfor_node.loop_body) + remove_dels(loop_body) + + parfor_dim = len(parfor_node.loop_nests) + loop_indices = [ + loop_nest.index_variable.name for loop_nest in parfor_node.loop_nests + ] + + # Get all the parfor params. + parfor_params = parfor_node.params + + # Get all parfor reduction vars, and operators. + typemap = lowerer.fndesc.typemap + + parfor_redvars, parfor_reddict = parfor.get_parfor_reductions( + lowerer.func_ir, parfor_node, parfor_params, lowerer.fndesc.calltypes + ) + has_reduction = False if len(parfor_redvars) == 0 else True + + if has_reduction: + raise NotImplementedError + + # Compute just the parfor inputs as a set difference. + parfor_inputs = sorted(list(set(parfor_params) - set(parfor_outputs))) + + for race in races: + msg = ( + "Variable %s used in parallel loop may be written " + "to simultaneously by multiple workers and may result " + "in non-deterministic or unintended results." % race + ) + warnings.warn(NumbaParallelSafetyWarning(msg, loc)) + + _replace_var_with_array(races, loop_body, typemap, lowerer.fndesc.calltypes) + + # Reorder all the params so that inputs go first then outputs. + parfor_params = parfor_inputs + parfor_outputs + + # Some Var and loop_indices may not have legal parameter names so create a + # dict of potentially illegal param name to guaranteed legal name. + param_dict = _legalize_names_with_typemap(parfor_params, typemap) + ind_dict = _legalize_names_with_typemap(loop_indices, typemap) + redvars_dict = legalize_names(parfor_redvars) + + # Compute a new list of legal loop index names. + legal_loop_indices = [ind_dict[v] for v in loop_indices] + # Get the types of each parameter. + param_types = [_to_scalar_from_0d(typemap[v]) for v in parfor_params] + # Calculate types of args passed to the kernel function. + func_arg_types = [typemap[v] for v in (parfor_inputs + parfor_outputs)] + + # Replace illegal parameter names in the loop body with legal ones. + replace_var_names(loop_body, param_dict) + + # remember the name before legalizing as the actual arguments + parfor_args = parfor_params + # Change parfor_params to be legal names. + parfor_params = [param_dict[v] for v in parfor_params] + parfor_params_orig = parfor_params + + parfor_params = [] + ascontig = False + for pindex in range(len(parfor_params_orig)): + if ( + ascontig + and pindex < len(parfor_inputs) + and isinstance(param_types[pindex], types.npytypes.Array) + ): + parfor_params.append(parfor_params_orig[pindex] + "param") + else: + parfor_params.append(parfor_params_orig[pindex]) + + # Change parfor body to replace illegal loop index vars with legal ones. + replace_var_names(loop_body, ind_dict) + + loop_body_var_table = get_name_var_table(loop_body) + sentinel_name = get_unused_var_name("__sentinel__", loop_body_var_table) + + if config.DEBUG_ARRAY_OPT >= 1: + print("legal parfor_params = ", parfor_params, type(parfor_params)) + + # Determine the unique names of the kernel functions. + kernel_name = "__numba_parfor_kernel_%s" % (parfor_node.id) + + kernel_fn_txt = _generate_kernel_stub_as_string( + kernel_name, + parfor_params, + parfor_dim, + legal_loop_indices, + loop_ranges, + param_dict, + has_reduction, + parfor_redvars, + typemap, + redvars_dict, + sentinel_name, + ) + + if config.DEBUG_ARRAY_OPT: + print("kernel_fn_txt = ", type(kernel_fn_txt), "\n", kernel_fn_txt) + sys.stdout.flush() + + # Exec the kernel_fn_txt string into existence. + globls = {"dpnp": dpnp, "numba": numba, "dpex": dpex} + locls = {} + exec(kernel_fn_txt, globls, locls) + + kernel_fn = locls[kernel_name] + + if config.DEBUG_ARRAY_OPT: + print("kernel_fn = ", type(kernel_fn), "\n", kernel_fn) + # Get the IR for the kernel_fn dpex kernel + kernel_ir = compiler.run_frontend(kernel_fn) + + if config.DEBUG_ARRAY_OPT: + print("kernel_ir dump ", type(kernel_ir)) + kernel_ir.dump() + print("loop_body dump ", type(loop_body)) + _print_body(loop_body) + + # rename all variables in kernel_ir afresh + var_table = get_name_var_table(kernel_ir.blocks) + new_var_dict = {} + reserved_names = ( + [sentinel_name] + list(param_dict.values()) + legal_loop_indices + ) + for name, var in var_table.items(): + if not (name in reserved_names): + new_var_dict[name] = mk_unique_var(name) + replace_var_names(kernel_ir.blocks, new_var_dict) + if config.DEBUG_ARRAY_OPT: + print("kernel_ir dump after renaming ") + kernel_ir.dump() + + gufunc_param_types = param_types + + if config.DEBUG_ARRAY_OPT: + print( + "gufunc_param_types = ", + type(gufunc_param_types), + "\n", + gufunc_param_types, + ) + + gufunc_stub_last_label = max(kernel_ir.blocks.keys()) + 1 + + # Add gufunc stub last label to each parfor.loop_body label to prevent + # label conflicts. + loop_body = add_offset_to_labels(loop_body, gufunc_stub_last_label) + + # If enabled, add a print statement after every assignment. + if config.DEBUG_ARRAY_OPT_RUNTIME: + _dbgprint_after_each_array_assignments(lowerer, loop_body, typemap) + + _replace_sentinel_with_parfor_body(kernel_ir, sentinel_name, loop_body) + + if config.DEBUG_ARRAY_OPT: + print("kernel_ir last dump before renaming") + kernel_ir.dump() + + kernel_ir.blocks = rename_labels(kernel_ir.blocks) + remove_dels(kernel_ir.blocks) + + old_alias = flags.noalias + if not has_aliases: + if config.DEBUG_ARRAY_OPT: + print("No aliases found so adding noalias flag.") + flags.noalias = True + + remove_dead(kernel_ir.blocks, kernel_ir.arg_names, kernel_ir, typemap) + + if config.DEBUG_ARRAY_OPT: + print("gufunc_ir after remove dead") + kernel_ir.dump() + + kernel_sig = signature(types.none, *gufunc_param_types) + + if config.DEBUG_ARRAY_OPT: + sys.stdout.flush() + + if config.DEBUG_ARRAY_OPT: + print("after DUFunc inline".center(80, "-")) + kernel_ir.dump() + + # The ParforLegalizeCFD pass has already ensured that the LHS and RHS + # arrays are on same device. We can take the queue from the first input + # array and use that to compile the kernel. + + exec_queue = None + + for arg in parfor_args: + obj = typemap[arg] + if isinstance(obj, DpnpNdArray): + exec_queue = obj.queue + + if not exec_queue: + raise AssertionError( + "No execution found for parfor. No way to compile the kernel!" + ) + + sycl_kernel = _compile_kernel_parfor( + exec_queue, + kernel_name, + kernel_ir, + gufunc_param_types, + debug=flags.debuginfo, + ) + + flags.noalias = old_alias + + if config.DEBUG_ARRAY_OPT: + print("kernel_sig = ", kernel_sig) + + return GufuncKernel( + name=kernel_name, + kernel=sycl_kernel, + signature=kernel_sig, + kernel_args=parfor_args, + kernel_arg_types=func_arg_types, + queue=exec_queue, + ) diff --git a/numba_dpex/core/utils/kernel_launcher.py b/numba_dpex/core/utils/kernel_launcher.py new file mode 100644 index 0000000000..d9652a744a --- /dev/null +++ b/numba_dpex/core/utils/kernel_launcher.py @@ -0,0 +1,361 @@ +# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +from numba.core import cgutils, types + +from numba_dpex import utils +from numba_dpex.core.runtime.context import DpexRTContext +from numba_dpex.dpctl_iface import DpctlCAPIFnBuilder +from numba_dpex.dpctl_iface._helpers import numba_type_to_dpctl_typenum + + +class KernelLaunchIRBuilder: + """ + KernelLaunchIRBuilder(lowerer, cres) + Helper class to build the LLVM IR for the submission of a kernel. + + The class generates LLVM IR inside the current LLVM module that is needed + for submitting kernels. The LLVM Values that + """ + + def __init__(self, lowerer, kernel): + """Create a KernelLauncher for the specified kernel. + + Args: + lowerer: The Numba Lowerer that will be used to generate the code. + kernel: The SYCL kernel for which we are generating the code. + num_inputs: The number of arguments to the kernels. + """ + self.lowerer = lowerer + self.context = self.lowerer.context + self.builder = self.lowerer.builder + self.kernel = kernel + self.kernel_addr = self.kernel.addressof_ref() + self.rtctx = DpexRTContext(self.context) + + def _build_nullptr(self): + """Builds the LLVM IR to represent a null pointer. + + Returns: An LLVM Value storing a null pointer + """ + zero = cgutils.alloca_once(self.builder, utils.LLVMTypes.int64_t) + self.builder.store(self.context.get_constant(types.int64, 0), zero) + return self.builder.bitcast( + zero, utils.get_llvm_type(context=self.context, type=types.voidptr) + ) + + def _build_array_attr_arg( + self, + array_val, + array_attr_pos, + array_attr_ty, + arg_list, + args_ty_list, + arg_num, + ): + array_attr = self.builder.gep( + array_val, + [ + self.context.get_constant(types.int32, 0), + self.context.get_constant(types.int32, array_attr_pos), + ], + ) + + # FIXME: If pointer arg then load it to some value and pass that value. + # We also most likely need an address space cast + if isinstance(array_attr_ty, types.misc.RawPointer): + array_attr = self.builder.load(array_attr) + + self.build_arg( + val=array_attr, + ty=array_attr_ty, + arg_list=arg_list, + args_ty_list=args_ty_list, + arg_num=arg_num, + ) + + def _build_flattened_array_args( + self, array_val, array_attr_pos, ndims, arg_list, args_ty_list, arg_num + ): + array_attr = self.builder.gep( + array_val, + [ + self.context.get_constant(types.int32, 0), + self.context.get_constant(types.int32, array_attr_pos), + ], + ) + + for ndim in range(ndims): + self._build_array_attr_arg( + array_val=array_attr, + array_attr_pos=ndim, + array_attr_ty=types.int64, + arg_list=arg_list, + args_ty_list=args_ty_list, + arg_num=arg_num + ndim, + ) + + def build_arg(self, val, ty, arg_list, args_ty_list, arg_num): + """Stores the kernel arguments and the kernel argument types into + arrays that will be passed to DPCTLQueue_SubmitRange. + + Args: + val: An LLVM IR Value that will be stored into the arguments array + ty: A Numba type that will be converted to a DPCTLKernelArgType + enum and stored into the argument types list array + arg_list: An LLVM IR Value array that stores the kernel arguments + args_ty_list: An LLVM IR Value array that stores the + DPCTLKernelArgType enum value for each kernel argument + arg_num: The index position at which the arg_list and args_ty_list + need to be updated. + """ + kernel_arg_dst = self.builder.gep( + arg_list, + [self.context.get_constant(types.int32, arg_num)], + ) + kernel_arg_ty_dst = self.builder.gep( + args_ty_list, + [self.context.get_constant(types.int32, arg_num)], + ) + val = self.builder.bitcast( + val, + utils.get_llvm_type(context=self.context, type=types.voidptr), + ) + self.builder.store(val, kernel_arg_dst) + self.builder.store( + numba_type_to_dpctl_typenum(self.context, ty), kernel_arg_ty_dst + ) + + def build_array_arg( + self, array_val, array_rank, arg_list, args_ty_list, arg_num + ): + """Creates a list of LLVM Values for an unpacked DpnpNdArray kernel + argument. + + The steps performed here are the same as in + numba_dpex.core.kernel_interface.arg_pack_unpacker._unpack_array_helper + """ + # Argument 1: Null pointer for the NRT_MemInfo attribute of the array + nullptr = self._build_nullptr() + self.build_arg( + val=nullptr, + ty=types.int64, + arg_list=arg_list, + args_ty_list=args_ty_list, + arg_num=arg_num, + ) + arg_num += 1 + # Argument 2: Null pointer for the Parent attribute of the array + nullptr = self._build_nullptr() + self.build_arg( + val=nullptr, + ty=types.int64, + arg_list=arg_list, + args_ty_list=args_ty_list, + arg_num=arg_num, + ) + arg_num += 1 + # Argument 3: Array size + self._build_array_attr_arg( + array_val=array_val, + array_attr_pos=2, + array_attr_ty=types.int64, + arg_list=arg_list, + args_ty_list=args_ty_list, + arg_num=arg_num, + ) + arg_num += 1 + # Argument 4: itemsize + self._build_array_attr_arg( + array_val=array_val, + array_attr_pos=3, + array_attr_ty=types.int64, + arg_list=arg_list, + args_ty_list=args_ty_list, + arg_num=arg_num, + ) + arg_num += 1 + # Argument 5: data pointer + self._build_array_attr_arg( + array_val=array_val, + array_attr_pos=4, + array_attr_ty=types.voidptr, + arg_list=arg_list, + args_ty_list=args_ty_list, + arg_num=arg_num, + ) + arg_num += 1 + # Arguments for flattened shape + self._build_flattened_array_args( + array_val=array_val, + array_attr_pos=5, + ndims=array_rank, + arg_list=arg_list, + args_ty_list=args_ty_list, + arg_num=arg_num, + ) + arg_num += array_rank + # Arguments for flattened stride + self._build_flattened_array_args( + array_val=array_val, + array_attr_pos=6, + ndims=array_rank, + arg_list=arg_list, + args_ty_list=args_ty_list, + arg_num=arg_num, + ) + arg_num += array_rank + + def get_queue(self, exec_queue): + """Allocates memory on the stack to store a DPCTLSyclQueueRef. + + Returns: A LLVM Value storing the pointer to the SYCL queue created + using the filter string for the Python exec_queue (dpctl.SyclQueue). + """ + + # Allocate a stack var to store the queue created from the filter string + sycl_queue_val = cgutils.alloca_once( + self.builder, + utils.get_llvm_type(context=self.context, type=types.voidptr), + ) + # Insert a global constant to store the filter string + device = self.context.insert_const_string( + self.builder.module, exec_queue.sycl_device.filter_string + ) + # Store the queue returned by DPEXRTQueue_CreateFromFilterString in a + # local variable + self.builder.store( + self.rtctx.get_queue_from_filter_string( + builder=self.builder, device=device + ), + sycl_queue_val, + ) + return sycl_queue_val + + def free_queue(self, sycl_queue_val): + """ + Frees the ``DPCTLSyclQueueRef`` pointer that was used to launch the + kernels. + + Args: + sycl_queue_val: The SYCL queue pointer to be freed. + """ + fn = DpctlCAPIFnBuilder.get_dpctl_queue_delete( + builder=self.builder, context=self.context + ) + self.builder.call(fn, [self.builder.load(sycl_queue_val)]) + + def allocate_kernel_arg_array(self, num_kernel_args): + """Allocates an array to store the LLVM Value for every kernel argument. + + Args: + num_kernel_args (int): The number of kernel arguments that + determines the size of args array to allocate. + + Returns: An LLVM IR value pointing to an array to store the kernel + arguments. + """ + args_list = cgutils.alloca_once( + self.builder, + utils.get_llvm_type(context=self.context, type=types.voidptr), + size=self.context.get_constant(types.uintp, num_kernel_args), + ) + + return args_list + + def allocate_kernel_arg_ty_array(self, num_kernel_args): + """Allocates an array to store the LLVM Value for the typenum for + every kernel argument. + + Args: + num_kernel_args (int): The number of kernel arguments that + determines the size of args array to allocate. + + Returns: An LLVM IR value pointing to an array to store the kernel + arguments typenums as defined in dpctl. + """ + args_ty_list = cgutils.alloca_once( + self.builder, + utils.LLVMTypes.int32_t, + size=self.context.get_constant(types.uintp, num_kernel_args), + ) + + return args_ty_list + + def _create_sycl_range(self, idx_range): + """_summary_ + + Args: + idx_range (_type_): _description_ + kernel_name_tag (_type_): _description_ + """ + intp_t = utils.get_llvm_type(context=self.context, type=types.intp) + intp_ptr_t = utils.get_llvm_ptr_type(intp_t) + num_dim = len(idx_range) + + # form the global range + global_range = cgutils.alloca_once( + self.builder, + utils.get_llvm_type(context=self.context, type=types.uintp), + size=self.context.get_constant(types.uintp, num_dim), + ) + + for i in range(num_dim): + rext = idx_range[i] + if rext.type != utils.LLVMTypes.int64_t: + rext = self.builder.sext(rext, utils.LLVMTypes.int64_t) + + # we reverse the global range to account for how sycl and opencl + # range differs + self.builder.store( + rext, + self.builder.gep( + global_range, + [self.context.get_constant(types.uintp, (num_dim - 1) - i)], + ), + ) + + return self.builder.bitcast(global_range, intp_ptr_t) + + def submit_sync_ranged_kernel( + self, + idx_range, + sycl_queue_val, + total_kernel_args, + arg_list, + arg_ty_list, + ): + """ + submit_sync_ranged_kernel(dim_bounds, sycl_queue_val) + Submits the kernel to the specified queue, waits and then copies + back any results to the host. + + Args: + idx_range: Tuple specifying the range over which the kernel is + to be submitted. + sycl_queue_val : The SYCL queue on which the kernel is + submitted. + """ + gr = self._create_sycl_range(idx_range) + args = [ + self.builder.inttoptr( + self.context.get_constant(types.uintp, self.kernel_addr), + utils.get_llvm_type(context=self.context, type=types.voidptr), + ), + self.builder.load(sycl_queue_val), + arg_list, + arg_ty_list, + self.context.get_constant(types.uintp, total_kernel_args), + gr, + self.context.get_constant(types.uintp, len(idx_range)), + self.builder.bitcast( + utils.create_null_ptr( + builder=self.builder, context=self.context + ), + utils.get_llvm_type(context=self.context, type=types.voidptr), + ), + self.context.get_constant(types.uintp, 0), + ] + + self.rtctx.submit_range(self.builder, *args) diff --git a/numba_dpex/decorators.py b/numba_dpex/decorators.py index 1c05dfe495..8b27c0d358 100644 --- a/numba_dpex/decorators.py +++ b/numba_dpex/decorators.py @@ -16,7 +16,7 @@ compile_func, compile_func_template, ) -from numba_dpex.core.pipelines.offload_compiler import OffloadCompiler +from numba_dpex.core.pipelines.dpjit_compiler import DpjitCompiler def kernel( @@ -165,7 +165,8 @@ def dpjit(*args, **kws): ) del kws["forceobj"] kws.update({"nopython": True}) - kws.update({"pipeline_class": OffloadCompiler}) + kws.update({"parallel": True}) + kws.update({"pipeline_class": DpjitCompiler}) # FIXME: When trying to use dpex's target context, overloads do not work # properly. We will turn on dpex target once the issue is fixed. diff --git a/numba_dpex/dpnp_iface/arrayobj.py b/numba_dpex/dpnp_iface/arrayobj.py index e906176d6a..90e3f922ad 100644 --- a/numba_dpex/dpnp_iface/arrayobj.py +++ b/numba_dpex/dpnp_iface/arrayobj.py @@ -50,6 +50,22 @@ class for number classes. return _dtype +def _parse_layout(layout): + if isinstance(layout, types.StringLiteral): + layout_type_str = layout.literal_value + if layout_type_str not in ["C", "F", "A"]: + msg = f"Invalid layout specified: '{layout_type_str}'" + raise errors.NumbaValueError(msg) + return layout_type_str + elif isinstance(layout, str): + return layout + else: + raise TypeError( + "The parameter 'layout' is neither of " + + "'str' nor 'types.StringLiteral'" + ) + + def _parse_usm_type(usm_type): """Parse usm_type parameter. @@ -152,8 +168,7 @@ def build_dpnp_ndarray( The type has the same structure as USMNdArray used to represent dpctl.tensor.usm_ndarray. """ - - if queue: + if queue and not isinstance(queue, types.misc.Omitted): raise errors.TypingError( "The sycl_queue keyword is not yet supported by " "dpnp.empty(), dpnp.zeros(), dpnp.ones(), dpnp.empty_like(), " @@ -162,7 +177,7 @@ def build_dpnp_ndarray( ) # If a dtype value was passed in, then try to convert it to the - # coresponding Numba type. If None was passed, the default, then pass None + # corresponding Numba type. If None was passed, the default, then pass None # to the DpnpNdArray constructor. The default dtype will be derived based # on the behavior defined in dpctl.tensor.usm_ndarray. @@ -231,6 +246,7 @@ def ol_dpnp_empty( _ndim = _ty_parse_shape(shape) _dtype = _parse_dtype(dtype) + _layout = _parse_layout(order) _usm_type = _parse_usm_type(usm_type) if usm_type is not None else "device" _device = ( _parse_device_filter_string(device) if device is not None else "unknown" @@ -238,7 +254,7 @@ def ol_dpnp_empty( if _ndim: ret_ty = build_dpnp_ndarray( _ndim, - layout=order, + layout=_layout, dtype=_dtype, usm_type=_usm_type, device=_device, diff --git a/numba_dpex/dpnp_iface/dpnp_ufunc_db.py b/numba_dpex/dpnp_iface/dpnp_ufunc_db.py new file mode 100644 index 0000000000..6a6d3d6d50 --- /dev/null +++ b/numba_dpex/dpnp_iface/dpnp_ufunc_db.py @@ -0,0 +1,63 @@ +# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + + +import dpnp +import numpy as np +from numba.core import types + +from numba_dpex.core.typing import dpnpdecl + +from ..ocl import mathimpl + + +def get_ufuncs(): + """obtain a list of supported ufuncs in the db""" + + from numba.np.ufunc_db import _lazy_init_db + + _lazy_init_db() + from numba.np.ufunc_db import _ufunc_db + + _fill_ufunc_db_with_dpnp_ufuncs(_ufunc_db) + + return _ufunc_db.keys() + + +def _fill_ufunc_db_with_dpnp_ufuncs(ufunc_db): + """Monkey patching dpnp for missing attributes.""" + # FIXME: add more docstring + + for ufuncop in dpnpdecl.supported_ufuncs: + if ufuncop == "erf": + op = getattr(dpnp, "erf") + op.nin = 1 + op.nout = 1 + op.nargs = 2 + op.types = ["f->f", "d->d"] + op.is_dpnp_ufunc = True + + _unary_d_d = types.float64(types.float64) + _unary_f_f = types.float32(types.float32) + ufunc_db[op] = { + "f->f": mathimpl.lower_ocl_impl[("erf", (_unary_f_f))], + "d->d": mathimpl.lower_ocl_impl[("erf", (_unary_d_d))], + } + else: + op = getattr(dpnp, ufuncop) + npop = getattr(np, ufuncop) + op.nin = npop.nin + op.nout = npop.nout + op.nargs = npop.nargs + op.types = npop.types + op.is_dpnp_ufunc = True + ufunc_db.update({op: ufunc_db[npop]}) + for key in list(ufunc_db[op].keys()): + if ( + "FF->" in key + or "DD->" in key + or "F->" in key + or "D->" in key + ): + ufunc_db[op].pop(key) diff --git a/numba_dpex/dpnp_iface/dpnpimpl.py b/numba_dpex/dpnp_iface/dpnpimpl.py new file mode 100644 index 0000000000..7873bbc5b5 --- /dev/null +++ b/numba_dpex/dpnp_iface/dpnpimpl.py @@ -0,0 +1,58 @@ +# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import dpnp +from numba.np import npyimpl + +from numba_dpex.core.typing.dpnpdecl import _unsupported +from numba_dpex.dpnp_iface import dpnp_ufunc_db + + +def _register_dpnp_ufuncs(): + kernels = {} + # NOTE: Assuming ufunc implementation for the CPUContext. + for ufunc in dpnp_ufunc_db.get_ufuncs(): + kernels[ufunc] = npyimpl.register_ufunc_kernel( + ufunc, npyimpl._ufunc_db_function(ufunc) + ) + + for _op_map in ( + npyimpl.npydecl.NumpyRulesUnaryArrayOperator._op_map, + npyimpl.npydecl.NumpyRulesArrayOperator._op_map, + ): + for operator, ufunc_name in _op_map.items(): + if ufunc_name in _unsupported: + continue + ufunc = getattr(dpnp, ufunc_name) + kernel = kernels[ufunc] + if ufunc.nin == 1: + npyimpl.register_unary_operator_kernel(operator, ufunc, kernel) + elif ufunc.nin == 2: + npyimpl.register_binary_operator_kernel(operator, ufunc, kernel) + else: + raise RuntimeError( + "There shouldn't be any non-unary or binary operators" + ) + + for _op_map in (npyimpl.npydecl.NumpyRulesInplaceArrayOperator._op_map,): + for operator, ufunc_name in _op_map.items(): + if ufunc_name in _unsupported: + continue + ufunc = getattr(dpnp, ufunc_name) + kernel = kernels[ufunc] + if ufunc.nin == 1: + npyimpl.register_unary_operator_kernel( + operator, ufunc, kernel, inplace=True + ) + elif ufunc.nin == 2: + npyimpl.register_binary_operator_kernel( + operator, ufunc, kernel, inplace=True + ) + else: + raise RuntimeError( + "There shouldn't be any non-unary or binary operators" + ) + + +_register_dpnp_ufuncs() diff --git a/numba_dpex/examples/blacksholes_njit.py b/numba_dpex/examples/blacksholes_njit.py index ad7e602c18..9095f79987 100644 --- a/numba_dpex/examples/blacksholes_njit.py +++ b/numba_dpex/examples/blacksholes_njit.py @@ -6,19 +6,20 @@ import math import time -import dpctl +import dpctl.tensor as dpt +import dpnp import numba -import numpy as np +import numba_dpex as dpex -@numba.vectorize(nopython=True) -def cndf2(inp): - out = 0.5 + 0.5 * math.erf((math.sqrt(2.0) / 2.0) * inp) - return out +# @numba.vectorize(nopython=True) +# def cndf2(inp): +# out = 0.5 + 0.5 * math.erf((math.sqrt(2.0) / 2.0) * inp) +# return out -@numba.njit(parallel=True, fastmath=True) -def blackscholes(sptprice, strike, rate, volatility, timev): +@dpex.dpjit +def blackscholes(sptprice, strike, timev, rate, volatility): """ A simple implementation of the Black-Scholes formula using the automatic offload feature of numba_dpex. In this example, each NumPy array @@ -26,50 +27,54 @@ def blackscholes(sptprice, strike, rate, volatility, timev): generate a single SYCL kernel. The kernel is automatically offloaded to the device specified where the function is invoked. """ - logterm = np.log(sptprice / strike) - powterm = 0.5 * volatility * volatility - den = volatility * np.sqrt(timev) - d1 = (((rate + powterm) * timev) + logterm) / den - d2 = d1 - den - NofXd1 = cndf2(d1) - NofXd2 = cndf2(d2) - futureValue = strike * np.exp(-rate * timev) - c1 = futureValue * NofXd2 - call = sptprice * NofXd1 - c1 - put = call - futureValue + sptprice + + a = dpnp.log(sptprice / strike) + b = timev * -rate + z = timev * volatility * volatility * 2 + c = 0.25 * z + y = dpnp.true_divide(1.0, dpnp.sqrt(z)) + w1 = (a - b + c) * y + w2 = (a - b - c) * y + + NofXd1 = 0.5 + 0.5 * dpnp.erf(w1) + NofXd2 = 0.5 + 0.5 * dpnp.erf(w2) + + futureValue = strike * dpnp.exp(b) + call = sptprice * NofXd1 - futureValue * NofXd2 + put = call - sptprice + futureValue return put +@dpex.dpjit +def init_initStrike(size, initStrike): + for idx in numba.prange(initStrike.size): + initStrike[idx] = 40 + (initStrike[idx] + 1.0) / size + return initStrike + + def run(iterations): - sptprice = np.full((iterations,), 42.0) - initStrike = 40 + (np.arange(iterations) + 1.0) / iterations - rate = np.full((iterations,), 0.5) - volatility = np.full((iterations,), 0.2) - timev = np.full((iterations,), 0.5) + dpt_sptprice = dpt.full((iterations,), 42.0) + dpt_range_arr = dpt.arange(iterations) + dpt_full_arr_05 = dpt.full((iterations,), 0.5) + dpt_volatility = dpt.full((iterations,), 0.2) + + sptprice = dpnp.ndarray(shape=dpt_sptprice.shape, buffer=dpt_sptprice) + rate = dpnp.ndarray(shape=dpt_full_arr_05.shape, buffer=dpt_full_arr_05) + volatility = dpnp.ndarray(shape=dpt_volatility.shape, buffer=dpt_volatility) + timev = dpnp.ndarray(shape=dpt_full_arr_05.shape, buffer=dpt_full_arr_05) + initStrike = dpnp.ndarray(shape=dpt_range_arr.shape, buffer=dpt_range_arr) + initStrike = init_initStrike(iterations, initStrike) t1 = time.time() put = blackscholes(sptprice, initStrike, rate, volatility, timev) t = time.time() - t1 - print("checksum: ", sum(put)) + # print("checksum: ", sum(put)) + print(put) print("SELFTIMED ", t) def main(): - parser = argparse.ArgumentParser(description="Black-Scholes") - parser.add_argument("--iter", dest="iter", type=int, default=10) - args = parser.parse_args() - iter = args.iter - - # Use the environment variable SYCL_DEVICE_FILTER to change the default device. - # See https://github.com/intel/llvm/blob/sycl/sycl/doc/EnvironmentVariables.md#sycl_device_filter. - device = dpctl.select_default_device() - print("Using device ...") - device.print_device_info() - - with dpctl.device_context(device): - run(iter) - - print("Done...") + run(10) if __name__ == "__main__": diff --git a/numba_dpex/ocl/_declare_function.py b/numba_dpex/ocl/_declare_function.py new file mode 100644 index 0000000000..7fc2396e17 --- /dev/null +++ b/numba_dpex/ocl/_declare_function.py @@ -0,0 +1,49 @@ +# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + + +from llvmlite import ir as llvmir +from numba.core import cgutils, types + +from numba_dpex.core.itanium_mangler import mangle_c + +# ----------------------------------------------------------------------------- + + +def _declare_function(context, builder, name, sig, cargs, mangler=mangle_c): + """Insert declaration for a opencl builtin function. + Uses the Itanium mangler. + + Args + ---- + context: target context + + builder: llvm builder + + name: str + symbol name + + sig: signature + function signature of the symbol being declared + + cargs: sequence of str + C type names for the arguments + + mangler: a mangler function + function to use to mangle the symbol + + """ + mod = builder.module + if sig.return_type == types.void: + llretty = llvmir.VoidType() + else: + llretty = context.get_value_type(sig.return_type) + llargs = [context.get_value_type(t) for t in sig.args] + fnty = llvmir.FunctionType(llretty, llargs) + mangled = mangler(name, cargs) + fn = cgutils.get_or_insert_function(mod, fnty, mangled) + from numba_dpex import kernel_target + + fn.calling_convention = kernel_target.CC_SPIR_FUNC + return fn diff --git a/numba_dpex/ocl/mathimpl.py b/numba_dpex/ocl/mathimpl.py index b67ffe2128..5c1a37ca77 100644 --- a/numba_dpex/ocl/mathimpl.py +++ b/numba_dpex/ocl/mathimpl.py @@ -11,7 +11,7 @@ from numba_dpex.core.itanium_mangler import mangle -from .oclimpl import _declare_function +from ._declare_function import _declare_function registry = Registry() lower = registry.lower diff --git a/numba_dpex/ocl/oclimpl.py b/numba_dpex/ocl/oclimpl.py index 80f81a7c97..ff84b5bb93 100644 --- a/numba_dpex/ocl/oclimpl.py +++ b/numba_dpex/ocl/oclimpl.py @@ -14,12 +14,12 @@ from numba_dpex import config, kernel_target from numba_dpex.core.codegen import SPIR_DATA_LAYOUT -from numba_dpex.core.itanium_mangler import mangle, mangle_c, mangle_type from numba_dpex.core.types import Array from numba_dpex.ocl.atomics import atomic_helper from numba_dpex.utils import address_space from . import stubs +from ._declare_function import _declare_function registry = Registry() lower = registry.lower @@ -30,42 +30,6 @@ # ----------------------------------------------------------------------------- -def _declare_function(context, builder, name, sig, cargs, mangler=mangle_c): - """Insert declaration for a opencl builtin function. - Uses the Itanium mangler. - - Args - ---- - context: target context - - builder: llvm builder - - name: str - symbol name - - sig: signature - function signature of the symbol being declared - - cargs: sequence of str - C type names for the arguments - - mangler: a mangler function - function to use to mangle the symbol - - """ - mod = builder.module - if sig.return_type == types.void: - llretty = llvmir.VoidType() - else: - llretty = context.get_value_type(sig.return_type) - llargs = [context.get_value_type(t) for t in sig.args] - fnty = llvmir.FunctionType(llretty, llargs) - mangled = mangler(name, cargs) - fn = cgutils.get_or_insert_function(mod, fnty, mangled) - fn.calling_convention = kernel_target.CC_SPIR_FUNC - return fn - - @lower(stubs.get_global_id, types.uint32) def get_global_id_impl(context, builder, sig, args): [dim] = args diff --git a/numba_dpex/tests/dpjit_tests/parfors/test_builtin_ops.py b/numba_dpex/tests/dpjit_tests/parfors/test_builtin_ops.py new file mode 100644 index 0000000000..d793317f64 --- /dev/null +++ b/numba_dpex/tests/dpjit_tests/parfors/test_builtin_ops.py @@ -0,0 +1,122 @@ +# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import dpctl +import dpnp +import numpy +import pytest + +from numba_dpex import dpjit + + +def parfor_add(a, b): + return a + b + + +def parfor_sub(a, b): + return a - b + + +def parfor_mult(a, b): + return a * b + + +def parfor_divide(a, b): + return a / b + + +def parfor_modulus(a, b): + return a % b + + +def parfor_exponent(a, b): + return a**b + + +shapes = [100, (25, 4)] +dtypes = [dpnp.int32, dpnp.int64, dpnp.float32, dpnp.float64] +usm_types = ["device"] +funcs = [ + parfor_add, + parfor_sub, + parfor_mult, + parfor_divide, + parfor_modulus, + parfor_exponent, +] + + +def parfor_floor(a, b): + return a // b + + +@pytest.mark.parametrize("shape", shapes) +@pytest.mark.parametrize("dtype", dtypes) +@pytest.mark.parametrize("usm_type", usm_types) +@pytest.mark.parametrize("func", funcs) +def test_built_in_operators1(shape, dtype, usm_type, func): + queue = dpctl.SyclQueue() + a = dpnp.zeros( + shape=shape, dtype=dtype, usm_type=usm_type, sycl_queue=queue + ) + b = dpnp.ones(shape=shape, dtype=dtype, usm_type=usm_type, sycl_queue=queue) + try: + op = dpjit(func) + c = op(a, b) + del op + except Exception: + pytest.fail("Failed to compile.") + + if len(c.shape) == 1: + assert c.shape[0] == shape + else: + assert c.shape == shape + + if func != parfor_divide: + assert c.dtype == dtype + assert c.usm_type == usm_type + + assert c.sycl_device.filter_string == queue.sycl_device.filter_string + + expected = dpnp.asnumpy(func(a, b)) + nc = dpnp.asnumpy(c) + + numpy.allclose(nc, expected) + + +usm_types = ["host", "shared"] + + +@pytest.mark.parametrize("shape", shapes) +@pytest.mark.parametrize("dtype", dtypes) +@pytest.mark.parametrize("usm_type", usm_types) +@pytest.mark.parametrize("func", funcs) +def test_built_in_operators2(shape, dtype, usm_type, func): + queue = dpctl.SyclQueue() + a = dpnp.zeros( + shape=shape, dtype=dtype, usm_type=usm_type, sycl_queue=queue + ) + b = dpnp.ones(shape=shape, dtype=dtype, usm_type=usm_type, sycl_queue=queue) + try: + op = dpjit(func) + c = op(a, b) + del op + except Exception: + pytest.fail("Failed to compile.") + + if len(c.shape) == 1: + assert c.shape[0] == shape + else: + assert c.shape == shape + + if func != parfor_divide: + assert c.dtype == dtype + assert c.usm_type == usm_type + + assert c.sycl_device.filter_string == queue.sycl_device.filter_string + + expected = dpnp.asnumpy(func(a, b)) + nc = dpnp.asnumpy(c) + + numpy.allclose(nc, expected) diff --git a/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_bitwise_ops.py b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_bitwise_ops.py new file mode 100644 index 0000000000..7662df8afc --- /dev/null +++ b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_bitwise_ops.py @@ -0,0 +1,96 @@ +# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + + +import dpctl.tensor as dpt +import dpnp +import numpy as np +import pytest + +from numba_dpex import dpjit +from numba_dpex.tests._helper import assert_auto_offloading, filter_strings + +list_of_binary_ops = [ + "bitwise_and", + "bitwise_or", + "bitwise_xor", + "left_shift", + "right_shift", +] + + +@pytest.fixture(params=list_of_binary_ops) +def binary_op(request): + return request.param + + +list_of_unary_ops = [ + "bitwise_not", + "invert", +] + + +@pytest.fixture(params=list_of_unary_ops) +def unary_op(request): + return request.param + + +list_of_dtypes = [ + dpnp.int32, + dpnp.int64, +] + + +@pytest.fixture(params=list_of_dtypes) +def input_arrays(request): + # The size of input and out arrays to be used + N = 2048 + a = dpnp.array(dpnp.random.random(N), request.param) + b = dpnp.array(dpnp.random.random(N), request.param) + return a, b + + +@pytest.mark.parametrize("filter_str", filter_strings) +def test_binary_ops(filter_str, binary_op, input_arrays): + a, b = input_arrays + binop = getattr(dpnp, binary_op) + actual = dpnp.empty(shape=a.shape, dtype=a.dtype) + expected = dpnp.empty(shape=a.shape, dtype=a.dtype) + + @dpjit + def f(a, b): + return binop(a, b) + + actual = f(a, b) + + expected = binop(a, b) + np.testing.assert_allclose( + dpt.asnumpy(actual._array_obj), + dpt.asnumpy(expected._array_obj), + rtol=1e-5, + atol=0, + ) + + +@pytest.mark.parametrize("filter_str", filter_strings) +def test_unary_ops(filter_str, unary_op, input_arrays): + a = input_arrays[0] + uop = getattr(dpnp, unary_op) + actual = np.empty(shape=a.shape, dtype=a.dtype) + expected = np.empty(shape=a.shape, dtype=a.dtype) + + @dpjit + def f(a): + return uop(a) + + actual = f(a) + + expected = uop(a) + + np.testing.assert_allclose( + dpt.asnumpy(actual._array_obj), + dpt.asnumpy(expected._array_obj), + rtol=1e-5, + atol=0, + ) diff --git a/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_logic_ops.py b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_logic_ops.py new file mode 100644 index 0000000000..eb5bac45f9 --- /dev/null +++ b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_logic_ops.py @@ -0,0 +1,106 @@ +# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import dpctl +import dpctl.tensor as dpt +import dpnp +import numpy as np +import pytest + +from numba_dpex import dpjit +from numba_dpex.tests._helper import assert_auto_offloading, filter_strings + +""" Following cases, dpnp raises NotImplementedError""" + +list_of_binary_ops = [ + "greater", + "greater_equal", + "less", + "less_equal", + "not_equal", + "equal", + "logical_and", + "logical_or", + "logical_xor", +] + + +@pytest.fixture(params=list_of_binary_ops) +def binary_op(request): + return request.param + + +list_of_unary_ops = [ + "isinf", + "isfinite", + "isnan", +] + + +@pytest.fixture(params=list_of_unary_ops) +def unary_op(request): + return request.param + + +list_of_dtypes = [ + dpnp.int32, + dpnp.int64, + dpnp.float32, + dpnp.float64, +] + + +@pytest.fixture(params=list_of_dtypes) +def input_arrays(request): + # The size of input and out arrays to be used + N = 2048 + a = dpnp.array(dpnp.random.random(N), request.param) + b = dpnp.array(dpnp.random.random(N), request.param) + return a, b + + +@pytest.mark.xfail +@pytest.mark.parametrize("filter_str", filter_strings) +def test_binary_ops(filter_str, binary_op, input_arrays): + a, b = input_arrays + binop = getattr(dpnp, binary_op) + actual = dpnp.empty(shape=a.shape, dtype=a.dtype) + expected = dpnp.empty(shape=a.shape, dtype=a.dtype) + + @dpjit + def f(a, b): + return binop(a, b) + + actual = f(a, b) + + expected = binop(a, b) + np.testing.assert_allclose( + dpt.asnumpy(actual._array_obj), + dpt.asnumpy(expected._array_obj), + rtol=1e-5, + atol=0, + ) + + +@pytest.mark.xfail +@pytest.mark.parametrize("filter_str", filter_strings) +def test_unary_ops(filter_str, unary_op, input_arrays): + a = input_arrays[0] + uop = getattr(dpnp, unary_op) + actual = dpnp.empty(shape=a.shape, dtype=a.dtype) + expected = dpnp.empty(shape=a.shape, dtype=a.dtype) + + @dpjit + def f(a): + return uop(a) + + actual = f(a) + + expected = uop(a) + np.testing.assert_allclose( + dpt.asnumpy(actual._array_obj), + dpt.asnumpy(expected._array_obj), + rtol=1e-5, + atol=0, + ) diff --git a/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_transcedental_functions.py b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_transcedental_functions.py new file mode 100644 index 0000000000..f43d59752f --- /dev/null +++ b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_transcedental_functions.py @@ -0,0 +1,132 @@ +# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import dpctl +import dpctl.tensor as dpt +import dpnp +import numpy as np +import pytest + +from numba_dpex import dpjit +from numba_dpex.tests._helper import ( + assert_auto_offloading, + filter_strings, + is_gen12, +) + +"""dpnp raise error on : mod, abs and remainder(float32)""" +list_of_binary_ops = [ + "add", + "subtract", + "multiply", + "divide", + "power", + # "remainder", + # "mod", + "hypot", + "maximum", + "minimum", + "fmax", + "fmin", +] + + +@pytest.fixture(params=list_of_binary_ops) +def binary_op(request): + return request.param + + +list_of_unary_ops = [ + "negative", + # "abs", + "absolute", + "fabs", + "sign", + "conj", + "exp", + "exp2", + "log", + "log2", + "log10", + "expm1", + "log1p", + "sqrt", + "square", + "reciprocal", + "conjugate", + "floor", + "ceil", + "trunc", + "erf", +] + + +@pytest.fixture(params=list_of_unary_ops) +def unary_op(request): + return request.param + + +list_of_dtypes = [ + dpnp.float32, + dpnp.float64, +] + + +@pytest.fixture(params=list_of_dtypes) +def input_arrays(request): + # The size of input and out arrays to be used + N = 2048 + a = dpnp.array(dpnp.random.random(N), request.param) + b = dpnp.array(dpnp.random.random(N), request.param) + return a, b + + +@pytest.mark.parametrize("filter_str", filter_strings) +def test_binary_ops(filter_str, binary_op, input_arrays): + a, b = input_arrays + binop = getattr(dpnp, binary_op) + actual = dpnp.empty(shape=a.shape, dtype=a.dtype) + expected = dpnp.empty(shape=a.shape, dtype=a.dtype) + + @dpjit + def f(a, b): + return binop(a, b) + + actual = f(a, b) + + expected = binop(a, b) + + np.testing.assert_allclose( + dpt.asnumpy(actual._array_obj), + dpt.asnumpy(expected._array_obj), + rtol=1e-5, + atol=0, + ) + + +@pytest.mark.parametrize("filter_str", filter_strings) +def test_unary_ops(filter_str, unary_op, input_arrays): + skip_ops = ["abs", "sign", "log", "log2", "log10", "expm1"] + if unary_op in skip_ops and is_gen12(filter_str): + pytest.skip() + + a = input_arrays[0] + uop = getattr(dpnp, unary_op) + actual = dpnp.empty(shape=a.shape, dtype=a.dtype) + expected = dpnp.empty(shape=a.shape, dtype=a.dtype) + + @dpjit + def f(a): + return uop(a) + + actual = f(a) + + expected = uop(a) + + np.testing.assert_allclose( + dpt.asnumpy(actual._array_obj), + dpt.asnumpy(expected._array_obj), + rtol=1e-5, + atol=0, + ) diff --git a/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_trigonometric_functions.py b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_trigonometric_functions.py new file mode 100644 index 0000000000..73ef3b4f5d --- /dev/null +++ b/numba_dpex/tests/dpjit_tests/parfors/test_dpnp_trigonometric_functions.py @@ -0,0 +1,110 @@ +# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +import dpctl +import dpctl.tensor as dpt +import dpnp +import numpy as np +import pytest + +from numba_dpex import dpjit +from numba_dpex.tests._helper import ( + assert_auto_offloading, + filter_strings, + is_gen12, +) + +list_of_filter_strs = [ + "opencl:gpu:0", + "level_zero:gpu:0", + "opencl:cpu:0", +] + + +@pytest.fixture(params=list_of_filter_strs) +def filter_str(request): + return request.param + + +list_of_trig_ops = [ + "sin", + "cos", + "tan", + "arcsin", + "arccos", + "arctan", + "arctan2", + "sinh", + "cosh", + "tanh", + "arcsinh", + "arccosh", + "arctanh", + "deg2rad", + "rad2deg", + "degrees", + "radians", +] + + +@pytest.fixture(params=list_of_trig_ops) +def trig_op(request): + return request.param + + +list_of_dtypes = [ + dpnp.float32, + dpnp.float64, +] + + +@pytest.fixture(params=list_of_trig_ops) +def dtype(request): + return request.param + + +@pytest.fixture(params=list_of_dtypes) +def input_arrays(request): + # The size of input and out arrays to be used + N = 2048 + + a = dpnp.array(dpnp.random.random(N), request.param) + b = dpnp.array(dpnp.random.random(N), request.param) + return a, b + + +@pytest.mark.parametrize("filter_str", filter_strings) +def test_trigonometric_fn(filter_str, trig_op, input_arrays): + # FIXME: Why does archcosh fail on Gen12 discrete graphics card? + if trig_op == "arccosh" and is_gen12(filter_str): + pytest.skip() + + a, b = input_arrays + trig_fn = getattr(dpnp, trig_op) + actual = dpnp.empty(shape=a.shape, dtype=a.dtype) + expected = dpnp.empty(shape=a.shape, dtype=a.dtype) + + if trig_op == "arctan2": + + @dpjit + def f(a, b): + return trig_fn(a, b) + + actual = f(a, b) + expected = trig_fn(a, b) + else: + + @dpjit + def f(a): + return trig_fn(a) + + actual = f(a) + expected = trig_fn(a) + + np.testing.assert_allclose( + dpt.asnumpy(actual._array_obj), + dpt.asnumpy(expected._array_obj), + rtol=1e-5, + atol=0, + ) diff --git a/numba_dpex/tests/passes/test_parfor_legalize_cfd_pass.py b/numba_dpex/tests/passes/test_parfor_legalize_cfd_pass.py new file mode 100644 index 0000000000..451eb2392b --- /dev/null +++ b/numba_dpex/tests/passes/test_parfor_legalize_cfd_pass.py @@ -0,0 +1,82 @@ +# SPDX-FileCopyrightText: 2020 - 2023 Intel Corporation +# +# SPDX-License-Identifier: Apache-2.0 + +""" +Tests for checking enforcing CFD in parfor pass. +""" + + +import dpctl +import dpnp +import numba as nb +import pytest + +from numba_dpex import dpjit +from numba_dpex.core.exceptions import ComputeFollowsDataInferenceError +from numba_dpex.tests._helper import skip_no_opencl_gpu + +shapes = [10, (2, 5)] +dtypes = [dpnp.int32, dpnp.int64, dpnp.float32, dpnp.float64] +usm_types = ["device"] +devices = ["gpu"] + + +@dpjit +def func1(a, b): + c = a + b + return c + + +@skip_no_opencl_gpu +@pytest.mark.parametrize("shape", shapes) +@pytest.mark.parametrize("dtype", dtypes) +@pytest.mark.parametrize("usm_type", usm_types) +@pytest.mark.parametrize("device", devices) +def test_parfor_legalize_cfd_pass(shape, dtype, usm_type, device): + a = dpnp.zeros(shape=shape, dtype=dtype, usm_type=usm_type, device=device) + b = dpnp.ones(shape=shape, dtype=dtype, usm_type=usm_type, device=device) + try: + c = func1(a, b) + except Exception: + pytest.fail("Running Parfor CFD Pass check failed") + + if len(c.shape) == 1: + assert c.shape[0] == shape + else: + assert c.shape == shape + + assert c.dtype == dtype + assert c.usm_type == usm_type + if device != "unknown": + assert ( + c.sycl_device.filter_string + == dpctl.SyclDevice(device).filter_string + ) + else: + c.sycl_device.filter_string == dpctl.SyclDevice().filter_string + + +@skip_no_opencl_gpu +def test_parfor_legalize_cfd_pass_raise(): + a = dpnp.zeros(shape=10, device="cpu") + b = dpnp.ones(shape=10, device="gpu") + + with pytest.raises(ComputeFollowsDataInferenceError): + func1(a, b) + + +@skip_no_opencl_gpu +def test_cfd_error_due_to_lhs(): + a = dpnp.zeros(shape=10, device="cpu") + b = dpnp.ones(shape=10, device="cpu") + + @dpjit + def vecadd_prange(a, b): + c = dpnp.empty(a.shape, dtype=a.dtype, device="gpu") + for idx in nb.prange(a.size): + c[idx] = a[idx] + b[idx] + return c + + with pytest.raises(ComputeFollowsDataInferenceError): + vecadd_prange(a, b) diff --git a/numba_dpex/tests/test_prange.py b/numba_dpex/tests/test_prange.py index d62ac0039e..9b669cbc94 100644 --- a/numba_dpex/tests/test_prange.py +++ b/numba_dpex/tests/test_prange.py @@ -5,151 +5,196 @@ # SPDX-License-Identifier: Apache-2.0 import dpctl +import dpnp import numpy as np import pytest -from numba import njit, prange - -from numba_dpex.tests._helper import assert_auto_offloading, skip_no_opencl_gpu - - -class TestPrange: - def test_one_prange(self): - @njit - def f(a, b): - for i in prange(4): - b[i, 0] = a[i, 0] * 10 - - m = 8 - n = 8 - a = np.ones((m, n)) - b = np.ones((m, n)) - - device = dpctl.select_default_device() - with assert_auto_offloading(), dpctl.device_context(device): - f(a, b) - - for i in range(4): - assert b[i, 0] == a[i, 0] * 10 - - def test_nested_prange(self): - @njit - def f(a, b): - # dimensions must be provided as scalar - m, n = a.shape - for i in prange(m): - for j in prange(n): - b[i, j] = a[i, j] * 10 - - m = 8 - n = 8 - a = np.ones((m, n)) - b = np.ones((m, n)) - - device = dpctl.select_default_device() - with assert_auto_offloading(), dpctl.device_context(device): - f(a, b) - - assert np.all(b == 10) - - @pytest.mark.skip - def test_multiple_prange(self): - @njit - def f(a, b): - # dimensions must be provided as scalar - m, n = a.shape - for i in prange(m): - val = 10 - for j in prange(n): - b[i, j] = a[i, j] * val - - for i in prange(m): - for j in prange(n): - a[i, j] = a[i, j] * 10 - - m = 8 - n = 8 - a = np.ones((m, n)) - b = np.ones((m, n)) - - device = dpctl.select_default_device() - with assert_auto_offloading(parfor_offloaded=2), dpctl.device_context( - device - ): - f(a, b) - - assert np.all(b == 10) - assert np.all(a == 10) - - def test_three_prange(self): - @njit - def f(a, b): - # dimensions must be provided as scalar - m, n, o = a.shape - for i in prange(m): - val = 10 - for j in prange(n): - constant = 2 - for k in prange(o): - b[i, j, k] = a[i, j, k] * (val + constant) - - m = 8 - n = 8 - o = 8 - a = np.ones((m, n, o)) - b = np.ones((m, n, o)) - - device = dpctl.select_default_device() - with assert_auto_offloading(parfor_offloaded=1), dpctl.device_context( - device - ): - f(a, b) - - assert np.all(b == 12) - - @pytest.mark.skip(reason="numba-dpex issue 110") - def test_two_consequent_prange(self): - def prange_example(): - n = 10 - a = np.ones((n), dtype=np.float64) - b = np.ones((n), dtype=np.float64) - c = np.ones((n), dtype=np.float64) - for i in prange(n // 2): - a[i] = b[i] + c[i] - - return a - - jitted = njit(prange_example) - - device = dpctl.select_default_device() - with assert_auto_offloading(parfor_offloaded=2), dpctl.device_context( - device - ): - jitted_res = jitted() - - res = prange_example() - - np.testing.assert_equal(res, jitted_res) - - @pytest.mark.skip(reason="NRT required but not enabled") - def test_2d_arrays(self): - def prange_example(): - n = 10 - a = np.ones((n, n), dtype=np.float64) - b = np.ones((n, n), dtype=np.float64) - c = np.ones((n, n), dtype=np.float64) - for i in prange(n // 2): - a[i] = b[i] + c[i] - - return a - - jitted = njit(prange_example) - - device = dpctl.select_default_device() - with assert_auto_offloading(parfor_offloaded=2), dpctl.device_context( - device - ): - jitted_res = jitted() - - res = prange_example() - - np.testing.assert_equal(res, jitted_res) +from numba import njit + +from numba_dpex import dpjit, prange + + +def test_one_prange_mul(): + @dpjit + def f(a, b): + for i in prange(4): + b[i, 0] = a[i, 0] * 10 + return + + device = dpctl.select_default_device() + + m = 8 + n = 8 + a = dpnp.ones((m, n), device=device) + b = dpnp.ones((m, n), device=device) + + f(a, b) + + for i in range(4): + assert b[i, 0] == a[i, 0] * 10 + + +@pytest.mark.skip(reason="dpnp.add() doesn't support variable + scalar.") +def test_one_prange_add_scalar(): + @dpjit + def f(a, b): + for i in prange(4): + b[i, 0] = a[i, 0] + 10 + return + + device = dpctl.select_default_device() + + m = 8 + n = 8 + a = dpnp.ones((m, n), device=device) + b = dpnp.ones((m, n), device=device) + + f(a, b) + + for i in range(4): + assert b[i, 0] == a[i, 0] + 10 + + +@pytest.mark.skip(reason="[i,:] like indexing is not supported yet.") +def test_prange_2d_array(): + device = dpctl.select_default_device() + n = 10 + + @dpjit + def f(a, b, c): + for i in prange(n): + c[i, :] = a[i, :] + b[i, :] + return + + a = dpnp.ones((n, n), dtype=dpnp.int32, device=device) + b = dpnp.ones((n, n), dtype=dpnp.int32, device=device) + c = dpnp.ones((n, n), dtype=dpnp.int32, device=device) + + f(a, b, c) + + np.testing.assert_equal(c.asnumpy(), np.ones((n, n), dtype=np.int32) * 2) + + +@pytest.mark.skip(reason="Nested prange is not supported yet.") +def test_nested_prange(): + @dpjit + def f(a, b): + # dimensions must be provided as scalar + m, n = a.shape + for i in prange(m): + for j in prange(n): + b[i, j] = a[i, j] * 10 + return + + device = dpctl.select_default_device() + + m = 8 + n = 8 + a = dpnp.ones((m, n), device=device) + b = dpnp.ones((m, n), device=device) + + f(a, b) + + assert np.all(b.asnumpy() == 10) + + +@pytest.mark.skip(reason="Nested prange is not supported yet.") +def test_multiple_prange(): + @dpjit + def f(a, b): + # dimensions must be provided as scalar + m, n = a.shape + for i in prange(m): + val = 10 + for j in prange(n): + b[i, j] = a[i, j] * val + + for i in prange(m): + for j in prange(n): + a[i, j] = a[i, j] * 10 + return + + device = dpctl.select_default_device() + + m = 8 + n = 8 + a = dpnp.ones((m, n), device=device) + b = dpnp.ones((m, n), device=device) + + f(a, b) + + assert np.all(b.asnumpy() == 10) + assert np.all(a.asnumpy() == 10) + + +@pytest.mark.skip(reason="Nested prange is not supported yet.") +def test_three_prange(): + @dpjit + def f(a, b): + # dimensions must be provided as scalar + m, n, o = a.shape + for i in prange(m): + val = 10 + for j in prange(n): + constant = 2 + for k in prange(o): + b[i, j, k] = a[i, j, k] * (val + constant) + return + + device = dpctl.select_default_device() + + m = 8 + n = 8 + o = 8 + a = dpnp.ones((m, n, o), device=device) + b = dpnp.ones((m, n, o), device=device) + + f(a, b) + + assert np.all(b.asnumpy() == 12) + + +def test_two_consecutive_prange(): + @dpjit + def prange_example(a, b, c, d): + for i in prange(n): + c[i] = a[i] + b[i] + for i in prange(n): + d[i] = a[i] - b[i] + return + + device = dpctl.select_default_device() + + n = 10 + a = dpnp.ones((n), dtype=dpnp.float64, device=device) + b = dpnp.ones((n), dtype=dpnp.float64, device=device) + c = dpnp.zeros((n), dtype=dpnp.float64, device=device) + d = dpnp.zeros((n), dtype=dpnp.float64, device=device) + + prange_example(a, b, c, d) + + np.testing.assert_equal(c.asnumpy(), np.ones((n), dtype=np.float64) * 2) + np.testing.assert_equal(d.asnumpy(), np.zeros((n), dtype=np.float64)) + + +@pytest.mark.skip(reason="[i,:] like indexing is not supported yet.") +def test_two_consecutive_prange_2d(): + @dpjit + def prange_example(a, b, c, d): + for i in prange(n): + c[i, :] = a[i, :] + b[i, :] + for i in prange(n): + d[i, :] = a[i, :] - b[i, :] + return + + device = dpctl.select_default_device() + + n = 10 + a = dpnp.ones((n, n), dtype=dpnp.int32, device=device) + b = dpnp.ones((n, n), dtype=dpnp.int32, device=device) + c = dpnp.ones((n, n), dtype=dpnp.int32, device=device) + d = dpnp.ones((n, n), dtype=dpnp.int32, device=device) + + prange_example(a, b, c, d) + + np.testing.assert_equal(c.asnumpy(), np.ones((n, n), dtype=np.int32) * 2) + np.testing.assert_equal(d.asnumpy(), np.zeros((n, n), dtype=np.int32))