From 47a939c01f5368d80b10a0978db9934a5081cbe4 Mon Sep 17 00:00:00 2001 From: Chris Perkins Date: Fri, 21 Aug 2020 15:54:05 -0700 Subject: [PATCH 1/5] Improvements to SYCL_PI_TRACE rather than blind size_t *, introduce some simple data structures for passing Origin and Region arguments to plugin interface. This improves code readability, can be dispatched by PI_TRACE improving output, and allows us to more easily see bugs/confusion in how certain 2D/3D arguments aren't being passed correctly to existing Rect/Image operations Signed-off-by: Chris Perkins --- sycl/include/CL/sycl/detail/pi.h | 62 +++-- sycl/include/CL/sycl/detail/pi.hpp | 20 ++ sycl/plugins/cuda/pi_cuda.cpp | 53 ++--- sycl/plugins/level_zero/pi_level_zero.cpp | 121 +++++----- sycl/source/detail/memory_manager.cpp | 57 +++-- sycl/test/plugins/enqueue-arg-order.cpp | 261 ++++++++++++++++++++++ 6 files changed, 455 insertions(+), 119 deletions(-) create mode 100644 sycl/test/plugins/enqueue-arg-order.cpp diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index 741e80a10615..f9035d5ffbc7 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -706,13 +706,45 @@ struct pi_device_binary_struct { }; using pi_device_binary = pi_device_binary_struct *; -// pi_buffer_region structure repeats cl_buffer_region +// pi_buffer_region structure repeats cl_buffer_region, used for sub buffers. struct pi_buffer_region_struct { size_t origin; size_t size; }; using pi_buffer_region = pi_buffer_region_struct *; +// pi_buff_rect_offset structure is 3D offset argument passed to buffer rect operations (piEnqueuReadBufferRect, etc). +struct pi_buff_rect_offset_struct { + size_t x_bytes; + size_t y_scalar; + size_t z_scalar; +}; +using pi_buff_rect_offset = pi_buff_rect_offset_struct *; + +// pi_buff_rect_region structure represents size of 3D region passed to buffer rect operations (piEnqueuReadBufferRect, etc). +struct pi_buff_rect_region_struct { + size_t width_bytes; + size_t height_scalar; + size_t depth_scalar; +}; +using pi_buff_rect_region = pi_buff_rect_region_struct *; + +// pi_image_offset structure is 3D offset argument passed to image operations (piEnqueueMemImageRead, etc). +struct pi_image_offset_struct { + size_t x; + size_t y; + size_t z; +}; +using pi_image_offset = pi_image_offset_struct *; + +// pi_image_region structure represents size of 3D region passed to image operations (piEnqueueMemImageRead, etc). +struct pi_image_region_struct { + size_t width; + size_t height; + size_t depth; +}; +using pi_image_region = pi_image_region_struct *; + // Offload binaries descriptor version supported by this library. static const uint16_t PI_DEVICE_BINARIES_VERSION = 1; @@ -1261,11 +1293,11 @@ __SYCL_EXPORT pi_result piEnqueueMemBufferRead( __SYCL_EXPORT pi_result piEnqueueMemBufferReadRect( pi_queue command_queue, pi_mem buffer, pi_bool blocking_read, - const size_t *buffer_offset, const size_t *host_offset, - const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch, - size_t host_row_pitch, size_t host_slice_pitch, void *ptr, - pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, - pi_event *event); + pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, + pi_buff_rect_region region, size_t buffer_row_pitch, + size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, + void *ptr, pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, pi_event *event); __SYCL_EXPORT pi_result piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, @@ -1275,11 +1307,11 @@ piEnqueueMemBufferWrite(pi_queue command_queue, pi_mem buffer, __SYCL_EXPORT pi_result piEnqueueMemBufferWriteRect( pi_queue command_queue, pi_mem buffer, pi_bool blocking_write, - const size_t *buffer_offset, const size_t *host_offset, - const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch, - size_t host_row_pitch, size_t host_slice_pitch, const void *ptr, - pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, - pi_event *event); + pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, + pi_buff_rect_region region, size_t buffer_row_pitch, + size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, + const void *ptr, pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, pi_event *event); __SYCL_EXPORT pi_result piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, @@ -1289,7 +1321,7 @@ piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, __SYCL_EXPORT pi_result piEnqueueMemBufferCopyRect( pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer, - const size_t *src_origin, const size_t *dst_origin, const size_t *region, + pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, size_t dst_slice_pitch, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event); @@ -1302,20 +1334,20 @@ piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, __SYCL_EXPORT pi_result piEnqueueMemImageRead( pi_queue command_queue, pi_mem image, pi_bool blocking_read, - const size_t *origin, const size_t *region, size_t row_pitch, + pi_image_offset origin, pi_image_region region, size_t row_pitch, size_t slice_pitch, void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event); __SYCL_EXPORT pi_result piEnqueueMemImageWrite( pi_queue command_queue, pi_mem image, pi_bool blocking_write, - const size_t *origin, const size_t *region, size_t input_row_pitch, + pi_image_offset origin, pi_image_region region, size_t input_row_pitch, size_t input_slice_pitch, const void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event); __SYCL_EXPORT pi_result piEnqueueMemImageCopy( pi_queue command_queue, pi_mem src_image, pi_mem dst_image, - const size_t *src_origin, const size_t *dst_origin, const size_t *region, + pi_image_offset src_origin, pi_image_offset dst_origin, pi_image_region region, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event); diff --git a/sycl/include/CL/sycl/detail/pi.hpp b/sycl/include/CL/sycl/detail/pi.hpp index da3864a31c17..25fdb26a0a55 100644 --- a/sycl/include/CL/sycl/detail/pi.hpp +++ b/sycl/include/CL/sycl/detail/pi.hpp @@ -180,6 +180,26 @@ template <> inline void print<>(PiPlatform val) { std::cout << "pi_platform : " << val << std::endl; } +template <> inline void print<>(pi_buffer_region rgn){ + std::cout << "pi_buffer_region origin/size : " << rgn->origin << "/" << rgn->size << std::endl; +} + +template <> inline void print<>(pi_buff_rect_region rgn){ + std::cout << "pi_buff_rect_region width_bytes/height/depth : " << rgn->width_bytes << "/" << rgn->height_scalar << "/" << rgn->depth_scalar << std::endl; +} + +template <> inline void print<>(pi_buff_rect_offset off){ + std::cout << "pi_buff_rect_offset x_bytes/y/z : " << off->x_bytes << "/" << off->y_scalar << "/" << off->z_scalar << std::endl; +} + +template <> inline void print<>(pi_image_region rgn){ + std::cout << "pi_image_region width/height/depth : " << rgn->width << "/" << rgn->height << "/" << rgn->depth << std::endl; +} + +template <> inline void print<>(pi_image_offset off){ + std::cout << "pi_image_offset x/y/z : " << off->x << "/" << off->y << "/" << off->z << std::endl; +} + template <> inline void print<>(PiResult val) { std::cout << "pi_result : "; if (val == PI_SUCCESS) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index d4a37cb1d0fb..f34122c8cb89 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -3351,10 +3351,10 @@ pi_result cuda_piSamplerRelease(pi_sampler sampler) { /// If the source and/or destination is on the device, src_ptr and/or dst_ptr /// must be a pointer to a CUdeviceptr static pi_result commonEnqueueMemBufferCopyRect( - CUstream cu_stream, const size_t *region, const void *src_ptr, - const CUmemorytype_enum src_type, const size_t *src_offset, + CUstream cu_stream, pi_buff_rect_region region, const void *src_ptr, + const CUmemorytype_enum src_type, pi_buff_rect_offset src_offset, size_t src_row_pitch, size_t src_slice_pitch, void *dst_ptr, - const CUmemorytype_enum dst_type, const size_t *dst_offset, + const CUmemorytype_enum dst_type, pi_buff_rect_offset dst_offset, size_t dst_row_pitch, size_t dst_slice_pitch) { assert(region != nullptr); @@ -3364,27 +3364,27 @@ static pi_result commonEnqueueMemBufferCopyRect( assert(src_type == CU_MEMORYTYPE_DEVICE || src_type == CU_MEMORYTYPE_HOST); assert(dst_type == CU_MEMORYTYPE_DEVICE || dst_type == CU_MEMORYTYPE_HOST); - src_row_pitch = (!src_row_pitch) ? region[0] : src_row_pitch; - src_slice_pitch = - (!src_slice_pitch) ? (region[1] * src_row_pitch) : src_slice_pitch; - dst_row_pitch = (!dst_row_pitch) ? region[0] : dst_row_pitch; - dst_slice_pitch = - (!dst_slice_pitch) ? (region[1] * dst_row_pitch) : dst_slice_pitch; + src_row_pitch = (!src_row_pitch) ? region->width_bytes : src_row_pitch; + src_slice_pitch = (!src_slice_pitch) ? (region->height_scalar * src_row_pitch) + : src_slice_pitch; + dst_row_pitch = (!dst_row_pitch) ? region->width_bytes : dst_row_pitch; + dst_slice_pitch = (!dst_slice_pitch) ? (region->height_scalar * dst_row_pitch) + : dst_slice_pitch; CUDA_MEMCPY3D params = {0}; - params.WidthInBytes = region[0]; - params.Height = region[1]; - params.Depth = region[2]; + params.WidthInBytes = region->width_bytes; + params.Height = region->height_scalar; + params.Depth = region->depth_scalar; params.srcMemoryType = src_type; params.srcDevice = src_type == CU_MEMORYTYPE_DEVICE ? *static_cast(src_ptr) : 0; params.srcHost = src_type == CU_MEMORYTYPE_HOST ? src_ptr : nullptr; - params.srcXInBytes = src_offset[0]; - params.srcY = src_offset[1]; - params.srcZ = src_offset[2]; + params.srcXInBytes = src_offset->x_bytes; + params.srcY = src_offset->y_scalar; + params.srcZ = src_offset->z_scalar; params.srcPitch = src_row_pitch; params.srcHeight = src_slice_pitch / src_row_pitch; @@ -3393,9 +3393,9 @@ static pi_result commonEnqueueMemBufferCopyRect( ? *static_cast(dst_ptr) : 0; params.dstHost = dst_type == CU_MEMORYTYPE_HOST ? dst_ptr : nullptr; - params.dstXInBytes = dst_offset[0]; - params.dstY = dst_offset[1]; - params.dstZ = dst_offset[2]; + params.dstXInBytes = dst_offset->x_bytes; + params.dstY = dst_offset->y_scalar; + params.dstZ = dst_offset->z_scalar; params.dstPitch = dst_row_pitch; params.dstHeight = dst_slice_pitch / dst_row_pitch; @@ -3404,8 +3404,8 @@ static pi_result commonEnqueueMemBufferCopyRect( pi_result cuda_piEnqueueMemBufferReadRect( pi_queue command_queue, pi_mem buffer, pi_bool blocking_read, - const size_t *buffer_offset, const size_t *host_offset, - const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch, + pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, + pi_buff_rect_region region, size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { @@ -3455,8 +3455,8 @@ pi_result cuda_piEnqueueMemBufferReadRect( pi_result cuda_piEnqueueMemBufferWriteRect( pi_queue command_queue, pi_mem buffer, pi_bool blocking_write, - const size_t *buffer_offset, const size_t *host_offset, - const size_t *region, size_t buffer_row_pitch, size_t buffer_slice_pitch, + pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, + pi_buff_rect_region region, size_t buffer_row_pitch, size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, const void *ptr, pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, pi_event *event) { @@ -3553,10 +3553,11 @@ pi_result cuda_piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, pi_result cuda_piEnqueueMemBufferCopyRect( pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer, - const size_t *src_origin, const size_t *dst_origin, const size_t *region, - size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, - size_t dst_slice_pitch, pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, pi_event *event) { + pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, + pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch, + size_t dst_row_pitch, size_t dst_slice_pitch, + pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, + pi_event *event) { assert(src_buffer != nullptr); assert(dst_buffer != nullptr); diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 2c6c95d2f608..6c75b18161cd 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -293,11 +293,11 @@ enqueueMemCopyHelper(pi_command_type CommandType, pi_queue Queue, void *Dst, static pi_result enqueueMemCopyRectHelper( pi_command_type CommandType, pi_queue Queue, void *SrcBuffer, - void *DstBuffer, const size_t *SrcOrigin, const size_t *DstOrigin, - const size_t *Region, size_t SrcRowPitch, size_t SrcSlicePitch, - size_t DstRowPitch, size_t DstSlicePitch, pi_bool Blocking, - pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, - pi_event *Event); + void *DstBuffer, pi_buff_rect_offset SrcOrigin, + pi_buff_rect_offset DstOrigin, pi_buff_rect_region Region, + size_t SrcRowPitch, size_t SrcSlicePitch, size_t DstRowPitch, + size_t DstSlicePitch, pi_bool Blocking, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event); inline void zeParseError(ze_result_t ZeError, std::string &ErrorString) { switch (ZeError) { @@ -3095,7 +3095,7 @@ pi_result piEnqueueMemBufferRead(pi_queue Queue, pi_mem Src, pi_result piEnqueueMemBufferReadRect( pi_queue Queue, pi_mem Buffer, pi_bool BlockingRead, - const size_t *BufferOffset, const size_t *HostOffset, const size_t *Region, + pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch, size_t HostRowPitch, size_t HostSlicePitch, void *Ptr, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, pi_event *Event) { @@ -3167,11 +3167,11 @@ enqueueMemCopyHelper(pi_command_type CommandType, pi_queue Queue, void *Dst, // Shared by all memory read/write/copy rect PI interfaces. static pi_result enqueueMemCopyRectHelper( pi_command_type CommandType, pi_queue Queue, void *SrcBuffer, - void *DstBuffer, const size_t *SrcOrigin, const size_t *DstOrigin, - const size_t *Region, size_t SrcRowPitch, size_t DstRowPitch, - size_t SrcSlicePitch, size_t DstSlicePitch, pi_bool Blocking, - pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, - pi_event *Event) { + void *DstBuffer, pi_buff_rect_offset SrcOrigin, + pi_buff_rect_offset DstOrigin, pi_buff_rect_region Region, + size_t SrcRowPitch, size_t DstRowPitch, size_t SrcSlicePitch, + size_t DstSlicePitch, pi_bool Blocking, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { assert(Region); assert(SrcOrigin); @@ -3212,31 +3212,31 @@ static pi_result enqueueMemCopyRectHelper( } zePrint("\n"); - uint32_t SrcOriginX = pi_cast(SrcOrigin[0]); - uint32_t SrcOriginY = pi_cast(SrcOrigin[1]); - uint32_t SrcOriginZ = pi_cast(SrcOrigin[2]); + uint32_t SrcOriginX = pi_cast(SrcOrigin->x_bytes); + uint32_t SrcOriginY = pi_cast(SrcOrigin->y_scalar); + uint32_t SrcOriginZ = pi_cast(SrcOrigin->z_scalar); uint32_t SrcPitch = SrcRowPitch; if (SrcPitch == 0) - SrcPitch = pi_cast(Region[0]); + SrcPitch = pi_cast(Region->width_bytes); if (SrcSlicePitch == 0) - SrcSlicePitch = pi_cast(Region[1]) * SrcPitch; + SrcSlicePitch = pi_cast(Region->height_scalar) * SrcPitch; - uint32_t DstOriginX = pi_cast(DstOrigin[0]); - uint32_t DstOriginY = pi_cast(DstOrigin[1]); - uint32_t DstOriginZ = pi_cast(DstOrigin[2]); + uint32_t DstOriginX = pi_cast(DstOrigin->x_bytes); + uint32_t DstOriginY = pi_cast(DstOrigin->y_scalar); + uint32_t DstOriginZ = pi_cast(DstOrigin->z_scalar); uint32_t DstPitch = DstRowPitch; if (DstPitch == 0) - DstPitch = pi_cast(Region[0]); + DstPitch = pi_cast(Region->width_bytes); if (DstSlicePitch == 0) - DstSlicePitch = pi_cast(Region[1]) * DstPitch; + DstSlicePitch = pi_cast(Region->height_scalar) * DstPitch; - uint32_t Width = pi_cast(Region[0]); - uint32_t Height = pi_cast(Region[1]); - uint32_t Depth = pi_cast(Region[2]); + uint32_t Width = pi_cast(Region->width_bytes); + uint32_t Height = pi_cast(Region->height_scalar); + uint32_t Depth = pi_cast(Region->depth_scalar); const ze_copy_region_t ZeSrcRegion = {SrcOriginX, SrcOriginY, SrcOriginZ, Width, Height, Depth}; @@ -3282,7 +3282,7 @@ pi_result piEnqueueMemBufferWrite(pi_queue Queue, pi_mem Buffer, pi_result piEnqueueMemBufferWriteRect( pi_queue Queue, pi_mem Buffer, pi_bool BlockingWrite, - const size_t *BufferOffset, const size_t *HostOffset, const size_t *Region, + pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch, size_t HostRowPitch, size_t HostSlicePitch, const void *Ptr, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, pi_event *Event) { @@ -3313,13 +3313,12 @@ pi_result piEnqueueMemBufferCopy(pi_queue Queue, pi_mem SrcBuffer, NumEventsInWaitList, EventWaitList, Event); } -pi_result -piEnqueueMemBufferCopyRect(pi_queue Queue, pi_mem SrcBuffer, pi_mem DstBuffer, - const size_t *SrcOrigin, const size_t *DstOrigin, - const size_t *Region, size_t SrcRowPitch, - size_t SrcSlicePitch, size_t DstRowPitch, - size_t DstSlicePitch, pi_uint32 NumEventsInWaitList, - const pi_event *EventWaitList, pi_event *Event) { +pi_result piEnqueueMemBufferCopyRect( + pi_queue Queue, pi_mem SrcBuffer, pi_mem DstBuffer, + pi_buff_rect_offset SrcOrigin, pi_buff_rect_offset DstOrigin, + pi_buff_rect_region Region, size_t SrcRowPitch, size_t SrcSlicePitch, + size_t DstRowPitch, size_t DstSlicePitch, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { assert(SrcBuffer); assert(DstBuffer); @@ -3555,8 +3554,9 @@ pi_result piMemImageGetInfo(pi_mem Image, pi_image_info ParamName, } // extern "C" -static ze_image_region_t getImageRegionHelper(pi_mem Mem, const size_t *Origin, - const size_t *Region) { +static ze_image_region_t getImageRegionHelper(pi_mem Mem, + pi_image_offset Origin, + pi_image_region Region) { assert(Mem && Origin); #ifndef NDEBUG @@ -3565,26 +3565,26 @@ static ze_image_region_t getImageRegionHelper(pi_mem Mem, const size_t *Origin, ze_image_desc_t ZeImageDesc = Image->ZeImageDesc; #endif // !NDEBUG - assert((ZeImageDesc.type == ZE_IMAGE_TYPE_1D && Origin[1] == 0 && - Origin[2] == 0) || - (ZeImageDesc.type == ZE_IMAGE_TYPE_1DARRAY && Origin[2] == 0) || - (ZeImageDesc.type == ZE_IMAGE_TYPE_2D && Origin[2] == 0) || + assert((ZeImageDesc.type == ZE_IMAGE_TYPE_1D && Origin->y == 0 && + Origin->z == 0) || + (ZeImageDesc.type == ZE_IMAGE_TYPE_1DARRAY && Origin->z == 0) || + (ZeImageDesc.type == ZE_IMAGE_TYPE_2D && Origin->z == 0) || (ZeImageDesc.type == ZE_IMAGE_TYPE_3D)); - uint32_t OriginX = pi_cast(Origin[0]); - uint32_t OriginY = pi_cast(Origin[1]); - uint32_t OriginZ = pi_cast(Origin[2]); + uint32_t OriginX = pi_cast(Origin->x); + uint32_t OriginY = pi_cast(Origin->y); + uint32_t OriginZ = pi_cast(Origin->z); - assert(Region[0] && Region[1] && Region[2]); - assert((ZeImageDesc.type == ZE_IMAGE_TYPE_1D && Region[1] == 1 && - Region[2] == 1) || - (ZeImageDesc.type == ZE_IMAGE_TYPE_1DARRAY && Region[2] == 1) || - (ZeImageDesc.type == ZE_IMAGE_TYPE_2D && Region[2] == 1) || + assert(Region->width && Region->height && Region->depth); + assert((ZeImageDesc.type == ZE_IMAGE_TYPE_1D && Region->height == 1 && + Region->depth == 1) || + (ZeImageDesc.type == ZE_IMAGE_TYPE_1DARRAY && Region->depth == 1) || + (ZeImageDesc.type == ZE_IMAGE_TYPE_2D && Region->depth == 1) || (ZeImageDesc.type == ZE_IMAGE_TYPE_3D)); - uint32_t Width = pi_cast(Region[0]); - uint32_t Height = pi_cast(Region[1]); - uint32_t Depth = pi_cast(Region[2]); + uint32_t Width = pi_cast(Region->width); + uint32_t Height = pi_cast(Region->height); + uint32_t Depth = pi_cast(Region->depth); const ze_image_region_t ZeRegion = {OriginX, OriginY, OriginZ, Width, Height, Depth}; @@ -3596,8 +3596,8 @@ static pi_result enqueueMemImageCommandHelper(pi_command_type CommandType, pi_queue Queue, const void *Src, // image or ptr void *Dst, // image or ptr - pi_bool IsBlocking, const size_t *SrcOrigin, - const size_t *DstOrigin, const size_t *Region, + pi_bool IsBlocking, pi_image_offset SrcOrigin, + pi_image_offset DstOrigin, pi_image_region Region, size_t RowPitch, size_t SlicePitch, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, pi_event *Event) { @@ -3707,8 +3707,8 @@ enqueueMemImageCommandHelper(pi_command_type CommandType, pi_queue Queue, extern "C" { pi_result piEnqueueMemImageRead(pi_queue Queue, pi_mem Image, - pi_bool BlockingRead, const size_t *Origin, - const size_t *Region, size_t RowPitch, + pi_bool BlockingRead, pi_image_offset Origin, + pi_image_region Region, size_t RowPitch, size_t SlicePitch, void *Ptr, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, @@ -3725,8 +3725,8 @@ pi_result piEnqueueMemImageRead(pi_queue Queue, pi_mem Image, } pi_result piEnqueueMemImageWrite(pi_queue Queue, pi_mem Image, - pi_bool BlockingWrite, const size_t *Origin, - const size_t *Region, size_t InputRowPitch, + pi_bool BlockingWrite, pi_image_offset Origin, + pi_image_region Region, size_t InputRowPitch, size_t InputSlicePitch, const void *Ptr, pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, @@ -3743,12 +3743,11 @@ pi_result piEnqueueMemImageWrite(pi_queue Queue, pi_mem Image, Event); } -pi_result piEnqueueMemImageCopy(pi_queue Queue, pi_mem SrcImage, - pi_mem DstImage, const size_t *SrcOrigin, - const size_t *DstOrigin, const size_t *Region, - pi_uint32 NumEventsInWaitList, - const pi_event *EventWaitList, - pi_event *Event) { +pi_result +piEnqueueMemImageCopy(pi_queue Queue, pi_mem SrcImage, pi_mem DstImage, + pi_image_offset SrcOrigin, pi_image_offset DstOrigin, + pi_image_region Region, pi_uint32 NumEventsInWaitList, + const pi_event *EventWaitList, pi_event *Event) { return enqueueMemImageCommandHelper( PI_COMMAND_TYPE_IMAGE_COPY, Queue, SrcImage, DstImage, diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index 22d8c00ec91b..cc7b7c01b14c 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -276,21 +276,28 @@ void copyH2D(SYCLMemObjI *SYCLMemObj, char *SrcMem, QueueImplPtr, size_t BufferSlicePitch = (3 == DimDst) ? DstSize[0] * DstSize[1] : 0; size_t HostRowPitch = (1 == DimSrc) ? 0 : SrcSize[0]; size_t HostSlicePitch = (3 == DimSrc) ? SrcSize[0] * SrcSize[1] : 0; + + pi_buff_rect_offset_struct BufferOffset{DstOffset[0], DstOffset[1], DstOffset[2]}; + pi_buff_rect_offset_struct HostOffset{SrcOffset[0], SrcOffset[1], SrcOffset[2]}; + pi_buff_rect_region_struct RectRegion{DstAccessRange[0], DstAccessRange[1], DstAccessRange[2]}; + Plugin.call( Queue, DstMem, - /*blocking_write=*/CL_FALSE, &DstOffset[0], &SrcOffset[0], - &DstAccessRange[0], BufferRowPitch, BufferSlicePitch, HostRowPitch, - HostSlicePitch, SrcMem, DepEvents.size(), DepEvents.data(), - &OutEvent); + /*blocking_write=*/CL_FALSE, &BufferOffset, &HostOffset, &RectRegion, + BufferRowPitch, BufferSlicePitch, HostRowPitch, HostSlicePitch, + SrcMem, DepEvents.size(), DepEvents.data(), &OutEvent); } } else { size_t InputRowPitch = (1 == DimDst) ? 0 : DstSize[0]; size_t InputSlicePitch = (3 == DimDst) ? DstSize[0] * DstSize[1] : 0; + + pi_image_offset_struct Origin{DstOffset[0],DstOffset[1],DstOffset[2]}; + pi_image_region_struct Region{DstAccessRange[0],DstAccessRange[1],DstAccessRange[2]}; + Plugin.call( Queue, DstMem, - /*blocking_write=*/CL_FALSE, &DstOffset[0], &DstAccessRange[0], - InputRowPitch, InputSlicePitch, SrcMem, DepEvents.size(), - DepEvents.data(), &OutEvent); + /*blocking_write=*/CL_FALSE, &Origin, &Region, InputRowPitch, + InputSlicePitch, SrcMem, DepEvents.size(), DepEvents.data(), &OutEvent); } } @@ -326,19 +333,27 @@ void copyD2H(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue, size_t HostRowPitch = (1 == DimDst) ? 0 : DstSize[0]; size_t HostSlicePitch = (3 == DimDst) ? DstSize[0] * DstSize[1] : 0; + + pi_buff_rect_offset_struct BufferOffset{SrcOffset[0], SrcOffset[1], SrcOffset[2]}; + pi_buff_rect_offset_struct HostOffset{DstOffset[0], DstOffset[1], DstOffset[2]}; + pi_buff_rect_region_struct RectRegion{SrcAccessRange[0], SrcAccessRange[1], SrcAccessRange[2]}; + Plugin.call( Queue, SrcMem, - /*blocking_read=*/CL_FALSE, &SrcOffset[0], &DstOffset[0], - &SrcAccessRange[0], BufferRowPitch, BufferSlicePitch, HostRowPitch, - HostSlicePitch, DstMem, DepEvents.size(), DepEvents.data(), - &OutEvent); + /*blocking_read=*/CL_FALSE, &BufferOffset, &HostOffset, &RectRegion, + BufferRowPitch, BufferSlicePitch, HostRowPitch, HostSlicePitch, + DstMem, DepEvents.size(), DepEvents.data(), &OutEvent); } } else { size_t RowPitch = (1 == DimSrc) ? 0 : SrcSize[0]; size_t SlicePitch = (3 == DimSrc) ? SrcSize[0] * SrcSize[1] : 0; + + pi_image_offset_struct Offset{SrcOffset[0], SrcOffset[1], SrcOffset[2]}; + pi_image_region_struct Region{SrcAccessRange[0], SrcAccessRange[1], SrcAccessRange[2]}; + Plugin.call( - Queue, SrcMem, CL_FALSE, &SrcOffset[0], &SrcAccessRange[0], RowPitch, - SlicePitch, DstMem, DepEvents.size(), DepEvents.data(), &OutEvent); + Queue, SrcMem, CL_FALSE, &Offset, &Region, RowPitch, SlicePitch, DstMem, + DepEvents.size(), DepEvents.data(), &OutEvent); } } @@ -373,14 +388,22 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue, size_t DstSlicePitch = (DimDst > 1) ? DstSize[0] * DstSize[1] : DstSize[0]; + pi_buff_rect_offset_struct SrcOrigin{SrcOffset[0], SrcOffset[1], SrcOffset[2]}; + pi_buff_rect_offset_struct DstOrigin{DstOffset[0], DstOffset[1], DstOffset[2]}; + pi_buff_rect_region_struct Region{SrcAccessRange[0], SrcAccessRange[1], SrcAccessRange[2]}; + Plugin.call( - Queue, SrcMem, DstMem, &SrcOffset[0], &DstOffset[0], - &SrcAccessRange[0], SrcRowPitch, SrcSlicePitch, DstRowPitch, - DstSlicePitch, DepEvents.size(), DepEvents.data(), &OutEvent); + Queue, SrcMem, DstMem, &SrcOrigin, &DstOrigin, &Region, SrcRowPitch, + SrcSlicePitch, DstRowPitch, DstSlicePitch, DepEvents.size(), + DepEvents.data(), &OutEvent); } } else { + pi_image_offset_struct SrcOrigin{SrcOffset[0], SrcOffset[1], SrcOffset[2]}; + pi_image_offset_struct DstOrigin{DstOffset[0], DstOffset[1], DstOffset[2]}; + pi_image_region_struct Region{SrcAccessRange[0], SrcAccessRange[1], SrcAccessRange[2]}; + Plugin.call( - Queue, SrcMem, DstMem, &SrcOffset[0], &DstOffset[0], &SrcAccessRange[0], + Queue, SrcMem, DstMem, &SrcOrigin, &DstOrigin, &Region, DepEvents.size(), DepEvents.data(), &OutEvent); } } diff --git a/sycl/test/plugins/enqueue-arg-order.cpp b/sycl/test/plugins/enqueue-arg-order.cpp new file mode 100644 index 000000000000..a9d162478ec2 --- /dev/null +++ b/sycl/test/plugins/enqueue-arg-order.cpp @@ -0,0 +1,261 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_PI_TRACE=2 %GPU_RUN_PLACEHOLDER %t.out | FileCheck %s +// RUN: env SYCL_PI_TRACE=2 %CPU_RUN_PLACEHOLDER %t.out | FileCheck %s +// XFAIL: * + +/* + Manual + clang++ -fsycl -o eao.bin enqueue-arg-order.cpp + SYCL_PI_TRACE=2 ./eao.bin + + clang++ --driver-mode=g++ -fsycl + -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -o eao.bin enqueue-arg-order.cpp + SYCL_PI_TRACE=2 SYCL_BE=PI_CUDA ./eao.bin + + llvm-lit --param SYCL_BE=PI_CUDA -v enqueue-arg-order.cpp +*/ + +#include +#include + +using namespace cl::sycl; + +constexpr long width = 16; +constexpr long height = 5; +constexpr long total = width * height; + +void remind() { + /* + https://www.khronos.org/registry/OpenCL/sdk/2.2/docs/man/html/clEnqueueReadBufferRect.html + + buffer_origin defines the (x, y, z) offset in the memory region associated + with buffer. For a 2D rectangle region, the z value given by + buffer_origin[2] should be 0. The offset in bytes is computed as + buffer_origin[2] × buffer_slice_pitch + buffer_origin[1] × buffer_row_pitch + + buffer_origin[0]. + + region defines the (width in bytes, height in rows, depth in slices) of the + 2D or 3D rectangle being read or written. For a 2D rectangle copy, the depth + value given by region[2] should be 1. The values in region cannot be 0. + + + buffer_row_pitch is the length of each row in bytes to be used for the + memory region associated with buffer. If buffer_row_pitch is 0, + buffer_row_pitch is computed as region[0]. + + buffer_slice_pitch is the length of each 2D slice in bytes to be used for + the memory region associated with buffer. If buffer_slice_pitch is 0, + buffer_slice_pitch is computed as region[1] × buffer_row_pitch. + */ + std::cout << "For BUFFERS" << std::endl; + std::cout << " Region SHOULD be : " << width * sizeof(float) << "/" + << height << "/" << 1 << std::endl; // 64/5/1 + std::cout << " RowPitch SHOULD be 0 or : " << width * sizeof(float) + << std::endl; // 0 or 64 + std::cout << "SlicePitch SHOULD be 0 or : " << width * sizeof(float) * height + << std::endl + << std::endl; // 0 or 320 + + // NOTE: presently we see 20/16/1 for Region and 20 for row pitch. both + // incorrect. + + /* + https://www.khronos.org/registry/OpenCL/sdk/2.2/docs/man/html/clEnqueueReadImage.html + + row_pitch in clEnqueueReadImage and input_row_pitch in clEnqueueWriteImage + is the length of each row in bytes. This value must be greater than or equal + to the element size in bytes × width. If row_pitch (or input_row_pitch) is + set to 0, the appropriate row pitch is calculated based on the size of each + element in bytes multiplied by width. + + slice_pitch in clEnqueueReadImage and input_slice_pitch in + clEnqueueWriteImage is the size in bytes of the 2D slice of the 3D region of + a 3D image or each image of a 1D or 2D image array being read or written + respectively. + */ + + std::cout << "For IMAGES" << std::endl; + std::cout << " Region SHOULD be : " << width << "/" << height << "/" + << 1 << std::endl; // 16/5/1 + std::cout << " row_pitch SHOULD be 0 or : " << width * sizeof(sycl::float4) + << std::endl; // 0 or 256 + std::cout << " slice_pitch SHOULD be 0 or : " + << width * sizeof(sycl::float4) * height << std::endl + << std::endl; // 0 or 1280 + + // NOTE: presently we see 5/16/1 for image Region and 80 for row pitch. both + // incorrect +} + +void testCopyD2HBuffer() { + // copyD2H + std::cout << "start copyD2H-Buffer" << std::endl; + std::vector data(total, 0); + { + buffer base(data.data(), range<2>(height, width)); + queue myQueue; + myQueue.submit([&](handler &cgh) { + auto acc = base.get_access(cgh); + cgh.parallel_for(base.get_range(), [=](id<2> index) { + float y_term = (float)(index[0]); + float x_term = (float)(index[1]); + acc[index] = x_term + (y_term / 10); + }); + }); + } // ~buffer + std::cout << "end copyD2H-Buffer" << std::endl; +} + +void testcopyTwiceBuffer() { + // copy between two queues triggers a piEnqueueMemBufferMap followed by + // copyH2D, followed by a copyD2H, followed by a piEnqueueMemUnmap this may + // change in the future. Here we only care that the 2D offset and region args + // are passed in the right order to copyH2D and copyD2H + + std::cout << "start copyTwice-buffer" << std::endl; + std::vector data(total, 0); + { + // initialize buffer with data + buffer base(data.data(), range<2>(height, width)); + + // first op + queue myQueue; + queue otherQueue; + myQueue.submit([&](handler &cgh) { + auto acc = base.get_access(cgh); + cgh.parallel_for( + base.get_range(), [=](id<2> index) { acc[index] = acc[index] * -1; }); + }); + myQueue.wait(); + + otherQueue.submit([&](handler &cgh) { + auto acc = base.get_access(cgh); + cgh.parallel_for( + base.get_range(), [=](id<2> index) { acc[index] = acc[index] * -1; }); + }); + + } // ~buffer + std::cout << "end copyTwice-buffer" << std::endl; +} + +void testCopyD2HImage() { + // copyD2H + std::cout << "start copyD2H-Image" << std::endl; + // image with write accessor to it in kernel + const sycl::image_channel_order ChanOrder = sycl::image_channel_order::rgba; + const sycl::image_channel_type ChanType = sycl::image_channel_type::fp32; + + constexpr auto SYCLRead = sycl::access::mode::read; + constexpr auto SYCLWrite = sycl::access::mode::write; + + const sycl::range<2> Img1Size(height, width); + const sycl::range<2> Img2Size(height, width); + + std::vector Img1HostData(Img1Size.size(), {1, 2, 3, 4}); + std::vector Img2HostData(Img2Size.size(), {0, 0, 0, 0}); + + { + sycl::image<2> Img1(Img1HostData.data(), ChanOrder, ChanType, Img1Size); + sycl::image<2> Img2(Img2HostData.data(), ChanOrder, ChanType, Img2Size); + queue Q; + Q.submit([&](sycl::handler &CGH) { + auto Img1Acc = Img1.get_access(CGH); + auto Img2Acc = Img2.get_access(CGH); + + CGH.parallel_for(Img1Size, [=](sycl::item<2> Item) { + sycl::float4 Data = Img1Acc.read(sycl::int2{Item[0], Item[1]}); + Img2Acc.write(sycl::int2{Item[0], Item[1]}, Data); + }); + }); + } // ~image + std::cout << "end copyD2H-Image" << std::endl; +} + +void testCopyTwiceImage() { + // copyD2H and copyH2D + std::cout << "start copyTwiceImage" << std::endl; + // image with write accessor to it in kernel + const sycl::image_channel_order ChanOrder = sycl::image_channel_order::rgba; + const sycl::image_channel_type ChanType = sycl::image_channel_type::fp32; + + constexpr auto SYCLRead = sycl::access::mode::read; + constexpr auto SYCLWrite = sycl::access::mode::write; + + const sycl::range<2> Img1Size(height, width); + const sycl::range<2> Img2Size(height, width); + + std::vector Img1HostData(Img1Size.size(), {1, 2, 3, 4}); + std::vector Img2HostData(Img2Size.size(), {0, 0, 0, 0}); + + { + sycl::image<2> Img1(Img1HostData.data(), ChanOrder, ChanType, Img1Size); + sycl::image<2> Img2(Img2HostData.data(), ChanOrder, ChanType, Img2Size); + queue Q; + queue otherQueue; + + // first op + Q.submit([&](sycl::handler &CGH) { + auto Img1Acc = Img1.get_access(CGH); + auto Img2Acc = Img2.get_access(CGH); + + CGH.parallel_for(Img1Size, [=](sycl::item<2> Item) { + sycl::float4 Data = Img1Acc.read(sycl::int2{Item[0], Item[1]}); + Img2Acc.write(sycl::int2{Item[0], Item[1]}, Data); + }); + }); + + // second op + otherQueue.submit([&](sycl::handler &CGH) { + auto Img1Acc = Img2.get_access(CGH); + auto Img2Acc = Img1.get_access(CGH); + + CGH.parallel_for(Img1Size, [=](sycl::item<2> Item) { + sycl::float4 Data = Img1Acc.read(sycl::int2{Item[0], Item[1]}); + Img2Acc.write(sycl::int2{Item[0], Item[1]}, Data); + }); + }); + } // ~image + std::cout << "end copyTwiceImage" << std::endl; +} + +int main() { + remind(); + testCopyD2HBuffer(); + testcopyTwiceBuffer(); + + testCopyD2HImage(); + testCopyTwiceImage(); +} + +//CHECK: start copyD2H-Buffer +//CHECK: ---> piEnqueueMemBufferReadRect( +//CHECK: pi_buff_rect_region width_bytes/height/depth : 64/5/1 +//CHECK: : 64 +//CHECK: end copyD2H-Buffer + +//CHECK: start copyTwice-buffer +//CHECK: ---> piEnqueueMemBufferWriteRect( +//CHECK: pi_buff_rect_region width_bytes/height/depth : 64/5/1 +//CHECK: : 64 +//CHECK: : 0 +//CHECK: : 64 + +//CHECK: ---> piEnqueueMemBufferReadRect( +//CHECK: pi_buff_rect_region width_bytes/height/depth : 64/5/1 +//CHECK: : 64 +//CHECK: end copyTwice-buffer + +//CHECK: start copyD2H-Image +//CHECK: ---> piEnqueueMemImageRead( +//CHECK: pi_image_region width/height/depth : 16/5/1 +//CHECK: : 256 +//CHECK: end copyD2H-Image + +//CHECK: start copyTwiceImage +//CHECK: ---> piEnqueueMemImageRead( +//CHECK: pi_image_region width/height/depth : 16/5/1 +//CHECK: : 256 +//CHECK: ---> piEnqueueMemImageWrite( +//CHECK: pi_image_region width/height/depth : 16/5/1 +//CHECK: : 256 +//CHECK: end copyTwiceImage \ No newline at end of file From e2cd4dd736ec1e4efaed21b4f46707ea45ce31e4 Mon Sep 17 00:00:00 2001 From: Chris Perkins Date: Fri, 21 Aug 2020 16:09:48 -0700 Subject: [PATCH 2/5] clang formacion Signed-off-by: Chris Perkins --- sycl/include/CL/sycl/detail/pi.h | 27 +++++++++------- sycl/include/CL/sycl/detail/pi.hpp | 26 ++++++++++------ sycl/plugins/cuda/pi_cuda.cpp | 16 +++++----- sycl/plugins/level_zero/pi_level_zero.cpp | 18 ++++++----- sycl/source/detail/memory_manager.cpp | 38 +++++++++++++++-------- 5 files changed, 75 insertions(+), 50 deletions(-) diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index f9035d5ffbc7..d3f7f75763f5 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -713,7 +713,8 @@ struct pi_buffer_region_struct { }; using pi_buffer_region = pi_buffer_region_struct *; -// pi_buff_rect_offset structure is 3D offset argument passed to buffer rect operations (piEnqueuReadBufferRect, etc). +// pi_buff_rect_offset structure is 3D offset argument passed to buffer rect +// operations (piEnqueuReadBufferRect, etc). struct pi_buff_rect_offset_struct { size_t x_bytes; size_t y_scalar; @@ -721,7 +722,8 @@ struct pi_buff_rect_offset_struct { }; using pi_buff_rect_offset = pi_buff_rect_offset_struct *; -// pi_buff_rect_region structure represents size of 3D region passed to buffer rect operations (piEnqueuReadBufferRect, etc). +// pi_buff_rect_region structure represents size of 3D region passed to buffer +// rect operations (piEnqueuReadBufferRect, etc). struct pi_buff_rect_region_struct { size_t width_bytes; size_t height_scalar; @@ -729,7 +731,8 @@ struct pi_buff_rect_region_struct { }; using pi_buff_rect_region = pi_buff_rect_region_struct *; -// pi_image_offset structure is 3D offset argument passed to image operations (piEnqueueMemImageRead, etc). +// pi_image_offset structure is 3D offset argument passed to image operations +// (piEnqueueMemImageRead, etc). struct pi_image_offset_struct { size_t x; size_t y; @@ -737,7 +740,8 @@ struct pi_image_offset_struct { }; using pi_image_offset = pi_image_offset_struct *; -// pi_image_region structure represents size of 3D region passed to image operations (piEnqueueMemImageRead, etc). +// pi_image_region structure represents size of 3D region passed to image +// operations (piEnqueueMemImageRead, etc). struct pi_image_region_struct { size_t width; size_t height; @@ -1321,10 +1325,11 @@ piEnqueueMemBufferCopy(pi_queue command_queue, pi_mem src_buffer, __SYCL_EXPORT pi_result piEnqueueMemBufferCopyRect( pi_queue command_queue, pi_mem src_buffer, pi_mem dst_buffer, - pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, pi_buff_rect_region region, - size_t src_row_pitch, size_t src_slice_pitch, size_t dst_row_pitch, - size_t dst_slice_pitch, pi_uint32 num_events_in_wait_list, - const pi_event *event_wait_list, pi_event *event); + pi_buff_rect_offset src_origin, pi_buff_rect_offset dst_origin, + pi_buff_rect_region region, size_t src_row_pitch, size_t src_slice_pitch, + size_t dst_row_pitch, size_t dst_slice_pitch, + pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, + pi_event *event); __SYCL_EXPORT pi_result piEnqueueMemBufferFill(pi_queue command_queue, pi_mem buffer, @@ -1347,9 +1352,9 @@ __SYCL_EXPORT pi_result piEnqueueMemImageWrite( __SYCL_EXPORT pi_result piEnqueueMemImageCopy( pi_queue command_queue, pi_mem src_image, pi_mem dst_image, - pi_image_offset src_origin, pi_image_offset dst_origin, pi_image_region region, - pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, - pi_event *event); + pi_image_offset src_origin, pi_image_offset dst_origin, + pi_image_region region, pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, pi_event *event); __SYCL_EXPORT pi_result piEnqueueMemImageFill(pi_queue command_queue, pi_mem image, diff --git a/sycl/include/CL/sycl/detail/pi.hpp b/sycl/include/CL/sycl/detail/pi.hpp index 25fdb26a0a55..85d868b7eb82 100644 --- a/sycl/include/CL/sycl/detail/pi.hpp +++ b/sycl/include/CL/sycl/detail/pi.hpp @@ -180,24 +180,30 @@ template <> inline void print<>(PiPlatform val) { std::cout << "pi_platform : " << val << std::endl; } -template <> inline void print<>(pi_buffer_region rgn){ - std::cout << "pi_buffer_region origin/size : " << rgn->origin << "/" << rgn->size << std::endl; +template <> inline void print<>(pi_buffer_region rgn) { + std::cout << "pi_buffer_region origin/size : " << rgn->origin << "/" + << rgn->size << std::endl; } -template <> inline void print<>(pi_buff_rect_region rgn){ - std::cout << "pi_buff_rect_region width_bytes/height/depth : " << rgn->width_bytes << "/" << rgn->height_scalar << "/" << rgn->depth_scalar << std::endl; +template <> inline void print<>(pi_buff_rect_region rgn) { + std::cout << "pi_buff_rect_region width_bytes/height/depth : " + << rgn->width_bytes << "/" << rgn->height_scalar << "/" + << rgn->depth_scalar << std::endl; } -template <> inline void print<>(pi_buff_rect_offset off){ - std::cout << "pi_buff_rect_offset x_bytes/y/z : " << off->x_bytes << "/" << off->y_scalar << "/" << off->z_scalar << std::endl; +template <> inline void print<>(pi_buff_rect_offset off) { + std::cout << "pi_buff_rect_offset x_bytes/y/z : " << off->x_bytes << "/" + << off->y_scalar << "/" << off->z_scalar << std::endl; } -template <> inline void print<>(pi_image_region rgn){ - std::cout << "pi_image_region width/height/depth : " << rgn->width << "/" << rgn->height << "/" << rgn->depth << std::endl; +template <> inline void print<>(pi_image_region rgn) { + std::cout << "pi_image_region width/height/depth : " << rgn->width << "/" + << rgn->height << "/" << rgn->depth << std::endl; } -template <> inline void print<>(pi_image_offset off){ - std::cout << "pi_image_offset x/y/z : " << off->x << "/" << off->y << "/" << off->z << std::endl; +template <> inline void print<>(pi_image_offset off) { + std::cout << "pi_image_offset x/y/z : " << off->x << "/" << off->y << "/" + << off->z << std::endl; } template <> inline void print<>(PiResult val) { diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index f34122c8cb89..2f53ae46e4b4 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -3405,10 +3405,10 @@ static pi_result commonEnqueueMemBufferCopyRect( pi_result cuda_piEnqueueMemBufferReadRect( pi_queue command_queue, pi_mem buffer, pi_bool blocking_read, pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, - pi_buff_rect_region region, size_t buffer_row_pitch, size_t buffer_slice_pitch, - size_t host_row_pitch, size_t host_slice_pitch, void *ptr, - pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, - pi_event *event) { + pi_buff_rect_region region, size_t buffer_row_pitch, + size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, + void *ptr, pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, pi_event *event) { assert(buffer != nullptr); assert(command_queue != nullptr); @@ -3456,10 +3456,10 @@ pi_result cuda_piEnqueueMemBufferReadRect( pi_result cuda_piEnqueueMemBufferWriteRect( pi_queue command_queue, pi_mem buffer, pi_bool blocking_write, pi_buff_rect_offset buffer_offset, pi_buff_rect_offset host_offset, - pi_buff_rect_region region, size_t buffer_row_pitch, size_t buffer_slice_pitch, - size_t host_row_pitch, size_t host_slice_pitch, const void *ptr, - pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list, - pi_event *event) { + pi_buff_rect_region region, size_t buffer_row_pitch, + size_t buffer_slice_pitch, size_t host_row_pitch, size_t host_slice_pitch, + const void *ptr, pi_uint32 num_events_in_wait_list, + const pi_event *event_wait_list, pi_event *event) { assert(buffer != nullptr); assert(command_queue != nullptr); diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index 6c75b18161cd..4ef2945e6a4c 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -3095,10 +3095,11 @@ pi_result piEnqueueMemBufferRead(pi_queue Queue, pi_mem Src, pi_result piEnqueueMemBufferReadRect( pi_queue Queue, pi_mem Buffer, pi_bool BlockingRead, - pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, pi_buff_rect_region Region, - size_t BufferRowPitch, size_t BufferSlicePitch, size_t HostRowPitch, - size_t HostSlicePitch, void *Ptr, pi_uint32 NumEventsInWaitList, - const pi_event *EventWaitList, pi_event *Event) { + pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, + pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch, + size_t HostRowPitch, size_t HostSlicePitch, void *Ptr, + pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, + pi_event *Event) { assert(Buffer); return enqueueMemCopyRectHelper( @@ -3282,10 +3283,11 @@ pi_result piEnqueueMemBufferWrite(pi_queue Queue, pi_mem Buffer, pi_result piEnqueueMemBufferWriteRect( pi_queue Queue, pi_mem Buffer, pi_bool BlockingWrite, - pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, pi_buff_rect_region Region, - size_t BufferRowPitch, size_t BufferSlicePitch, size_t HostRowPitch, - size_t HostSlicePitch, const void *Ptr, pi_uint32 NumEventsInWaitList, - const pi_event *EventWaitList, pi_event *Event) { + pi_buff_rect_offset BufferOffset, pi_buff_rect_offset HostOffset, + pi_buff_rect_region Region, size_t BufferRowPitch, size_t BufferSlicePitch, + size_t HostRowPitch, size_t HostSlicePitch, const void *Ptr, + pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList, + pi_event *Event) { assert(Buffer); return enqueueMemCopyRectHelper( diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp index cc7b7c01b14c..0cc1cea62d50 100644 --- a/sycl/source/detail/memory_manager.cpp +++ b/sycl/source/detail/memory_manager.cpp @@ -277,9 +277,12 @@ void copyH2D(SYCLMemObjI *SYCLMemObj, char *SrcMem, QueueImplPtr, size_t HostRowPitch = (1 == DimSrc) ? 0 : SrcSize[0]; size_t HostSlicePitch = (3 == DimSrc) ? SrcSize[0] * SrcSize[1] : 0; - pi_buff_rect_offset_struct BufferOffset{DstOffset[0], DstOffset[1], DstOffset[2]}; - pi_buff_rect_offset_struct HostOffset{SrcOffset[0], SrcOffset[1], SrcOffset[2]}; - pi_buff_rect_region_struct RectRegion{DstAccessRange[0], DstAccessRange[1], DstAccessRange[2]}; + pi_buff_rect_offset_struct BufferOffset{DstOffset[0], DstOffset[1], + DstOffset[2]}; + pi_buff_rect_offset_struct HostOffset{SrcOffset[0], SrcOffset[1], + SrcOffset[2]}; + pi_buff_rect_region_struct RectRegion{ + DstAccessRange[0], DstAccessRange[1], DstAccessRange[2]}; Plugin.call( Queue, DstMem, @@ -291,8 +294,9 @@ void copyH2D(SYCLMemObjI *SYCLMemObj, char *SrcMem, QueueImplPtr, size_t InputRowPitch = (1 == DimDst) ? 0 : DstSize[0]; size_t InputSlicePitch = (3 == DimDst) ? DstSize[0] * DstSize[1] : 0; - pi_image_offset_struct Origin{DstOffset[0],DstOffset[1],DstOffset[2]}; - pi_image_region_struct Region{DstAccessRange[0],DstAccessRange[1],DstAccessRange[2]}; + pi_image_offset_struct Origin{DstOffset[0], DstOffset[1], DstOffset[2]}; + pi_image_region_struct Region{DstAccessRange[0], DstAccessRange[1], + DstAccessRange[2]}; Plugin.call( Queue, DstMem, @@ -334,9 +338,12 @@ void copyD2H(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue, size_t HostRowPitch = (1 == DimDst) ? 0 : DstSize[0]; size_t HostSlicePitch = (3 == DimDst) ? DstSize[0] * DstSize[1] : 0; - pi_buff_rect_offset_struct BufferOffset{SrcOffset[0], SrcOffset[1], SrcOffset[2]}; - pi_buff_rect_offset_struct HostOffset{DstOffset[0], DstOffset[1], DstOffset[2]}; - pi_buff_rect_region_struct RectRegion{SrcAccessRange[0], SrcAccessRange[1], SrcAccessRange[2]}; + pi_buff_rect_offset_struct BufferOffset{SrcOffset[0], SrcOffset[1], + SrcOffset[2]}; + pi_buff_rect_offset_struct HostOffset{DstOffset[0], DstOffset[1], + DstOffset[2]}; + pi_buff_rect_region_struct RectRegion{ + SrcAccessRange[0], SrcAccessRange[1], SrcAccessRange[2]}; Plugin.call( Queue, SrcMem, @@ -349,7 +356,8 @@ void copyD2H(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue, size_t SlicePitch = (3 == DimSrc) ? SrcSize[0] * SrcSize[1] : 0; pi_image_offset_struct Offset{SrcOffset[0], SrcOffset[1], SrcOffset[2]}; - pi_image_region_struct Region{SrcAccessRange[0], SrcAccessRange[1], SrcAccessRange[2]}; + pi_image_region_struct Region{SrcAccessRange[0], SrcAccessRange[1], + SrcAccessRange[2]}; Plugin.call( Queue, SrcMem, CL_FALSE, &Offset, &Region, RowPitch, SlicePitch, DstMem, @@ -388,9 +396,12 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue, size_t DstSlicePitch = (DimDst > 1) ? DstSize[0] * DstSize[1] : DstSize[0]; - pi_buff_rect_offset_struct SrcOrigin{SrcOffset[0], SrcOffset[1], SrcOffset[2]}; - pi_buff_rect_offset_struct DstOrigin{DstOffset[0], DstOffset[1], DstOffset[2]}; - pi_buff_rect_region_struct Region{SrcAccessRange[0], SrcAccessRange[1], SrcAccessRange[2]}; + pi_buff_rect_offset_struct SrcOrigin{SrcOffset[0], SrcOffset[1], + SrcOffset[2]}; + pi_buff_rect_offset_struct DstOrigin{DstOffset[0], DstOffset[1], + DstOffset[2]}; + pi_buff_rect_region_struct Region{SrcAccessRange[0], SrcAccessRange[1], + SrcAccessRange[2]}; Plugin.call( Queue, SrcMem, DstMem, &SrcOrigin, &DstOrigin, &Region, SrcRowPitch, @@ -400,7 +411,8 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue, } else { pi_image_offset_struct SrcOrigin{SrcOffset[0], SrcOffset[1], SrcOffset[2]}; pi_image_offset_struct DstOrigin{DstOffset[0], DstOffset[1], DstOffset[2]}; - pi_image_region_struct Region{SrcAccessRange[0], SrcAccessRange[1], SrcAccessRange[2]}; + pi_image_region_struct Region{SrcAccessRange[0], SrcAccessRange[1], + SrcAccessRange[2]}; Plugin.call( Queue, SrcMem, DstMem, &SrcOrigin, &DstOrigin, &Region, From 860bb852ee28d85dec381578d7a1be09191ebb2a Mon Sep 17 00:00:00 2001 From: Alexey Bader Date: Sun, 23 Aug 2020 20:33:24 +0300 Subject: [PATCH 3/5] Update sycl/include/CL/sycl/detail/pi.h Co-authored-by: smaslov-intel <48694368+smaslov-intel@users.noreply.github.com> --- sycl/include/CL/sycl/detail/pi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index d3f7f75763f5..e62d9824dfab 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -714,7 +714,7 @@ struct pi_buffer_region_struct { using pi_buffer_region = pi_buffer_region_struct *; // pi_buff_rect_offset structure is 3D offset argument passed to buffer rect -// operations (piEnqueuReadBufferRect, etc). +// operations (piEnqueueMemBufferCopyRect, etc). struct pi_buff_rect_offset_struct { size_t x_bytes; size_t y_scalar; From e4a45b5127ca71cb40ed4cda070e14d3c47cd74e Mon Sep 17 00:00:00 2001 From: Chris Perkins Date: Wed, 26 Aug 2020 11:35:00 -0700 Subject: [PATCH 4/5] removed test, will introduce with later bug-fix Signed-off-by: Chris Perkins --- sycl/test/plugins/enqueue-arg-order.cpp | 261 ------------------------ 1 file changed, 261 deletions(-) delete mode 100644 sycl/test/plugins/enqueue-arg-order.cpp diff --git a/sycl/test/plugins/enqueue-arg-order.cpp b/sycl/test/plugins/enqueue-arg-order.cpp deleted file mode 100644 index a9d162478ec2..000000000000 --- a/sycl/test/plugins/enqueue-arg-order.cpp +++ /dev/null @@ -1,261 +0,0 @@ -// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out -// RUN: env SYCL_PI_TRACE=2 %GPU_RUN_PLACEHOLDER %t.out | FileCheck %s -// RUN: env SYCL_PI_TRACE=2 %CPU_RUN_PLACEHOLDER %t.out | FileCheck %s -// XFAIL: * - -/* - Manual - clang++ -fsycl -o eao.bin enqueue-arg-order.cpp - SYCL_PI_TRACE=2 ./eao.bin - - clang++ --driver-mode=g++ -fsycl - -fsycl-targets=nvptx64-nvidia-cuda-sycldevice -o eao.bin enqueue-arg-order.cpp - SYCL_PI_TRACE=2 SYCL_BE=PI_CUDA ./eao.bin - - llvm-lit --param SYCL_BE=PI_CUDA -v enqueue-arg-order.cpp -*/ - -#include -#include - -using namespace cl::sycl; - -constexpr long width = 16; -constexpr long height = 5; -constexpr long total = width * height; - -void remind() { - /* - https://www.khronos.org/registry/OpenCL/sdk/2.2/docs/man/html/clEnqueueReadBufferRect.html - - buffer_origin defines the (x, y, z) offset in the memory region associated - with buffer. For a 2D rectangle region, the z value given by - buffer_origin[2] should be 0. The offset in bytes is computed as - buffer_origin[2] × buffer_slice_pitch + buffer_origin[1] × buffer_row_pitch - + buffer_origin[0]. - - region defines the (width in bytes, height in rows, depth in slices) of the - 2D or 3D rectangle being read or written. For a 2D rectangle copy, the depth - value given by region[2] should be 1. The values in region cannot be 0. - - - buffer_row_pitch is the length of each row in bytes to be used for the - memory region associated with buffer. If buffer_row_pitch is 0, - buffer_row_pitch is computed as region[0]. - - buffer_slice_pitch is the length of each 2D slice in bytes to be used for - the memory region associated with buffer. If buffer_slice_pitch is 0, - buffer_slice_pitch is computed as region[1] × buffer_row_pitch. - */ - std::cout << "For BUFFERS" << std::endl; - std::cout << " Region SHOULD be : " << width * sizeof(float) << "/" - << height << "/" << 1 << std::endl; // 64/5/1 - std::cout << " RowPitch SHOULD be 0 or : " << width * sizeof(float) - << std::endl; // 0 or 64 - std::cout << "SlicePitch SHOULD be 0 or : " << width * sizeof(float) * height - << std::endl - << std::endl; // 0 or 320 - - // NOTE: presently we see 20/16/1 for Region and 20 for row pitch. both - // incorrect. - - /* - https://www.khronos.org/registry/OpenCL/sdk/2.2/docs/man/html/clEnqueueReadImage.html - - row_pitch in clEnqueueReadImage and input_row_pitch in clEnqueueWriteImage - is the length of each row in bytes. This value must be greater than or equal - to the element size in bytes × width. If row_pitch (or input_row_pitch) is - set to 0, the appropriate row pitch is calculated based on the size of each - element in bytes multiplied by width. - - slice_pitch in clEnqueueReadImage and input_slice_pitch in - clEnqueueWriteImage is the size in bytes of the 2D slice of the 3D region of - a 3D image or each image of a 1D or 2D image array being read or written - respectively. - */ - - std::cout << "For IMAGES" << std::endl; - std::cout << " Region SHOULD be : " << width << "/" << height << "/" - << 1 << std::endl; // 16/5/1 - std::cout << " row_pitch SHOULD be 0 or : " << width * sizeof(sycl::float4) - << std::endl; // 0 or 256 - std::cout << " slice_pitch SHOULD be 0 or : " - << width * sizeof(sycl::float4) * height << std::endl - << std::endl; // 0 or 1280 - - // NOTE: presently we see 5/16/1 for image Region and 80 for row pitch. both - // incorrect -} - -void testCopyD2HBuffer() { - // copyD2H - std::cout << "start copyD2H-Buffer" << std::endl; - std::vector data(total, 0); - { - buffer base(data.data(), range<2>(height, width)); - queue myQueue; - myQueue.submit([&](handler &cgh) { - auto acc = base.get_access(cgh); - cgh.parallel_for(base.get_range(), [=](id<2> index) { - float y_term = (float)(index[0]); - float x_term = (float)(index[1]); - acc[index] = x_term + (y_term / 10); - }); - }); - } // ~buffer - std::cout << "end copyD2H-Buffer" << std::endl; -} - -void testcopyTwiceBuffer() { - // copy between two queues triggers a piEnqueueMemBufferMap followed by - // copyH2D, followed by a copyD2H, followed by a piEnqueueMemUnmap this may - // change in the future. Here we only care that the 2D offset and region args - // are passed in the right order to copyH2D and copyD2H - - std::cout << "start copyTwice-buffer" << std::endl; - std::vector data(total, 0); - { - // initialize buffer with data - buffer base(data.data(), range<2>(height, width)); - - // first op - queue myQueue; - queue otherQueue; - myQueue.submit([&](handler &cgh) { - auto acc = base.get_access(cgh); - cgh.parallel_for( - base.get_range(), [=](id<2> index) { acc[index] = acc[index] * -1; }); - }); - myQueue.wait(); - - otherQueue.submit([&](handler &cgh) { - auto acc = base.get_access(cgh); - cgh.parallel_for( - base.get_range(), [=](id<2> index) { acc[index] = acc[index] * -1; }); - }); - - } // ~buffer - std::cout << "end copyTwice-buffer" << std::endl; -} - -void testCopyD2HImage() { - // copyD2H - std::cout << "start copyD2H-Image" << std::endl; - // image with write accessor to it in kernel - const sycl::image_channel_order ChanOrder = sycl::image_channel_order::rgba; - const sycl::image_channel_type ChanType = sycl::image_channel_type::fp32; - - constexpr auto SYCLRead = sycl::access::mode::read; - constexpr auto SYCLWrite = sycl::access::mode::write; - - const sycl::range<2> Img1Size(height, width); - const sycl::range<2> Img2Size(height, width); - - std::vector Img1HostData(Img1Size.size(), {1, 2, 3, 4}); - std::vector Img2HostData(Img2Size.size(), {0, 0, 0, 0}); - - { - sycl::image<2> Img1(Img1HostData.data(), ChanOrder, ChanType, Img1Size); - sycl::image<2> Img2(Img2HostData.data(), ChanOrder, ChanType, Img2Size); - queue Q; - Q.submit([&](sycl::handler &CGH) { - auto Img1Acc = Img1.get_access(CGH); - auto Img2Acc = Img2.get_access(CGH); - - CGH.parallel_for(Img1Size, [=](sycl::item<2> Item) { - sycl::float4 Data = Img1Acc.read(sycl::int2{Item[0], Item[1]}); - Img2Acc.write(sycl::int2{Item[0], Item[1]}, Data); - }); - }); - } // ~image - std::cout << "end copyD2H-Image" << std::endl; -} - -void testCopyTwiceImage() { - // copyD2H and copyH2D - std::cout << "start copyTwiceImage" << std::endl; - // image with write accessor to it in kernel - const sycl::image_channel_order ChanOrder = sycl::image_channel_order::rgba; - const sycl::image_channel_type ChanType = sycl::image_channel_type::fp32; - - constexpr auto SYCLRead = sycl::access::mode::read; - constexpr auto SYCLWrite = sycl::access::mode::write; - - const sycl::range<2> Img1Size(height, width); - const sycl::range<2> Img2Size(height, width); - - std::vector Img1HostData(Img1Size.size(), {1, 2, 3, 4}); - std::vector Img2HostData(Img2Size.size(), {0, 0, 0, 0}); - - { - sycl::image<2> Img1(Img1HostData.data(), ChanOrder, ChanType, Img1Size); - sycl::image<2> Img2(Img2HostData.data(), ChanOrder, ChanType, Img2Size); - queue Q; - queue otherQueue; - - // first op - Q.submit([&](sycl::handler &CGH) { - auto Img1Acc = Img1.get_access(CGH); - auto Img2Acc = Img2.get_access(CGH); - - CGH.parallel_for(Img1Size, [=](sycl::item<2> Item) { - sycl::float4 Data = Img1Acc.read(sycl::int2{Item[0], Item[1]}); - Img2Acc.write(sycl::int2{Item[0], Item[1]}, Data); - }); - }); - - // second op - otherQueue.submit([&](sycl::handler &CGH) { - auto Img1Acc = Img2.get_access(CGH); - auto Img2Acc = Img1.get_access(CGH); - - CGH.parallel_for(Img1Size, [=](sycl::item<2> Item) { - sycl::float4 Data = Img1Acc.read(sycl::int2{Item[0], Item[1]}); - Img2Acc.write(sycl::int2{Item[0], Item[1]}, Data); - }); - }); - } // ~image - std::cout << "end copyTwiceImage" << std::endl; -} - -int main() { - remind(); - testCopyD2HBuffer(); - testcopyTwiceBuffer(); - - testCopyD2HImage(); - testCopyTwiceImage(); -} - -//CHECK: start copyD2H-Buffer -//CHECK: ---> piEnqueueMemBufferReadRect( -//CHECK: pi_buff_rect_region width_bytes/height/depth : 64/5/1 -//CHECK: : 64 -//CHECK: end copyD2H-Buffer - -//CHECK: start copyTwice-buffer -//CHECK: ---> piEnqueueMemBufferWriteRect( -//CHECK: pi_buff_rect_region width_bytes/height/depth : 64/5/1 -//CHECK: : 64 -//CHECK: : 0 -//CHECK: : 64 - -//CHECK: ---> piEnqueueMemBufferReadRect( -//CHECK: pi_buff_rect_region width_bytes/height/depth : 64/5/1 -//CHECK: : 64 -//CHECK: end copyTwice-buffer - -//CHECK: start copyD2H-Image -//CHECK: ---> piEnqueueMemImageRead( -//CHECK: pi_image_region width/height/depth : 16/5/1 -//CHECK: : 256 -//CHECK: end copyD2H-Image - -//CHECK: start copyTwiceImage -//CHECK: ---> piEnqueueMemImageRead( -//CHECK: pi_image_region width/height/depth : 16/5/1 -//CHECK: : 256 -//CHECK: ---> piEnqueueMemImageWrite( -//CHECK: pi_image_region width/height/depth : 16/5/1 -//CHECK: : 256 -//CHECK: end copyTwiceImage \ No newline at end of file From 7f314652da7d90d35e42bc0595ae5a1b6b4576df Mon Sep 17 00:00:00 2001 From: Chris Perkins Date: Thu, 27 Aug 2020 11:06:56 -0700 Subject: [PATCH 5/5] Update sycl/include/CL/sycl/detail/pi.h Signed-off-by: Chris Perkins chris.perkins@intel.com Co-authored-by: smaslov-intel <48694368+smaslov-intel@users.noreply.github.com> --- sycl/include/CL/sycl/detail/pi.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/CL/sycl/detail/pi.h b/sycl/include/CL/sycl/detail/pi.h index e62d9824dfab..2a0ae02101dc 100644 --- a/sycl/include/CL/sycl/detail/pi.h +++ b/sycl/include/CL/sycl/detail/pi.h @@ -723,7 +723,7 @@ struct pi_buff_rect_offset_struct { using pi_buff_rect_offset = pi_buff_rect_offset_struct *; // pi_buff_rect_region structure represents size of 3D region passed to buffer -// rect operations (piEnqueuReadBufferRect, etc). +// rect operations (piEnqueueMemBufferCopyRect, etc). struct pi_buff_rect_region_struct { size_t width_bytes; size_t height_scalar;