Skip to content

[SYCL] Implement event_profiling::command_submit for level-zero #7403

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions sycl/include/sycl/detail/pi.def
Original file line number Diff line number Diff line change
Expand Up @@ -140,5 +140,7 @@ _PI_API(piPluginGetLastError)

_PI_API(piTearDown)

_PI_API(piSetEventProperty)


#undef _PI_API
17 changes: 16 additions & 1 deletion sycl/include/sycl/detail/pi.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,12 @@
// 10.14 Add PI_EXT_INTEL_DEVICE_INFO_FREE_MEMORY as an extension for
// piDeviceGetInfo.
// 11.15 piEventCreate creates even in the signalled state now.
// 11.16 piSetEventProperty modifies properties of a pi_event.
// Currently only supports marking a to be constructed pi_event
// user-visible

#define _PI_H_VERSION_MAJOR 11
#define _PI_H_VERSION_MINOR 15
#define _PI_H_VERSION_MINOR 16

#define _PI_STRING_HELPER(a) #a
#define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
Expand Down Expand Up @@ -526,6 +529,8 @@ typedef enum {
PI_PROFILING_INFO_COMMAND_END = 0x1283
} _pi_profiling_info;

typedef enum { IS_USER_VISIBLE = 0x1 } _pi_event_property;

// NOTE: this is made 64-bit to match the size of cl_mem_flags to
// make the translation to OpenCL transparent.
// TODO: populate
Expand Down Expand Up @@ -1791,6 +1796,16 @@ __SYCL_EXPORT pi_result piTearDown(void *PluginParameter);
/// runtime must handle it or end the application.
__SYCL_EXPORT pi_result piPluginGetLastError(char **message);

/// Modifies property of a pi_event
/// \param event pointer to pi_event to modify
/// \param property enum to event property to modify
/// \param propertySize size of propertyValue
/// \param propertyValue value to assign to event property
__SYCL_EXPORT pi_result piSetEventProperty(pi_event *event,
_pi_event_property property,
size_t propertySize,
void *propertyValue);

struct _pi_plugin {
// PI version supported by host passed to the plugin. The Plugin
// checks and writes the appropriate Function Pointers in
Expand Down
6 changes: 6 additions & 0 deletions sycl/plugins/cuda/pi_cuda.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5380,6 +5380,11 @@ pi_result cuda_piTearDown(void *) {
return PI_SUCCESS;
}

pi_result cuda_piSetEventProperty(pi_event *event, _pi_event_property property,
size_t propertySize, void *propertyValue) {
assert(0 && "Operation not supported");
}

const char SupportedVersion[] = _PI_CUDA_PLUGIN_VERSION_STRING;

pi_result piPluginInit(pi_plugin *PluginInit) {
Expand Down Expand Up @@ -5524,6 +5529,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
_PI_CL(piextKernelSetArgSampler, cuda_piextKernelSetArgSampler)
_PI_CL(piPluginGetLastError, cuda_piPluginGetLastError)
_PI_CL(piTearDown, cuda_piTearDown)
_PI_CL(piSetEventProperty, cuda_piSetEventProperty)

#undef _PI_CL

Expand Down
2 changes: 1 addition & 1 deletion sycl/plugins/cuda/pi_cuda.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

// This version should be incremented for any change made to this file or its
// corresponding .cpp file.
#define _PI_CUDA_PLUGIN_VERSION 1
#define _PI_CUDA_PLUGIN_VERSION 2

#define _PI_CUDA_PLUGIN_VERSION_STRING \
_PI_PLUGIN_VERSION_STRING(_PI_CUDA_PLUGIN_VERSION)
Expand Down
5 changes: 5 additions & 0 deletions sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1983,6 +1983,11 @@ pi_result piTearDown(void *) {
return PI_SUCCESS;
}

pi_result piSetEventProperty(pi_event *event, _pi_event_property property,
size_t propertySize, void *propertyValue) {
assert(0 && "Operation not supported");
}

const char SupportedVersion[] = _PI_ESIMD_PLUGIN_VERSION_STRING;

pi_result piPluginInit(pi_plugin *PluginInit) {
Expand Down
2 changes: 1 addition & 1 deletion sycl/plugins/esimd_emulator/pi_esimd_emulator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@

// This version should be incremented for any change made to this file or its
// corresponding .cpp file.
#define _PI_ESIMD_PLUGIN_VERSION 1
#define _PI_ESIMD_PLUGIN_VERSION 2

#define _PI_ESIMD_PLUGIN_VERSION_STRING \
_PI_PLUGIN_VERSION_STRING(_PI_ESIMD_PLUGIN_VERSION)
Expand Down
5 changes: 5 additions & 0 deletions sycl/plugins/hip/pi_hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5108,6 +5108,10 @@ pi_result hip_piTearDown(void *PluginParameter) {
return PI_SUCCESS;
}

pi_result hip_piSetEventProperty(pi_event *event, _pi_event_property property,
size_t propertySize, void *propertyValue) {
assert(0 && "Operation not supported");
}
const char SupportedVersion[] = _PI_HIP_PLUGIN_VERSION_STRING;

pi_result piPluginInit(pi_plugin *PluginInit) {
Expand Down Expand Up @@ -5246,6 +5250,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
_PI_CL(piextKernelSetArgSampler, hip_piextKernelSetArgSampler)
_PI_CL(piPluginGetLastError, hip_piPluginGetLastError)
_PI_CL(piTearDown, hip_piTearDown)
_PI_CL(piSetEventProperty, hip_piSetEventProperty)

#undef _PI_CL

Expand Down
2 changes: 1 addition & 1 deletion sycl/plugins/hip/pi_hip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

// This version should be incremented for any change made to this file or its
// corresponding .cpp file.
#define _PI_HIP_PLUGIN_VERSION 1
#define _PI_HIP_PLUGIN_VERSION 2

#define _PI_HIP_PLUGIN_VERSION_STRING \
_PI_PLUGIN_VERSION_STRING(_PI_HIP_PLUGIN_VERSION)
Expand Down
96 changes: 93 additions & 3 deletions sycl/plugins/level_zero/pi_level_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,39 @@ static const bool DisableEventsCaching = [] {
return std::stoi(DisableEventsCachingFlag) != 0;
}();

// Stores pointers to events that are user visible.
// TODO: When ABI breaking changes are allowed. Pass boolean to piEnqueue
// methods instead
// (piEnqueueKernelLaunch,etc) to indicate if an event is user visible.
static std::list<pi_event *> piUserVisibleEvents{};
// Mutex for piUserVisibleEvents
static pi_mutex piUserVisibleEventsMutex{};

/// Checks if an event is user visible by seeing if it's pointer value is
/// present in piUserVisibleEvents
///
/// \param event The event to check
bool piIsEventUserVisible(pi_event *event) {

std::unique_lock lock{piUserVisibleEventsMutex};
for (auto it = piUserVisibleEvents.begin(); it != piUserVisibleEvents.end();
it++) {
if (*it == event) {
piUserVisibleEvents.erase(it);
return true;
}
}
return false;
}

/// Marks the event as user visible
///
/// \param event To mark as user visible
void piMarkEventUserVisible(pi_event *event) {
std::unique_lock lock{piUserVisibleEventsMutex};
piUserVisibleEvents.push_front(event);
}

// This class encapsulates actions taken along with a call to Level Zero API.
class ZeCall {
private:
Expand Down Expand Up @@ -5408,6 +5441,8 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
// reference count on the kernel, using the kernel saved in CommandData.
PI_CALL(piKernelRetain(Kernel));

PI_CALL(Queue->Device->deviceTime.getSubmitTime(Event));

// Add to list of kernels to be submitted
if (IndirectAccessTrackingEnabled)
Queue->KernelsToBeSubmitted.push_back(Kernel);
Expand Down Expand Up @@ -5760,9 +5795,11 @@ pi_result piEventGetProfilingInfo(pi_event Event, pi_profiling_info ParamName,
return ReturnValue(ContextEndTime);
}
case PI_PROFILING_INFO_COMMAND_QUEUED:
case PI_PROFILING_INFO_COMMAND_SUBMIT:
// TODO: Support these when Level Zero supported is added.
return ReturnValue(uint64_t{0});
case PI_PROFILING_INFO_COMMAND_SUBMIT: {
// No solid way of handling possible wrap around as the the event may not be
// signalled by device, thus no way of obtaining event start time
return ReturnValue(Event->submitTime);
}
default:
zePrint("piEventGetProfilingInfo: not supported ParamName\n");
return PI_ERROR_INVALID_VALUE;
Expand Down Expand Up @@ -6314,6 +6351,8 @@ pi_result piEnqueueEventsWait(pi_queue Queue, pi_uint32 NumEventsInWaitList,

ZE_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent));

PI_CALL(Queue->Device->deviceTime.getSubmitTime(Event));

// Execute command list asynchronously as the event will be used
// to track down its completion.
return Queue->executeCommandList(CommandList);
Expand Down Expand Up @@ -6407,6 +6446,7 @@ pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue,
insertBarrierIntoCmdList(CmdList, TmpWaitList, *Event, IsInternal))
return Res;

PI_CALL(Queue->Device->deviceTime.getSubmitTime(Event));
if (auto Res = Queue->executeCommandList(CmdList, false, OkToBatch))
return Res;

Expand Down Expand Up @@ -6678,6 +6718,7 @@ enqueueMemCopyHelper(pi_command_type CommandType, pi_queue Queue, void *Dst,
ZE_CALL(zeCommandListAppendMemoryCopy,
(ZeCommandList, Dst, Src, Size, ZeEvent, 0, nullptr));

PI_CALL(Queue->Device->deviceTime.getSubmitTime(Event));
if (auto Res =
Queue->executeCommandList(CommandList, BlockingWrite, OkToBatch))
return Res;
Expand Down Expand Up @@ -6780,6 +6821,7 @@ static pi_result enqueueMemCopyRectHelper(
zePrint("calling zeCommandListAppendBarrier() with Event %#lx\n",
pi_cast<std::uintptr_t>(ZeEvent));

PI_CALL(Queue->Device->deviceTime.getSubmitTime(Event));
if (auto Res = Queue->executeCommandList(CommandList, Blocking, OkToBatch))
return Res;

Expand Down Expand Up @@ -6999,6 +7041,7 @@ enqueueMemFillHelper(pi_command_type CommandType, pi_queue Queue, void *Ptr,
pi_cast<pi_uint64>(ZeEvent));
printZeEventList(WaitList);

PI_CALL(Queue->Device->deviceTime.getSubmitTime(Event));
// Execute command list asynchronously, as the event will be used
// to track down its completion.
if (auto Res = Queue->executeCommandList(CommandList, false, OkToBatch))
Expand Down Expand Up @@ -7054,6 +7097,8 @@ pi_result piEnqueueMemBufferMap(pi_queue Queue, pi_mem Mem, pi_bool BlockingMap,
ze_event_handle_t ZeEvent = nullptr;

bool UseCopyEngine = false;

PI_CALL(Queue->Device->deviceTime.getSubmitTime(Event));
{
// Lock automatically releases when this goes out of scope.
std::scoped_lock<pi_shared_mutex> lock(Queue->Mutex);
Expand Down Expand Up @@ -7512,6 +7557,7 @@ static pi_result enqueueMemImageCommandHelper(
return PI_ERROR_INVALID_OPERATION;
}

PI_CALL(Queue->Device->deviceTime.getSubmitTime(Event));
if (auto Res = Queue->executeCommandList(CommandList, IsBlocking, OkToBatch))
return Res;

Expand Down Expand Up @@ -8417,6 +8463,8 @@ pi_result piextUSMEnqueuePrefetch(pi_queue Queue, const void *Ptr, size_t Size,
// so manually add command to signal our event.
ZE_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent));

PI_CALL(Queue->Device->deviceTime.getSubmitTime(Event));

if (auto Res = Queue->executeCommandList(CommandList, false))
return Res;

Expand Down Expand Up @@ -8484,6 +8532,8 @@ pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr,
// so manually add command to signal our event.
ZE_CALL(zeCommandListAppendSignalEvent, (ZeCommandList, ZeEvent));

PI_CALL(Queue->Device->deviceTime.getSubmitTime(Event));

Queue->executeCommandList(CommandList, false);

return PI_SUCCESS;
Expand Down Expand Up @@ -9004,4 +9054,44 @@ pi_result _pi_buffer::free() {
return PI_SUCCESS;
}

inline pi_result piDeviceTime::get(uint64_t *deviceTime) {
if (!initialized) {
std::unique_lock{mutex};
initialized = true;
ZeTimerResolution = device->ZeDeviceProperties->timerResolution;
TimestampMaxCount =
((1ULL << device->ZeDeviceProperties->kernelTimestampValidBits) - 1ULL);
}
uint64_t deviceClockCount, dummy;
ZE_CALL(zeDeviceGetGlobalTimestamps,
(device->ZeDevice, &dummy, &deviceClockCount));
*deviceTime = (deviceClockCount & TimestampMaxCount) * ZeTimerResolution;
return PI_SUCCESS;
}

inline pi_result piDeviceTime::getSubmitTime(pi_event *event) {
if (!(*event)->isProfilingEnabled() || !piIsEventUserVisible(event)) {
return PI_SUCCESS;
}
return get(&((*event)->submitTime));
}
pi_result piSetEventProperty(pi_event *event, _pi_event_property property,
size_t propertySize, void *propertyValue) {

switch (property) {
case IS_USER_VISIBLE: {
bool isHostVisible = *static_cast<bool *>(propertyValue);
if (isHostVisible) {
piMarkEventUserVisible(event);
} else {
piIsEventUserVisible(event);
}
break;
}
default: {
return PI_ERROR_INVALID_VALUE;
}
}
return PI_SUCCESS;
}
} // extern "C"
36 changes: 34 additions & 2 deletions sycl/plugins/level_zero/pi_level_zero.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

// This version should be incremented for any change made to this file or its
// corresponding .cpp file.
#define _PI_LEVEL_ZERO_PLUGIN_VERSION 1
#define _PI_LEVEL_ZERO_PLUGIN_VERSION 2

#define _PI_LEVEL_ZERO_PLUGIN_VERSION_STRING \
_PI_PLUGIN_VERSION_STRING(_PI_LEVEL_ZERO_PLUGIN_VERSION)
Expand Down Expand Up @@ -356,6 +356,33 @@ struct MemAllocRecord : _pi_object {
bool OwnZeMemHandle;
};

// Struct used to fetch device wall-clock time
struct piDeviceTime {
private:
// Device to query
pi_device device;
// ZeTimerResolution is number of nanoseconds per clock step assuming
// stype==ZE_STRUCTURE_TYPE_DEVICE_PROPERTIES
uint64_t ZeTimerResolution, TimestampMaxCount;
bool initialized = false;
pi_mutex mutex;

public:
/// Retreives current wall-clock time from device
///
/// \param deviceTime Variable where device time would be stored
inline pi_result get(uint64_t *deviceTime);

/// Checks if the passed in event is user visible.
/// If so then retrieves the current wall-clock time from device
/// and stores it in the submitTime field of the event.
/// Used to calculate the submission time of a commandlist
///
/// \param event is the event to check for user visiblity
inline pi_result getSubmitTime(pi_event *event);
piDeviceTime(pi_device dev) : device(dev) {}
};

// Define the types that are opaque in pi.h in a manner suitabale for Level Zero
// plugin

Expand Down Expand Up @@ -484,7 +511,7 @@ struct _pi_device : _pi_object {
pi_device ParentDevice = nullptr)
: ZeDevice{Device}, Platform{Plt}, RootDevice{ParentDevice},
ImmCommandListsPreferred{false}, ZeDeviceProperties{},
ZeDeviceComputeProperties{} {
ZeDeviceComputeProperties{}, deviceTime(this) {
// NOTE: one must additionally call initialize() to complete
// PI device creation.
}
Expand Down Expand Up @@ -579,6 +606,7 @@ struct _pi_device : _pi_object {
ZeCache<ZeStruct<ze_device_memory_access_properties_t>>
ZeDeviceMemoryAccessProperties;
ZeCache<ZeStruct<ze_device_cache_properties_t>> ZeDeviceCacheProperties;
piDeviceTime deviceTime;
};

// Structure describing the specific use of a command-list in a queue.
Expand Down Expand Up @@ -1350,6 +1378,10 @@ struct _pi_event : _pi_object {
(Queue->Properties & PI_QUEUE_PROFILING_ENABLE) != 0;
}

// Keeps track of the submisison time of the commadlist associated with this
// event, if event is user visible
uint64_t submitTime = 0;

// Keeps the command-queue and command associated with the event.
// These are NULL for the user events.
pi_queue Queue = {nullptr};
Expand Down
Loading