Skip to content

Commit 4545e93

Browse files
committed
Fix USM mem blocking free corruption
If the event being waited on is completed and deleted during the traversal of command_queue->{commands|inorder_commands}, this would lead to corruption of the corresponding data structure. This change fixes this issue by deferring the deletion till the end of the current iteration.
1 parent 4c34cda commit 4545e93

File tree

3 files changed

+79
-36
lines changed

3 files changed

+79
-36
lines changed

include/acl_types.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -744,6 +744,10 @@ typedef struct _cl_event {
744744

745745
int is_on_device_op_queue; // bool whether this event is submitted to DevQ
746746
bool support_profiling; // bool whether this event can be profiled
747+
748+
// Flag to indicate the event is being removed during command/inorder_command
749+
// traversal and the removal should be deferred
750+
bool defer_removal = false;
747751
} _cl_event;
748752

749753
ACL_DECLARE_CL_OBJECT_ALLOC_FUNCTIONS(cl_event);
@@ -764,6 +768,9 @@ typedef struct _cl_command_queue {
764768
int num_commands; // Number of commands in the queue.
765769
int num_commands_submitted; // Number of events submitted to device_op_queue
766770

771+
// Flag to indicate whether commands/inorder_commands is being traversed
772+
bool waiting_for_events = false;
773+
767774
// Used only for in-order command queues. Owner of all events managed by this
768775
// queue
769776
std::deque<cl_event> inorder_commands;

src/acl_command_queue.cpp

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -764,7 +764,13 @@ int acl_update_ooo_queue(cl_command_queue command_queue) {
764764
command_queue->last_barrier = NULL;
765765
}
766766
acl_maybe_delete_event(event);
767-
command_queue->commands.erase(event);
767+
if (command_queue->waiting_for_events) {
768+
// We are in the middle of traversing command_queue->commands, defer
769+
// the removal till later to avoid corruption
770+
event->defer_removal = true;
771+
} else {
772+
command_queue->commands.erase(event);
773+
}
768774
event->not_popped = false;
769775
command_queue->num_commands--;
770776
acl_release(command_queue);
@@ -820,8 +826,14 @@ int acl_update_inorder_queue(cl_command_queue command_queue) {
820826
// popping it the second time.
821827

822828
// Deleted event. Remove it from our queue.
823-
command_queue->inorder_commands.erase(
824-
command_queue->inorder_commands.begin() + queue_idx);
829+
if (command_queue->waiting_for_events) {
830+
// We are in the middle of traversing command_queue->inorder_commands,
831+
// defer the removal till later to avoid corruption
832+
event->defer_removal = true;
833+
} else {
834+
command_queue->inorder_commands.erase(
835+
command_queue->inorder_commands.begin() + queue_idx);
836+
}
825837
command_queue->num_commands--;
826838
num_updates++;
827839
event->not_popped = false;

src/acl_usm.cpp

Lines changed: 57 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1224,48 +1224,72 @@ void l_cl_mem_blocking_free(cl_context context, void *ptr) {
12241224

12251225
for (int i = 0; i < num_command_queues; i++) {
12261226
cl_command_queue current_cq = context->command_queue[i];
1227-
// check events in commands
1228-
for (const auto &event : current_cq->commands) {
1229-
if (event->execution_status != CL_COMPLETE) {
1230-
// check if ptr is used by kernels when we submit to queue
1231-
if (event->ptr_hashtable.find(ptr) != event->ptr_hashtable.end()) {
1232-
clWaitForEvents(1, &event);
1233-
}
1234-
// check if ptr is used in queues
1235-
if ((event->cmd.type == CL_COMMAND_MEMCPY_INTEL) ||
1236-
(event->cmd.type == CL_COMMAND_MEMFILL_INTEL)) {
1237-
src_usm_alloc = acl_get_usm_alloc_from_ptr(
1238-
context, event->cmd.info.usm_xfer.src_ptr);
1239-
dst_usm_alloc = acl_get_usm_alloc_from_ptr(
1240-
context, event->cmd.info.usm_xfer.dst_ptr);
1241-
if ((src_usm_alloc && (src_usm_alloc->range.begin == ptr)) ||
1242-
(dst_usm_alloc && (dst_usm_alloc->range.begin == ptr))) {
1227+
// Set a flag to indicate the command set of this command queue is being
1228+
// traversed, and any event deletion should be deferred
1229+
current_cq->waiting_for_events = true;
1230+
1231+
if (current_cq->properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
1232+
// Current queue is ooo queue, check events in commands
1233+
for (auto it = current_cq->commands.begin();
1234+
it != current_cq->commands.end();) {
1235+
auto event = *it;
1236+
if (event->execution_status != CL_COMPLETE) {
1237+
// check if ptr is used by kernels when we submit to queue
1238+
if (event->ptr_hashtable.find(ptr) != event->ptr_hashtable.end()) {
12431239
clWaitForEvents(1, &event);
12441240
}
1241+
// check if ptr is used in queues
1242+
if ((event->cmd.type == CL_COMMAND_MEMCPY_INTEL) ||
1243+
(event->cmd.type == CL_COMMAND_MEMFILL_INTEL)) {
1244+
src_usm_alloc = acl_get_usm_alloc_from_ptr(
1245+
context, event->cmd.info.usm_xfer.src_ptr);
1246+
dst_usm_alloc = acl_get_usm_alloc_from_ptr(
1247+
context, event->cmd.info.usm_xfer.dst_ptr);
1248+
if ((src_usm_alloc && (src_usm_alloc->range.begin == ptr)) ||
1249+
(dst_usm_alloc && (dst_usm_alloc->range.begin == ptr))) {
1250+
clWaitForEvents(1, &event);
1251+
}
1252+
}
12451253
}
1246-
}
1247-
}
1248-
// check events in inorder commands
1249-
for (const auto &event : current_cq->inorder_commands) {
1250-
if (event->execution_status != CL_COMPLETE) {
1251-
// check if ptr is used by kernels when we submit to queue
1252-
if (event->ptr_hashtable.find(ptr) != event->ptr_hashtable.end()) {
1253-
clWaitForEvents(1, &event);
1254+
if (event->defer_removal) {
1255+
it = current_cq->commands.erase(it);
1256+
event->defer_removal = false; // Reset as this event might get reused
1257+
} else {
1258+
++it;
12541259
}
1255-
// check if ptr is used in queues
1256-
if ((event->cmd.type == CL_COMMAND_MEMCPY_INTEL) ||
1257-
(event->cmd.type == CL_COMMAND_MEMFILL_INTEL)) {
1258-
src_usm_alloc = acl_get_usm_alloc_from_ptr(
1259-
context, event->cmd.info.usm_xfer.src_ptr);
1260-
dst_usm_alloc = acl_get_usm_alloc_from_ptr(
1261-
context, event->cmd.info.usm_xfer.dst_ptr);
1262-
if ((src_usm_alloc && (src_usm_alloc->range.begin == ptr)) ||
1263-
(dst_usm_alloc && (dst_usm_alloc->range.begin == ptr))) {
1260+
}
1261+
} else {
1262+
// Current queue is inorder queue, check events in inorder commands
1263+
for (auto it = current_cq->inorder_commands.begin();
1264+
it != current_cq->inorder_commands.end();) {
1265+
auto event = *it;
1266+
if (event->execution_status != CL_COMPLETE) {
1267+
// check if ptr is used by kernels when we submit to queue
1268+
if (event->ptr_hashtable.find(ptr) != event->ptr_hashtable.end()) {
12641269
clWaitForEvents(1, &event);
12651270
}
1271+
// check if ptr is used in queues
1272+
if ((event->cmd.type == CL_COMMAND_MEMCPY_INTEL) ||
1273+
(event->cmd.type == CL_COMMAND_MEMFILL_INTEL)) {
1274+
src_usm_alloc = acl_get_usm_alloc_from_ptr(
1275+
context, event->cmd.info.usm_xfer.src_ptr);
1276+
dst_usm_alloc = acl_get_usm_alloc_from_ptr(
1277+
context, event->cmd.info.usm_xfer.dst_ptr);
1278+
if ((src_usm_alloc && (src_usm_alloc->range.begin == ptr)) ||
1279+
(dst_usm_alloc && (dst_usm_alloc->range.begin == ptr))) {
1280+
clWaitForEvents(1, &event);
1281+
}
1282+
}
1283+
}
1284+
if (event->defer_removal) {
1285+
it = current_cq->inorder_commands.erase(it);
1286+
event->defer_removal = false; // Reset as this event might get reused
1287+
} else {
1288+
++it;
12661289
}
12671290
}
12681291
}
1292+
current_cq->waiting_for_events = false;
12691293
}
12701294
}
12711295

0 commit comments

Comments
 (0)