Skip to content

Commit fabd1cc

Browse files
authored
[UR][CUDA][HIP] Fix command buffer update with shared kernel handles (#19479)
- Fix issue where updating multiple nodes with the same UR kernel handle would give incorrect results due to arg caching. - Add SYCL E2E test based on the example in the issue that reported this. Addresses issue reported in #19450
1 parent 8dea47b commit fabd1cc

File tree

3 files changed

+62
-8
lines changed

3 files changed

+62
-8
lines changed
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
// RUN: %{build} -o %t.out
2+
// RUN: %{run} %t.out
3+
4+
// REQUIRES: aspect-usm_shared_allocations
5+
6+
// Modified example from issue #19450 which identified an issue with updating
7+
// multiple kernel nodes which share the same kernel.
8+
9+
#include "../graph_common.hpp"
10+
11+
int main() {
12+
sycl::queue q;
13+
14+
static constexpr size_t R = 10;
15+
static constexpr size_t I = 5;
16+
int *output = sycl::malloc_shared<int>(I, q);
17+
std::fill(output, output + I, 0);
18+
19+
std::unique_ptr<sycl::ext::oneapi::experimental::command_graph<
20+
sycl::ext::oneapi::experimental::graph_state::executable>>
21+
graph;
22+
for (int r = 0; r < R; ++r) {
23+
24+
sycl::ext::oneapi::experimental::command_graph<
25+
sycl::ext::oneapi::experimental::graph_state::modifiable>
26+
modifiable_graph(q.get_context(), q.get_device());
27+
for (size_t i = 1; i < I; i++) {
28+
sycl::range global = {i, i, i};
29+
sycl::range local = {i, i, i};
30+
modifiable_graph.add([=](sycl::handler &h) {
31+
h.parallel_for<class test>(sycl::nd_range{global, local},
32+
[=](sycl::nd_item<3> it) noexcept {
33+
if (it.get_group().leader()) {
34+
output[i]++;
35+
}
36+
});
37+
});
38+
}
39+
40+
if (r == 0) {
41+
const auto instance = modifiable_graph.finalize(
42+
sycl::ext::oneapi::experimental::property::graph::updatable{});
43+
graph = std::make_unique<sycl::ext::oneapi::experimental::command_graph<
44+
sycl::ext::oneapi::experimental::graph_state::executable>>(
45+
std::move(instance));
46+
} else {
47+
graph->update(modifiable_graph);
48+
}
49+
q.ext_oneapi_graph(*graph).wait();
50+
}
51+
52+
q.wait();
53+
std::array<int, I> Ref{0, R, R, R, R};
54+
55+
for (int i = 0; i < I; ++i) {
56+
assert(output[i] == Ref[i]);
57+
}
58+
}

unified-runtime/source/adapters/cuda/command_buffer.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1347,14 +1347,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
13471347
UR_CHECK_ERROR(validateCommandDesc(hCommandBuffer, pUpdateKernelLaunch[i]));
13481348
}
13491349

1350-
// Store changes in config struct in command handle object
1350+
// Store changes in config struct in command handle object and propagate
1351+
// changes to CUDA graph
13511352
for (uint32_t i = 0; i < numKernelUpdates; i++) {
13521353
UR_CHECK_ERROR(updateCommand(pUpdateKernelLaunch[i]));
13531354
UR_CHECK_ERROR(updateKernelArguments(pUpdateKernelLaunch[i]));
1354-
}
13551355

1356-
// Propagate changes to CUDA driver API
1357-
for (uint32_t i = 0; i < numKernelUpdates; i++) {
13581356
const auto &UpdateCommandDesc = pUpdateKernelLaunch[i];
13591357

13601358
// If no work-size is provided make sure we pass nullptr to setKernelParams

unified-runtime/source/adapters/hip/command_buffer.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -984,14 +984,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
984984
UR_CHECK_ERROR(validateCommandDesc(hCommandBuffer, pUpdateKernelLaunch[i]));
985985
}
986986

987-
// Store changes in config struct in command handle object
987+
// Store changes in config struct in command handle object and propagate
988+
// changes to HIP Graph.
988989
for (uint32_t i = 0; i < numKernelUpdates; i++) {
989990
UR_CHECK_ERROR(updateCommand(pUpdateKernelLaunch[i]));
990991
UR_CHECK_ERROR(updateKernelArguments(pUpdateKernelLaunch[i]));
991-
}
992992

993-
// Propagate changes to HIP driver API
994-
for (uint32_t i = 0; i < numKernelUpdates; i++) {
995993
const auto &UpdateCommandDesc = pUpdateKernelLaunch[i];
996994

997995
// If no worksize is provided make sure we pass nullptr to setKernelParams

0 commit comments

Comments
 (0)