Skip to content

Commit 493494d

Browse files
committed
Merge remote-tracking branch 'upstream/sycl' into ci/update_gpu_driver-linux-25.35.35096.9
2 parents fd72eeb + 778ddeb commit 493494d

File tree

117 files changed

+1130
-712
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

117 files changed

+1130
-712
lines changed

.github/workflows/sycl-linux-precommit.yml

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,42 @@ jobs:
6262
e2e_binaries_artifact: e2e_bin
6363
e2e_binaries_preview_artifact: e2e_bin_preview
6464

65+
# Build and run native cpu e2e tests separately as cannot currently
66+
# build all the e2e tests
67+
build_run_native_cpu_e2e_tests:
68+
if: ${{ always() && !cancelled() && needs.build.outputs.build_conclusion == 'success' }}
69+
runs-on: [Linux, build]
70+
needs: [build]
71+
container:
72+
image: ghcr.io/intel/llvm/sycl_ubuntu2404_nightly:latest
73+
options: -u 1001:1001
74+
steps:
75+
- uses: actions/checkout@v4
76+
with:
77+
sparse-checkout: |
78+
devops/
79+
80+
# download build artefact
81+
- name: Download toolchain
82+
uses: actions/download-artifact@v4
83+
with:
84+
name: sycl_linux_default
85+
- name: Extract SYCL toolchain
86+
shell: bash
87+
run: |
88+
mkdir toolchain
89+
tar -xf llvm_sycl.tar.zst -C toolchain
90+
rm llvm_sycl.tar.zst
91+
- name: Build and run E2E tests
92+
uses: ./devops/actions/run-tests/e2e
93+
with:
94+
ref: ${{ inputs.ref || github.sha }}
95+
testing_mode: build-only
96+
target_devices: native_cpu
97+
sycl_compiler: $GITHUB_WORKSPACE/toolchain/bin/clang++
98+
extra_lit_opts: --param sycl_build_targets="native_cpu"
99+
extra_cmake_args: -DSYCL_TEST_E2E_TARGETS="native_cpu:cpu" -DSYCL_TEST_E2E_STANDALONE=ON
100+
65101
# If a PR changes CUDA adapter, run the build on Ubuntu 22.04 as well.
66102
# Ubuntu 22.04 container has CUDA 12.1 installed while Ubuntu 24.0 image
67103
# has CUDA 12.6.1 installed.

devops/compat_ci_exclude.sycl-rel-6_3

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,13 @@ Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
99
# See GSD-11097.
1010
Assert/assert_in_kernels.cpp
1111

12+
# https://github.com/intel/llvm/pull/20159 prevents returning last event as an
13+
# optimization for submitting barrier to an empty IOQ. However, the test
14+
# actually checks whether last event is returned or not, so it needs to be
15+
# updated to match the new behavior. ext_oneapi_submit_barrier spec doesn't
16+
# require last event to be returned, so this is not an ABI break.
17+
InorderQueue/in_order_ext_oneapi_submit_barrier.cpp
18+
1219
# Likely OK, but need author to provide justification, get approval/confirmation
1320
# from someone:
1421

devops/scripts/benchmarks/benches/compute.py

Lines changed: 123 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
# See LICENSE.TXT
44
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
55

6+
from itertools import product
67
import os
78
import csv
89
import io
@@ -24,6 +25,11 @@ class RUNTIMES(Enum):
2425
UR = "ur"
2526

2627

28+
class PROFILERS(Enum):
29+
TIMER = "timer"
30+
CPU_COUNTER = "cpuCounter"
31+
32+
2733
def runtime_to_name(runtime: RUNTIMES) -> str:
2834
return {
2935
RUNTIMES.SYCL_PREVIEW: "SYCL Preview",
@@ -171,69 +177,116 @@ def benchmarks(self) -> list[Benchmark]:
171177

172178
# hand-picked value so that total execution time of the benchmark is
173179
# similar on all architectures
174-
long_lernel_exec_time_ioq = [20]
180+
long_kernel_exec_time_ioq = [20]
175181
# For BMG server, a new value 200 is used, but we have to create metadata
176182
# for both values to keep the dashboard consistent.
177183
# See SubmitKernel.enabled()
178184
long_kernel_exec_time_ooo = [20, 200]
179185

180-
# The Combo Profiler is available only for selected sycl benchmarks
181-
profiler_types = ["timer", "cpuCounter"]
182-
183-
for runtime in list(RUNTIMES):
184-
# Add SubmitKernel benchmarks using loops
185-
for in_order_queue in [0, 1]:
186-
for measure_completion in [0, 1]:
187-
for use_events in [0, 1]:
188-
long_kernel_exec_time = (
189-
long_lernel_exec_time_ioq
190-
if in_order_queue
191-
else long_kernel_exec_time_ooo
192-
)
193-
for kernel_exec_time in [1, *long_kernel_exec_time]:
194-
for profiler_type in profiler_types:
195-
benches.append(
196-
SubmitKernel(
197-
self,
198-
runtime,
199-
in_order_queue,
200-
measure_completion,
201-
use_events,
202-
kernel_exec_time,
203-
profiler_type,
204-
)
205-
)
206-
207-
# Add SinKernelGraph benchmarks
208-
for with_graphs in [0, 1]:
209-
for num_kernels in [5, 100]:
186+
submit_kernel_params = product(
187+
list(RUNTIMES),
188+
[0, 1], # in_order_queue
189+
[0, 1], # measure_completion
190+
[0, 1], # use_events
191+
)
192+
for (
193+
runtime,
194+
in_order_queue,
195+
measure_completion,
196+
use_events,
197+
) in submit_kernel_params:
198+
long_kernel_exec_time = (
199+
long_kernel_exec_time_ioq
200+
if in_order_queue
201+
else long_kernel_exec_time_ooo
202+
)
203+
for kernel_exec_time in [1, *long_kernel_exec_time]:
204+
benches.append(
205+
SubmitKernel(
206+
self,
207+
runtime,
208+
in_order_queue,
209+
measure_completion,
210+
use_events,
211+
kernel_exec_time,
212+
)
213+
)
214+
if runtime == RUNTIMES.SYCL:
215+
# Create CPU count variant
210216
benches.append(
211-
GraphApiSinKernelGraph(self, runtime, with_graphs, num_kernels)
217+
SubmitKernel(
218+
self,
219+
runtime,
220+
in_order_queue,
221+
measure_completion,
222+
use_events,
223+
kernel_exec_time,
224+
profiler_type=PROFILERS.CPU_COUNTER,
225+
)
212226
)
213227

228+
# Add SinKernelGraph benchmarks
229+
sin_kernel_graph_params = product(
230+
list(RUNTIMES),
231+
[0, 1], # with_graphs
232+
[5, 100], # num_kernels
233+
)
234+
for runtime, with_graphs, num_kernels in sin_kernel_graph_params:
235+
benches.append(
236+
GraphApiSinKernelGraph(self, runtime, with_graphs, num_kernels)
237+
)
238+
214239
# Add ULLS benchmarks
215-
for profiler_type in profiler_types:
216-
benches.append(UllsEmptyKernel(self, runtime, 1000, 256, profiler_type))
240+
for runtime in list(RUNTIMES):
241+
if runtime == RUNTIMES.SYCL:
242+
benches.append(
243+
UllsEmptyKernel(
244+
self, runtime, 1000, 256, profiler_type=PROFILERS.CPU_COUNTER
245+
)
246+
)
247+
benches.append(UllsEmptyKernel(self, runtime, 1000, 256))
217248
benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1))
218249

219-
# Add GraphApiSubmitGraph benchmarks
220-
for in_order_queue in [0, 1]:
221-
for num_kernels in self.submit_graph_num_kernels:
222-
for measure_completion_time in [0, 1]:
223-
for use_events in [0, 1]:
224-
for profiler_type in profiler_types:
225-
benches.append(
226-
GraphApiSubmitGraph(
227-
self,
228-
runtime,
229-
in_order_queue,
230-
num_kernels,
231-
measure_completion_time,
232-
profiler_type,
233-
use_events,
234-
useHostTasks=0,
235-
)
236-
)
250+
# Add GraphApiSubmitGraph benchmarks
251+
submit_graph_params = product(
252+
list(RUNTIMES),
253+
[0, 1], # in_order_queue
254+
self.submit_graph_num_kernels,
255+
[0, 1], # measure_completion_time
256+
[0, 1], # use_events
257+
)
258+
for (
259+
runtime,
260+
in_order_queue,
261+
num_kernels,
262+
measure_completion_time,
263+
use_events,
264+
) in submit_graph_params:
265+
benches.append(
266+
GraphApiSubmitGraph(
267+
self,
268+
runtime,
269+
in_order_queue,
270+
num_kernels,
271+
measure_completion_time,
272+
use_events,
273+
useHostTasks=0,
274+
)
275+
)
276+
if runtime == RUNTIMES.SYCL:
277+
# Create CPU count variant
278+
benches.append(
279+
GraphApiSubmitGraph(
280+
self,
281+
runtime,
282+
in_order_queue,
283+
num_kernels,
284+
measure_completion_time,
285+
use_events,
286+
useHostTasks=0,
287+
profiler_type=PROFILERS.CPU_COUNTER,
288+
)
289+
)
237290

238291
# Add other benchmarks
239292
benches += [
@@ -244,7 +297,7 @@ def benchmarks(self) -> list[Benchmark]:
244297
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 0, "Llama"),
245298
GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 1, "Llama"),
246299
]
247-
for profiler_type in profiler_types:
300+
for profiler_type in list(PROFILERS):
248301
benches.append(
249302
QueueInOrderMemcpy(self, 0, "Device", "Device", 1024, profiler_type)
250303
)
@@ -310,7 +363,12 @@ def parse_unit_type(compute_unit):
310363

311364
class ComputeBenchmark(Benchmark):
312365
def __init__(
313-
self, bench, name, test, runtime: RUNTIMES = None, profiler_type: str = ""
366+
self,
367+
bench,
368+
name,
369+
test,
370+
runtime: RUNTIMES = None,
371+
profiler_type: PROFILERS = PROFILERS.TIMER,
314372
):
315373
super().__init__(bench.directory, bench)
316374
self.bench = bench
@@ -478,7 +536,7 @@ def __init__(
478536
MeasureCompletion=0,
479537
UseEvents=0,
480538
KernelExecTime=1,
481-
profiler_type="",
539+
profiler_type=PROFILERS.TIMER,
482540
):
483541
self.ioq = ioq
484542
self.MeasureCompletion = MeasureCompletion
@@ -578,7 +636,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
578636
f"--UseEvents={self.UseEvents}",
579637
]
580638
if self.runtime == RUNTIMES.SYCL:
581-
bin_args.append(f"--profilerType={self.profiler_type}")
639+
bin_args.append(f"--profilerType={self.profiler_type.value}")
582640
return bin_args
583641

584642
def get_metadata(self) -> dict[str, BenchmarkMetadata]:
@@ -647,7 +705,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
647705
f"--dst={self.destination}",
648706
f"--size={self.size}",
649707
"--withCopyOffload=0",
650-
f"--profilerType={self.profiler_type}",
708+
f"--profilerType={self.profiler_type.value}",
651709
]
652710

653711

@@ -693,7 +751,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
693751
f"--size={self.size}",
694752
"--count=100",
695753
"--withCopyOffload=0",
696-
f"--profilerType={self.profiler_type}",
754+
f"--profilerType={self.profiler_type.value}",
697755
]
698756

699757

@@ -731,7 +789,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
731789
f"--sourcePlacement={self.source}",
732790
f"--destinationPlacement={self.destination}",
733791
f"--size={self.size}",
734-
f"--profilerType={self.profiler_type}",
792+
f"--profilerType={self.profiler_type.value}",
735793
]
736794

737795

@@ -970,9 +1028,9 @@ def __init__(
9701028
inOrderQueue,
9711029
numKernels,
9721030
measureCompletionTime,
973-
profiler_type,
9741031
useEvents,
9751032
useHostTasks,
1033+
profiler_type=PROFILERS.TIMER,
9761034
):
9771035
self.inOrderQueue = inOrderQueue
9781036
self.numKernels = numKernels
@@ -1037,12 +1095,14 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
10371095
f"--UseHostTasks={self.useHostTasks}",
10381096
]
10391097
if self.runtime == RUNTIMES.SYCL:
1040-
bin_args.append(f"--profilerType={self.profiler_type}")
1098+
bin_args.append(f"--profilerType={self.profiler_type.value}")
10411099
return bin_args
10421100

10431101

10441102
class UllsEmptyKernel(ComputeBenchmark):
1045-
def __init__(self, bench, runtime: RUNTIMES, wgc, wgs, profiler_type):
1103+
def __init__(
1104+
self, bench, runtime: RUNTIMES, wgc, wgs, profiler_type=PROFILERS.TIMER
1105+
):
10461106
self.wgc = wgc
10471107
self.wgs = wgs
10481108
# iterations per bin_args: --iterations=10000
@@ -1084,7 +1144,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
10841144
f"--wgc={self.wgc}",
10851145
]
10861146
if self.runtime == RUNTIMES.SYCL:
1087-
bin_args.append(f"--profilerType={self.profiler_type}")
1147+
bin_args.append(f"--profilerType={self.profiler_type.value}")
10881148
return bin_args
10891149

10901150

llvm/lib/SYCLPostLink/ESIMDPostSplitProcessing.cpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,17 @@
2727
#include <string>
2828
#include <vector>
2929

30+
#ifdef NDEBUG
31+
#define DUMP_ENTRY_POINTS(...)
32+
#else
33+
constexpr int DebugESIMDPostSplit = 0;
34+
35+
#define DUMP_ENTRY_POINTS(...) \
36+
if (DebugESIMDPostSplit > 0) { \
37+
llvm::module_split::dumpEntryPoints(__VA_ARGS__); \
38+
}
39+
#endif // NDEBUG
40+
3041
using namespace llvm;
3142
using namespace llvm::module_split;
3243

@@ -124,9 +135,7 @@ llvm::sycl::handleESIMD(ModuleDesc MDesc,
124135
SplitOccurred |= Result.size() > 1;
125136

126137
for (ModuleDesc &MD : Result) {
127-
#ifdef LLVM_ENABLE_DUMP
128-
dumpEntryPoints(MD.entries(), MD.Name.c_str(), 4);
129-
#endif // LLVM_ENABLE_DUMP
138+
DUMP_ENTRY_POINTS(MD.entries(), MD.Name.c_str(), 4);
130139
if (Options.LowerESIMD && MD.isESIMD())
131140
Modified |= lowerESIMDConstructs(MD, Options);
132141
}
@@ -155,9 +164,7 @@ llvm::sycl::handleESIMD(ModuleDesc MDesc,
155164
Linked.rebuildEntryPoints(Names);
156165
Result.clear();
157166
Result.emplace_back(std::move(Linked));
158-
#ifdef LLVM_ENABLE_DUMP
159-
dumpEntryPoints(Result.back().entries(), Result.back().Name.c_str(), 4);
160-
#endif // LLVM_ENABLE_DUMP
167+
DUMP_ENTRY_POINTS(Result.back().entries(), Result.back().Name.c_str(), 4);
161168
Modified = true;
162169

163170
return std::move(Result);

0 commit comments

Comments
 (0)