intel
diff --git a/‎.github/workflows/sycl-linux-precommit.yml‎
Lines changed: 36 additions & 0 deletions b/‎.github/workflows/sycl-linux-precommit.yml‎
Lines changed: 36 additions & 0 deletions
diff --git a/‎devops/compat_ci_exclude.sycl-rel-6_3‎
Lines changed: 7 additions & 0 deletions b/‎devops/compat_ci_exclude.sycl-rel-6_3‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎devops/scripts/benchmarks/benches/compute.py‎
Lines changed: 123 additions & 63 deletions b/‎devops/scripts/benchmarks/benches/compute.py‎
Lines changed: 123 additions & 63 deletions
diff --git a/‎llvm/lib/SYCLPostLink/ESIMDPostSplitProcessing.cpp‎
Lines changed: 13 additions & 6 deletions b/‎llvm/lib/SYCLPostLink/ESIMDPostSplitProcessing.cpp‎
Lines changed: 13 additions & 6 deletions
@@ -62,6 +62,42 @@ jobs:
       e2e_binaries_artifact: e2e_bin
       e2e_binaries_preview_artifact: e2e_bin_preview
 
+  # Build and run native cpu e2e tests separately as cannot currently
+  # build all the e2e tests
+  build_run_native_cpu_e2e_tests:
+    if: ${{ always() && !cancelled() && needs.build.outputs.build_conclusion == 'success' }}  
+    runs-on: [Linux, build]
+    needs: [build]
+    container:
+      image: ghcr.io/intel/llvm/sycl_ubuntu2404_nightly:latest
+      options: -u 1001:1001
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          sparse-checkout: |
+            devops/
+
+      # download build artefact
+      - name: Download toolchain
+        uses: actions/download-artifact@v4
+        with:
+          name: sycl_linux_default
+      - name: Extract SYCL toolchain
+        shell: bash
+        run: |
+          mkdir toolchain
+          tar -xf llvm_sycl.tar.zst -C toolchain
+          rm llvm_sycl.tar.zst
+      - name: Build and run E2E tests
+        uses: ./devops/actions/run-tests/e2e
+        with:
+          ref: ${{ inputs.ref || github.sha }}
+          testing_mode: build-only
+          target_devices: native_cpu
+          sycl_compiler: $GITHUB_WORKSPACE/toolchain/bin/clang++
+          extra_lit_opts: --param sycl_build_targets="native_cpu"
+          extra_cmake_args: -DSYCL_TEST_E2E_TARGETS="native_cpu:cpu" -DSYCL_TEST_E2E_STANDALONE=ON
+
   # If a PR changes CUDA adapter, run the build on Ubuntu 22.04 as well.
   # Ubuntu 22.04 container has CUDA 12.1 installed while Ubuntu 24.0 image
   # has CUDA 12.6.1 installed.
 
@@ -9,6 +9,13 @@ Matrix/joint_matrix_bf16_fill_k_cache_runtime_dim.cpp
 # See GSD-11097.
 Assert/assert_in_kernels.cpp
 
+# https://github.com/intel/llvm/pull/20159 prevents returning last event as an
+# optimization for submitting barrier to an empty IOQ. However, the test
+# actually checks whether last event is returned or not, so it needs to be
+# updated to match the new behavior. ext_oneapi_submit_barrier spec doesn't
+# require last event to be returned, so this is not an ABI break.
+InorderQueue/in_order_ext_oneapi_submit_barrier.cpp
+
 # Likely OK, but need author to provide justification, get approval/confirmation
 # from someone:
 
 
@@ -3,6 +3,7 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+from itertools import product
 import os
 import csv
 import io
@@ -24,6 +25,11 @@ class RUNTIMES(Enum):
     UR = "ur"
 
 
+class PROFILERS(Enum):
+    TIMER = "timer"
+    CPU_COUNTER = "cpuCounter"
+
+
 def runtime_to_name(runtime: RUNTIMES) -> str:
     return {
         RUNTIMES.SYCL_PREVIEW: "SYCL Preview",
@@ -171,69 +177,116 @@ def benchmarks(self) -> list[Benchmark]:
 
         # hand-picked value so that total execution time of the benchmark is
         # similar on all architectures
-        long_lernel_exec_time_ioq = [20]
+        long_kernel_exec_time_ioq = [20]
         # For BMG server, a new value 200 is used, but we have to create metadata
         # for both values to keep the dashboard consistent.
         # See SubmitKernel.enabled()
         long_kernel_exec_time_ooo = [20, 200]
 
-        # The Combo Profiler is available only for selected sycl benchmarks
-        profiler_types = ["timer", "cpuCounter"]
-
-        for runtime in list(RUNTIMES):
-            # Add SubmitKernel benchmarks using loops
-            for in_order_queue in [0, 1]:
-                for measure_completion in [0, 1]:
-                    for use_events in [0, 1]:
-                        long_kernel_exec_time = (
-                            long_lernel_exec_time_ioq
-                            if in_order_queue
-                            else long_kernel_exec_time_ooo
-                        )
-                        for kernel_exec_time in [1, *long_kernel_exec_time]:
-                            for profiler_type in profiler_types:
-                                benches.append(
-                                    SubmitKernel(
-                                        self,
-                                        runtime,
-                                        in_order_queue,
-                                        measure_completion,
-                                        use_events,
-                                        kernel_exec_time,
-                                        profiler_type,
-                                    )
-                                )
-
-            # Add SinKernelGraph benchmarks
-            for with_graphs in [0, 1]:
-                for num_kernels in [5, 100]:
+        submit_kernel_params = product(
+            list(RUNTIMES),
+            [0, 1],  # in_order_queue
+            [0, 1],  # measure_completion
+            [0, 1],  # use_events
+        )
+        for (
+            runtime,
+            in_order_queue,
+            measure_completion,
+            use_events,
+        ) in submit_kernel_params:
+            long_kernel_exec_time = (
+                long_kernel_exec_time_ioq
+                if in_order_queue
+                else long_kernel_exec_time_ooo
+            )
+            for kernel_exec_time in [1, *long_kernel_exec_time]:
+                benches.append(
+                    SubmitKernel(
+                        self,
+                        runtime,
+                        in_order_queue,
+                        measure_completion,
+                        use_events,
+                        kernel_exec_time,
+                    )
+                )
+                if runtime == RUNTIMES.SYCL:
+                    # Create CPU count variant
                     benches.append(
-                        GraphApiSinKernelGraph(self, runtime, with_graphs, num_kernels)
+                        SubmitKernel(
+                            self,
+                            runtime,
+                            in_order_queue,
+                            measure_completion,
+                            use_events,
+                            kernel_exec_time,
+                            profiler_type=PROFILERS.CPU_COUNTER,
+                        )
                     )
 
+        # Add SinKernelGraph benchmarks
+        sin_kernel_graph_params = product(
+            list(RUNTIMES),
+            [0, 1],  # with_graphs
+            [5, 100],  # num_kernels
+        )
+        for runtime, with_graphs, num_kernels in sin_kernel_graph_params:
+            benches.append(
+                GraphApiSinKernelGraph(self, runtime, with_graphs, num_kernels)
+            )
+
             # Add ULLS benchmarks
-            for profiler_type in profiler_types:
-                benches.append(UllsEmptyKernel(self, runtime, 1000, 256, profiler_type))
+        for runtime in list(RUNTIMES):
+            if runtime == RUNTIMES.SYCL:
+                benches.append(
+                    UllsEmptyKernel(
+                        self, runtime, 1000, 256, profiler_type=PROFILERS.CPU_COUNTER
+                    )
+                )
+            benches.append(UllsEmptyKernel(self, runtime, 1000, 256))
             benches.append(UllsKernelSwitch(self, runtime, 8, 200, 0, 0, 1, 1))
 
-            # Add GraphApiSubmitGraph benchmarks
-            for in_order_queue in [0, 1]:
-                for num_kernels in self.submit_graph_num_kernels:
-                    for measure_completion_time in [0, 1]:
-                        for use_events in [0, 1]:
-                            for profiler_type in profiler_types:
-                                benches.append(
-                                    GraphApiSubmitGraph(
-                                        self,
-                                        runtime,
-                                        in_order_queue,
-                                        num_kernels,
-                                        measure_completion_time,
-                                        profiler_type,
-                                        use_events,
-                                        useHostTasks=0,
-                                    )
-                                )
+        # Add GraphApiSubmitGraph benchmarks
+        submit_graph_params = product(
+            list(RUNTIMES),
+            [0, 1],  # in_order_queue
+            self.submit_graph_num_kernels,
+            [0, 1],  # measure_completion_time
+            [0, 1],  # use_events
+        )
+        for (
+            runtime,
+            in_order_queue,
+            num_kernels,
+            measure_completion_time,
+            use_events,
+        ) in submit_graph_params:
+            benches.append(
+                GraphApiSubmitGraph(
+                    self,
+                    runtime,
+                    in_order_queue,
+                    num_kernels,
+                    measure_completion_time,
+                    use_events,
+                    useHostTasks=0,
+                )
+            )
+            if runtime == RUNTIMES.SYCL:
+                # Create CPU count variant
+                benches.append(
+                    GraphApiSubmitGraph(
+                        self,
+                        runtime,
+                        in_order_queue,
+                        num_kernels,
+                        measure_completion_time,
+                        use_events,
+                        useHostTasks=0,
+                        profiler_type=PROFILERS.CPU_COUNTER,
+                    )
+                )
 
         # Add other benchmarks
         benches += [
@@ -244,7 +297,7 @@ def benchmarks(self) -> list[Benchmark]:
             GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 0, "Llama"),
             GraphApiFinalizeGraph(self, RUNTIMES.SYCL, 1, "Llama"),
         ]
-        for profiler_type in profiler_types:
+        for profiler_type in list(PROFILERS):
             benches.append(
                 QueueInOrderMemcpy(self, 0, "Device", "Device", 1024, profiler_type)
             )
@@ -310,7 +363,12 @@ def parse_unit_type(compute_unit):
 
 class ComputeBenchmark(Benchmark):
     def __init__(
-        self, bench, name, test, runtime: RUNTIMES = None, profiler_type: str = ""
+        self,
+        bench,
+        name,
+        test,
+        runtime: RUNTIMES = None,
+        profiler_type: PROFILERS = PROFILERS.TIMER,
     ):
         super().__init__(bench.directory, bench)
         self.bench = bench
@@ -478,7 +536,7 @@ def __init__(
         MeasureCompletion=0,
         UseEvents=0,
         KernelExecTime=1,
-        profiler_type="",
+        profiler_type=PROFILERS.TIMER,
     ):
         self.ioq = ioq
         self.MeasureCompletion = MeasureCompletion
@@ -578,7 +636,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--UseEvents={self.UseEvents}",
         ]
         if self.runtime == RUNTIMES.SYCL:
-            bin_args.append(f"--profilerType={self.profiler_type}")
+            bin_args.append(f"--profilerType={self.profiler_type.value}")
         return bin_args
 
     def get_metadata(self) -> dict[str, BenchmarkMetadata]:
@@ -647,7 +705,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--dst={self.destination}",
             f"--size={self.size}",
             "--withCopyOffload=0",
-            f"--profilerType={self.profiler_type}",
+            f"--profilerType={self.profiler_type.value}",
         ]
 
 
@@ -693,7 +751,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--size={self.size}",
             "--count=100",
             "--withCopyOffload=0",
-            f"--profilerType={self.profiler_type}",
+            f"--profilerType={self.profiler_type.value}",
         ]
 
 
@@ -731,7 +789,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--sourcePlacement={self.source}",
             f"--destinationPlacement={self.destination}",
             f"--size={self.size}",
-            f"--profilerType={self.profiler_type}",
+            f"--profilerType={self.profiler_type.value}",
         ]
 
 
@@ -970,9 +1028,9 @@ def __init__(
         inOrderQueue,
         numKernels,
         measureCompletionTime,
-        profiler_type,
         useEvents,
         useHostTasks,
+        profiler_type=PROFILERS.TIMER,
     ):
         self.inOrderQueue = inOrderQueue
         self.numKernels = numKernels
@@ -1037,12 +1095,14 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--UseHostTasks={self.useHostTasks}",
         ]
         if self.runtime == RUNTIMES.SYCL:
-            bin_args.append(f"--profilerType={self.profiler_type}")
+            bin_args.append(f"--profilerType={self.profiler_type.value}")
         return bin_args
 
 
 class UllsEmptyKernel(ComputeBenchmark):
-    def __init__(self, bench, runtime: RUNTIMES, wgc, wgs, profiler_type):
+    def __init__(
+        self, bench, runtime: RUNTIMES, wgc, wgs, profiler_type=PROFILERS.TIMER
+    ):
         self.wgc = wgc
         self.wgs = wgs
         # iterations per bin_args: --iterations=10000
@@ -1084,7 +1144,7 @@ def bin_args(self, run_trace: TracingType = TracingType.NONE) -> list[str]:
             f"--wgc={self.wgc}",
         ]
         if self.runtime == RUNTIMES.SYCL:
-            bin_args.append(f"--profilerType={self.profiler_type}")
+            bin_args.append(f"--profilerType={self.profiler_type.value}")
         return bin_args
 
 
 
@@ -27,6 +27,17 @@
 #include <string>
 #include <vector>
 
+#ifdef NDEBUG
+#define DUMP_ENTRY_POINTS(...)
+#else
+constexpr int DebugESIMDPostSplit = 0;
+
+#define DUMP_ENTRY_POINTS(...)                                                 \
+  if (DebugESIMDPostSplit > 0) {                                               \
+    llvm::module_split::dumpEntryPoints(__VA_ARGS__);                          \
+  }
+#endif // NDEBUG
+
 using namespace llvm;
 using namespace llvm::module_split;
 
@@ -124,9 +135,7 @@ llvm::sycl::handleESIMD(ModuleDesc MDesc,
   SplitOccurred |= Result.size() > 1;
 
   for (ModuleDesc &MD : Result) {
-#ifdef LLVM_ENABLE_DUMP
-    dumpEntryPoints(MD.entries(), MD.Name.c_str(), 4);
-#endif // LLVM_ENABLE_DUMP
+    DUMP_ENTRY_POINTS(MD.entries(), MD.Name.c_str(), 4);
     if (Options.LowerESIMD && MD.isESIMD())
       Modified |= lowerESIMDConstructs(MD, Options);
   }
@@ -155,9 +164,7 @@ llvm::sycl::handleESIMD(ModuleDesc MDesc,
   Linked.rebuildEntryPoints(Names);
   Result.clear();
   Result.emplace_back(std::move(Linked));
-#ifdef LLVM_ENABLE_DUMP
-  dumpEntryPoints(Result.back().entries(), Result.back().Name.c_str(), 4);
-#endif // LLVM_ENABLE_DUMP
+  DUMP_ENTRY_POINTS(Result.back().entries(), Result.back().Name.c_str(), 4);
   Modified = true;
 
   return std::move(Result);