From a8563511ba2b0b8011536200a91d5cf19e015f7c Mon Sep 17 00:00:00 2001
From: "romain.biessy" <romain.biessy@codeplay.com>
Date: Fri, 10 Jan 2025 15:23:41 +0000
Subject: [PATCH 01/15] Rename oneMKL Interface to oneMath

---
 docs/backend/SYCL.md              | 41 +++++++++++--------------------
 ggml/src/ggml-sycl/CMakeLists.txt | 20 ++++++++++-----
 2 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 5da439e94e092..53d95dc57bbd7 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -20,7 +20,7 @@
 **oneAPI** is an open ecosystem and a standard-based specification, supporting multiple architectures including but not limited to intel CPUs, GPUs and FPGAs. The key components of the oneAPI ecosystem include:
 
 - **DPCPP** *(Data Parallel C++)*: The primary oneAPI SYCL implementation, which includes the icpx/icx Compilers.
-- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. oneMKL and oneDNN)*.
+- **oneAPI Libraries**: A set of highly optimized libraries targeting multiple domains *(e.g. Intel oneMKL, oneMath and oneDNN)*.
 - **oneAPI LevelZero**: A high performance low level interface for fine-grained control over intel iGPUs and dGPUs.
 - **Nvidia & AMD Plugins**: These are plugins extending oneAPI's DPCPP support to SYCL on Nvidia and AMD GPU targets.
 
@@ -228,27 +228,27 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
 **oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
 
 
-**oneMKL for cuBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *cuBLAS* backend enabled is thus required to run it on Nvidia GPUs.
+**oneMath for cuBlas**: The current Intel oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. [oneMath](https://github.com/uxlfoundation/oneMath) is used instead to dispatch to *cuBLAS* on Nvidia GPUs.
 
 ```sh
-git clone https://github.com/oneapi-src/oneMKL
-cd oneMKL
-cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas
-cmake --build buildWithCublas --config Release
+git clone https://github.com/uxlfoundation/oneMath
+cd oneMath
+cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas -DCMAKE_INSTALL_PREFIX:PATH=install
+cmake --build buildWithCublas --target install --config Release
 ```
 
 - **Adding support to AMD GPUs**
 
 **oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
 
-**oneMKL for rocBlas**: The current oneMKL releases *(shipped with the oneAPI base-toolkit)* doesn't contain the rocBLAS backend. A build from source of the upstream [oneMKL](https://github.com/oneapi-src/oneMKL) with the *rocBLAS* backend enabled is thus required to run it on AMD GPUs.
+**oneMath for rocBlas**: The current Intel oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the rocBLAS backend. [oneMath](https://github.com/uxlfoundation/oneMath) is used instead to dispatch to *rocBLAS* on AMD GPUs.
 
 ```sh
-git clone https://github.com/oneapi-src/oneMKL
-cd oneMKL
-# Find your HIPTARGET with rocminfo, under the key 'Name:'
-cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIPTARGETS=${HIPTARGET} -DTARGET_DOMAINS=blas
-cmake --build buildWithrocBLAS --config Release
+git clone https://github.com/uxlfoundation/oneMath
+cd oneMath
+# Find your HIP_TARGETS with rocminfo, under the key 'Name:'
+cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIP_TARGETS=${HIP_TARGETS} -DTARGET_DOMAINS=blas  -DCMAKE_INSTALL_PREFIX:PATH=install
+cmake --build buildWithrocBLAS --target install --config Release
 ```
 
 3. **Verify installation and environment**
@@ -316,21 +316,15 @@ cmake --build build --config Release -j -v
 #### Nvidia GPU
 
 ```sh
-# Export relevant ENV variables
-export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LD_LIBRARY_PATH
-export LIBRARY_PATH=/path/to/oneMKL/buildWithCublas/lib:$LIBRARY_PATH
-export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithCublas/include:$CPLUS_INCLUDE_DIR
-export CPLUS_INCLUDE_DIR=/path/to/oneMKL/include:$CPLUS_INCLUDE_DIR
-
 # Build LLAMA with Nvidia BLAS acceleration through SYCL
 # Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
 GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
 
 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DoneMath_DIR=/path/to/oneMath/buildWithCublas/install/lib/cmake/oneMath
 
 # Option 2: Use FP16
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DoneMath_DIR=/path/to/oneMath/buildWithCublas/install/lib/cmake/oneMath -DGGML_SYCL_F16=ON
 
 # build all binary
 cmake --build build --config Release -j -v
@@ -339,18 +333,13 @@ cmake --build build --config Release -j -v
 #### AMD GPU
 
 ```sh
-# Export relevant ENV variables
-export LD_LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LD_LIBRARY_PATH
-export LIBRARY_PATH=/path/to/oneMKL/buildWithrocBLAS/lib:$LIBRARY_PATH
-export CPLUS_INCLUDE_DIR=/path/to/oneMKL/buildWithrocBLAS/include:$CPLUS_INCLUDE_DIR
-
 # Build LLAMA with rocBLAS acceleration through SYCL
 
 ## AMD
 # Use FP32, FP16 is not supported
 # Find your GGML_SYCL_DEVICE_ARCH with rocminfo, under the key 'Name:'
 GGML_SYCL_DEVICE_ARCH=gfx90a # Example architecture
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DoneMath_DIR=/path/to/oneMath/buildWithrocBLAS/install/lib/cmake/oneMath
 
 # build all binary
 cmake --build build --config Release -j -v
diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index 3ad044432a27d..77037173eb856 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -69,18 +69,26 @@ else()
     if (GGML_SYCL_TARGET STREQUAL "INTEL")
         target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
     elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
         add_compile_definitions(GGML_SYCL_NVIDIA)
-        target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl_blas_cublas)
+        find_package(oneMath REQUIRED)
+        target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl ONEMATH::onemath_blas_cublas)
+        target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
+        # Disable warnings for using deprecated oneapi::mkl namespace in oneMath
+        # Using the deprecated API in oneMath is useful to have a similar API than Intel oneMKL
+        target_compile_options(ggml-sycl PRIVATE "-Wno-deprecated-declarations")
     elseif (GGML_SYCL_TARGET STREQUAL "AMD")
         if (NOT GGML_SYCL_DEVICE_ARCH)
             message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
         endif()
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amdgcn-amd-amdhsa")
-        target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemkl)
+        find_package(oneMath REQUIRED)
+        target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemath)
+        target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
+        # Disable warnings for using deprecated oneapi::mkl namespace in oneMath
+        # Using the deprecated API in oneMath is useful to have a similar API than Intel oneMKL
+        target_compile_options(ggml-sycl PRIVATE "-Wno-deprecated-declarations")
     endif()
 
     if (GGML_SYCL_DEVICE_ARCH)
-      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}")
-  endif()
+        target_compile_options(ggml-sycl PRIVATE "-Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}")
+    endif()
 endif()

From dccc9f87951afe20a679382eb985345e9f9e743d Mon Sep 17 00:00:00 2001
From: "romain.biessy" <romain.biessy@codeplay.com>
Date: Thu, 23 Jan 2025 14:56:52 +0000
Subject: [PATCH 02/15] Use oneMath for Intel vendor

---
 docs/backend/SYCL.md              |   2 +-
 examples/sycl/README.md           |   2 +-
 ggml/cmake/ggml-config.cmake.in   |   2 +-
 ggml/src/ggml-sycl/CMakeLists.txt | 104 +++++++++++++++++++++---------
 4 files changed, 77 insertions(+), 33 deletions(-)

diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 53d95dc57bbd7..99e7fdc0aefac 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -648,7 +648,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | Name               | Value                                 | Function                                    |
 |--------------------|---------------------------------------|---------------------------------------------|
 | GGML_SYCL          | ON (mandatory)                        | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
-| GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA \| AMD    | Set the SYCL target device type.            |
+| GGML_SYCL_TARGET   | INTEL *(default)* \| INTEL_CPU \| INTEL_GPU \| NVIDIA \| AMD    | Set the SYCL target device type.            |
 | GGML_SYCL_DEVICE_ARCH | Optional (except for AMD)          | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
 | GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)*     | Enable FP16 build with SYCL code path.      |
 | CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
diff --git a/examples/sycl/README.md b/examples/sycl/README.md
index 8819d87f56ec2..ec82ddae36835 100644
--- a/examples/sycl/README.md
+++ b/examples/sycl/README.md
@@ -14,7 +14,7 @@ List all SYCL devices with ID, compute capability, max work group size, ect.
 
 1. Build the llama.cpp for SYCL for the specified target *(using GGML_SYCL_TARGET)*.
 
-2. Enable oneAPI running environment *(if GGML_SYCL_TARGET is set to INTEL -default-)*
+2. Enable oneAPI running environment *(if GGML_SYCL_TARGET is set to INTEL -default-, INTEL_CPU or INTEL_GPU)*
 
 ```
 source /opt/intel/oneapi/setvars.sh
diff --git a/ggml/cmake/ggml-config.cmake.in b/ggml/cmake/ggml-config.cmake.in
index 823eb797b7007..cbdfed567d1d8 100644
--- a/ggml/cmake/ggml-config.cmake.in
+++ b/ggml/cmake/ggml-config.cmake.in
@@ -78,7 +78,7 @@ if (NOT GGML_SHARED_LIB)
 
     if (GGML_SYCL)
         find_package(DNNL)
-        if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
+        if (${DNNL_FOUND} AND GGML_SYCL_TARGET MATCHES "INTEL.*")
             list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES DNNL::dnnl)
         endif()
         if (WIN32)
diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index 77037173eb856..4259132431398 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -1,6 +1,6 @@
 message(STATUS  "GGML_SYCL_TARGET=${GGML_SYCL_TARGET}")
 
-if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
+if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL.*|NVIDIA|AMD)$")
     message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
 endif()
 
@@ -30,8 +30,6 @@ if (GGML_SYCL_F16)
     add_compile_definitions(GGML_SYCL_F16)
 endif()
 
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
-
 if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
     add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
 elseif (GGML_SYCL_TARGET STREQUAL "AMD")
@@ -51,44 +49,90 @@ target_sources(ggml-sycl PRIVATE ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL})
 find_package(DNNL)
 message("-- DNNL found:" ${DNNL_FOUND})
 
-if (GGML_SYCL_TARGET STREQUAL "INTEL")
+if (GGML_SYCL_TARGET MATCHES "INTEL.*")
     add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
 else()
     add_compile_definitions(GGML_SYCL_DNNL=0)
 endif()
 
-if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
+if (${DNNL_FOUND} AND GGML_SYCL_TARGET MATCHES "INTEL.*")
     target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
 endif()
 
-if (WIN32)
-    find_package(IntelSYCL REQUIRED)
-    find_package(MKL REQUIRED)
-    target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
+find_package(IntelSYCL)
+if (IntelSYCL_FOUND)
+    # Use oneAPI CMake when possible
+    target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX)
 else()
-    if (GGML_SYCL_TARGET STREQUAL "INTEL")
-        target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
-    elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
-        add_compile_definitions(GGML_SYCL_NVIDIA)
-        find_package(oneMath REQUIRED)
-        target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl ONEMATH::onemath_blas_cublas)
-        target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
-        # Disable warnings for using deprecated oneapi::mkl namespace in oneMath
-        # Using the deprecated API in oneMath is useful to have a similar API than Intel oneMKL
-        target_compile_options(ggml-sycl PRIVATE "-Wno-deprecated-declarations")
+    # Fallback to the simplest way of enabling SYCL when using intel/llvm nightly for instance
+    target_compile_options(ggml-sycl PRIVATE "-fsycl")
+    target_link_options(ggml-sycl PRIVATE "-fsycl")
+endif()
+
+target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")
+
+find_package(oneMath QUIET)
+if (NOT oneMath_FOUND)
+    message("-- oneMath not found: oneMath will be automatically downloaded")
+    # Use FetchContent to automatically pull and build oneMath
+    include(FetchContent)
+    set(BUILD_FUNCTIONAL_TESTS False)
+    set(BUILD_EXAMPLES False)
+    set(TARGET_DOMAINS blas)
+    if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
+        set(ENABLE_MKLCPU_BACKEND False)
+        set(ENABLE_MKLGPU_BACKEND False)
+        set(ENABLE_CUBLAS_BACKEND True)
     elseif (GGML_SYCL_TARGET STREQUAL "AMD")
-        if (NOT GGML_SYCL_DEVICE_ARCH)
-            message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
-        endif()
-        find_package(oneMath REQUIRED)
-        target_link_libraries(ggml-sycl PRIVATE sycl pthread m dl onemath)
-        target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
-        # Disable warnings for using deprecated oneapi::mkl namespace in oneMath
-        # Using the deprecated API in oneMath is useful to have a similar API than Intel oneMKL
-        target_compile_options(ggml-sycl PRIVATE "-Wno-deprecated-declarations")
+        set(ENABLE_MKLCPU_BACKEND False)
+        set(ENABLE_MKLGPU_BACKEND False)
+        set(ENABLE_ROCBLAS_BACKEND True)
     endif()
+    FetchContent_Declare(
+        ONEMATH
+        GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
+        GIT_TAG develop
+    )
+    FetchContent_MakeAvailable(ONEMATH)
+    # Create alias to match with find_package targets name
+    function(onemath_alias target)
+        if (TARGET ${target})
+            add_library(ONEMATH::${target} ALIAS ${target})
+        endif()
+    endfunction()
+    onemath_alias(onemath_blas_mklcpu)
+    onemath_alias(onemath_blas_mklgpu)
+    onemath_alias(onemath_blas_cublas)
+    onemath_alias(onemath_blas_rocblas)
+endif()
 
-    if (GGML_SYCL_DEVICE_ARCH)
-        target_compile_options(ggml-sycl PRIVATE "-Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH}")
+# Below oneMath compile-time dispatching is used for better performance
+if (GGML_SYCL_TARGET STREQUAL "INTEL_CPU")
+    target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_mklcpu)
+    target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_INTEL GGML_SYCL_INTEL_CPU)
+elseif (GGML_SYCL_TARGET STREQUAL "INTEL_GPU")
+    target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_mklgpu)
+    target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_INTEL GGML_SYCL_INTEL_GPU)
+elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
+    target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_cublas)
+    target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
+    target_link_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
+    target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA)
+elseif (GGML_SYCL_TARGET STREQUAL "AMD")
+    if (NOT GGML_SYCL_DEVICE_ARCH)
+        message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
     endif()
+    target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas)
+    target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
+    target_link_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
+    target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_AMD)
+else()
+    # Fallback to oneMath runtime dispatcher
+    target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath)
+    target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GENERIC)
+endif()
+
+if (GGML_SYCL_DEVICE_ARCH)
+    target_compile_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
+    target_link_options(ggml-sycl PRIVATE -Xsycl-target-backend --offload-arch=${GGML_SYCL_DEVICE_ARCH})
 endif()

From 3577bc00df4469921d4b61d67fa822999205f4c0 Mon Sep 17 00:00:00 2001
From: "romain.biessy" <romain.biessy@codeplay.com>
Date: Wed, 29 Jan 2025 16:44:32 +0000
Subject: [PATCH 03/15] Rename occurences to mkl

---
 ggml/src/ggml-sycl/CMakeLists.txt  |  1 +
 ggml/src/ggml-sycl/dpct/helper.hpp | 94 +++++++++++++++---------------
 ggml/src/ggml-sycl/ggml-sycl.cpp   | 21 +++----
 ggml/src/ggml-sycl/outprod.cpp     | 15 ++---
 4 files changed, 59 insertions(+), 72 deletions(-)

diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index 4259132431398..4e7eefc910ad9 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -100,6 +100,7 @@ if (NOT oneMath_FOUND)
             add_library(ONEMATH::${target} ALIAS ${target})
         endif()
     endfunction()
+    onemath_alias(onemath)
     onemath_alias(onemath_blas_mklcpu)
     onemath_alias(onemath_blas_mklgpu)
     onemath_alias(onemath_blas_cublas)
diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index c96395be61312..92882c56ca04c 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -16,7 +16,7 @@
 #include <sycl/sycl.hpp>
 #include <sycl/half_type.hpp>
 #include <syclcompat/math.hpp>
-#include <oneapi/mkl.hpp>
+#include <oneapi/math.hpp>
 #include <map>
 
 #include "ggml.h"
@@ -83,13 +83,36 @@ inline std::string get_device_backend_and_type(const sycl::device &device) {
 }
 
 template <typename Ts> struct matrix_info_t {
-    oneapi::mkl::transpose transpose_info[2];
+    oneapi::math::transpose transpose_info[2];
     Ts                     value_info[2];
     std::int64_t           size_info[3];
     std::int64_t           ld_info[3];
     std::int64_t           groupsize_info;
 };
 
+inline auto get_onemath_backend(sycl::queue& queue)
+#ifdef GGML_SYCL_GENERIC
+  -> sycl::queue&
+#endif
+{
+// If the backend is known at compile-time, use oneMath backend_selector to use
+// compile-time dispatching and avoid the need to dlopen libraries. Otherwise
+// fallback to runtime dispatching.
+#if defined(GGML_SYCL_INTEL_CPU)
+    return oneapi::math::backend_selector<oneapi::math::backend::mklcpu>{ queue };
+#elif defined(GGML_SYCL_INTEL_GPU)
+    return oneapi::math::backend_selector<oneapi::math::backend::mklgpu>{ queue };
+#elif defined(GGML_SYCL_NVIDIA)
+    return oneapi::math::backend_selector<oneapi::math::backend::cublas>{ queue };
+#elif defined(GGML_SYCL_AMD)
+    return oneapi::math::backend_selector<oneapi::math::backend::rocblas>{ queue };
+#elif defined(GGML_SYCL_GENERIC)
+    return queue;
+#else
+    static_assert(false, "Unsupported backend");
+#endif
+}
+
 namespace dpct
 {
     typedef sycl::queue *queue_ptr;
@@ -1687,8 +1710,8 @@ namespace dpct
     namespace detail
     {
         template <class Ta, class Tb, class Tc, class Ts>
-        inline void gemm_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                              oneapi::mkl::transpose b_trans, int m, int n, int k,
+        inline void gemm_impl(sycl::queue &q, oneapi::math::transpose a_trans,
+                              oneapi::math::transpose b_trans, int m, int n, int k,
                               const void *alpha, const void *a, int lda, const void *b,
                               int ldb, const void *beta, void *c, int ldc)
         {
@@ -1697,14 +1720,8 @@ namespace dpct
             auto data_a = get_memory<const Ta>(a);
             auto data_b = get_memory<const Tb>(b);
             auto data_c = get_memory<Tc>(c);
-#ifdef GGML_SYCL_NVIDIA
-            oneapi::mkl::blas::column_major::gemm(oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q },
-                                                  a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb,
-                                                  beta_value, data_c, ldc);
-#else
-            oneapi::mkl::blas::column_major::gemm(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb,
+            oneapi::math::blas::column_major::gemm(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb,
                                                   beta_value, data_c, ldc);
-#endif
         }
 
         template <typename VecT, class BinaryOperation, class = void>
@@ -1735,7 +1752,7 @@ namespace dpct
         };
 
         template <class Ta, class Tb, class Tc, class Ts>
-        inline void gemm_batch_impl(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans,
+        inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans,
                                     int m, int n, int k, const void * alpha, const void ** a, int lda, const void ** b,
                                     int ldb, const void * beta, void ** c, int ldc, int batch_size,
                                     matrix_info_t<float> * matrix_info) {
@@ -1754,28 +1771,18 @@ namespace dpct
             matrix_info->ld_info[2] = ldc;
             matrix_info->groupsize_info = batch_size;
 
-#ifdef GGML_SYCL_NVIDIA
-            sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
-                oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q }, matrix_info->transpose_info,
-                matrix_info->transpose_info + 1, matrix_info->size_info, matrix_info->size_info + 1,
-                matrix_info->size_info + 2, reinterpret_cast<Ts *>(matrix_info->value_info),
-                reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
-                matrix_info->ld_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info + 1),
-                reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
-#else
-            sycl::event e = oneapi::mkl::blas::column_major::gemm_batch(
-                q, matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info,
+            sycl::event e = oneapi::math::blas::column_major::gemm_batch(
+                get_onemath_backend(q), matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info,
                 matrix_info->size_info + 1, matrix_info->size_info + 2, reinterpret_cast<Ts *>(matrix_info->value_info),
                 reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
                 matrix_info->ld_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info + 1),
                 reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
-#endif
         }
 
         template <class Ta, class Tb, class Tc, class Ts>
         inline void
-        gemm_batch_impl(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                        oneapi::mkl::transpose b_trans, int m, int n,
+        gemm_batch_impl(sycl::queue &q, oneapi::math::transpose a_trans,
+                        oneapi::math::transpose b_trans, int m, int n,
                         int k, const void *alpha, const void *a, int lda,
                         long long int stride_a, const void *b, int ldb,
                         long long int stride_b, const void *beta, void *c,
@@ -1786,16 +1793,9 @@ namespace dpct
             auto data_a = get_memory<const Ta>(a);
             auto data_b = get_memory<const Tb>(b);
             auto data_c = get_memory<Tc>(c);
-#ifdef GGML_SYCL_NVIDIA
-            oneapi::mkl::blas::column_major::gemm_batch(
-                oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ q }, a_trans, b_trans, m, n, k,
-                alpha_value, data_a, lda, stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc, stride_c,
-                batch_size);
-#else
-            oneapi::mkl::blas::column_major::gemm_batch(q, a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
+            oneapi::math::blas::column_major::gemm_batch(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
                                                         stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc,
                                                         stride_c, batch_size);
-#endif
         }
 
     } // namespace detail
@@ -2259,8 +2259,8 @@ namespace dpct
                            sycl::range<3>(x, y, 1), direction);
     }
 
-    inline void gemm(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                     oneapi::mkl::transpose b_trans, int m, int n, int k,
+    inline void gemm(sycl::queue &q, oneapi::math::transpose a_trans,
+                     oneapi::math::transpose b_trans, int m, int n, int k,
                      const void *alpha, const void *a, library_data_t a_type,
                      int lda, const void *b, library_data_t b_type, int ldb,
                      const void *beta, void *c, library_data_t c_type, int ldc,
@@ -2329,7 +2329,7 @@ namespace dpct
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_float, library_data_t::real_float):
         {
-            detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+            detail::gemm_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float,
                               float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b,
                                      ldb, beta, c, ldc);
             break;
@@ -2369,8 +2369,8 @@ namespace dpct
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_bfloat16, library_data_t::real_float):
         {
-            detail::gemm_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                              oneapi::mkl::bfloat16, float>(
+            detail::gemm_impl<oneapi::math::bfloat16, oneapi::math::bfloat16,
+                              oneapi::math::bfloat16, float>(
                 q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
             break;
         }
@@ -2412,7 +2412,7 @@ namespace dpct
     /// \param [in] ldc Leading dimension of C.
     /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
     /// \param [in] scaling_type Data type of the scaling factors.
-    inline void gemm_batch(sycl::queue & q, oneapi::mkl::transpose a_trans, oneapi::mkl::transpose b_trans, int m,
+    inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
                            int n, int k, const void * alpha, const void * a[], library_data_t a_type, int lda,
                            const void * b[], library_data_t b_type, int ldb, const void * beta, void * c[],
                            library_data_t c_type, int ldc, int batch_size, library_data_t scaling_type,
@@ -2450,7 +2450,7 @@ namespace dpct
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_bfloat16, library_data_t::real_float):
         {
-            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float>(
+            detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
                 q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
             break;
         }
@@ -2458,7 +2458,7 @@ namespace dpct
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_float, library_data_t::real_float):
         {
-            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float, float>(
+            detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
                 q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc, batch_size, matrix_info);
             break;
         }
@@ -2534,8 +2534,8 @@ namespace dpct
     /// \param [in] stride_c Stride between the different C matrices.
     /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
     /// \param [in] scaling_type Data type of the scaling factors.
-    inline void gemm_batch(sycl::queue &q, oneapi::mkl::transpose a_trans,
-                           oneapi::mkl::transpose b_trans, int m, int n, int k,
+    inline void gemm_batch(sycl::queue &q, oneapi::math::transpose a_trans,
+                           oneapi::math::transpose b_trans, int m, int n, int k,
                            const void *alpha, const void *a, library_data_t a_type,
                            int lda, long long int stride_a, const void *b,
                            library_data_t b_type, int ldb, long long int stride_b,
@@ -2611,8 +2611,8 @@ namespace dpct
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_bfloat16, library_data_t::real_float):
         {
-            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16,
-                                    oneapi::mkl::bfloat16, float>(
+            detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16,
+                                    oneapi::math::bfloat16, float>(
                 q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
                 beta, c, ldc, stride_c, batch_size);
             break;
@@ -2621,7 +2621,7 @@ namespace dpct
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_float, library_data_t::real_float):
         {
-            detail::gemm_batch_impl<oneapi::mkl::bfloat16, oneapi::mkl::bfloat16, float,
+            detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float,
                                     float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
                                            stride_a, b, ldb, stride_b, beta, c, ldc,
                                            stride_c, batch_size);
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index d804e66061721..62b0061833b45 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -2442,8 +2442,8 @@ inline void ggml_sycl_op_mul_mat_sycl(
         const sycl::half alpha_f16 = 1.0f;
         const sycl::half beta_f16  = 0.0f;
         SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm(
-            *stream, oneapi::mkl::transpose::trans,
-            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
+            *stream, oneapi::math::transpose::trans,
+            oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10,
             &alpha_f16, src0_ptr, dpct::library_data_t::real_half, ne00,
             src1_ptr, dpct::library_data_t::real_half, ne10, &beta_f16,
             dst_f16.get(), dpct::library_data_t::real_half, ldc,
@@ -2480,17 +2480,10 @@ inline void ggml_sycl_op_mul_mat_sycl(
 #if !GGML_SYCL_DNNL
         const float alpha = 1.0f;
         const float beta  = 0.0f;
-#    ifdef GGML_SYCL_NVIDIA
-        SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
-            oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream }, oneapi::mkl::transpose::trans,
-            oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i,
-            ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
-#    else
-        SYCL_CHECK(CHECK_TRY_ERROR(oneapi::mkl::blas::column_major::gemm(
-            *stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, row_diff, src1_ncols, ne10,
+        SYCL_CHECK(CHECK_TRY_ERROR(oneapi::math::blas::column_major::gemm(
+            get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10,
             dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream),
             dst_dd_i, ldc)));
-#    endif
 #else
         auto dnnl_stream = ctx.stream_dnnl(stream);
          DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
@@ -3250,8 +3243,8 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
     if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
         // there is no broadcast and src0, src1 are contiguous across dims 2, 3
         SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
-            *main_stream, oneapi::mkl::transpose::trans,
-            oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
+            *main_stream, oneapi::math::transpose::trans,
+            oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
             (const char *)src0_as_f16, dpct::library_data_t::real_half,
             nb01 / nb00, nb02 / nb00,
             (const char *)src1_f16, dpct::library_data_t::real_half,
@@ -3292,7 +3285,7 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
             });
         }
         SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
-            *main_stream, oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha,
+            *main_stream, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
             (const void **) (ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00,
             (const void **) (ptrs_src.get() + 1 * ne23), dpct::library_data_t::real_half, nb11 / nb10, beta,
             (void **) (ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, cu_compute_type, matrix_info.get())));
diff --git a/ggml/src/ggml-sycl/outprod.cpp b/ggml/src/ggml-sycl/outprod.cpp
index 8e8347ff4f95e..74ee796c40c20 100644
--- a/ggml/src/ggml-sycl/outprod.cpp
+++ b/ggml/src/ggml-sycl/outprod.cpp
@@ -1,5 +1,4 @@
 #include <sycl/sycl.hpp>
-#include <oneapi/mkl.hpp>
 #include "outprod.hpp"
 
 
@@ -34,20 +33,14 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
 
     // Handle transposition of src1
     const bool src1_T = ggml_is_transposed(src1);
-    const oneapi::mkl::transpose src1_op =
-        src1_T ? oneapi::mkl::transpose::nontrans : oneapi::mkl::transpose::trans;
+    const oneapi::math::transpose src1_op =
+        src1_T ? oneapi::math::transpose::nontrans : oneapi::math::transpose::trans;
     const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
 
     try {
-        // Perform matrix multiplication using oneMKL GEMM
-#ifdef GGML_SYCL_NVIDIA
-        oneapi::mkl::blas::column_major::gemm(oneapi::mkl::backend_selector<oneapi::mkl::backend::cublas>{ *stream },
-                                              oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha, src0_d,
-                                              ne00, src1_d, ldb, beta, dst_d, ne0);
-#else
-        oneapi::mkl::blas::column_major::gemm(*stream, oneapi::mkl::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha,
+        // Perform matrix multiplication using oneMath GEMM
+        oneapi::math::blas::column_major::gemm(get_onemath_backend(*stream), oneapi::math::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha,
                                               src0_d, ne00, src1_d, ldb, beta, dst_d, ne0);
-#endif
     }
     catch (sycl::exception const& exc) {
         std::cerr << exc.what() << std::endl;

From 44825164f7f80ae28dba2a5d5c1a26e8aaebba3d Mon Sep 17 00:00:00 2001
From: "romain.biessy" <romain.biessy@codeplay.com>
Date: Wed, 29 Jan 2025 16:44:39 +0000
Subject: [PATCH 04/15] clang-format

---
 ggml/src/ggml-sycl/dpct/helper.hpp | 104 ++++++++++++-----------------
 ggml/src/ggml-sycl/ggml-sycl.cpp   |  18 ++---
 ggml/src/ggml-sycl/outprod.cpp     |   9 +--
 3 files changed, 54 insertions(+), 77 deletions(-)

diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index 92882c56ca04c..fcb644559ed1e 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -1709,20 +1709,18 @@ namespace dpct
 
     namespace detail
     {
-        template <class Ta, class Tb, class Tc, class Ts>
-        inline void gemm_impl(sycl::queue &q, oneapi::math::transpose a_trans,
-                              oneapi::math::transpose b_trans, int m, int n, int k,
-                              const void *alpha, const void *a, int lda, const void *b,
-                              int ldb, const void *beta, void *c, int ldc)
-        {
-            Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
-            Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
-            auto data_a = get_memory<const Ta>(a);
-            auto data_b = get_memory<const Tb>(b);
-            auto data_c = get_memory<Tc>(c);
-            oneapi::math::blas::column_major::gemm(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value, data_a, lda, data_b, ldb,
-                                                  beta_value, data_c, ldc);
-        }
+    template <class Ta, class Tb, class Tc, class Ts>
+    inline void gemm_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
+                          int n, int k, const void * alpha, const void * a, int lda, const void * b, int ldb,
+                          const void * beta, void * c, int ldc) {
+        Ts   alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
+        Ts   beta_value  = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
+        auto data_a      = get_memory<const Ta>(a);
+        auto data_b      = get_memory<const Tb>(b);
+        auto data_c      = get_memory<Tc>(c);
+        oneapi::math::blas::column_major::gemm(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value, data_a,
+                                               lda, data_b, ldb, beta_value, data_c, ldc);
+    }
 
         template <typename VecT, class BinaryOperation, class = void>
         class vectorized_binary
@@ -1772,30 +1770,27 @@ namespace dpct
             matrix_info->groupsize_info = batch_size;
 
             sycl::event e = oneapi::math::blas::column_major::gemm_batch(
-                get_onemath_backend(q), matrix_info->transpose_info, matrix_info->transpose_info + 1, matrix_info->size_info,
-                matrix_info->size_info + 1, matrix_info->size_info + 2, reinterpret_cast<Ts *>(matrix_info->value_info),
-                reinterpret_cast<const Ta **>(a), matrix_info->ld_info, reinterpret_cast<const Tb **>(b),
-                matrix_info->ld_info + 1, reinterpret_cast<Ts *>(matrix_info->value_info + 1),
-                reinterpret_cast<Tc **>(c), matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
+                get_onemath_backend(q), matrix_info->transpose_info, matrix_info->transpose_info + 1,
+                matrix_info->size_info, matrix_info->size_info + 1, matrix_info->size_info + 2,
+                reinterpret_cast<Ts *>(matrix_info->value_info), reinterpret_cast<const Ta **>(a), matrix_info->ld_info,
+                reinterpret_cast<const Tb **>(b), matrix_info->ld_info + 1,
+                reinterpret_cast<Ts *>(matrix_info->value_info + 1), reinterpret_cast<Tc **>(c),
+                matrix_info->ld_info + 2, 1, &(matrix_info->groupsize_info));
         }
 
         template <class Ta, class Tb, class Tc, class Ts>
-        inline void
-        gemm_batch_impl(sycl::queue &q, oneapi::math::transpose a_trans,
-                        oneapi::math::transpose b_trans, int m, int n,
-                        int k, const void *alpha, const void *a, int lda,
-                        long long int stride_a, const void *b, int ldb,
-                        long long int stride_b, const void *beta, void *c,
-                        int ldc, long long int stride_c, int batch_size)
-        {
+        inline void gemm_batch_impl(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans,
+                                    int m, int n, int k, const void * alpha, const void * a, int lda,
+                                    long long int stride_a, const void * b, int ldb, long long int stride_b,
+                                    const void * beta, void * c, int ldc, long long int stride_c, int batch_size) {
             Ts alpha_value = dpct::get_value(reinterpret_cast<const Ts *>(alpha), q);
             Ts beta_value = dpct::get_value(reinterpret_cast<const Ts *>(beta), q);
             auto data_a = get_memory<const Ta>(a);
             auto data_b = get_memory<const Tb>(b);
             auto data_c = get_memory<Tc>(c);
-            oneapi::math::blas::column_major::gemm_batch(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value, data_a, lda,
-                                                        stride_a, data_b, ldb, stride_b, beta_value, data_c, ldc,
-                                                        stride_c, batch_size);
+            oneapi::math::blas::column_major::gemm_batch(get_onemath_backend(q), a_trans, b_trans, m, n, k, alpha_value,
+                                                         data_a, lda, stride_a, data_b, ldb, stride_b, beta_value,
+                                                         data_c, ldc, stride_c, batch_size);
         }
 
     } // namespace detail
@@ -2259,13 +2254,10 @@ namespace dpct
                            sycl::range<3>(x, y, 1), direction);
     }
 
-    inline void gemm(sycl::queue &q, oneapi::math::transpose a_trans,
-                     oneapi::math::transpose b_trans, int m, int n, int k,
-                     const void *alpha, const void *a, library_data_t a_type,
-                     int lda, const void *b, library_data_t b_type, int ldb,
-                     const void *beta, void *c, library_data_t c_type, int ldc,
-                     library_data_t scaling_type)
-    {
+    inline void gemm(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m, int n,
+                     int k, const void * alpha, const void * a, library_data_t a_type, int lda, const void * b,
+                     library_data_t b_type, int ldb, const void * beta, void * c, library_data_t c_type, int ldc,
+                     library_data_t scaling_type) {
         if (scaling_type == library_data_t::real_float &&
             c_type == library_data_t::complex_float)
         {
@@ -2329,9 +2321,8 @@ namespace dpct
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_float, library_data_t::real_float):
         {
-            detail::gemm_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float,
-                              float>(q, a_trans, b_trans, m, n, k, alpha, a, lda, b,
-                                     ldb, beta, c, ldc);
+            detail::gemm_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
             break;
         }
         case detail::get_type_combination_id(
@@ -2369,8 +2360,7 @@ namespace dpct
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_bfloat16, library_data_t::real_float):
         {
-            detail::gemm_impl<oneapi::math::bfloat16, oneapi::math::bfloat16,
-                              oneapi::math::bfloat16, float>(
+            detail::gemm_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
                 q, a_trans, b_trans, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc);
             break;
         }
@@ -2390,7 +2380,7 @@ namespace dpct
         default:
             throw std::runtime_error("the combination of data type is unsupported");
         }
-    } // gemm()
+    }  // gemm()
 
     /// Computes a batch of matrix-matrix product with general matrices.
     /// \param [in] q The queue where the routine should be executed.
@@ -2534,15 +2524,11 @@ namespace dpct
     /// \param [in] stride_c Stride between the different C matrices.
     /// \param [in] batch_size Specifies the number of matrix multiply operations to perform.
     /// \param [in] scaling_type Data type of the scaling factors.
-    inline void gemm_batch(sycl::queue &q, oneapi::math::transpose a_trans,
-                           oneapi::math::transpose b_trans, int m, int n, int k,
-                           const void *alpha, const void *a, library_data_t a_type,
-                           int lda, long long int stride_a, const void *b,
-                           library_data_t b_type, int ldb, long long int stride_b,
-                           const void *beta, void *c, library_data_t c_type,
-                           int ldc, long long int stride_c, int batch_size,
-                           library_data_t scaling_type)
-    {
+    inline void gemm_batch(sycl::queue & q, oneapi::math::transpose a_trans, oneapi::math::transpose b_trans, int m,
+                           int n, int k, const void * alpha, const void * a, library_data_t a_type, int lda,
+                           long long int stride_a, const void * b, library_data_t b_type, int ldb,
+                           long long int stride_b, const void * beta, void * c, library_data_t c_type, int ldc,
+                           long long int stride_c, int batch_size, library_data_t scaling_type) {
         if (scaling_type == library_data_t::real_float &&
             c_type == library_data_t::complex_float)
         {
@@ -2611,20 +2597,18 @@ namespace dpct
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_bfloat16, library_data_t::real_float):
         {
-            detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16,
-                                    oneapi::math::bfloat16, float>(
-                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b,
-                beta, c, ldc, stride_c, batch_size);
+            detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, oneapi::math::bfloat16, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
+                batch_size);
             break;
         }
         case detail::get_type_combination_id(
             library_data_t::real_bfloat16, library_data_t::real_bfloat16,
             library_data_t::real_float, library_data_t::real_float):
         {
-            detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float,
-                                    float>(q, a_trans, b_trans, m, n, k, alpha, a, lda,
-                                           stride_a, b, ldb, stride_b, beta, c, ldc,
-                                           stride_c, batch_size);
+            detail::gemm_batch_impl<oneapi::math::bfloat16, oneapi::math::bfloat16, float, float>(
+                q, a_trans, b_trans, m, n, k, alpha, a, lda, stride_a, b, ldb, stride_b, beta, c, ldc, stride_c,
+                batch_size);
             break;
         }
 #endif
diff --git a/ggml/src/ggml-sycl/ggml-sycl.cpp b/ggml/src/ggml-sycl/ggml-sycl.cpp
index 62b0061833b45..1c26430014d30 100644
--- a/ggml/src/ggml-sycl/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl/ggml-sycl.cpp
@@ -2481,9 +2481,9 @@ inline void ggml_sycl_op_mul_mat_sycl(
         const float alpha = 1.0f;
         const float beta  = 0.0f;
         SYCL_CHECK(CHECK_TRY_ERROR(oneapi::math::blas::column_major::gemm(
-            get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff, src1_ncols, ne10,
-            dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10, dpct::get_value(&beta, *stream),
-            dst_dd_i, ldc)));
+            get_onemath_backend(*stream), oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, row_diff,
+            src1_ncols, ne10, dpct::get_value(&alpha, *stream), src0_ddf_i, ne00, src1_ddf1_i, ne10,
+            dpct::get_value(&beta, *stream), dst_dd_i, ldc)));
 #else
         auto dnnl_stream = ctx.stream_dnnl(stream);
          DnnlGemmWrapper::row_gemm(dnnl_stream, false, true, src1_ncols, row_diff, ne10, src1_ddf1_i, DnnlGemmWrapper::to_dt<float>(),
@@ -3243,14 +3243,10 @@ static void ggml_sycl_mul_mat_batched_sycl(ggml_backend_sycl_context & ctx,
     if (r2 == 1 && r3 == 1 && ggml_is_contiguous_2(src0) && ggml_is_contiguous_2(src1)) {
         // there is no broadcast and src0, src1 are contiguous across dims 2, 3
         SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch(
-            *main_stream, oneapi::math::transpose::trans,
-            oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
-            (const char *)src0_as_f16, dpct::library_data_t::real_half,
-            nb01 / nb00, nb02 / nb00,
-            (const char *)src1_f16, dpct::library_data_t::real_half,
-            nb11 / nb10, nb12 / nb10, beta,
-            (char *)dst_t, cu_data_type, ne01, nb2 / nb0,
-            ne12 * ne13, cu_compute_type)));
+            *main_stream, oneapi::math::transpose::trans, oneapi::math::transpose::nontrans, ne01, ne11, ne10, alpha,
+            (const char *) src0_as_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00,
+            (const char *) src1_f16, dpct::library_data_t::real_half, nb11 / nb10, nb12 / nb10, beta, (char *) dst_t,
+            cu_data_type, ne01, nb2 / nb0, ne12 * ne13, cu_compute_type)));
     } else {
         const int ne23 = ne12*ne13;
 
diff --git a/ggml/src/ggml-sycl/outprod.cpp b/ggml/src/ggml-sycl/outprod.cpp
index 74ee796c40c20..b60415784f32d 100644
--- a/ggml/src/ggml-sycl/outprod.cpp
+++ b/ggml/src/ggml-sycl/outprod.cpp
@@ -1,7 +1,5 @@
-#include <sycl/sycl.hpp>
 #include "outprod.hpp"
 
-
 void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
     const ggml_tensor *src0 = dst->src[0];
     const ggml_tensor *src1 = dst->src[1];
@@ -33,14 +31,13 @@ void ggml_sycl_op_out_prod(ggml_backend_sycl_context& ctx, ggml_tensor* dst) {
 
     // Handle transposition of src1
     const bool src1_T = ggml_is_transposed(src1);
-    const oneapi::math::transpose src1_op =
-        src1_T ? oneapi::math::transpose::nontrans : oneapi::math::transpose::trans;
+    const oneapi::math::transpose src1_op = src1_T ? oneapi::math::transpose::nontrans : oneapi::math::transpose::trans;
     const int64_t ldb = (src1_T ? nb10 : nb11) / sizeof(float);
 
     try {
         // Perform matrix multiplication using oneMath GEMM
-        oneapi::math::blas::column_major::gemm(get_onemath_backend(*stream), oneapi::math::transpose::nontrans, src1_op, ne0, ne1, ne01, alpha,
-                                              src0_d, ne00, src1_d, ldb, beta, dst_d, ne0);
+        oneapi::math::blas::column_major::gemm(get_onemath_backend(*stream), oneapi::math::transpose::nontrans, src1_op,
+                                               ne0, ne1, ne01, alpha, src0_d, ne00, src1_d, ldb, beta, dst_d, ne0);
     }
     catch (sycl::exception const& exc) {
         std::cerr << exc.what() << std::endl;

From 2c79721059811d9a5a6a08d0b119b62a0bd4a06f Mon Sep 17 00:00:00 2001
From: "romain.biessy" <romain.biessy@codeplay.com>
Date: Thu, 30 Jan 2025 11:01:49 +0000
Subject: [PATCH 05/15] Silence verbose warnings

---
 ggml/src/ggml-sycl/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index 4e7eefc910ad9..12f7f83287bb3 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -97,6 +97,8 @@ if (NOT oneMath_FOUND)
     # Create alias to match with find_package targets name
     function(onemath_alias target)
         if (TARGET ${target})
+            # Silence verbose warnings from external libraries
+            target_compile_options(${target} PRIVATE -Wno-uninitialized -Wno-unused-parameter -Wno-unused-variable -Wno-cast-qual)
             add_library(ONEMATH::${target} ALIAS ${target})
         endif()
     endfunction()

From 64b5a14bbdb56cd2969ba55800a9db114abd66df Mon Sep 17 00:00:00 2001
From: "romain.biessy" <romain.biessy@codeplay.com>
Date: Thu, 6 Mar 2025 10:49:22 +0000
Subject: [PATCH 06/15] Set oneMath HIP_TARGETS

---
 ggml/src/ggml-sycl/CMakeLists.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index 12f7f83287bb3..0bd5a464655fd 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -87,6 +87,10 @@ if (NOT oneMath_FOUND)
         set(ENABLE_MKLCPU_BACKEND False)
         set(ENABLE_MKLGPU_BACKEND False)
         set(ENABLE_ROCBLAS_BACKEND True)
+        # Ensure setting a string variable here is not overriden by oneMath CACHE variables
+        cmake_policy(SET CMP0126 NEW)
+        # Setting the device architecture is only needed and useful for AMD devices in oneMath
+        set(HIP_TARGETS ${GGML_SYCL_DEVICE_ARCH} CACHE STRING "oneMath HIP target" FORCE)
     endif()
     FetchContent_Declare(
         ONEMATH

From bc851c856867a9f80ed2f7393e096eb1aa459c92 Mon Sep 17 00:00:00 2001
From: "romain.biessy" <romain.biessy@codeplay.com>
Date: Thu, 6 Mar 2025 10:52:41 +0000
Subject: [PATCH 07/15] Fix silence warnings

---
 ggml/src/ggml-sycl/CMakeLists.txt | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index 0bd5a464655fd..11d29e2aadf53 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -100,9 +100,11 @@ if (NOT oneMath_FOUND)
     FetchContent_MakeAvailable(ONEMATH)
     # Create alias to match with find_package targets name
     function(onemath_alias target)
-        if (TARGET ${target})
+        if (TARGET ${target}_obj)
             # Silence verbose warnings from external libraries
-            target_compile_options(${target} PRIVATE -Wno-uninitialized -Wno-unused-parameter -Wno-unused-variable -Wno-cast-qual)
+            target_compile_options(${target}_obj PRIVATE -w)
+        endif()
+        if (TARGET ${target})
             add_library(ONEMATH::${target} ALIAS ${target})
         endif()
     endfunction()

From 9002b4decf25973ecb4c28206471f362cddbcca0 Mon Sep 17 00:00:00 2001
From: "romain.biessy" <romain.biessy@codeplay.com>
Date: Thu, 6 Mar 2025 11:53:53 +0000
Subject: [PATCH 08/15] Remove step to build oneMath from build instructions

---
 docs/backend/SYCL.md | 28 +++++-----------------------
 1 file changed, 5 insertions(+), 23 deletions(-)

diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 99e7fdc0aefac..ccc4d5775d5ce 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -227,30 +227,10 @@ Upon a successful installation, SYCL is enabled for the available intel devices,
 
 **oneAPI Plugin**: In order to enable SYCL support on Nvidia GPUs, please install the [Codeplay oneAPI Plugin for Nvidia GPUs](https://developer.codeplay.com/products/oneapi/nvidia/download). User should also make sure the plugin version matches the installed base toolkit one *(previous step)* for a seamless "oneAPI on Nvidia GPU" setup.
 
-
-**oneMath for cuBlas**: The current Intel oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the cuBLAS backend. [oneMath](https://github.com/uxlfoundation/oneMath) is used instead to dispatch to *cuBLAS* on Nvidia GPUs.
-
-```sh
-git clone https://github.com/uxlfoundation/oneMath
-cd oneMath
-cmake -B buildWithCublas -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_CUBLAS_BACKEND=ON -DTARGET_DOMAINS=blas -DCMAKE_INSTALL_PREFIX:PATH=install
-cmake --build buildWithCublas --target install --config Release
-```
-
 - **Adding support to AMD GPUs**
 
 **oneAPI Plugin**: In order to enable SYCL support on AMD GPUs, please install the [Codeplay oneAPI Plugin for AMD GPUs](https://developer.codeplay.com/products/oneapi/amd/download). As with Nvidia GPUs, the user should also make sure the plugin version matches the installed base toolkit.
 
-**oneMath for rocBlas**: The current Intel oneMKL releases *(shipped with the oneAPI base-toolkit)* do not contain the rocBLAS backend. [oneMath](https://github.com/uxlfoundation/oneMath) is used instead to dispatch to *rocBLAS* on AMD GPUs.
-
-```sh
-git clone https://github.com/uxlfoundation/oneMath
-cd oneMath
-# Find your HIP_TARGETS with rocminfo, under the key 'Name:'
-cmake -B buildWithrocBLAS -DCMAKE_CXX_COMPILER=icpx -DCMAKE_C_COMPILER=icx -DENABLE_MKLGPU_BACKEND=OFF -DENABLE_MKLCPU_BACKEND=OFF -DENABLE_ROCBLAS_BACKEND=ON -DHIP_TARGETS=${HIP_TARGETS} -DTARGET_DOMAINS=blas  -DCMAKE_INSTALL_PREFIX:PATH=install
-cmake --build buildWithrocBLAS --target install --config Release
-```
-
 3. **Verify installation and environment**
 
 In order to check the available SYCL devices on the machine, please use the `sycl-ls` command.
@@ -291,6 +271,8 @@ For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:
 
 ### II. Build llama.cpp
 
+The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath). By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
+
 #### Intel GPU
 
 ```
@@ -321,10 +303,10 @@ cmake --build build --config Release -j -v
 GGML_SYCL_DEVICE_ARCH=sm_80 # Example architecture
 
 # Option 1: Use FP32 (recommended for better performance in most cases)
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DoneMath_DIR=/path/to/oneMath/buildWithCublas/install/lib/cmake/oneMath
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 
 # Option 2: Use FP16
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DoneMath_DIR=/path/to/oneMath/buildWithCublas/install/lib/cmake/oneMath -DGGML_SYCL_F16=ON
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=NVIDIA -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
 
 # build all binary
 cmake --build build --config Release -j -v
@@ -339,7 +321,7 @@ cmake --build build --config Release -j -v
 # Use FP32, FP16 is not supported
 # Find your GGML_SYCL_DEVICE_ARCH with rocminfo, under the key 'Name:'
 GGML_SYCL_DEVICE_ARCH=gfx90a # Example architecture
-cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DoneMath_DIR=/path/to/oneMath/buildWithrocBLAS/install/lib/cmake/oneMath
+cmake -B build -DGGML_SYCL=ON -DGGML_SYCL_TARGET=AMD -DGGML_SYCL_DEVICE_ARCH=${GGML_SYCL_DEVICE_ARCH} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 
 # build all binary
 cmake --build build --config Release -j -v

From 09dfe8981d7d75ec7e0f53b652f2138afbd039f3 Mon Sep 17 00:00:00 2001
From: "romain.biessy" <romain.biessy@codeplay.com>
Date: Thu, 13 Mar 2025 15:18:22 +0000
Subject: [PATCH 09/15] Use fixed oneMath version

---
 ggml/src/ggml-sycl/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index 11d29e2aadf53..640e550b22315 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -95,7 +95,7 @@ if (NOT oneMath_FOUND)
     FetchContent_Declare(
         ONEMATH
         GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
-        GIT_TAG develop
+        GIT_TAG c255b1b4c41e2ee3059455c1f96a965d6a62568a
     )
     FetchContent_MakeAvailable(ONEMATH)
     # Create alias to match with find_package targets name

From 948f3c5f1e37bd9d548efed15dc775a358d7fc58 Mon Sep 17 00:00:00 2001
From: "romain.biessy" <romain.biessy@codeplay.com>
Date: Thu, 13 Mar 2025 15:23:49 +0000
Subject: [PATCH 10/15] Remove INTEL_CPU

---
 docs/backend/SYCL.md               |  2 +-
 examples/sycl/README.md            |  2 +-
 ggml/cmake/ggml-config.cmake.in    |  2 +-
 ggml/src/ggml-sycl/CMakeLists.txt  | 13 +++++--------
 ggml/src/ggml-sycl/dpct/helper.hpp |  4 +---
 5 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index ccc4d5775d5ce..40462e6b6ca5b 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -630,7 +630,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
 | Name               | Value                                 | Function                                    |
 |--------------------|---------------------------------------|---------------------------------------------|
 | GGML_SYCL          | ON (mandatory)                        | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
-| GGML_SYCL_TARGET   | INTEL *(default)* \| INTEL_CPU \| INTEL_GPU \| NVIDIA \| AMD    | Set the SYCL target device type.            |
+| GGML_SYCL_TARGET   | INTEL *(default)* \| NVIDIA \| AMD    | Set the SYCL target device type.            |
 | GGML_SYCL_DEVICE_ARCH | Optional (except for AMD)          | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
 | GGML_SYCL_F16      | OFF *(default)* \|ON *(optional)*     | Enable FP16 build with SYCL code path.      |
 | CMAKE_C_COMPILER   | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path.      |
diff --git a/examples/sycl/README.md b/examples/sycl/README.md
index ec82ddae36835..8819d87f56ec2 100644
--- a/examples/sycl/README.md
+++ b/examples/sycl/README.md
@@ -14,7 +14,7 @@ List all SYCL devices with ID, compute capability, max work group size, ect.
 
 1. Build the llama.cpp for SYCL for the specified target *(using GGML_SYCL_TARGET)*.
 
-2. Enable oneAPI running environment *(if GGML_SYCL_TARGET is set to INTEL -default-, INTEL_CPU or INTEL_GPU)*
+2. Enable oneAPI running environment *(if GGML_SYCL_TARGET is set to INTEL -default-)*
 
 ```
 source /opt/intel/oneapi/setvars.sh
diff --git a/ggml/cmake/ggml-config.cmake.in b/ggml/cmake/ggml-config.cmake.in
index cbdfed567d1d8..823eb797b7007 100644
--- a/ggml/cmake/ggml-config.cmake.in
+++ b/ggml/cmake/ggml-config.cmake.in
@@ -78,7 +78,7 @@ if (NOT GGML_SHARED_LIB)
 
     if (GGML_SYCL)
         find_package(DNNL)
-        if (${DNNL_FOUND} AND GGML_SYCL_TARGET MATCHES "INTEL.*")
+        if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
             list(APPEND GGML_SYCL_INTERFACE_LINK_LIBRARIES DNNL::dnnl)
         endif()
         if (WIN32)
diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index 640e550b22315..db424d2fcc807 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -1,6 +1,6 @@
 message(STATUS  "GGML_SYCL_TARGET=${GGML_SYCL_TARGET}")
 
-if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL.*|NVIDIA|AMD)$")
+if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
     message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
 endif()
 
@@ -49,13 +49,13 @@ target_sources(ggml-sycl PRIVATE ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL})
 find_package(DNNL)
 message("-- DNNL found:" ${DNNL_FOUND})
 
-if (GGML_SYCL_TARGET MATCHES "INTEL.*")
+if (GGML_SYCL_TARGET STREQUAL "INTEL")
     add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
 else()
     add_compile_definitions(GGML_SYCL_DNNL=0)
 endif()
 
-if (${DNNL_FOUND} AND GGML_SYCL_TARGET MATCHES "INTEL.*")
+if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
     target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
 endif()
 
@@ -116,12 +116,9 @@ if (NOT oneMath_FOUND)
 endif()
 
 # Below oneMath compile-time dispatching is used for better performance
-if (GGML_SYCL_TARGET STREQUAL "INTEL_CPU")
-    target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_mklcpu)
-    target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_INTEL GGML_SYCL_INTEL_CPU)
-elseif (GGML_SYCL_TARGET STREQUAL "INTEL_GPU")
+if (GGML_SYCL_TARGET STREQUAL "INTEL")
     target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_mklgpu)
-    target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_INTEL GGML_SYCL_INTEL_GPU)
+    target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_INTEL)
 elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
     target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_cublas)
     target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index fcb644559ed1e..daa698aff02e5 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -98,9 +98,7 @@ inline auto get_onemath_backend(sycl::queue& queue)
 // If the backend is known at compile-time, use oneMath backend_selector to use
 // compile-time dispatching and avoid the need to dlopen libraries. Otherwise
 // fallback to runtime dispatching.
-#if defined(GGML_SYCL_INTEL_CPU)
-    return oneapi::math::backend_selector<oneapi::math::backend::mklcpu>{ queue };
-#elif defined(GGML_SYCL_INTEL_GPU)
+#if defined(GGML_SYCL_INTEL)
     return oneapi::math::backend_selector<oneapi::math::backend::mklgpu>{ queue };
 #elif defined(GGML_SYCL_NVIDIA)
     return oneapi::math::backend_selector<oneapi::math::backend::cublas>{ queue };

From 1c8a949ac501739977dbfdb9877a0a277ef9c288 Mon Sep 17 00:00:00 2001
From: "romain.biessy" <romain.biessy@codeplay.com>
Date: Thu, 13 Mar 2025 15:24:49 +0000
Subject: [PATCH 11/15] Fold CMake oneDNN conditions

---
 ggml/src/ggml-sycl/CMakeLists.txt | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index db424d2fcc807..ad81bfa9a6b25 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -49,16 +49,13 @@ target_sources(ggml-sycl PRIVATE ${GGML_HEADERS_SYCL} ${GGML_SOURCES_SYCL})
 find_package(DNNL)
 message("-- DNNL found:" ${DNNL_FOUND})
 
-if (GGML_SYCL_TARGET STREQUAL "INTEL")
+if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
     add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
+    target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
 else()
     add_compile_definitions(GGML_SYCL_DNNL=0)
 endif()
 
-if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
-    target_link_libraries(ggml-sycl PRIVATE DNNL::dnnl)
-endif()
-
 find_package(IntelSYCL)
 if (IntelSYCL_FOUND)
     # Use oneAPI CMake when possible

From 995aea3d8309f049894aaffbc98cd8e774477b3e Mon Sep 17 00:00:00 2001
From: "romain.biessy" <romain.biessy@codeplay.com>
Date: Fri, 28 Mar 2025 15:15:40 +0000
Subject: [PATCH 12/15] Use Intel oneMKL for Intel devices

---
 ggml/src/ggml-sycl/CMakeLists.txt  | 128 +++++++++++++++--------------
 ggml/src/ggml-sycl/dpct/helper.hpp |  19 +++--
 2 files changed, 80 insertions(+), 67 deletions(-)

diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index c531a5691a247..8761d3c04c844 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -72,71 +72,77 @@ if (GGML_SYCL_GRAPH)
     target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GRAPH)
 endif()
 
-find_package(oneMath QUIET)
-if (NOT oneMath_FOUND)
-    message("-- oneMath not found: oneMath will be automatically downloaded")
-    # Use FetchContent to automatically pull and build oneMath
-    include(FetchContent)
-    set(BUILD_FUNCTIONAL_TESTS False)
-    set(BUILD_EXAMPLES False)
-    set(TARGET_DOMAINS blas)
+# Link against Intel oneMKL or oneMath
+if (GGML_SYCL_TARGET STREQUAL "INTEL")
+    # Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically
+    # See https://github.com/uxlfoundation/oneMath/issues/654
+    find_package(MKL REQUIRED)
+    target_link_libraries(ggml-sycl PRIVATE MKL::MKL MKL::MKL_SYCL)
+    target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL)
+else()
+    find_package(oneMath QUIET)
+    if (NOT oneMath_FOUND)
+        message("-- oneMath not found: oneMath will be automatically downloaded")
+        # Use FetchContent to automatically pull and build oneMath
+        include(FetchContent)
+        set(BUILD_FUNCTIONAL_TESTS False)
+        set(BUILD_EXAMPLES False)
+        set(TARGET_DOMAINS blas)
+        if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
+            set(ENABLE_MKLCPU_BACKEND False)
+            set(ENABLE_MKLGPU_BACKEND False)
+            set(ENABLE_CUBLAS_BACKEND True)
+        elseif (GGML_SYCL_TARGET STREQUAL "AMD")
+            set(ENABLE_MKLCPU_BACKEND False)
+            set(ENABLE_MKLGPU_BACKEND False)
+            set(ENABLE_ROCBLAS_BACKEND True)
+            # Ensure setting a string variable here is not overriden by oneMath CACHE variables
+            cmake_policy(SET CMP0126 NEW)
+            # Setting the device architecture is only needed and useful for AMD devices in oneMath
+            set(HIP_TARGETS ${GGML_SYCL_DEVICE_ARCH} CACHE STRING "oneMath HIP target" FORCE)
+        endif()
+        FetchContent_Declare(
+            ONEMATH
+            GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
+            GIT_TAG c255b1b4c41e2ee3059455c1f96a965d6a62568a
+        )
+        FetchContent_MakeAvailable(ONEMATH)
+        # Create alias to match with find_package targets name
+        function(onemath_alias target)
+            if (TARGET ${target}_obj)
+                # Silence verbose warnings from external libraries
+                target_compile_options(${target}_obj PRIVATE -w)
+            endif()
+            if (TARGET ${target})
+                add_library(ONEMATH::${target} ALIAS ${target})
+            endif()
+        endfunction()
+        onemath_alias(onemath)
+        onemath_alias(onemath_blas_mklcpu)
+        onemath_alias(onemath_blas_mklgpu)
+        onemath_alias(onemath_blas_cublas)
+        onemath_alias(onemath_blas_rocblas)
+    endif()
+
+    # Below oneMath compile-time dispatching is used for better performance
     if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
-        set(ENABLE_MKLCPU_BACKEND False)
-        set(ENABLE_MKLGPU_BACKEND False)
-        set(ENABLE_CUBLAS_BACKEND True)
+        target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_cublas)
+        target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
+        target_link_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
+        target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA)
     elseif (GGML_SYCL_TARGET STREQUAL "AMD")
-        set(ENABLE_MKLCPU_BACKEND False)
-        set(ENABLE_MKLGPU_BACKEND False)
-        set(ENABLE_ROCBLAS_BACKEND True)
-        # Ensure setting a string variable here is not overriden by oneMath CACHE variables
-        cmake_policy(SET CMP0126 NEW)
-        # Setting the device architecture is only needed and useful for AMD devices in oneMath
-        set(HIP_TARGETS ${GGML_SYCL_DEVICE_ARCH} CACHE STRING "oneMath HIP target" FORCE)
-    endif()
-    FetchContent_Declare(
-        ONEMATH
-        GIT_REPOSITORY https://github.com/uxlfoundation/oneMath.git
-        GIT_TAG c255b1b4c41e2ee3059455c1f96a965d6a62568a
-    )
-    FetchContent_MakeAvailable(ONEMATH)
-    # Create alias to match with find_package targets name
-    function(onemath_alias target)
-        if (TARGET ${target}_obj)
-            # Silence verbose warnings from external libraries
-            target_compile_options(${target}_obj PRIVATE -w)
+        if (NOT GGML_SYCL_DEVICE_ARCH)
+            message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
         endif()
-        if (TARGET ${target})
-            add_library(ONEMATH::${target} ALIAS ${target})
-        endif()
-    endfunction()
-    onemath_alias(onemath)
-    onemath_alias(onemath_blas_mklcpu)
-    onemath_alias(onemath_blas_mklgpu)
-    onemath_alias(onemath_blas_cublas)
-    onemath_alias(onemath_blas_rocblas)
-endif()
-
-# Below oneMath compile-time dispatching is used for better performance
-if (GGML_SYCL_TARGET STREQUAL "INTEL")
-    target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_mklgpu)
-    target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_INTEL)
-elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
-    target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_cublas)
-    target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
-    target_link_options(ggml-sycl PRIVATE "-fsycl-targets=nvptx64-nvidia-cuda")
-    target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_NVIDIA)
-elseif (GGML_SYCL_TARGET STREQUAL "AMD")
-    if (NOT GGML_SYCL_DEVICE_ARCH)
-        message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_DEVICE_ARCH has not been set.")
+        target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas)
+        target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
+        target_link_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
+        target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_AMD)
+    else()
+        # Fallback to oneMath runtime dispatcher
+        target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath)
+        target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GENERIC)
     endif()
-    target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath_blas_rocblas)
-    target_compile_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
-    target_link_options(ggml-sycl PRIVATE "-fsycl-targets=amdgcn-amd-amdhsa")
-    target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_AMD)
-else()
-    # Fallback to oneMath runtime dispatcher
-    target_link_libraries(ggml-sycl PRIVATE ONEMATH::onemath)
-    target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_GENERIC)
 endif()
 
 if (GGML_SYCL_DEVICE_ARCH)
diff --git a/ggml/src/ggml-sycl/dpct/helper.hpp b/ggml/src/ggml-sycl/dpct/helper.hpp
index daa698aff02e5..d538965b096bf 100644
--- a/ggml/src/ggml-sycl/dpct/helper.hpp
+++ b/ggml/src/ggml-sycl/dpct/helper.hpp
@@ -16,9 +16,18 @@
 #include <sycl/sycl.hpp>
 #include <sycl/half_type.hpp>
 #include <syclcompat/math.hpp>
-#include <oneapi/math.hpp>
 #include <map>
 
+#ifdef GGML_SYCL_USE_INTEL_ONEMKL
+#include <oneapi/mkl.hpp>
+// Allow to use the same namespace for Intel oneMKL and oneMath
+namespace oneapi {
+    namespace math = mkl;
+}
+#else
+#include <oneapi/math.hpp>
+#endif
+
 #include "ggml.h"
 
 #if defined(__linux__)
@@ -91,20 +100,18 @@ template <typename Ts> struct matrix_info_t {
 };
 
 inline auto get_onemath_backend(sycl::queue& queue)
-#ifdef GGML_SYCL_GENERIC
+#if defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL)
   -> sycl::queue&
 #endif
 {
 // If the backend is known at compile-time, use oneMath backend_selector to use
 // compile-time dispatching and avoid the need to dlopen libraries. Otherwise
 // fallback to runtime dispatching.
-#if defined(GGML_SYCL_INTEL)
-    return oneapi::math::backend_selector<oneapi::math::backend::mklgpu>{ queue };
-#elif defined(GGML_SYCL_NVIDIA)
+#if defined(GGML_SYCL_NVIDIA)
     return oneapi::math::backend_selector<oneapi::math::backend::cublas>{ queue };
 #elif defined(GGML_SYCL_AMD)
     return oneapi::math::backend_selector<oneapi::math::backend::rocblas>{ queue };
-#elif defined(GGML_SYCL_GENERIC)
+#elif defined(GGML_SYCL_GENERIC) || defined(GGML_SYCL_USE_INTEL_ONEMKL)
     return queue;
 #else
     static_assert(false, "Unsupported backend");

From 0b6f9a90978e79684de0fb5d9129e8f7e2bddcfb Mon Sep 17 00:00:00 2001
From: "romain.biessy" <romain.biessy@codeplay.com>
Date: Fri, 28 Mar 2025 16:12:38 +0000
Subject: [PATCH 13/15] Improve CMake message

---
 ggml/src/ggml-sycl/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index a90d35cf26f49..f9fa5aebc302d 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -105,7 +105,7 @@ if (GGML_SYCL_TARGET STREQUAL "INTEL")
 else()
     find_package(oneMath QUIET)
     if (NOT oneMath_FOUND)
-        message("-- oneMath not found: oneMath will be automatically downloaded")
+        message(STATUS "oneMath not found: oneMath will be automatically downloaded")
         # Use FetchContent to automatically pull and build oneMath
         include(FetchContent)
         set(BUILD_FUNCTIONAL_TESTS False)

From 6af33c9c0d7201acf2c82f1659fc9dd3afe46d73 Mon Sep 17 00:00:00 2001
From: "romain.biessy" <romain.biessy@codeplay.com>
Date: Mon, 31 Mar 2025 15:24:11 +0100
Subject: [PATCH 14/15] Link against MKL::MKL_SYCL::BLAS only

---
 ggml/src/ggml-sycl/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-sycl/CMakeLists.txt b/ggml/src/ggml-sycl/CMakeLists.txt
index f9fa5aebc302d..6747fd88361f7 100644
--- a/ggml/src/ggml-sycl/CMakeLists.txt
+++ b/ggml/src/ggml-sycl/CMakeLists.txt
@@ -100,7 +100,7 @@ if (GGML_SYCL_TARGET STREQUAL "INTEL")
     # Intel devices use Intel oneMKL directly instead of oneMath to avoid the limitation of linking Intel oneMKL statically
     # See https://github.com/uxlfoundation/oneMath/issues/654
     find_package(MKL REQUIRED)
-    target_link_libraries(ggml-sycl PRIVATE MKL::MKL MKL::MKL_SYCL)
+    target_link_libraries(ggml-sycl PRIVATE MKL::MKL_SYCL::BLAS)
     target_compile_definitions(ggml-sycl PRIVATE GGML_SYCL_USE_INTEL_ONEMKL)
 else()
     find_package(oneMath QUIET)

From 06fe2ca7ed720521664edc504d2a0723a6a92039 Mon Sep 17 00:00:00 2001
From: "romain.biessy" <romain.biessy@codeplay.com>
Date: Mon, 31 Mar 2025 16:35:23 +0100
Subject: [PATCH 15/15] Move oneMath documentation to Nvidia and AMD sections

---
 docs/backend/SYCL.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/docs/backend/SYCL.md b/docs/backend/SYCL.md
index 0416684447a0d..19fe8a9d22126 100644
--- a/docs/backend/SYCL.md
+++ b/docs/backend/SYCL.md
@@ -280,8 +280,6 @@ For AMD GPUs we should expect at least one SYCL-HIP device [`hip:gpu`]:
 
 ### II. Build llama.cpp
 
-The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath). By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
-
 #### Intel GPU
 
 ```
@@ -306,6 +304,9 @@ cmake --build build --config Release -j -v
 
 #### Nvidia GPU
 
+The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
+By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
+
 ```sh
 # Build LLAMA with Nvidia BLAS acceleration through SYCL
 # Setting GGML_SYCL_DEVICE_ARCH is optional but can improve performance
@@ -323,6 +324,9 @@ cmake --build build --config Release -j -v
 
 #### AMD GPU
 
+The SYCL backend depends on [oneMath](https://github.com/uxlfoundation/oneMath) for Nvidia and AMD devices.
+By default it is automatically built along with the project. A specific build can be provided by setting the CMake flag `-DoneMath_DIR=/path/to/oneMath/install/lib/cmake/oneMath`.
+
 ```sh
 # Build LLAMA with rocBLAS acceleration through SYCL