Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit 5508775

Browse files
authored
[ESIMD] Added a test verifying dpas called for bfloat16 (#927)
* [ESIMD] Added a test verifying dpas called for bfloat16 Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent 52c107f commit 5508775

File tree

1 file changed

+135
-0
lines changed

1 file changed

+135
-0
lines changed

SYCL/ESIMD/dpas/dpas_test3.cpp

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
//==---------------- dpas_test3.cpp - DPC++ ESIMD on-device test ----------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
// REQUIRES: gpu-intel-pvc
9+
// UNSUPPORTED: cuda || hip
10+
// RUN: %clangxx -fsycl -DESIMD_XE_HPC %s -DVER1 -o %t.out1
11+
// RUN: %clangxx -fsycl -DESIMD_XE_HPC %s -DVER2 -o %t.out2
12+
// RUN: %GPU_RUN_PLACEHOLDER %t.out1
13+
// RUN: %GPU_RUN_PLACEHOLDER %t.out2
14+
15+
// The test verifies the low-level API for DPAS with 'bfloat16' types.
16+
// The macros VER1 and VER2 are used to verify slightly different
17+
// ways of initializing the input operands of DPAS. There were runtime
18+
// errors previously depending on what variant of initialization was used.
19+
20+
#include <CL/sycl.hpp>
21+
#include <sycl/ext/intel/esimd.hpp>
22+
23+
using namespace cl::sycl;
24+
using namespace sycl::ext::intel::esimd;
25+
using namespace sycl::ext::intel::experimental::esimd;
26+
using BF16 = uint16_t;
27+
28+
union BFloat16 {
29+
float f;
30+
unsigned short s[2];
31+
};
32+
33+
uint16_t FP32toBF16(float f) {
34+
BFloat16 bf16;
35+
bf16.f = f;
36+
return bf16.s[1];
37+
}
38+
39+
float BF16toFP32(uint16_t i) {
40+
BFloat16 bf16;
41+
bf16.s[0] = 0;
42+
bf16.s[1] = i;
43+
return bf16.f;
44+
}
45+
46+
template <typename T, int K, int N>
47+
simd<T, K * N> pack_bb(simd<T, K * N> &src) {
48+
// K=16 N=16
49+
simd<T, K * N> dst;
50+
auto dst2d = dst.template bit_cast_view<T, K / 2, N * 2>();
51+
auto src2d = src.template bit_cast_view<T, K, N>();
52+
dst2d.template select<8, 1, 16, 2>(0, 0) =
53+
src2d.template select<8, 2, 16, 1>(0, 0);
54+
dst2d.template select<8, 1, 16, 2>(0, 1) =
55+
src2d.template select<8, 2, 16, 1>(1, 0);
56+
return dst;
57+
}
58+
59+
void dpas_ker(nd_item<1> &idx, BF16 *matA, BF16 *matB, float *matC) {
60+
// matC = matC + matA * matB
61+
// matC 8x16 MxN
62+
// matA 8x16 MxK
63+
// matB 16x16 KxN
64+
constexpr int MB = 8;
65+
constexpr int NB = 16; // KB = NB = 16 in pvc
66+
constexpr int KB = 16;
67+
constexpr int TN = 128;
68+
constexpr int TN1 = 128;
69+
constexpr int TN2 = 64;
70+
constexpr int REPEAT_COUNT = 8;
71+
constexpr int SYSTOLIC_DEPTH = 8;
72+
73+
simd<BF16, MB * KB> BA; // MB, KB
74+
simd<BF16, KB * NB> BB; // KB, NB
75+
simd<float, MB * NB> BC; // MB, NB
76+
#ifdef VER1
77+
BA.copy_from(matA);
78+
BB.copy_from(matB);
79+
#else // VER2
80+
for (int i = 0; i < MB * KB; ++i)
81+
BA[i] = FP32toBF16(float(i));
82+
for (int i = 0; i < KB * NB; ++i)
83+
BB[i] = FP32toBF16(float(i));
84+
#endif
85+
BC = 0.0f;
86+
simd<BF16, KB *NB> BBvnni = pack_bb<BF16, KB, NB>(BB);
87+
BC = dpas<argument_type::BF16, argument_type::BF16, SYSTOLIC_DEPTH,
88+
REPEAT_COUNT, float, uint, uint, TN, TN1, TN2>(
89+
BC, BBvnni.template bit_cast_view<uint>(),
90+
BA.template bit_cast_view<uint>());
91+
BBvnni.copy_to(matB);
92+
BC.copy_to(matC);
93+
}
94+
95+
int main() {
96+
// A [8][16] * B[16][16]= C[8][16]
97+
queue q(gpu_selector{});
98+
nd_range<1> Range(range<1>{1}, range<1>{1});
99+
constexpr int MB = 8;
100+
constexpr int NB = 16; // KB = NB = 16 in pvc
101+
constexpr int KB = 16;
102+
BF16 *matA = malloc_shared<BF16>(MB * KB, q);
103+
BF16 *matB = malloc_shared<BF16>(KB * NB, q);
104+
float *matC = malloc_shared<float>(MB * NB, q);
105+
for (int i = 0; i < MB * KB; ++i)
106+
matA[i] = FP32toBF16(float(i));
107+
for (int i = 0; i < KB * NB; ++i)
108+
matB[i] = FP32toBF16(float(i));
109+
for (int i = 0; i < MB * NB; ++i)
110+
matC[i] = 0.0f;
111+
q.submit([&](handler &cgh) {
112+
cgh.parallel_for(Range, [=](nd_item<1> idx) SYCL_ESIMD_KERNEL {
113+
dpas_ker(idx, matA, matB, matC);
114+
});
115+
}).wait();
116+
117+
unsigned err_cnt = 0;
118+
for (unsigned i = 0; i < MB * NB && err_cnt < 10; ++i) {
119+
int m = i / NB;
120+
int n = i % NB;
121+
float res = 0.0f;
122+
for (int k = 0; k < KB; ++k)
123+
res += float((m * KB + k) * (k * NB + n));
124+
if (std::abs(res - matC[i]) > 0.0001) {
125+
std::cerr << "res vs ref: " << res << " : " << matC[i] << std::endl;
126+
err_cnt++;
127+
}
128+
}
129+
free(matA, q);
130+
free(matB, q);
131+
free(matC, q);
132+
133+
std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
134+
return err_cnt > 0 ? 1 : 0;
135+
}

0 commit comments

Comments
 (0)