@@ -1287,6 +1287,109 @@ def int_amdgcn_ds_bpermute :
12871287 GCCBuiltin<"__builtin_amdgcn_ds_bpermute">,
12881288 Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
12891289
1290+ //===----------------------------------------------------------------------===//
1291+ // Deep learning intrinsics.
1292+ //===----------------------------------------------------------------------===//
1293+
1294+ // f32 %r = llvm.amdgcn.fdot2(v2f16 %a, v2f16 %b, f32 %c)
1295+ // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
1296+ def int_amdgcn_fdot2 :
1297+ GCCBuiltin<"__builtin_amdgcn_fdot2">,
1298+ Intrinsic<
1299+ [llvm_float_ty], // %r
1300+ [
1301+ llvm_v2f16_ty, // %a
1302+ llvm_v2f16_ty, // %b
1303+ llvm_float_ty // %c
1304+ ],
1305+ [IntrNoMem, IntrSpeculatable]
1306+ >;
1307+
1308+ // i32 %r = llvm.amdgcn.sdot2(v2i16 %a, v2i16 %b, i32 %c)
1309+ // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
1310+ def int_amdgcn_sdot2 :
1311+ GCCBuiltin<"__builtin_amdgcn_sdot2">,
1312+ Intrinsic<
1313+ [llvm_i32_ty], // %r
1314+ [
1315+ llvm_v2i16_ty, // %a
1316+ llvm_v2i16_ty, // %b
1317+ llvm_i32_ty // %c
1318+ ],
1319+ [IntrNoMem, IntrSpeculatable]
1320+ >;
1321+
1322+ // u32 %r = llvm.amdgcn.udot2(v2u16 %a, v2u16 %b, u32 %c)
1323+ // %r = %a[0] * %b[0] + %a[1] * %b[1] + %c
1324+ def int_amdgcn_udot2 :
1325+ GCCBuiltin<"__builtin_amdgcn_udot2">,
1326+ Intrinsic<
1327+ [llvm_i32_ty], // %r
1328+ [
1329+ llvm_v2i16_ty, // %a
1330+ llvm_v2i16_ty, // %b
1331+ llvm_i32_ty // %c
1332+ ],
1333+ [IntrNoMem, IntrSpeculatable]
1334+ >;
1335+
1336+ // i32 %r = llvm.amdgcn.sdot4(v4i8 (as i32) %a, v4i8 (as i32) %b, i32 %c)
1337+ // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
1338+ def int_amdgcn_sdot4 :
1339+ GCCBuiltin<"__builtin_amdgcn_sdot4">,
1340+ Intrinsic<
1341+ [llvm_i32_ty], // %r
1342+ [
1343+ llvm_i32_ty, // %a
1344+ llvm_i32_ty, // %b
1345+ llvm_i32_ty // %c
1346+ ],
1347+ [IntrNoMem, IntrSpeculatable]
1348+ >;
1349+
1350+ // u32 %r = llvm.amdgcn.udot4(v4u8 (as u32) %a, v4u8 (as u32) %b, u32 %c)
1351+ // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] + %c
1352+ def int_amdgcn_udot4 :
1353+ GCCBuiltin<"__builtin_amdgcn_udot4">,
1354+ Intrinsic<
1355+ [llvm_i32_ty], // %r
1356+ [
1357+ llvm_i32_ty, // %a
1358+ llvm_i32_ty, // %b
1359+ llvm_i32_ty // %c
1360+ ],
1361+ [IntrNoMem, IntrSpeculatable]
1362+ >;
1363+
1364+ // i32 %r = llvm.amdgcn.sdot8(v8i4 (as i32) %a, v8i4 (as i32) %b, i32 %c)
1365+ // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
1366+ // %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
1367+ def int_amdgcn_sdot8 :
1368+ GCCBuiltin<"__builtin_amdgcn_sdot8">,
1369+ Intrinsic<
1370+ [llvm_i32_ty], // %r
1371+ [
1372+ llvm_i32_ty, // %a
1373+ llvm_i32_ty, // %b
1374+ llvm_i32_ty // %c
1375+ ],
1376+ [IntrNoMem, IntrSpeculatable]
1377+ >;
1378+
1379+ // u32 %r = llvm.amdgcn.udot8(v8u4 (as u32) %a, v8u4 (as u32) %b, u32 %c)
1380+ // %r = %a[0] * %b[0] + %a[1] * %b[1] + %a[2] * %b[2] + %a[3] * %b[3] +
1381+ // %a[4] * %b[4] + %a[5] * %b[5] + %a[6] * %b[6] + %a[7] * %b[7] + %c
1382+ def int_amdgcn_udot8 :
1383+ GCCBuiltin<"__builtin_amdgcn_udot8">,
1384+ Intrinsic<
1385+ [llvm_i32_ty], // %r
1386+ [
1387+ llvm_i32_ty, // %a
1388+ llvm_i32_ty, // %b
1389+ llvm_i32_ty // %c
1390+ ],
1391+ [IntrNoMem, IntrSpeculatable]
1392+ >;
12901393
12911394//===----------------------------------------------------------------------===//
12921395// Special Intrinsics for backend internal use only. No frontend
0 commit comments