Skip to content

Commit 4b31ec9

Browse files
committed
[AArch64] Improve bcvtn2 and remove aarch64_neon_bfcvt intrinscs
This started out as trying to combine bf16 fpround to BFCVT2 instructions, but ended up removing the aarch64.neon.nfcvt intrinsics in favour of generating fpround instructions directly. This simplifies the patterns and can lead to other optimizations. The BFCVT2 instruction is adjusted to makes sure the types are more valid, and a bfcvt2 is now generated in more place. The old intrinsics are auto-upgraded to fptrunc instructions too.
1 parent f6f4744 commit 4b31ec9

File tree

10 files changed

+190
-165
lines changed

10 files changed

+190
-165
lines changed

clang/include/clang/Basic/arm_neon.td

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -259,11 +259,6 @@ def OP_VCVT_F32_BF16_LO
259259
def OP_VCVT_F32_BF16_HI
260260
: Op<(call "vcvt_f32_bf16", (call "vget_high", $p0))>;
261261

262-
def OP_VCVT_BF16_F32_LO_A64
263-
: Op<(call "__a64_vcvtq_low_bf16", $p0)>;
264-
def OP_VCVT_BF16_F32_A64
265-
: Op<(call "vget_low", (call "__a64_vcvtq_low_bf16", $p0))>;
266-
267262
def OP_VCVT_BF16_F32_A32
268263
: Op<(call "__a32_vcvt_bf16", $p0)>;
269264

@@ -2061,10 +2056,9 @@ let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard =
20612056
}
20622057

20632058
let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16,neon" in {
2064-
def VCVT_LOW_BF16_F32_A64_INTERNAL : WInst<"__a64_vcvtq_low_bf16", "BQ", "Hf">;
2065-
def VCVT_LOW_BF16_F32_A64 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A64>;
2059+
def VCVT_LOW_BF16_F32_A64 : SInst<"vcvt_low_bf16", "BQ", "Qf">;
20662060
def VCVT_HIGH_BF16_F32_A64 : SInst<"vcvt_high_bf16", "BBQ", "Qf">;
2067-
def VCVT_BF16_F32 : SOpInst<"vcvt_bf16", "BQ", "f", OP_VCVT_BF16_F32_A64>;
2061+
def VCVT_BF16_F32 : SInst<"vcvt_bf16", "BQ", "f">;
20682062

20692063
def COPY_LANE_BF16 : IOpInst<"vcopy_lane", "..I.I", "b", OP_COPY_LN>;
20702064
def COPYQ_LANE_BF16 : IOpInst<"vcopy_lane", "..IqI", "Qb", OP_COPY_LN>;

clang/lib/CodeGen/CGBuiltin.cpp

Lines changed: 38 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7277,7 +7277,6 @@ static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = {
72777277
};
72787278

72797279
static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
7280-
NEONMAP1(__a64_vcvtq_low_bf16_f32, aarch64_neon_bfcvtn, 0),
72817280
NEONMAP0(splat_lane_v),
72827281
NEONMAP0(splat_laneq_v),
72837282
NEONMAP0(splatq_lane_v),
@@ -7377,7 +7376,8 @@ static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = {
73777376
NEONMAP0(vcvtq_f16_s16),
73787377
NEONMAP0(vcvtq_f16_u16),
73797378
NEONMAP0(vcvtq_f32_v),
7380-
NEONMAP1(vcvtq_high_bf16_f32, aarch64_neon_bfcvtn2, 0),
7379+
NEONMAP0(vcvtq_high_bf16_f32),
7380+
NEONMAP0(vcvtq_low_bf16_f32),
73817381
NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0),
73827382
NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0),
73837383
NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0),
@@ -7586,7 +7586,7 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = {
75867586
NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType),
75877587
NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType),
75887588
NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType),
7589-
NEONMAP1(vcvth_bf16_f32, aarch64_neon_bfcvt, 0),
7589+
NEONMAP0(vcvth_bf16_f32),
75907590
NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
75917591
NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType),
75927592
NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType),
@@ -12040,6 +12040,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
1204012040
return ConstantInt::get(Builder.getInt32Ty(), 0);
1204112041
}
1204212042

12043+
if (BuiltinID == NEON::BI__builtin_neon_vcvth_bf16_f32)
12044+
return Builder.CreateFPTrunc(
12045+
Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)),
12046+
Builder.getFloatTy()),
12047+
Builder.getBFloatTy());
12048+
1204312049
// Handle MSVC intrinsics before argument evaluation to prevent double
1204412050
// evaluation.
1204512051
if (std::optional<MSVCIntrin> MsvcIntId =
@@ -12765,6 +12771,35 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
1276512771
return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)),
1276612772
"vgetq_lane");
1276712773
}
12774+
case NEON::BI__builtin_neon_vcvt_bf16_f32: {
12775+
llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
12776+
llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
12777+
return Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
12778+
}
12779+
case NEON::BI__builtin_neon_vcvtq_low_bf16_f32: {
12780+
SmallVector<int, 16> ConcatMask(8);
12781+
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
12782+
llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
12783+
llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
12784+
llvm::Value *Trunc =
12785+
Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16);
12786+
return Builder.CreateShuffleVector(
12787+
Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
12788+
}
12789+
case NEON::BI__builtin_neon_vcvtq_high_bf16_f32: {
12790+
SmallVector<int, 16> ConcatMask(8);
12791+
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
12792+
SmallVector<int, 16> LoMask(4);
12793+
std::iota(LoMask.begin(), LoMask.end(), 0);
12794+
llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4);
12795+
llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4);
12796+
llvm::Type *V8BF16 = FixedVectorType::get(Builder.getBFloatTy(), 8);
12797+
llvm::Value *Inactive = Builder.CreateShuffleVector(
12798+
Builder.CreateBitCast(Ops[0], V8BF16), LoMask);
12799+
llvm::Value *Trunc =
12800+
Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[1], V4F32), V4BF16);
12801+
return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
12802+
}
1276812803

1276912804
case clang::AArch64::BI_InterlockedAdd:
1277012805
case clang::AArch64::BI_InterlockedAdd64: {

clang/test/CodeGen/arm-bf16-convert-intrinsics.c

Lines changed: 11 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -223,10 +223,8 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) {
223223
// CHECK-A64-LABEL: @test_vcvt_bf16_f32(
224224
// CHECK-A64-NEXT: entry:
225225
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
226-
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
227-
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
228-
// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
229-
// CHECK-A64-NEXT: ret <4 x bfloat> [[SHUFFLE_I]]
226+
// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
227+
// CHECK-A64-NEXT: ret <4 x bfloat> [[TMP1]]
230228
//
231229
// CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32(
232230
// CHECK-A32-HARDFP-NEXT: entry:
@@ -263,9 +261,9 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) {
263261
// CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32(
264262
// CHECK-A64-NEXT: entry:
265263
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
266-
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]])
267-
// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8>
268-
// CHECK-A64-NEXT: ret <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]]
264+
// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
265+
// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
266+
// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP2]]
269267
//
270268
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32(
271269
// CHECK-A32-HARDFP-NEXT: entry:
@@ -323,9 +321,10 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) {
323321
// CHECK-A64-NEXT: entry:
324322
// CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8>
325323
// CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8>
326-
// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F322_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE]], <4 x float> [[A]])
327-
// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F323_I:%.*]] = bitcast <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]] to <16 x i8>
328-
// CHECK-A64-NEXT: ret <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]]
324+
// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <8 x bfloat> [[INACTIVE]], <8 x bfloat> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
325+
// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat>
326+
// CHECK-A64-NEXT: [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
327+
// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP4]]
329328
//
330329
// CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32(
331330
// CHECK-A32-HARDFP-NEXT: entry:
@@ -404,8 +403,8 @@ bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) {
404403

405404
// CHECK-A64-LABEL: @test_vcvth_bf16_f32(
406405
// CHECK-A64-NEXT: entry:
407-
// CHECK-A64-NEXT: [[VCVTH_BF16_F32_I:%.*]] = call bfloat @llvm.aarch64.neon.bfcvt(float [[A:%.*]])
408-
// CHECK-A64-NEXT: ret bfloat [[VCVTH_BF16_F32_I]]
406+
// CHECK-A64-NEXT: [[TMP0:%.*]] = fptrunc float [[A:%.*]] to bfloat
407+
// CHECK-A64-NEXT: ret bfloat [[TMP0]]
409408
//
410409
// CHECK-A32-HARDFP-LABEL: @test_vcvth_bf16_f32(
411410
// CHECK-A32-HARDFP-NEXT: entry:

llvm/include/llvm/IR/IntrinsicsAArch64.td

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -538,17 +538,6 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in {
538538
def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic;
539539
def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic;
540540

541-
542-
// v8.6-A Bfloat Intrinsics
543-
def int_aarch64_neon_bfcvt
544-
: DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>;
545-
def int_aarch64_neon_bfcvtn
546-
: DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v4f32_ty], [IntrNoMem]>;
547-
def int_aarch64_neon_bfcvtn2
548-
: DefaultAttrsIntrinsic<[llvm_v8bf16_ty],
549-
[llvm_v8bf16_ty, llvm_v4f32_ty],
550-
[IntrNoMem]>;
551-
552541
// v8.2-A FP16 Fused Multiply-Add Long
553542
def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic;
554543
def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic;

llvm/lib/IR/AutoUpgrade.cpp

Lines changed: 61 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
#include "llvm/Support/Regex.h"
4646
#include "llvm/TargetParser/Triple.h"
4747
#include <cstring>
48+
#include <numeric>
4849

4950
using namespace llvm;
5051

@@ -828,6 +829,13 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
828829
return true;
829830
}
830831
}
832+
833+
// Changed in 20.0: bfcvt/bfcvtn/bcvtn2 have been replaced with fptrunc.
834+
if (Name.starts_with("bfcvt")) {
835+
NewFn = nullptr;
836+
return true;
837+
}
838+
831839
return false; // No other 'aarch64.neon.*'.
832840
}
833841
if (Name.consume_front("sve.")) {
@@ -4064,31 +4072,59 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
40644072

40654073
static Value *upgradeAArch64IntrinsicCall(StringRef Name, CallBase *CI,
40664074
Function *F, IRBuilder<> &Builder) {
4067-
Intrinsic::ID NewID =
4068-
StringSwitch<Intrinsic::ID>(Name)
4069-
.Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
4070-
.Case("sve.fcvtnt.bf16f32", Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
4071-
.Default(Intrinsic::not_intrinsic);
4072-
if (NewID == Intrinsic::not_intrinsic)
4073-
llvm_unreachable("Unhandled Intrinsic!");
4074-
4075-
SmallVector<Value *, 3> Args(CI->args());
4076-
4077-
// The original intrinsics incorrectly used a predicate based on the smallest
4078-
// element type rather than the largest.
4079-
Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
4080-
Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);
4081-
4082-
if (Args[1]->getType() != BadPredTy)
4083-
llvm_unreachable("Unexpected predicate type!");
4084-
4085-
Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
4086-
BadPredTy, Args[1]);
4087-
Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
4088-
GoodPredTy, Args[1]);
4089-
4090-
return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
4091-
CI->getName());
4075+
if (Name.starts_with("neon.bfcvt")) {
4076+
if (Name.starts_with("neon.bfcvtn2")) {
4077+
SmallVector<int, 32> LoMask(4);
4078+
std::iota(LoMask.begin(), LoMask.end(), 0);
4079+
SmallVector<int, 32> ConcatMask(8);
4080+
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
4081+
Value *Inactive = Builder.CreateShuffleVector(CI->getOperand(0), LoMask);
4082+
Value *Trunc =
4083+
Builder.CreateFPTrunc(CI->getOperand(1), Inactive->getType());
4084+
return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask);
4085+
} else if (Name.starts_with("neon.bfcvtn")) {
4086+
SmallVector<int, 32> ConcatMask(8);
4087+
std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
4088+
Type *V4BF16 =
4089+
FixedVectorType::get(Type::getBFloatTy(F->getContext()), 4);
4090+
Value *Trunc = Builder.CreateFPTrunc(CI->getOperand(0), V4BF16);
4091+
dbgs() << "Trunc: " << *Trunc << "\n";
4092+
return Builder.CreateShuffleVector(
4093+
Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask);
4094+
} else {
4095+
return Builder.CreateFPTrunc(CI->getOperand(0),
4096+
Type::getBFloatTy(F->getContext()));
4097+
}
4098+
} else if (Name.starts_with("sve.fcvt")) {
4099+
Intrinsic::ID NewID =
4100+
StringSwitch<Intrinsic::ID>(Name)
4101+
.Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2)
4102+
.Case("sve.fcvtnt.bf16f32",
4103+
Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2)
4104+
.Default(Intrinsic::not_intrinsic);
4105+
if (NewID == Intrinsic::not_intrinsic)
4106+
llvm_unreachable("Unhandled Intrinsic!");
4107+
4108+
SmallVector<Value *, 3> Args(CI->args());
4109+
4110+
// The original intrinsics incorrectly used a predicate based on the
4111+
// smallest element type rather than the largest.
4112+
Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8);
4113+
Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4);
4114+
4115+
if (Args[1]->getType() != BadPredTy)
4116+
llvm_unreachable("Unexpected predicate type!");
4117+
4118+
Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
4119+
BadPredTy, Args[1]);
4120+
Args[1] = Builder.CreateIntrinsic(
4121+
Intrinsic::aarch64_sve_convert_from_svbool, GoodPredTy, Args[1]);
4122+
4123+
return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
4124+
CI->getName());
4125+
}
4126+
4127+
llvm_unreachable("Unhandled Intrinsic!");
40924128
}
40934129

40944130
static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,

llvm/lib/Target/AArch64/AArch64InstrFormats.td

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9045,22 +9045,19 @@ class SIMDThreeSameVectorBF16MatrixMul<string asm>
90459045

90469046
let mayRaiseFPException = 1, Uses = [FPCR] in
90479047
class SIMD_BFCVTN
9048-
: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
9048+
: BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V64,
90499049
"bfcvtn", ".4h", ".4s",
9050-
[(set (v8bf16 V128:$Rd),
9051-
(int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>;
9050+
[(set (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn)))]>;
90529051

90539052
let mayRaiseFPException = 1, Uses = [FPCR] in
90549053
class SIMD_BFCVTN2
90559054
: BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128,
9056-
"bfcvtn2", ".8h", ".4s",
9057-
[(set (v8bf16 V128:$dst),
9058-
(int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>;
9055+
"bfcvtn2", ".8h", ".4s", []>;
90599056

90609057
let mayRaiseFPException = 1, Uses = [FPCR] in
90619058
class BF16ToSinglePrecision<string asm>
90629059
: I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "",
9063-
[(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>,
9060+
[(set (bf16 FPR16:$Rd), (any_fpround (f32 FPR32:$Rn)))]>,
90649061
Sched<[WriteFCvt]> {
90659062
bits<5> Rd;
90669063
bits<5> Rn;

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1446,8 +1446,8 @@ def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
14461446
def BFCVTN : SIMD_BFCVTN;
14471447
def BFCVTN2 : SIMD_BFCVTN2;
14481448

1449-
def : Pat<(v4bf16 (any_fpround (v4f32 V128:$Rn))),
1450-
(EXTRACT_SUBREG (BFCVTN V128:$Rn), dsub)>;
1449+
def : Pat<(concat_vectors (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn))),
1450+
(BFCVTN2 (v8bf16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub)), V128:$Rn)>;
14511451

14521452
// Vector-scalar BFDOT:
14531453
// The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit
@@ -1469,8 +1469,6 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot
14691469

14701470
let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in {
14711471
def BFCVT : BF16ToSinglePrecision<"bfcvt">;
1472-
// Round FP32 to BF16.
1473-
def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>;
14741472
}
14751473

14761474
// ARMv8.6A AArch64 matrix multiplication
@@ -10425,9 +10423,11 @@ multiclass PromoteUnaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst>
1042510423
let Predicates = [HasBF16] in
1042610424
def : Pat<(InOp (v8bf16 V128:$Rn)),
1042710425
(v8bf16 (BFCVTN2
10428-
(v8bf16 (BFCVTN
10429-
(v4f32 (OutInst
10430-
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
10426+
(INSERT_SUBREG (IMPLICIT_DEF),
10427+
(v4bf16 (BFCVTN
10428+
(v4f32 (OutInst
10429+
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))),
10430+
dsub),
1043110431
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn))))))>;
1043210432

1043310433
let Predicates = [HasNoBF16] in
@@ -10462,10 +10462,12 @@ multiclass PromoteBinaryv8f16Tov4f32<SDPatternOperator InOp, Instruction OutInst
1046210462
let Predicates = [HasBF16] in
1046310463
def : Pat<(InOp (v8bf16 V128:$Rn), (v8bf16 V128:$Rm)),
1046410464
(v8bf16 (BFCVTN2
10465-
(v8bf16 (BFCVTN
10466-
(v4f32 (OutInst
10467-
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
10468-
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
10465+
(INSERT_SUBREG (IMPLICIT_DEF),
10466+
(v4bf16 (BFCVTN
10467+
(v4f32 (OutInst
10468+
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))),
10469+
(v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rm, dsub)))))))),
10470+
dsub),
1046910471
(v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn)),
1047010472
(v4f32 (SHLLv8i16 V128:$Rm))))))>;
1047110473

llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64 -mattr=+neon -mattr=+bf16 | FileCheck %s
22

3+
; This test acts to test the old neon.bfcvt intrinsics, which are now
4+
; autoupgraded to fptrunc operations.
5+
36
declare bfloat @llvm.aarch64.neon.bfcvt(float)
47
declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float>)
58
declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat>, <4 x float>)

0 commit comments

Comments
 (0)