diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index ef89fa4358dfe..ddc5391eb3fa2 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -259,11 +259,6 @@ def OP_VCVT_F32_BF16_LO def OP_VCVT_F32_BF16_HI : Op<(call "vcvt_f32_bf16", (call "vget_high", $p0))>; -def OP_VCVT_BF16_F32_LO_A64 - : Op<(call "__a64_vcvtq_low_bf16", $p0)>; -def OP_VCVT_BF16_F32_A64 - : Op<(call "vget_low", (call "__a64_vcvtq_low_bf16", $p0))>; - def OP_VCVT_BF16_F32_A32 : Op<(call "__a32_vcvt_bf16", $p0)>; @@ -2061,10 +2056,9 @@ let ArchGuard = "!defined(__aarch64__) && !defined(__arm64ec__)", TargetGuard = } let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "bf16,neon" in { - def VCVT_LOW_BF16_F32_A64_INTERNAL : WInst<"__a64_vcvtq_low_bf16", "BQ", "Hf">; - def VCVT_LOW_BF16_F32_A64 : SOpInst<"vcvt_low_bf16", "BQ", "Qf", OP_VCVT_BF16_F32_LO_A64>; + def VCVT_LOW_BF16_F32_A64 : SInst<"vcvt_low_bf16", "BQ", "Qf">; def VCVT_HIGH_BF16_F32_A64 : SInst<"vcvt_high_bf16", "BBQ", "Qf">; - def VCVT_BF16_F32 : SOpInst<"vcvt_bf16", "BQ", "f", OP_VCVT_BF16_F32_A64>; + def VCVT_BF16_F32 : SInst<"vcvt_bf16", "BQ", "f">; def COPY_LANE_BF16 : IOpInst<"vcopy_lane", "..I.I", "b", OP_COPY_LN>; def COPYQ_LANE_BF16 : IOpInst<"vcopy_lane", "..IqI", "Qb", OP_COPY_LN>; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 1b25d365932c3..063cd0cc09fc6 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -7307,7 +7307,6 @@ static const ARMVectorIntrinsicInfo ARMSIMDIntrinsicMap [] = { }; static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = { - NEONMAP1(__a64_vcvtq_low_bf16_f32, aarch64_neon_bfcvtn, 0), NEONMAP0(splat_lane_v), NEONMAP0(splat_laneq_v), NEONMAP0(splatq_lane_v), @@ -7407,7 +7406,8 @@ static const ARMVectorIntrinsicInfo AArch64SIMDIntrinsicMap[] = { NEONMAP0(vcvtq_f16_s16), NEONMAP0(vcvtq_f16_u16), NEONMAP0(vcvtq_f32_v), - NEONMAP1(vcvtq_high_bf16_f32, aarch64_neon_bfcvtn2, 0), + NEONMAP0(vcvtq_high_bf16_f32), + NEONMAP0(vcvtq_low_bf16_f32), NEONMAP1(vcvtq_n_f16_s16, aarch64_neon_vcvtfxs2fp, 0), NEONMAP1(vcvtq_n_f16_u16, aarch64_neon_vcvtfxu2fp, 0), NEONMAP2(vcvtq_n_f32_v, aarch64_neon_vcvtfxu2fp, aarch64_neon_vcvtfxs2fp, 0), @@ -7616,7 +7616,7 @@ static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = { NEONMAP1(vcvtd_n_u64_f64, aarch64_neon_vcvtfp2fxu, AddRetType | Add1ArgType), NEONMAP1(vcvtd_s64_f64, aarch64_neon_fcvtzs, AddRetType | Add1ArgType), NEONMAP1(vcvtd_u64_f64, aarch64_neon_fcvtzu, AddRetType | Add1ArgType), - NEONMAP1(vcvth_bf16_f32, aarch64_neon_bfcvt, 0), + NEONMAP0(vcvth_bf16_f32), NEONMAP1(vcvtmd_s64_f64, aarch64_neon_fcvtms, AddRetType | Add1ArgType), NEONMAP1(vcvtmd_u64_f64, aarch64_neon_fcvtmu, AddRetType | Add1ArgType), NEONMAP1(vcvtms_s32_f32, aarch64_neon_fcvtms, AddRetType | Add1ArgType), @@ -12083,6 +12083,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, return ConstantInt::get(Builder.getInt32Ty(), 0); } + if (BuiltinID == NEON::BI__builtin_neon_vcvth_bf16_f32) + return Builder.CreateFPTrunc( + Builder.CreateBitCast(EmitScalarExpr(E->getArg(0)), + Builder.getFloatTy()), + Builder.getBFloatTy()); + // Handle MSVC intrinsics before argument evaluation to prevent double // evaluation. if (std::optional MsvcIntId = @@ -12808,6 +12814,35 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), "vgetq_lane"); } + case NEON::BI__builtin_neon_vcvt_bf16_f32: { + llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4); + llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4); + return Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16); + } + case NEON::BI__builtin_neon_vcvtq_low_bf16_f32: { + SmallVector ConcatMask(8); + std::iota(ConcatMask.begin(), ConcatMask.end(), 0); + llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4); + llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4); + llvm::Value *Trunc = + Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[0], V4F32), V4BF16); + return Builder.CreateShuffleVector( + Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask); + } + case NEON::BI__builtin_neon_vcvtq_high_bf16_f32: { + SmallVector ConcatMask(8); + std::iota(ConcatMask.begin(), ConcatMask.end(), 0); + SmallVector LoMask(4); + std::iota(LoMask.begin(), LoMask.end(), 0); + llvm::Type *V4F32 = FixedVectorType::get(Builder.getFloatTy(), 4); + llvm::Type *V4BF16 = FixedVectorType::get(Builder.getBFloatTy(), 4); + llvm::Type *V8BF16 = FixedVectorType::get(Builder.getBFloatTy(), 8); + llvm::Value *Inactive = Builder.CreateShuffleVector( + Builder.CreateBitCast(Ops[0], V8BF16), LoMask); + llvm::Value *Trunc = + Builder.CreateFPTrunc(Builder.CreateBitCast(Ops[1], V4F32), V4BF16); + return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask); + } case clang::AArch64::BI_InterlockedAdd: case clang::AArch64::BI_InterlockedAdd64: { diff --git a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c index 51aa5aa758f0c..93f54c70c340d 100644 --- a/clang/test/CodeGen/arm-bf16-convert-intrinsics.c +++ b/clang/test/CodeGen/arm-bf16-convert-intrinsics.c @@ -223,10 +223,8 @@ float32x4_t test_vcvtq_high_f32_bf16(bfloat16x8_t a) { // CHECK-A64-LABEL: @test_vcvt_bf16_f32( // CHECK-A64-NEXT: entry: // CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]]) -// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8> -// CHECK-A64-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]], <4 x i32> -// CHECK-A64-NEXT: ret <4 x bfloat> [[SHUFFLE_I]] +// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat> +// CHECK-A64-NEXT: ret <4 x bfloat> [[TMP1]] // // CHECK-A32-HARDFP-LABEL: @test_vcvt_bf16_f32( // CHECK-A32-HARDFP-NEXT: entry: @@ -263,9 +261,9 @@ bfloat16x4_t test_vcvt_bf16_f32(float32x4_t a) { // CHECK-A64-LABEL: @test_vcvtq_low_bf16_f32( // CHECK-A64-NEXT: entry: // CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F321_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float> [[A]]) -// CHECK-A64-NEXT: [[__A64_VCVTQ_LOW_BF16_F322_I:%.*]] = bitcast <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] to <16 x i8> -// CHECK-A64-NEXT: ret <8 x bfloat> [[__A64_VCVTQ_LOW_BF16_F321_I]] +// CHECK-A64-NEXT: [[TMP1:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat> +// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <4 x bfloat> [[TMP1]], <4 x bfloat> zeroinitializer, <8 x i32> +// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP2]] // // CHECK-A32-HARDFP-LABEL: @test_vcvtq_low_bf16_f32( // CHECK-A32-HARDFP-NEXT: entry: @@ -323,9 +321,10 @@ bfloat16x8_t test_vcvtq_low_bf16_f32(float32x4_t a) { // CHECK-A64-NEXT: entry: // CHECK-A64-NEXT: [[TMP0:%.*]] = bitcast <8 x bfloat> [[INACTIVE:%.*]] to <16 x i8> // CHECK-A64-NEXT: [[TMP1:%.*]] = bitcast <4 x float> [[A:%.*]] to <16 x i8> -// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F322_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat> [[INACTIVE]], <4 x float> [[A]]) -// CHECK-A64-NEXT: [[VCVTQ_HIGH_BF16_F323_I:%.*]] = bitcast <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]] to <16 x i8> -// CHECK-A64-NEXT: ret <8 x bfloat> [[VCVTQ_HIGH_BF16_F322_I]] +// CHECK-A64-NEXT: [[TMP2:%.*]] = shufflevector <8 x bfloat> [[INACTIVE]], <8 x bfloat> poison, <4 x i32> +// CHECK-A64-NEXT: [[TMP3:%.*]] = fptrunc <4 x float> [[A]] to <4 x bfloat> +// CHECK-A64-NEXT: [[TMP4:%.*]] = shufflevector <4 x bfloat> [[TMP2]], <4 x bfloat> [[TMP3]], <8 x i32> +// CHECK-A64-NEXT: ret <8 x bfloat> [[TMP4]] // // CHECK-A32-HARDFP-LABEL: @test_vcvtq_high_bf16_f32( // CHECK-A32-HARDFP-NEXT: entry: @@ -404,8 +403,8 @@ bfloat16x8_t test_vcvtq_high_bf16_f32(bfloat16x8_t inactive, float32x4_t a) { // CHECK-A64-LABEL: @test_vcvth_bf16_f32( // CHECK-A64-NEXT: entry: -// CHECK-A64-NEXT: [[VCVTH_BF16_F32_I:%.*]] = call bfloat @llvm.aarch64.neon.bfcvt(float [[A:%.*]]) -// CHECK-A64-NEXT: ret bfloat [[VCVTH_BF16_F32_I]] +// CHECK-A64-NEXT: [[TMP0:%.*]] = fptrunc float [[A:%.*]] to bfloat +// CHECK-A64-NEXT: ret bfloat [[TMP0]] // // CHECK-A32-HARDFP-LABEL: @test_vcvth_bf16_f32( // CHECK-A32-HARDFP-NEXT: entry: diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index cc7a81e15f660..b31a65d9bcc02 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -538,17 +538,6 @@ let TargetPrefix = "aarch64", IntrProperties = [IntrNoMem] in { def int_aarch64_neon_bfmlalb : AdvSIMD_BF16FML_Intrinsic; def int_aarch64_neon_bfmlalt : AdvSIMD_BF16FML_Intrinsic; - - // v8.6-A Bfloat Intrinsics - def int_aarch64_neon_bfcvt - : DefaultAttrsIntrinsic<[llvm_bfloat_ty], [llvm_float_ty], [IntrNoMem]>; - def int_aarch64_neon_bfcvtn - : DefaultAttrsIntrinsic<[llvm_v8bf16_ty], [llvm_v4f32_ty], [IntrNoMem]>; - def int_aarch64_neon_bfcvtn2 - : DefaultAttrsIntrinsic<[llvm_v8bf16_ty], - [llvm_v8bf16_ty, llvm_v4f32_ty], - [IntrNoMem]>; - // v8.2-A FP16 Fused Multiply-Add Long def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic; def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 06e62bf7f9f75..be67bed087b81 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -45,6 +45,7 @@ #include "llvm/Support/Regex.h" #include "llvm/TargetParser/Triple.h" #include +#include using namespace llvm; @@ -828,6 +829,13 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, return true; } } + + // Changed in 20.0: bfcvt/bfcvtn/bcvtn2 have been replaced with fptrunc. + if (Name.starts_with("bfcvt")) { + NewFn = nullptr; + return true; + } + return false; // No other 'aarch64.neon.*'. } if (Name.consume_front("sve.")) { @@ -4064,31 +4072,59 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F, static Value *upgradeAArch64IntrinsicCall(StringRef Name, CallBase *CI, Function *F, IRBuilder<> &Builder) { - Intrinsic::ID NewID = - StringSwitch(Name) - .Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2) - .Case("sve.fcvtnt.bf16f32", Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2) - .Default(Intrinsic::not_intrinsic); - if (NewID == Intrinsic::not_intrinsic) - llvm_unreachable("Unhandled Intrinsic!"); - - SmallVector Args(CI->args()); - - // The original intrinsics incorrectly used a predicate based on the smallest - // element type rather than the largest. - Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8); - Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4); - - if (Args[1]->getType() != BadPredTy) - llvm_unreachable("Unexpected predicate type!"); - - Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool, - BadPredTy, Args[1]); - Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, - GoodPredTy, Args[1]); - - return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr, - CI->getName()); + if (Name.starts_with("neon.bfcvt")) { + if (Name.starts_with("neon.bfcvtn2")) { + SmallVector LoMask(4); + std::iota(LoMask.begin(), LoMask.end(), 0); + SmallVector ConcatMask(8); + std::iota(ConcatMask.begin(), ConcatMask.end(), 0); + Value *Inactive = Builder.CreateShuffleVector(CI->getOperand(0), LoMask); + Value *Trunc = + Builder.CreateFPTrunc(CI->getOperand(1), Inactive->getType()); + return Builder.CreateShuffleVector(Inactive, Trunc, ConcatMask); + } else if (Name.starts_with("neon.bfcvtn")) { + SmallVector ConcatMask(8); + std::iota(ConcatMask.begin(), ConcatMask.end(), 0); + Type *V4BF16 = + FixedVectorType::get(Type::getBFloatTy(F->getContext()), 4); + Value *Trunc = Builder.CreateFPTrunc(CI->getOperand(0), V4BF16); + dbgs() << "Trunc: " << *Trunc << "\n"; + return Builder.CreateShuffleVector( + Trunc, ConstantAggregateZero::get(V4BF16), ConcatMask); + } else { + return Builder.CreateFPTrunc(CI->getOperand(0), + Type::getBFloatTy(F->getContext())); + } + } else if (Name.starts_with("sve.fcvt")) { + Intrinsic::ID NewID = + StringSwitch(Name) + .Case("sve.fcvt.bf16f32", Intrinsic::aarch64_sve_fcvt_bf16f32_v2) + .Case("sve.fcvtnt.bf16f32", + Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2) + .Default(Intrinsic::not_intrinsic); + if (NewID == Intrinsic::not_intrinsic) + llvm_unreachable("Unhandled Intrinsic!"); + + SmallVector Args(CI->args()); + + // The original intrinsics incorrectly used a predicate based on the + // smallest element type rather than the largest. + Type *BadPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 8); + Type *GoodPredTy = ScalableVectorType::get(Builder.getInt1Ty(), 4); + + if (Args[1]->getType() != BadPredTy) + llvm_unreachable("Unexpected predicate type!"); + + Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool, + BadPredTy, Args[1]); + Args[1] = Builder.CreateIntrinsic( + Intrinsic::aarch64_sve_convert_from_svbool, GoodPredTy, Args[1]); + + return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr, + CI->getName()); + } + + llvm_unreachable("Unhandled Intrinsic!"); } static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F, diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 1ff8b77f88e27..6a3a9492e031c 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -9053,22 +9053,19 @@ class SIMDThreeSameVectorBF16MatrixMul let mayRaiseFPException = 1, Uses = [FPCR] in class SIMD_BFCVTN - : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128, + : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V64, "bfcvtn", ".4h", ".4s", - [(set (v8bf16 V128:$Rd), - (int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>; + [(set (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn)))]>; let mayRaiseFPException = 1, Uses = [FPCR] in class SIMD_BFCVTN2 : BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128, - "bfcvtn2", ".8h", ".4s", - [(set (v8bf16 V128:$dst), - (int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>; + "bfcvtn2", ".8h", ".4s", []>; let mayRaiseFPException = 1, Uses = [FPCR] in class BF16ToSinglePrecision : I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "", - [(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>, + [(set (bf16 FPR16:$Rd), (any_fpround (f32 FPR32:$Rn)))]>, Sched<[WriteFCvt]> { bits<5> Rd; bits<5> Rn; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 9c7dc7784e939..8cc4d327dec2b 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1454,8 +1454,8 @@ def BFMLALTIdx : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>; def BFCVTN : SIMD_BFCVTN; def BFCVTN2 : SIMD_BFCVTN2; -def : Pat<(v4bf16 (any_fpround (v4f32 V128:$Rn))), - (EXTRACT_SUBREG (BFCVTN V128:$Rn), dsub)>; +def : Pat<(concat_vectors (v4bf16 V64:$Rd), (any_fpround (v4f32 V128:$Rn))), + (BFCVTN2 (v8bf16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub)), V128:$Rn)>; // Vector-scalar BFDOT: // The second source operand of the 64-bit variant of BF16DOTlane is a 128-bit @@ -1477,8 +1477,6 @@ def : Pat<(v2f32 (int_aarch64_neon_bfdot let Predicates = [HasNEONandIsStreamingSafe, HasBF16] in { def BFCVT : BF16ToSinglePrecision<"bfcvt">; -// Round FP32 to BF16. -def : Pat<(bf16 (any_fpround (f32 FPR32:$Rn))), (BFCVT $Rn)>; } // ARMv8.6A AArch64 matrix multiplication @@ -10410,9 +10408,11 @@ multiclass PromoteUnaryv8f16Tov4f32 let Predicates = [HasBF16] in def : Pat<(InOp (v8bf16 V128:$Rn)), (v8bf16 (BFCVTN2 - (v8bf16 (BFCVTN - (v4f32 (OutInst - (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))), + (INSERT_SUBREG (IMPLICIT_DEF), + (v4bf16 (BFCVTN + (v4f32 (OutInst + (v4f32 (SHLLv4i16 (v4i16 (EXTRACT_SUBREG V128:$Rn, dsub)))))))), + dsub), (v4f32 (OutInst (v4f32 (SHLLv8i16 V128:$Rn))))))>; let Predicates = [HasNoBF16] in @@ -10447,10 +10447,12 @@ multiclass PromoteBinaryv8f16Tov4f32; diff --git a/llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll b/llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll index 9d4e79d38d5d1..64bc95f2f3890 100644 --- a/llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll +++ b/llvm/test/CodeGen/AArch64/bf16-convert-intrinsics.ll @@ -1,5 +1,8 @@ ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64 -mattr=+neon -mattr=+bf16 | FileCheck %s +; This test acts to test the old neon.bfcvt intrinsics, which are now +; autoupgraded to fptrunc operations. + declare bfloat @llvm.aarch64.neon.bfcvt(float) declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn(<4 x float>) declare <8 x bfloat> @llvm.aarch64.neon.bfcvtn2(<8 x bfloat>, <4 x float>) diff --git a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll index 9b6e19eba3f4e..1cd0294b0083e 100644 --- a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll @@ -22,7 +22,6 @@ define <4 x bfloat> @add_h(<4 x bfloat> %a, <4 x bfloat> %b) { ; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: fadd v0.4s, v0.4s, v1.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-BF16-NEXT: ret entry: @@ -62,7 +61,6 @@ define <4 x bfloat> @sub_h(<4 x bfloat> %a, <4 x bfloat> %b) { ; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: fsub v0.4s, v0.4s, v1.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-BF16-NEXT: ret entry: @@ -91,7 +89,6 @@ define <4 x bfloat> @mul_h(<4 x bfloat> %a, <4 x bfloat> %b) { ; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: fmul v0.4s, v0.4s, v1.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-BF16-NEXT: ret entry: @@ -120,7 +117,6 @@ define <4 x bfloat> @div_h(<4 x bfloat> %a, <4 x bfloat> %b) { ; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: fdiv v0.4s, v0.4s, v1.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-BF16-NEXT: ret entry: @@ -168,7 +164,6 @@ define <4 x bfloat> @s_to_h(<4 x float> %a) { ; CHECK-BF16-LABEL: s_to_h: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-BF16-NEXT: ret %1 = fptrunc <4 x float> %a to <4 x bfloat> ret <4 x bfloat> %1 @@ -196,7 +191,6 @@ define <4 x bfloat> @d_to_h(<4 x double> %a) { ; CHECK-BF16-NEXT: fcvtxn v0.2s, v0.2d ; CHECK-BF16-NEXT: fcvtxn2 v0.4s, v1.2d ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-BF16-NEXT: ret %1 = fptrunc <4 x double> %a to <4 x bfloat> ret <4 x bfloat> %1 @@ -262,7 +256,6 @@ define <4 x bfloat> @sitofp_i8(<4 x i8> %a) #0 { ; CHECK-BF16-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-BF16-NEXT: scvtf v0.4s, v0.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-BF16-NEXT: ret %1 = sitofp <4 x i8> %a to <4 x bfloat> ret <4 x bfloat> %1 @@ -286,7 +279,6 @@ define <4 x bfloat> @sitofp_i16(<4 x i16> %a) #0 { ; CHECK-BF16-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-BF16-NEXT: scvtf v0.4s, v0.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-BF16-NEXT: ret %1 = sitofp <4 x i16> %a to <4 x bfloat> ret <4 x bfloat> %1 @@ -309,7 +301,6 @@ define <4 x bfloat> @sitofp_i32(<4 x i32> %a) #0 { ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: scvtf v0.4s, v0.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-BF16-NEXT: ret %1 = sitofp <4 x i32> %a to <4 x bfloat> ret <4 x bfloat> %1 @@ -342,7 +333,6 @@ define <4 x bfloat> @sitofp_i64(<4 x i64> %a) #0 { ; CHECK-BF16-NEXT: fcvtn v0.2s, v0.2d ; CHECK-BF16-NEXT: fcvtn2 v0.4s, v1.2d ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-BF16-NEXT: ret %1 = sitofp <4 x i64> %a to <4 x bfloat> ret <4 x bfloat> %1 @@ -368,7 +358,6 @@ define <4 x bfloat> @uitofp_i8(<4 x i8> %a) #0 { ; CHECK-BF16-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-BF16-NEXT: ucvtf v0.4s, v0.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-BF16-NEXT: ret %1 = uitofp <4 x i8> %a to <4 x bfloat> ret <4 x bfloat> %1 @@ -393,7 +382,6 @@ define <4 x bfloat> @uitofp_i16(<4 x i16> %a) #0 { ; CHECK-BF16-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-BF16-NEXT: ucvtf v0.4s, v0.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-BF16-NEXT: ret %1 = uitofp <4 x i16> %a to <4 x bfloat> ret <4 x bfloat> %1 @@ -416,7 +404,6 @@ define <4 x bfloat> @uitofp_i32(<4 x i32> %a) #0 { ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: ucvtf v0.4s, v0.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-BF16-NEXT: ret %1 = uitofp <4 x i32> %a to <4 x bfloat> ret <4 x bfloat> %1 @@ -449,7 +436,6 @@ define <4 x bfloat> @uitofp_i64(<4 x i64> %a) #0 { ; CHECK-BF16-NEXT: fcvtn v0.2s, v0.2d ; CHECK-BF16-NEXT: fcvtn2 v0.4s, v1.2d ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-BF16-NEXT: ret %1 = uitofp <4 x i64> %a to <4 x bfloat> ret <4 x bfloat> %1 diff --git a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll index a609e33be935e..2eaa58de92807 100644 --- a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll @@ -221,9 +221,8 @@ define <8 x bfloat> @s_to_h(<8 x float> %a) { ; ; CHECK-BF16-LABEL: s_to_h: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: bfcvtn v1.4h, v1.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: mov v0.d[1], v1.d[0] +; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s ; CHECK-BF16-NEXT: ret %1 = fptrunc <8 x float> %a to <8 x bfloat> ret <8 x bfloat> %1 @@ -257,13 +256,12 @@ define <8 x bfloat> @d_to_h(<8 x double> %a) { ; ; CHECK-BF16-LABEL: d_to_h: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: fcvtxn v2.2s, v2.2d ; CHECK-BF16-NEXT: fcvtxn v0.2s, v0.2d -; CHECK-BF16-NEXT: fcvtxn2 v2.4s, v3.2d +; CHECK-BF16-NEXT: fcvtxn v2.2s, v2.2d ; CHECK-BF16-NEXT: fcvtxn2 v0.4s, v1.2d -; CHECK-BF16-NEXT: bfcvtn v1.4h, v2.4s +; CHECK-BF16-NEXT: fcvtxn2 v2.4s, v3.2d ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: mov v0.d[1], v1.d[0] +; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v2.4s ; CHECK-BF16-NEXT: ret %1 = fptrunc <8 x double> %a to <8 x bfloat> ret <8 x bfloat> %1 @@ -334,7 +332,6 @@ define <4 x bfloat> @sitofp_v4i8(<4 x i8> %a) #0 { ; CHECK-BF16-NEXT: sshll v0.4s, v0.4h, #0 ; CHECK-BF16-NEXT: scvtf v0.4s, v0.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-BF16-NEXT: ret %1 = sitofp <4 x i8> %a to <4 x bfloat> ret <4 x bfloat> %1 @@ -363,13 +360,12 @@ define <8 x bfloat> @sitofp_v8i8(<8 x i8> %a) #0 { ; CHECK-BF16-LABEL: sitofp_v8i8: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-BF16-NEXT: sshll2 v1.4s, v0.8h, #0 -; CHECK-BF16-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-BF16-NEXT: sshll v1.4s, v0.4h, #0 +; CHECK-BF16-NEXT: sshll2 v2.4s, v0.8h, #0 ; CHECK-BF16-NEXT: scvtf v1.4s, v1.4s -; CHECK-BF16-NEXT: scvtf v0.4s, v0.4s -; CHECK-BF16-NEXT: bfcvtn v1.4h, v1.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: mov v0.d[1], v1.d[0] +; CHECK-BF16-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16-NEXT: scvtf v1.4s, v2.4s +; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s ; CHECK-BF16-NEXT: ret %1 = sitofp <8 x i8> %a to <8 x bfloat> ret <8 x bfloat> %1 @@ -412,20 +408,18 @@ define <16 x bfloat> @sitofp_v16i8(<16 x i8> %a) #0 { ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: sshll2 v1.8h, v0.16b, #0 ; CHECK-BF16-NEXT: sshll v0.8h, v0.8b, #0 -; CHECK-BF16-NEXT: sshll2 v2.4s, v1.8h, #0 -; CHECK-BF16-NEXT: sshll v1.4s, v1.4h, #0 -; CHECK-BF16-NEXT: sshll2 v3.4s, v0.8h, #0 -; CHECK-BF16-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-BF16-NEXT: sshll v2.4s, v1.4h, #0 +; CHECK-BF16-NEXT: sshll v3.4s, v0.4h, #0 +; CHECK-BF16-NEXT: sshll2 v4.4s, v1.8h, #0 +; CHECK-BF16-NEXT: sshll2 v5.4s, v0.8h, #0 ; CHECK-BF16-NEXT: scvtf v2.4s, v2.4s -; CHECK-BF16-NEXT: scvtf v1.4s, v1.4s ; CHECK-BF16-NEXT: scvtf v3.4s, v3.4s -; CHECK-BF16-NEXT: scvtf v0.4s, v0.4s -; CHECK-BF16-NEXT: bfcvtn v2.4h, v2.4s -; CHECK-BF16-NEXT: bfcvtn v1.4h, v1.4s -; CHECK-BF16-NEXT: bfcvtn v3.4h, v3.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: mov v1.d[1], v2.d[0] -; CHECK-BF16-NEXT: mov v0.d[1], v3.d[0] +; CHECK-BF16-NEXT: bfcvtn v1.4h, v2.4s +; CHECK-BF16-NEXT: scvtf v2.4s, v4.4s +; CHECK-BF16-NEXT: bfcvtn v0.4h, v3.4s +; CHECK-BF16-NEXT: scvtf v3.4s, v5.4s +; CHECK-BF16-NEXT: bfcvtn2 v1.8h, v2.4s +; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v3.4s ; CHECK-BF16-NEXT: ret %1 = sitofp <16 x i8> %a to <16 x bfloat> ret <16 x bfloat> %1 @@ -452,13 +446,12 @@ define <8 x bfloat> @sitofp_i16(<8 x i16> %a) #0 { ; ; CHECK-BF16-LABEL: sitofp_i16: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: sshll2 v1.4s, v0.8h, #0 -; CHECK-BF16-NEXT: sshll v0.4s, v0.4h, #0 +; CHECK-BF16-NEXT: sshll v1.4s, v0.4h, #0 +; CHECK-BF16-NEXT: sshll2 v2.4s, v0.8h, #0 ; CHECK-BF16-NEXT: scvtf v1.4s, v1.4s -; CHECK-BF16-NEXT: scvtf v0.4s, v0.4s -; CHECK-BF16-NEXT: bfcvtn v1.4h, v1.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: mov v0.d[1], v1.d[0] +; CHECK-BF16-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16-NEXT: scvtf v1.4s, v2.4s +; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s ; CHECK-BF16-NEXT: ret %1 = sitofp <8 x i16> %a to <8 x bfloat> ret <8 x bfloat> %1 @@ -483,11 +476,10 @@ define <8 x bfloat> @sitofp_i32(<8 x i32> %a) #0 { ; ; CHECK-BF16-LABEL: sitofp_i32: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: scvtf v1.4s, v1.4s ; CHECK-BF16-NEXT: scvtf v0.4s, v0.4s -; CHECK-BF16-NEXT: bfcvtn v1.4h, v1.4s +; CHECK-BF16-NEXT: scvtf v1.4s, v1.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: mov v0.d[1], v1.d[0] +; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s ; CHECK-BF16-NEXT: ret %1 = sitofp <8 x i32> %a to <8 x bfloat> ret <8 x bfloat> %1 @@ -526,17 +518,16 @@ define <8 x bfloat> @sitofp_i64(<8 x i64> %a) #0 { ; ; CHECK-BF16-LABEL: sitofp_i64: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: scvtf v2.2d, v2.2d ; CHECK-BF16-NEXT: scvtf v0.2d, v0.2d -; CHECK-BF16-NEXT: scvtf v3.2d, v3.2d +; CHECK-BF16-NEXT: scvtf v2.2d, v2.2d ; CHECK-BF16-NEXT: scvtf v1.2d, v1.2d -; CHECK-BF16-NEXT: fcvtn v2.2s, v2.2d +; CHECK-BF16-NEXT: scvtf v3.2d, v3.2d ; CHECK-BF16-NEXT: fcvtn v0.2s, v0.2d -; CHECK-BF16-NEXT: fcvtn2 v2.4s, v3.2d +; CHECK-BF16-NEXT: fcvtn v2.2s, v2.2d ; CHECK-BF16-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-BF16-NEXT: bfcvtn v1.4h, v2.4s +; CHECK-BF16-NEXT: fcvtn2 v2.4s, v3.2d ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: mov v0.d[1], v1.d[0] +; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v2.4s ; CHECK-BF16-NEXT: ret %1 = sitofp <8 x i64> %a to <8 x bfloat> ret <8 x bfloat> %1 @@ -562,7 +553,6 @@ define <4 x bfloat> @uitofp_v4i8(<4 x i8> %a) #0 { ; CHECK-BF16-NEXT: ushll v0.4s, v0.4h, #0 ; CHECK-BF16-NEXT: ucvtf v0.4s, v0.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-BF16-NEXT: ret %1 = uitofp <4 x i8> %a to <4 x bfloat> ret <4 x bfloat> %1 @@ -591,13 +581,12 @@ define <8 x bfloat> @uitofp_v8i8(<8 x i8> %a) #0 { ; CHECK-BF16-LABEL: uitofp_v8i8: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-BF16-NEXT: ushll2 v1.4s, v0.8h, #0 -; CHECK-BF16-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BF16-NEXT: ushll v1.4s, v0.4h, #0 +; CHECK-BF16-NEXT: ushll2 v2.4s, v0.8h, #0 ; CHECK-BF16-NEXT: ucvtf v1.4s, v1.4s -; CHECK-BF16-NEXT: ucvtf v0.4s, v0.4s -; CHECK-BF16-NEXT: bfcvtn v1.4h, v1.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: mov v0.d[1], v1.d[0] +; CHECK-BF16-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16-NEXT: ucvtf v1.4s, v2.4s +; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s ; CHECK-BF16-NEXT: ret %1 = uitofp <8 x i8> %a to <8 x bfloat> ret <8 x bfloat> %1 @@ -640,20 +629,18 @@ define <16 x bfloat> @uitofp_v16i8(<16 x i8> %a) #0 { ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: ushll2 v1.8h, v0.16b, #0 ; CHECK-BF16-NEXT: ushll v0.8h, v0.8b, #0 -; CHECK-BF16-NEXT: ushll2 v2.4s, v1.8h, #0 -; CHECK-BF16-NEXT: ushll v1.4s, v1.4h, #0 -; CHECK-BF16-NEXT: ushll2 v3.4s, v0.8h, #0 -; CHECK-BF16-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BF16-NEXT: ushll v2.4s, v1.4h, #0 +; CHECK-BF16-NEXT: ushll v3.4s, v0.4h, #0 +; CHECK-BF16-NEXT: ushll2 v4.4s, v1.8h, #0 +; CHECK-BF16-NEXT: ushll2 v5.4s, v0.8h, #0 ; CHECK-BF16-NEXT: ucvtf v2.4s, v2.4s -; CHECK-BF16-NEXT: ucvtf v1.4s, v1.4s ; CHECK-BF16-NEXT: ucvtf v3.4s, v3.4s -; CHECK-BF16-NEXT: ucvtf v0.4s, v0.4s -; CHECK-BF16-NEXT: bfcvtn v2.4h, v2.4s -; CHECK-BF16-NEXT: bfcvtn v1.4h, v1.4s -; CHECK-BF16-NEXT: bfcvtn v3.4h, v3.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: mov v1.d[1], v2.d[0] -; CHECK-BF16-NEXT: mov v0.d[1], v3.d[0] +; CHECK-BF16-NEXT: bfcvtn v1.4h, v2.4s +; CHECK-BF16-NEXT: ucvtf v2.4s, v4.4s +; CHECK-BF16-NEXT: bfcvtn v0.4h, v3.4s +; CHECK-BF16-NEXT: ucvtf v3.4s, v5.4s +; CHECK-BF16-NEXT: bfcvtn2 v1.8h, v2.4s +; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v3.4s ; CHECK-BF16-NEXT: ret %1 = uitofp <16 x i8> %a to <16 x bfloat> ret <16 x bfloat> %1 @@ -681,13 +668,12 @@ define <8 x bfloat> @uitofp_i16(<8 x i16> %a) #0 { ; ; CHECK-BF16-LABEL: uitofp_i16: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: ushll2 v1.4s, v0.8h, #0 -; CHECK-BF16-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BF16-NEXT: ushll v1.4s, v0.4h, #0 +; CHECK-BF16-NEXT: ushll2 v2.4s, v0.8h, #0 ; CHECK-BF16-NEXT: ucvtf v1.4s, v1.4s -; CHECK-BF16-NEXT: ucvtf v0.4s, v0.4s -; CHECK-BF16-NEXT: bfcvtn v1.4h, v1.4s -; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: mov v0.d[1], v1.d[0] +; CHECK-BF16-NEXT: bfcvtn v0.4h, v1.4s +; CHECK-BF16-NEXT: ucvtf v1.4s, v2.4s +; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s ; CHECK-BF16-NEXT: ret %1 = uitofp <8 x i16> %a to <8 x bfloat> ret <8 x bfloat> %1 @@ -713,11 +699,10 @@ define <8 x bfloat> @uitofp_i32(<8 x i32> %a) #0 { ; ; CHECK-BF16-LABEL: uitofp_i32: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: ucvtf v1.4s, v1.4s ; CHECK-BF16-NEXT: ucvtf v0.4s, v0.4s -; CHECK-BF16-NEXT: bfcvtn v1.4h, v1.4s +; CHECK-BF16-NEXT: ucvtf v1.4s, v1.4s ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: mov v0.d[1], v1.d[0] +; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s ; CHECK-BF16-NEXT: ret %1 = uitofp <8 x i32> %a to <8 x bfloat> ret <8 x bfloat> %1 @@ -756,17 +741,16 @@ define <8 x bfloat> @uitofp_i64(<8 x i64> %a) #0 { ; ; CHECK-BF16-LABEL: uitofp_i64: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: ucvtf v2.2d, v2.2d ; CHECK-BF16-NEXT: ucvtf v0.2d, v0.2d -; CHECK-BF16-NEXT: ucvtf v3.2d, v3.2d +; CHECK-BF16-NEXT: ucvtf v2.2d, v2.2d ; CHECK-BF16-NEXT: ucvtf v1.2d, v1.2d -; CHECK-BF16-NEXT: fcvtn v2.2s, v2.2d +; CHECK-BF16-NEXT: ucvtf v3.2d, v3.2d ; CHECK-BF16-NEXT: fcvtn v0.2s, v0.2d -; CHECK-BF16-NEXT: fcvtn2 v2.4s, v3.2d +; CHECK-BF16-NEXT: fcvtn v2.2s, v2.2d ; CHECK-BF16-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-BF16-NEXT: bfcvtn v1.4h, v2.4s +; CHECK-BF16-NEXT: fcvtn2 v2.4s, v3.2d ; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: mov v0.d[1], v1.d[0] +; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v2.4s ; CHECK-BF16-NEXT: ret %1 = uitofp <8 x i64> %a to <8 x bfloat> ret <8 x bfloat> %1