diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 21622ea43724c..3e43299bb8110 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -296,8 +296,21 @@ void IRTranslator::addMachineCFGPred(CFGEdge Edge, MachineBasicBlock *NewPred) { MachinePreds[Edge].push_back(NewPred); } +static bool containsBF16Type(const User &U) { + // BF16 cannot currently be represented by LLT, to avoid miscompiles we + // prevent any instructions using them. FIXME: This can be removed once LLT + // supports bfloat. + return U.getType()->getScalarType()->isBFloatTy() || + any_of(U.operands(), [](Value *V) { + return V->getType()->getScalarType()->isBFloatTy(); + }); +} + bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { + if (containsBF16Type(U)) + return false; + // Get or create a virtual register for each value. // Unless the value is a Constant => loadimm cst? // or inline constant each time? @@ -317,6 +330,9 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U, bool IRTranslator::translateUnaryOp(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { + if (containsBF16Type(U)) + return false; + Register Op0 = getOrCreateVReg(*U.getOperand(0)); Register Res = getOrCreateVReg(U); uint32_t Flags = 0; @@ -334,6 +350,9 @@ bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) { bool IRTranslator::translateCompare(const User &U, MachineIRBuilder &MIRBuilder) { + if (containsBF16Type(U)) + return false; + auto *CI = cast(&U); Register Op0 = getOrCreateVReg(*U.getOperand(0)); Register Op1 = getOrCreateVReg(*U.getOperand(1)); @@ -1553,8 +1572,7 @@ bool IRTranslator::translateBitCast(const User &U, bool IRTranslator::translateCast(unsigned Opcode, const User &U, MachineIRBuilder &MIRBuilder) { - if (U.getType()->getScalarType()->isBFloatTy() || - U.getOperand(0)->getType()->getScalarType()->isBFloatTy()) + if (containsBF16Type(U)) return false; uint32_t Flags = 0; @@ -2647,6 +2665,8 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID, bool IRTranslator::translateInlineAsm(const CallBase &CB, MachineIRBuilder &MIRBuilder) { + if (containsBF16Type(CB)) + return false; const InlineAsmLowering *ALI = MF->getSubtarget().getInlineAsmLowering(); @@ -2736,6 +2756,9 @@ bool IRTranslator::translateCallBase(const CallBase &CB, } bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) { + if (containsBF16Type(U)) + return false; + const CallInst &CI = cast(U); auto TII = MF->getTarget().getIntrinsicInfo(); const Function *F = CI.getCalledFunction(); @@ -3371,6 +3394,9 @@ bool IRTranslator::translateAtomicCmpXchg(const User &U, bool IRTranslator::translateAtomicRMW(const User &U, MachineIRBuilder &MIRBuilder) { + if (containsBF16Type(U)) + return false; + const AtomicRMWInst &I = cast(U); auto Flags = TLI->getAtomicMemOperandFlags(I, *DL); diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp index d8d9f38da3eae..847a1aef39c56 100644 --- a/llvm/lib/CodeGen/TargetPassConfig.cpp +++ b/llvm/lib/CodeGen/TargetPassConfig.cpp @@ -1017,7 +1017,7 @@ bool TargetPassConfig::addCoreISelPasses() { if (Selector != SelectorType::GlobalISel || !isGlobalISelAbortEnabled()) DebugifyIsSafe = false; - // Add instruction selector passes. + // Add instruction selector passes for global isel if enabled. if (Selector == SelectorType::GlobalISel) { SaveAndRestore SavedAddingMachinePasses(AddingMachinePasses, true); if (addIRTranslator()) @@ -1043,15 +1043,14 @@ bool TargetPassConfig::addCoreISelPasses() { // Pass to reset the MachineFunction if the ISel failed. addPass(createResetMachineFunctionPass( reportDiagnosticWhenGlobalISelFallback(), isGlobalISelAbortEnabled())); + } - // Provide a fallback path when we do not want to abort on - // not-yet-supported input. - if (!isGlobalISelAbortEnabled() && addInstSelector()) + // Run the SDAG InstSelector, providing a fallback path when we do not want to + // abort on not-yet-supported input. + if (Selector != SelectorType::GlobalISel || !isGlobalISelAbortEnabled()) + if (addInstSelector()) return true; - } else if (addInstSelector()) - return true; - // Expand pseudo-instructions emitted by ISel. Don't run the verifier before // FinalizeISel. addPass(&FinalizeISelID); diff --git a/llvm/test/CodeGen/AArch64/bf16-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-instructions.ll index ecf64ecbbd3ff..2fc9c53112ab6 100644 --- a/llvm/test/CodeGen/AArch64/bf16-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-instructions.ll @@ -3,7 +3,34 @@ ; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+bf16 | FileCheck %s --check-prefixes=CHECK,CHECK-BF16,CHECK-SD ; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+bf16,+fullfp16 -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-BF16,CHECK-GI -; CHECK-GI: warning: Instruction selection used fallback path for test_fptosi_i32 +; CHECK-GI: warning: Instruction selection used fallback path for test_fadd +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fsub +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fmul +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fmadd +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fdiv +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_frem +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_call +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_call_flipped +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_tailcall_flipped +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_select_cc +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_select_cc_f32_f16 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_une +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ueq +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ugt +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_uge +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ult +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ule +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_uno +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_one +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_oeq +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ogt +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_oge +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_olt +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ole +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fcmp_ord +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fccmp +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_br_cc +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fptosi_i32 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fptosi_i64 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fptoui_i32 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fptoui_i64 @@ -17,9 +44,40 @@ ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fptrunc_double ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fpext_float ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fpext_double +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sqrt +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_powi +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sin +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_cos +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_tan +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_acos +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_asin +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_atan +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_atan2 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_cosh +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_sinh +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_tanh +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_pow +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_exp +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_exp2 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_log +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_log10 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_log2 +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fma +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fabs +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_minnum +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_maxnum +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_copysign ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_copysign_f32 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_copysign_f64 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_copysign_extended +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_floor +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_ceil +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_trunc +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_rint +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_nearbyint +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_round +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_roundeven +; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_fmuladd define bfloat @test_fadd(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_fadd: @@ -39,20 +97,15 @@ define bfloat @test_fadd(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_fadd: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fadd s0, s0, s1 -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fadd: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fadd h0, h0, h1 -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_fadd: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fadd s0, s0, s1 +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ret %r = fadd bfloat %a, %b ret bfloat %r } @@ -75,20 +128,15 @@ define bfloat @test_fsub(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_fsub: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fsub s0, s0, s1 -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fsub: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fsub h0, h0, h1 -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_fsub: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fsub s0, s0, s1 +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ret %r = fsub bfloat %a, %b ret bfloat %r } @@ -111,20 +159,15 @@ define bfloat @test_fmul(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_fmul: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fmul s0, s0, s1 -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fmul: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fmul h0, h0, h1 -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_fmul: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fmul s0, s0, s1 +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ret %r = fmul bfloat %a, %b ret bfloat %r } @@ -157,25 +200,20 @@ define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 { ; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_fmadd: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: // kill: def $h2 killed $h2 def $d2 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fmul s0, s0, s1 -; CHECK-SD-NEXT: shll v1.4s, v2.4h, #16 -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fadd s0, s0, s1 -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fmadd: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fmadd h0, h0, h1, h2 -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_fmadd: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fmul s0, s0, s1 +; CHECK-BF16-NEXT: shll v1.4s, v2.4h, #16 +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fadd s0, s0, s1 +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ret %mul = fmul fast bfloat %a, %b %r = fadd fast bfloat %mul, %c ret bfloat %r @@ -199,20 +237,15 @@ define bfloat @test_fdiv(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_fdiv: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fdiv s0, s0, s1 -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fdiv: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fdiv h0, h0, h1 -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_fdiv: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fdiv s0, s0, s1 +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ret %r = fdiv bfloat %a, %b ret bfloat %r } @@ -239,29 +272,19 @@ define bfloat @test_frem(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_frem: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-SD-NEXT: // kill: def $s1 killed $s1 killed $q1 -; CHECK-SD-NEXT: bl fmodf -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_frem: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: bl fmodf -; CHECK-GI-NEXT: bfcvt h0, s0 -; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_frem: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: // kill: def $s1 killed $s1 killed $q1 +; CHECK-BF16-NEXT: bl fmodf +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-BF16-NEXT: ret %r = frem bfloat %a, %b ret bfloat %r } @@ -385,14 +408,12 @@ define bfloat @test_select_cc(bfloat %a, bfloat %b, bfloat %c, bfloat %d) #0 { ; ; CHECK-GI-LABEL: test_select_cc: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-GI-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-GI-NEXT: fcmp h2, h3 -; CHECK-GI-NEXT: fmov w8, s0 -; CHECK-GI-NEXT: fmov w9, s1 -; CHECK-GI-NEXT: csel w8, w8, w9, ne -; CHECK-GI-NEXT: fmov s0, w8 -; CHECK-GI-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-GI-NEXT: // kill: def $h3 killed $h3 def $d3 +; CHECK-GI-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-GI-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-GI-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-GI-NEXT: fcmp s2, s3 +; CHECK-GI-NEXT: fcsel h0, h0, h1, ne ; CHECK-GI-NEXT: ret %cc = fcmp une bfloat %c, %d %r = select i1 %cc, bfloat %a, bfloat %b @@ -400,31 +421,15 @@ define bfloat @test_select_cc(bfloat %a, bfloat %b, bfloat %c, bfloat %d) #0 { } define float @test_select_cc_f32_f16(float %a, float %b, bfloat %c, bfloat %d) #0 { -; CHECK-CVT-LABEL: test_select_cc_f32_f16: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h3 killed $h3 def $d3 -; CHECK-CVT-NEXT: // kill: def $h2 killed $h2 def $d2 -; CHECK-CVT-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-CVT-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-CVT-NEXT: fcmp s2, s3 -; CHECK-CVT-NEXT: fcsel s0, s0, s1, ne -; CHECK-CVT-NEXT: ret -; -; CHECK-SD-LABEL: test_select_cc_f32_f16: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h3 killed $h3 def $d3 -; CHECK-SD-NEXT: // kill: def $h2 killed $h2 def $d2 -; CHECK-SD-NEXT: shll v3.4s, v3.4h, #16 -; CHECK-SD-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-SD-NEXT: fcmp s2, s3 -; CHECK-SD-NEXT: fcsel s0, s0, s1, ne -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_select_cc_f32_f16: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fcmp h2, h3 -; CHECK-GI-NEXT: fcsel s0, s0, s1, ne -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_select_cc_f32_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h3 killed $h3 def $d3 +; CHECK-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: fcmp s2, s3 +; CHECK-NEXT: fcsel s0, s0, s1, ne +; CHECK-NEXT: ret %cc = fcmp une bfloat %c, %d %r = select i1 %cc, float %a, float %b ret float %r @@ -466,429 +471,199 @@ define bfloat @test_select_cc_f16_f32(bfloat %a, bfloat %b, float %c, float %d) } define i1 @test_fcmp_une(bfloat %a, bfloat %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_une: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: cset w0, ne -; CHECK-CVT-NEXT: ret -; -; CHECK-SD-LABEL: test_fcmp_une: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fcmp s0, s1 -; CHECK-SD-NEXT: cset w0, ne -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fcmp_une: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fcmp h0, h1 -; CHECK-GI-NEXT: cset w0, ne -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_fcmp_une: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, ne +; CHECK-NEXT: ret %r = fcmp une bfloat %a, %b ret i1 %r } define i1 @test_fcmp_ueq(bfloat %a, bfloat %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_ueq: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: cset w8, eq -; CHECK-CVT-NEXT: csinc w0, w8, wzr, vc -; CHECK-CVT-NEXT: ret -; -; CHECK-SD-LABEL: test_fcmp_ueq: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fcmp s0, s1 -; CHECK-SD-NEXT: cset w8, eq -; CHECK-SD-NEXT: csinc w0, w8, wzr, vc -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fcmp_ueq: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fcmp h0, h1 -; CHECK-GI-NEXT: cset w8, eq -; CHECK-GI-NEXT: cset w9, vs -; CHECK-GI-NEXT: orr w0, w8, w9 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_fcmp_ueq: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w8, eq +; CHECK-NEXT: csinc w0, w8, wzr, vc +; CHECK-NEXT: ret %r = fcmp ueq bfloat %a, %b ret i1 %r } define i1 @test_fcmp_ugt(bfloat %a, bfloat %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_ugt: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: cset w0, hi -; CHECK-CVT-NEXT: ret -; -; CHECK-SD-LABEL: test_fcmp_ugt: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fcmp s0, s1 -; CHECK-SD-NEXT: cset w0, hi -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fcmp_ugt: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fcmp h0, h1 -; CHECK-GI-NEXT: cset w0, hi -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_fcmp_ugt: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, hi +; CHECK-NEXT: ret %r = fcmp ugt bfloat %a, %b ret i1 %r } define i1 @test_fcmp_uge(bfloat %a, bfloat %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_uge: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: cset w0, pl -; CHECK-CVT-NEXT: ret -; -; CHECK-SD-LABEL: test_fcmp_uge: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fcmp s0, s1 -; CHECK-SD-NEXT: cset w0, pl -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fcmp_uge: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fcmp h0, h1 -; CHECK-GI-NEXT: cset w0, pl -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_fcmp_uge: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, pl +; CHECK-NEXT: ret %r = fcmp uge bfloat %a, %b ret i1 %r } define i1 @test_fcmp_ult(bfloat %a, bfloat %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_ult: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: cset w0, lt -; CHECK-CVT-NEXT: ret -; -; CHECK-SD-LABEL: test_fcmp_ult: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fcmp s0, s1 -; CHECK-SD-NEXT: cset w0, lt -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fcmp_ult: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fcmp h0, h1 -; CHECK-GI-NEXT: cset w0, lt -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_fcmp_ult: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, lt +; CHECK-NEXT: ret %r = fcmp ult bfloat %a, %b ret i1 %r } define i1 @test_fcmp_ule(bfloat %a, bfloat %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_ule: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: cset w0, le -; CHECK-CVT-NEXT: ret -; -; CHECK-SD-LABEL: test_fcmp_ule: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fcmp s0, s1 -; CHECK-SD-NEXT: cset w0, le -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fcmp_ule: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fcmp h0, h1 -; CHECK-GI-NEXT: cset w0, le -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_fcmp_ule: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, le +; CHECK-NEXT: ret %r = fcmp ule bfloat %a, %b ret i1 %r } define i1 @test_fcmp_uno(bfloat %a, bfloat %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_uno: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: cset w0, vs -; CHECK-CVT-NEXT: ret -; -; CHECK-SD-LABEL: test_fcmp_uno: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fcmp s0, s1 -; CHECK-SD-NEXT: cset w0, vs -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fcmp_uno: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fcmp h0, h1 -; CHECK-GI-NEXT: cset w0, vs -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_fcmp_uno: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, vs +; CHECK-NEXT: ret %r = fcmp uno bfloat %a, %b ret i1 %r } define i1 @test_fcmp_one(bfloat %a, bfloat %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_one: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: cset w8, mi -; CHECK-CVT-NEXT: csinc w0, w8, wzr, le -; CHECK-CVT-NEXT: ret -; -; CHECK-SD-LABEL: test_fcmp_one: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fcmp s0, s1 -; CHECK-SD-NEXT: cset w8, mi -; CHECK-SD-NEXT: csinc w0, w8, wzr, le -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fcmp_one: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fcmp h0, h1 -; CHECK-GI-NEXT: cset w8, mi -; CHECK-GI-NEXT: cset w9, gt -; CHECK-GI-NEXT: orr w0, w8, w9 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_fcmp_one: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w8, mi +; CHECK-NEXT: csinc w0, w8, wzr, le +; CHECK-NEXT: ret %r = fcmp one bfloat %a, %b ret i1 %r } define i1 @test_fcmp_oeq(bfloat %a, bfloat %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_oeq: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: cset w0, eq -; CHECK-CVT-NEXT: ret -; -; CHECK-SD-LABEL: test_fcmp_oeq: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fcmp s0, s1 -; CHECK-SD-NEXT: cset w0, eq -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fcmp_oeq: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fcmp h0, h1 -; CHECK-GI-NEXT: cset w0, eq -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_fcmp_oeq: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret %r = fcmp oeq bfloat %a, %b ret i1 %r } define i1 @test_fcmp_ogt(bfloat %a, bfloat %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_ogt: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: cset w0, gt -; CHECK-CVT-NEXT: ret -; -; CHECK-SD-LABEL: test_fcmp_ogt: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fcmp s0, s1 -; CHECK-SD-NEXT: cset w0, gt -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fcmp_ogt: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fcmp h0, h1 -; CHECK-GI-NEXT: cset w0, gt -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_fcmp_ogt: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, gt +; CHECK-NEXT: ret %r = fcmp ogt bfloat %a, %b ret i1 %r } define i1 @test_fcmp_oge(bfloat %a, bfloat %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_oge: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: cset w0, ge -; CHECK-CVT-NEXT: ret -; -; CHECK-SD-LABEL: test_fcmp_oge: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fcmp s0, s1 -; CHECK-SD-NEXT: cset w0, ge -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fcmp_oge: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fcmp h0, h1 -; CHECK-GI-NEXT: cset w0, ge -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_fcmp_oge: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, ge +; CHECK-NEXT: ret %r = fcmp oge bfloat %a, %b ret i1 %r -} - -define i1 @test_fcmp_olt(bfloat %a, bfloat %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_olt: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: cset w0, mi -; CHECK-CVT-NEXT: ret -; -; CHECK-SD-LABEL: test_fcmp_olt: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fcmp s0, s1 -; CHECK-SD-NEXT: cset w0, mi -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fcmp_olt: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fcmp h0, h1 -; CHECK-GI-NEXT: cset w0, mi -; CHECK-GI-NEXT: ret - %r = fcmp olt bfloat %a, %b - ret i1 %r -} - -define i1 @test_fcmp_ole(bfloat %a, bfloat %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_ole: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: cset w0, ls -; CHECK-CVT-NEXT: ret -; -; CHECK-SD-LABEL: test_fcmp_ole: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fcmp s0, s1 -; CHECK-SD-NEXT: cset w0, ls -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fcmp_ole: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fcmp h0, h1 -; CHECK-GI-NEXT: cset w0, ls -; CHECK-GI-NEXT: ret +} + +define i1 @test_fcmp_olt(bfloat %a, bfloat %b) #0 { +; CHECK-LABEL: test_fcmp_olt: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, mi +; CHECK-NEXT: ret + %r = fcmp olt bfloat %a, %b + ret i1 %r +} + +define i1 @test_fcmp_ole(bfloat %a, bfloat %b) #0 { +; CHECK-LABEL: test_fcmp_ole: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, ls +; CHECK-NEXT: ret %r = fcmp ole bfloat %a, %b ret i1 %r } define i1 @test_fcmp_ord(bfloat %a, bfloat %b) #0 { -; CHECK-CVT-LABEL: test_fcmp_ord: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: cset w0, vc -; CHECK-CVT-NEXT: ret -; -; CHECK-SD-LABEL: test_fcmp_ord: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fcmp s0, s1 -; CHECK-SD-NEXT: cset w0, vc -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fcmp_ord: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fcmp h0, h1 -; CHECK-GI-NEXT: cset w0, vc -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_fcmp_ord: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: cset w0, vc +; CHECK-NEXT: ret %r = fcmp ord bfloat %a, %b ret i1 %r } @@ -924,15 +699,15 @@ define void @test_fccmp(bfloat %in, ptr %out) { ; ; CHECK-GI-LABEL: test_fccmp: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: movi v1.2s, #69, lsl #24 +; CHECK-GI-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-GI-NEXT: shll v2.4s, v0.4h, #16 +; CHECK-GI-NEXT: movi v3.2s, #72, lsl #24 +; CHECK-GI-NEXT: fcmp s2, s1 ; CHECK-GI-NEXT: fmov h1, #5.00000000 -; CHECK-GI-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-GI-NEXT: fmov h2, #8.00000000 -; CHECK-GI-NEXT: fmov w8, s0 -; CHECK-GI-NEXT: fcmp h0, h1 -; CHECK-GI-NEXT: fmov w9, s1 -; CHECK-GI-NEXT: fccmp h0, h2, #4, mi -; CHECK-GI-NEXT: csel w8, w8, w9, gt -; CHECK-GI-NEXT: strh w8, [x0] +; CHECK-GI-NEXT: fccmp s2, s3, #4, mi +; CHECK-GI-NEXT: fcsel h0, h0, h1, gt +; CHECK-GI-NEXT: str h0, [x0] ; CHECK-GI-NEXT: ret %cmp1 = fcmp ogt bfloat %in, 0xR4800 %cmp2 = fcmp olt bfloat %in, 0xR4500 @@ -943,34 +718,16 @@ define void @test_fccmp(bfloat %in, ptr %out) { } define void @test_br_cc(bfloat %a, bfloat %b, ptr %p1, ptr %p2) #0 { -; CHECK-CVT-LABEL: test_br_cc: -; CHECK-CVT: // %bb.0: // %common.ret -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-CVT-NEXT: fcmp s0, s1 -; CHECK-CVT-NEXT: csel x8, x0, x1, pl -; CHECK-CVT-NEXT: str wzr, [x8] -; CHECK-CVT-NEXT: ret -; -; CHECK-SD-LABEL: test_br_cc: -; CHECK-SD: // %bb.0: // %common.ret -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fcmp s0, s1 -; CHECK-SD-NEXT: csel x8, x0, x1, pl -; CHECK-SD-NEXT: str wzr, [x8] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_br_cc: -; CHECK-GI: // %bb.0: // %common.ret -; CHECK-GI-NEXT: fcmp h0, h1 -; CHECK-GI-NEXT: csel x8, x0, x1, pl -; CHECK-GI-NEXT: str wzr, [x8] -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_br_cc: +; CHECK: // %bb.0: // %common.ret +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 +; CHECK-NEXT: csel x8, x0, x1, pl +; CHECK-NEXT: str wzr, [x8] +; CHECK-NEXT: ret %c = fcmp uge bfloat %a, %b br i1 %c, label %then, label %else then: @@ -1426,18 +1183,13 @@ define bfloat @test_sqrt(bfloat %a) #0 { ; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_sqrt: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fsqrt s0, s0 -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_sqrt: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fsqrt h0, h0 -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_sqrt: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fsqrt s0, s0 +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.sqrt.f16(bfloat %a) ret bfloat %r } @@ -1461,25 +1213,16 @@ define bfloat @test_powi(bfloat %a, i32 %b) #0 { ; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_powi: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-SD-NEXT: bl __powisf2 -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_powi: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: bl __powisf2 -; CHECK-GI-NEXT: bfcvt h0, s0 -; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_powi: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: bl __powisf2 +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.powi.f16.i32(bfloat %a, i32 %b) ret bfloat %r } @@ -1504,25 +1247,16 @@ define bfloat @test_sin(bfloat %a) #0 { ; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_sin: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-SD-NEXT: bl sinf -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_sin: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: bl sinf -; CHECK-GI-NEXT: bfcvt h0, s0 -; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_sin: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: bl sinf +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.sin.f16(bfloat %a) ret bfloat %r } @@ -1546,25 +1280,16 @@ define bfloat @test_cos(bfloat %a) #0 { ; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_cos: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-SD-NEXT: bl cosf -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_cos: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: bl cosf -; CHECK-GI-NEXT: bfcvt h0, s0 -; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_cos: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: bl cosf +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.cos.f16(bfloat %a) ret bfloat %r } @@ -1588,25 +1313,16 @@ define bfloat @test_tan(bfloat %a) #0 { ; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_tan: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-SD-NEXT: bl tanf -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_tan: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: bl tanf -; CHECK-GI-NEXT: bfcvt h0, s0 -; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_tan: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: bl tanf +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.tan.f16(bfloat %a) ret bfloat %r } @@ -1630,25 +1346,16 @@ define bfloat @test_acos(bfloat %a) #0 { ; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_acos: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-SD-NEXT: bl acosf -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_acos: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: bl acosf -; CHECK-GI-NEXT: bfcvt h0, s0 -; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_acos: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: bl acosf +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.acos.f16(bfloat %a) ret bfloat %r } @@ -1672,25 +1379,16 @@ define bfloat @test_asin(bfloat %a) #0 { ; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_asin: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-SD-NEXT: bl asinf -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_asin: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: bl asinf -; CHECK-GI-NEXT: bfcvt h0, s0 -; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_asin: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: bl asinf +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.asin.f16(bfloat %a) ret bfloat %r } @@ -1714,25 +1412,16 @@ define bfloat @test_atan(bfloat %a) #0 { ; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_atan: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-SD-NEXT: bl atanf -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_atan: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: bl atanf -; CHECK-GI-NEXT: bfcvt h0, s0 -; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_atan: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: bl atanf +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.atan.f16(bfloat %a) ret bfloat %r } @@ -1759,29 +1448,19 @@ define bfloat @test_atan2(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_atan2: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-SD-NEXT: // kill: def $s1 killed $s1 killed $q1 -; CHECK-SD-NEXT: bl atan2f -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_atan2: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: bl atan2f -; CHECK-GI-NEXT: bfcvt h0, s0 -; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_atan2: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: // kill: def $s1 killed $s1 killed $q1 +; CHECK-BF16-NEXT: bl atan2f +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.atan2.f16(bfloat %a, bfloat %b) ret bfloat %r } @@ -1805,25 +1484,16 @@ define bfloat @test_cosh(bfloat %a) #0 { ; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_cosh: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-SD-NEXT: bl coshf -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_cosh: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: bl coshf -; CHECK-GI-NEXT: bfcvt h0, s0 -; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_cosh: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: bl coshf +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.cosh.f16(bfloat %a) ret bfloat %r } @@ -1847,25 +1517,16 @@ define bfloat @test_sinh(bfloat %a) #0 { ; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_sinh: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-SD-NEXT: bl sinhf -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_sinh: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: bl sinhf -; CHECK-GI-NEXT: bfcvt h0, s0 -; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_sinh: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: bl sinhf +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.sinh.f16(bfloat %a) ret bfloat %r } @@ -1889,25 +1550,16 @@ define bfloat @test_tanh(bfloat %a) #0 { ; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_tanh: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-SD-NEXT: bl tanhf -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_tanh: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: bl tanhf -; CHECK-GI-NEXT: bfcvt h0, s0 -; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_tanh: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: bl tanhf +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.tanh.f16(bfloat %a) ret bfloat %r } @@ -1934,29 +1586,19 @@ define bfloat @test_pow(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_pow: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-SD-NEXT: // kill: def $s1 killed $s1 killed $q1 -; CHECK-SD-NEXT: bl powf -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_pow: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: fcvt s1, h1 -; CHECK-GI-NEXT: bl powf -; CHECK-GI-NEXT: bfcvt h0, s0 -; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_pow: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: // kill: def $s1 killed $s1 killed $q1 +; CHECK-BF16-NEXT: bl powf +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.pow.f16(bfloat %a, bfloat %b) ret bfloat %r } @@ -1973,32 +1615,23 @@ define bfloat @test_exp(bfloat %a) #0 { ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 ; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: add w8, w10, w8 -; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-CVT-NEXT: ret -; -; CHECK-SD-LABEL: test_exp: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-SD-NEXT: bl expf -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-SD-NEXT: ret +; CHECK-CVT-NEXT: add w8, w10, w8 +; CHECK-CVT-NEXT: lsr w8, w8, #16 +; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-CVT-NEXT: ret ; -; CHECK-GI-LABEL: test_exp: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: bl expf -; CHECK-GI-NEXT: bfcvt h0, s0 -; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_exp: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: bl expf +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.exp.f16(bfloat %a) ret bfloat %r } @@ -2022,25 +1655,16 @@ define bfloat @test_exp2(bfloat %a) #0 { ; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_exp2: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-SD-NEXT: bl exp2f -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_exp2: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: bl exp2f -; CHECK-GI-NEXT: bfcvt h0, s0 -; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_exp2: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: bl exp2f +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.exp2.f16(bfloat %a) ret bfloat %r } @@ -2064,25 +1688,16 @@ define bfloat @test_log(bfloat %a) #0 { ; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_log: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-SD-NEXT: bl logf -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_log: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: bl logf -; CHECK-GI-NEXT: bfcvt h0, s0 -; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_log: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: bl logf +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.log.f16(bfloat %a) ret bfloat %r } @@ -2106,25 +1721,16 @@ define bfloat @test_log10(bfloat %a) #0 { ; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_log10: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-SD-NEXT: bl log10f -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_log10: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: bl log10f -; CHECK-GI-NEXT: bfcvt h0, s0 -; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_log10: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: bl log10f +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.log10.f16(bfloat %a) ret bfloat %r } @@ -2148,25 +1754,16 @@ define bfloat @test_log2(bfloat %a) #0 { ; CHECK-CVT-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_log2: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-SD-NEXT: bl log2f -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_log2: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-GI-NEXT: fcvt s0, h0 -; CHECK-GI-NEXT: bl log2f -; CHECK-GI-NEXT: bfcvt h0, s0 -; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_log2: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: bl log2f +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.log2.f16(bfloat %a) ret bfloat %r } @@ -2191,49 +1788,30 @@ define bfloat @test_fma(bfloat %a, bfloat %b, bfloat %c) #0 { ; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_fma: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h2 killed $h2 def $d2 -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v2.4s, v2.4h, #16 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fmadd s0, s0, s1, s2 -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fma: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fmadd h0, h0, h1, h2 -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_fma: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fmadd s0, s0, s1, s2 +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.fma.f16(bfloat %a, bfloat %b, bfloat %c) ret bfloat %r } define bfloat @test_fabs(bfloat %a) #0 { -; CHECK-CVT-LABEL: test_fabs: -; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: and w8, w8, #0x7fff -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-CVT-NEXT: ret -; -; CHECK-SD-LABEL: test_fabs: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-SD-NEXT: fmov w8, s0 -; CHECK-SD-NEXT: and w8, w8, #0x7fff -; CHECK-SD-NEXT: fmov s0, w8 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 killed $s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fabs: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fabs h0, h0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_fabs: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 +; CHECK-NEXT: fmov w8, s0 +; CHECK-NEXT: and w8, w8, #0x7fff +; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 +; CHECK-NEXT: ret %r = call bfloat @llvm.fabs.f16(bfloat %a) ret bfloat %r } @@ -2256,20 +1834,15 @@ define bfloat @test_minnum(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_minnum: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fminnm s0, s0, s1 -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_minnum: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fminnm h0, h0, h1 -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_minnum: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fminnm s0, s0, s1 +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.minnum.f16(bfloat %a, bfloat %b) ret bfloat %r } @@ -2292,20 +1865,15 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_maxnum: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fmaxnm s0, s0, s1 -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_maxnum: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fmaxnm h0, h0, h1 -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_maxnum: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fmaxnm s0, s0, s1 +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.maxnum.f16(bfloat %a, bfloat %b) ret bfloat %r } @@ -2338,11 +1906,11 @@ define bfloat @test_copysign(bfloat %a, bfloat %b) #0 { ; ; CHECK-GI-LABEL: test_copysign: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: mvni v2.4h, #128, lsl #8 -; CHECK-GI-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-GI-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-GI-NEXT: bif v0.8b, v1.8b, v2.8b -; CHECK-GI-NEXT: // kill: def $h0 killed $h0 killed $d0 +; CHECK-GI-NEXT: mvni v2.8h, #128, lsl #8 +; CHECK-GI-NEXT: // kill: def $h0 killed $h0 def $q0 +; CHECK-GI-NEXT: // kill: def $h1 killed $h1 def $q1 +; CHECK-GI-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-GI-NEXT: // kill: def $h0 killed $h0 killed $q0 ; CHECK-GI-NEXT: ret %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b) ret bfloat %r @@ -2488,18 +2056,13 @@ define bfloat @test_floor(bfloat %a) #0 { ; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_floor: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: frintm s0, s0 -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_floor: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: frintm h0, h0 -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_floor: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: frintm s0, s0 +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.floor.f16(bfloat %a) ret bfloat %r } @@ -2520,18 +2083,13 @@ define bfloat @test_ceil(bfloat %a) #0 { ; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_ceil: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: frintp s0, s0 -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_ceil: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: frintp h0, h0 -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_ceil: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: frintp s0, s0 +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.ceil.f16(bfloat %a) ret bfloat %r } @@ -2552,18 +2110,13 @@ define bfloat @test_trunc(bfloat %a) #0 { ; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_trunc: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: frintz s0, s0 -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_trunc: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: frintz h0, h0 -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_trunc: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: frintz s0, s0 +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.trunc.f16(bfloat %a) ret bfloat %r } @@ -2584,18 +2137,13 @@ define bfloat @test_rint(bfloat %a) #0 { ; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_rint: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: frintx s0, s0 -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_rint: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: frintx h0, h0 -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_rint: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: frintx s0, s0 +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.rint.f16(bfloat %a) ret bfloat %r } @@ -2616,18 +2164,13 @@ define bfloat @test_nearbyint(bfloat %a) #0 { ; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_nearbyint: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: frinti s0, s0 -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_nearbyint: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: frinti h0, h0 -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_nearbyint: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: frinti s0, s0 +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.nearbyint.f16(bfloat %a) ret bfloat %r } @@ -2648,18 +2191,13 @@ define bfloat @test_round(bfloat %a) #0 { ; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_round: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: frinta s0, s0 -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_round: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: frinta h0, h0 -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_round: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: frinta s0, s0 +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.round.f16(bfloat %a) ret bfloat %r } @@ -2680,18 +2218,13 @@ define bfloat @test_roundeven(bfloat %a) #0 { ; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_roundeven: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: frintn s0, s0 -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_roundeven: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: frintn h0, h0 -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_roundeven: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: frintn s0, s0 +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.roundeven.f16(bfloat %a) ret bfloat %r } @@ -2724,26 +2257,20 @@ define bfloat @test_fmuladd(bfloat %a, bfloat %b, bfloat %c) #0 { ; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-CVT-NEXT: ret ; -; CHECK-SD-LABEL: test_fmuladd: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: // kill: def $h2 killed $h2 def $d2 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fmul s0, s0, s1 -; CHECK-SD-NEXT: shll v1.4s, v2.4h, #16 -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-SD-NEXT: fadd s0, s0, s1 -; CHECK-SD-NEXT: bfcvt h0, s0 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_fmuladd: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: fmul h0, h0, h1 -; CHECK-GI-NEXT: fadd h0, h0, h2 -; CHECK-GI-NEXT: ret +; CHECK-BF16-LABEL: test_fmuladd: +; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fmul s0, s0, s1 +; CHECK-BF16-NEXT: shll v1.4s, v2.4h, #16 +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fadd s0, s0, s1 +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.fmuladd.f16(bfloat %a, bfloat %b, bfloat %c) ret bfloat %r } diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll index 2187717c4148a..b4c38e9f2df3b 100644 --- a/llvm/test/CodeGen/AArch64/fptrunc.ll +++ b/llvm/test/CodeGen/AArch64/fptrunc.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 ; RUN: llc -mtriple=aarch64 -global-isel=0 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD ; RUN: llc -mtriple=aarch64 -global-isel=1 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI +; RUN: llc -mtriple=aarch64 -global-isel=1 -mattr=+fullfp16,+bf16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI define float @fptrunc_f64_f32(double %a) { ; CHECK-LABEL: fptrunc_f64_f32: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll index 7691f4c30de04..78f33a174980d 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-call.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -; RUN: llc -global-isel -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s +; RUN: llc -global-isel -global-isel-abort=2 -stop-after=irtranslator -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck -enable-var-scope %s declare hidden void @external_void_func_void() #0 @@ -5594,48 +5594,14 @@ define void @test_call_external_void_func_f16_inreg(half inreg %arg) #0 { define void @test_call_external_void_func_bf16_inreg(bfloat inreg %arg) #0 { ; CHECK-LABEL: name: test_call_external_void_func_bf16_inreg - ; CHECK: bb.1 (%ir-block.0): - ; CHECK-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $vgpr31, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $sgpr16 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr31 - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr15 - ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr14 - ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr13 - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr12 - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:sgpr_64 = COPY $sgpr10_sgpr11 - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sgpr_64 = COPY $sgpr8_sgpr9 - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:sgpr_64 = COPY $sgpr6_sgpr7 - ; CHECK-NEXT: [[COPY8:%[0-9]+]]:sgpr_64 = COPY $sgpr4_sgpr5 - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:_(s32) = COPY $sgpr16 - ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY9]](s32) - ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc - ; CHECK-NEXT: [[GV:%[0-9]+]]:_(p0) = G_GLOBAL_VALUE @external_void_func_bf16_inreg - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:_(p4) = COPY [[COPY8]] - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:_(p4) = COPY [[COPY7]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:_(p4) = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:_(s64) = COPY [[COPY5]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY3]] - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY2]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]] - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) - ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[TRUNC]](s16) - ; CHECK-NEXT: [[INTRINSIC_CONVERGENT:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[ANYEXT]](s32) - ; CHECK-NEXT: $sgpr0 = COPY [[INTRINSIC_CONVERGENT]](s32) - ; CHECK-NEXT: [[COPY19:%[0-9]+]]:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 - ; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY19]](<4 x s32>) - ; CHECK-NEXT: $sgpr4_sgpr5 = COPY [[COPY10]](p4) - ; CHECK-NEXT: $sgpr6_sgpr7 = COPY [[COPY11]](p4) - ; CHECK-NEXT: $sgpr8_sgpr9 = COPY [[COPY12]](p4) - ; CHECK-NEXT: $sgpr10_sgpr11 = COPY [[COPY13]](s64) - ; CHECK-NEXT: $sgpr12 = COPY [[COPY14]](s32) - ; CHECK-NEXT: $sgpr13 = COPY [[COPY15]](s32) - ; CHECK-NEXT: $sgpr14 = COPY [[COPY16]](s32) - ; CHECK-NEXT: $sgpr15 = COPY [[COPY17]](s32) - ; CHECK-NEXT: $vgpr31 = COPY [[COPY18]](s32) - ; CHECK-NEXT: $sgpr30_sgpr31 = noconvergent G_SI_CALL [[GV]](p0), @external_void_func_bf16_inreg, csr_amdgpu, implicit $sgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr4_sgpr5, implicit $sgpr6_sgpr7, implicit $sgpr8_sgpr9, implicit $sgpr10_sgpr11, implicit $sgpr12, implicit $sgpr13, implicit $sgpr14, implicit $sgpr15, implicit $vgpr31 - ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc - ; CHECK-NEXT: SI_RETURN + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr16 + ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1 (%ir-block.0): call void @external_void_func_bf16_inreg(bfloat inreg %arg) ret void } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll index 73b891e43de99..ee89b28a0d2bb 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.set.inactive.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s define amdgpu_kernel void @set_inactive(ptr addrspace(1) %out, i32 %in) { ; GCN-LABEL: set_inactive: @@ -284,17 +284,15 @@ define amdgpu_kernel void @set_inactive_v2bf16(ptr addrspace(1) %out, <2 x bfloa ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dword s6, s[4:5], 0x2c ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[2:3], -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80 -; GCN-NEXT: s_mov_b64 exec, s[2:3] +; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[4:5] ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: v_mov_b32_e32 v1, v0 -; GCN-NEXT: s_mov_b32 s3, 0xf000 ; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %tmp.0 = call <2 x bfloat> @llvm.amdgcn.set.inactive.v2bf16(<2 x bfloat> %in, <2 x bfloat> ) #0 @@ -359,21 +357,23 @@ define amdgpu_kernel void @set_inactive_v4bf16(ptr addrspace(1) %out, <4 x bfloa ; GCN-LABEL: set_inactive_v4bf16: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80 -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, s2 -; GCN-NEXT: v_mov_b32_e32 v4, s3 -; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v3, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v2, v0, v4, s[4:5] -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s3 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_mov_b32_e32 v0, 0x3f803f80 +; GCN-NEXT: v_cndmask_b32_e64 v1, v0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_or_saveexec_b64 s[0:1], -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[0:1] +; GCN-NEXT: s_mov_b64 exec, s[0:1] +; GCN-NEXT: v_mov_b32_e32 v2, v0 ; GCN-NEXT: v_mov_b32_e32 v3, v1 -; GCN-NEXT: v_mov_b32_e32 v4, v2 -; GCN-NEXT: s_mov_b32 s3, 0xf000 -; GCN-NEXT: buffer_store_dwordx2 v[3:4], off, s[0:3], 0 +; GCN-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GCN-NEXT: s_endpgm %tmp.0 = call <4 x bfloat> @llvm.amdgcn.set.inactive.v4bf16(<4 x bfloat> %in, <4 x bfloat> ) #0 %tmp = call <4 x bfloat> @llvm.amdgcn.strict.wwm.v4bf16(<4 x bfloat> %tmp.0) diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll index 9912ce3604a49..7eaa52d89b9b6 100644 --- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll +++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-contents-legalization.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=SDAG %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 < %s | FileCheck -check-prefix=GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -check-prefix=GISEL %s ; Note: if you're adding tests here, also add them to ; lower-buffer-fat-pointers-contents-legalization.ll to verify the IR produced by @@ -629,7 +629,6 @@ define <4 x bfloat> @load_v4bf16(ptr addrspace(8) inreg %buf) { ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GISEL-NEXT: buffer_load_dwordx2 v[0:1], off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GISEL-NEXT: s_setpc_b64 s[30:31] %p = addrspacecast ptr addrspace(8) %buf to ptr addrspace(7) %ret = load <4 x bfloat>, ptr addrspace(7) %p @@ -647,10 +646,6 @@ define void @store_v4bf16(<4 x bfloat> %data, ptr addrspace(8) inreg %buf) { ; GISEL-LABEL: store_v4bf16: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GISEL-NEXT: v_mov_b32_sdwa v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 ; GISEL-NEXT: buffer_store_dwordx2 v[0:1], off, s[16:19], 0 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll index b128be2186df2..935ae48654b64 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s define float @fmul_select_f32_test1(float %x, i32 %bool.arg1, i32 %bool.arg2) { ; GFX7-LABEL: fmul_select_f32_test1: @@ -2541,114 +2541,72 @@ define half @fmul_select_f16_test11_sel_log2val_pos7_neg14(half %x, i32 %bool.ar } define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-SDAG-LABEL: fmul_select_bf16_test1: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmul_select_bf16_test1: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-SDAG-LABEL: fmul_select_bf16_test1: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x3f80 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x4000 -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: fmul_select_bf16_test1: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff -; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3 -; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: fmul_select_bf16_test1: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmul_select_bf16_test1: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: fmul_select_bf16_test1: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 -; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: fmul_select_bf16_test1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, 0x3f80 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x4000 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: fmul_select_bf16_test1: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_bf16_test1: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, 0x4000 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: fmul_select_bf16_test1: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 -; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: fmul_select_bf16_test1: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, bfloat 2.000000e+00, bfloat 1.000000e+00 %ldexp = fmul bfloat %x, %y @@ -2656,114 +2614,72 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) } define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-SDAG-LABEL: fmul_select_bf16_test2: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmul_select_bf16_test2: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-SDAG-LABEL: fmul_select_bf16_test2: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x3f80 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x3f00 -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: fmul_select_bf16_test2: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff -; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3 -; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: fmul_select_bf16_test2: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x3f00 -; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmul_select_bf16_test2: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: fmul_select_bf16_test2: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 -; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: fmul_select_bf16_test2: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, 0x3f80 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x3f00 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: fmul_select_bf16_test2: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_bf16_test2: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, 0x3f00 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: fmul_select_bf16_test2: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 -; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: fmul_select_bf16_test2: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, 0x3f00 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v3, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, bfloat 5.000000e-01, bfloat 1.000000e+00 %ldexp = fmul bfloat %x, %y @@ -2771,158 +2687,111 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) } define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) { -; GFX7-SDAG-LABEL: fmul_select_v2bf16_test3: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmul_select_v2bf16_test3: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc -; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v3 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-SDAG-LABEL: fmul_select_v2bf16_test3: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x3f80 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x4000 -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-SDAG-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-SDAG-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-SDAG-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-SDAG-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-SDAG-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-SDAG-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-SDAG-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: fmul_select_v2bf16_test3: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff -; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3 -; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: fmul_select_v2bf16_test3: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v5, 0x4000 -; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo -; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo -; GFX10-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-SDAG-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-SDAG-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX10-SDAG-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX10-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX10-SDAG-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo -; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX10-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmul_select_v2bf16_test3: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v2, 1.0, 2.0, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: v_cndmask_b32_e64 v3, 1.0, 2.0, vcc +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: fmul_select_v2bf16_test3: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 -; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: fmul_select_v2bf16_test3: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v5, 0x3f80 +; GFX9-NEXT: v_mov_b32_e32 v6, 0x4000 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: fmul_select_v2bf16_test3: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 0x4000 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX11-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX11-SDAG-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX11-SDAG-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX11-SDAG-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_v2bf16_test3: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v5, 0x4000 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo +; GFX10-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: fmul_select_v2bf16_test3: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 -; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: fmul_select_v2bf16_test3: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v5, 0x4000 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2 %y = select <2 x i1> %bool, <2 x bfloat> , <2 x bfloat> %ldexp = fmul <2 x bfloat> %x, %y @@ -2930,267 +2799,185 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a } define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.arg1, <2 x i32> %bool.arg2) { -; GFX7-SDAG-LABEL: fmul_select_v2bf16_test4: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, 1.0, v1 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v1, v1, v3 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmul_select_v2bf16_test4: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc -; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v2 -; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v1, v1, v3 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-SDAG-LABEL: fmul_select_v2bf16_test4: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v5, 0x3f80 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v6, 0x3f00 -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-SDAG-NEXT: v_bfe_u32 v3, v1, 16, 1 -; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX9-SDAG-NEXT: v_add3_u32 v3, v3, v1, s4 -; GFX9-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX9-SDAG-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-SDAG-NEXT: v_add3_u32 v2, v2, v0, s4 -; GFX9-SDAG-NEXT: v_or_b32_e32 v3, 0x400000, v0 -; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc -; GFX9-SDAG-NEXT: s_mov_b32 s4, 0x7060302 -; GFX9-SDAG-NEXT: v_perm_b32 v0, v0, v1, s4 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: fmul_select_v2bf16_test4: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff -; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3 -; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: fmul_select_v2bf16_test4: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v5, 0x3f00 -; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo -; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo -; GFX10-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX10-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2 -; GFX10-SDAG-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX10-SDAG-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX10-SDAG-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX10-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX10-SDAG-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo -; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX10-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmul_select_v2bf16_test4: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1 +; GFX7-NEXT: v_cndmask_b32_e64 v2, 1.0, 0.5, vcc +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; GFX7-NEXT: v_cndmask_b32_e64 v3, 1.0, 0.5, vcc +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v1, v1, v3 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: fmul_select_v2bf16_test4: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 -; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: fmul_select_v2bf16_test4: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v5, 0x3f80 +; GFX9-NEXT: v_mov_b32_e32 v6, 0x3f00 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX9-NEXT: v_bfe_u32 v3, v1, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX9-NEXT: v_add3_u32 v3, v3, v1, s4 +; GFX9-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX9-NEXT: v_bfe_u32 v2, v0, 16, 1 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_add3_u32 v2, v2, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX9-NEXT: s_mov_b32 s4, 0x7060302 +; GFX9-NEXT: v_perm_b32 v0, v0, v1, s4 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: fmul_select_v2bf16_test4: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v5, 0x3f00 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v3, 16, v0 -; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1 -; GFX11-SDAG-NEXT: v_or_b32_e32 v5, 0x400000, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_mul_f32_e32 v1, v3, v1 -; GFX11-SDAG-NEXT: v_bfe_u32 v3, v0, 16, 1 -; GFX11-SDAG-NEXT: v_bfe_u32 v2, v1, 16, 1 -; GFX11-SDAG-NEXT: v_or_b32_e32 v4, 0x400000, v1 -; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_add3_u32 v3, v3, v0, 0x7fff -; GFX11-SDAG-NEXT: v_add3_u32 v2, v2, v1, 0x7fff -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo -; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_v2bf16_test4: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v5, 0x3f00 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX10-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo +; GFX10-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX10-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v2 +; GFX10-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX10-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX10-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX10-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX10-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: fmul_select_v2bf16_test4: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 -; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: fmul_select_v2bf16_test4: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v5, 0x3f00 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v3 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3f80, v5, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v4 +; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x3f80, v5, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mul_f32 v0, v0, v2 :: v_dual_lshlrev_b32 v1, 16, v1 +; GFX11-NEXT: v_or_b32_e32 v5, 0x400000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_f32_e32 v1, v3, v1 +; GFX11-NEXT: v_bfe_u32 v3, v0, 16, 1 +; GFX11-NEXT: v_bfe_u32 v2, v1, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v4, 0x400000, v1 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_add3_u32 v3, v3, v0, 0x7fff +; GFX11-NEXT: v_add3_u32 v2, v2, v1, 0x7fff +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_perm_b32 v0, v0, v1, 0x7060302 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq <2 x i32> %bool.arg1, %bool.arg2 %y = select <2 x i1> %bool, <2 x bfloat> , <2 x bfloat> %ldexp = fmul <2 x bfloat> %x, %y ret <2 x bfloat> %ldexp } - -define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-SDAG-LABEL: fmul_select_bf16_test5: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x41000000 -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, v3, 2.0, vcc -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmul_select_bf16_test5: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc -; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-SDAG-LABEL: fmul_select_bf16_test5: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x4100 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x4000 -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: fmul_select_bf16_test5: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc -; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: fmul_select_bf16_test5: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] + +define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) { +; GFX7-LABEL: fmul_select_bf16_test5: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, 0x41000000 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e64 v1, v3, 2.0, vcc +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: fmul_select_bf16_test5: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc_lo -; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: fmul_select_bf16_test5: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, 0x4100 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x4000 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: fmul_select_bf16_test5: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_bf16_test5: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, 0x4000 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: fmul_select_bf16_test5: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: fmul_select_bf16_test5: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, 0x4000 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x4100, v3, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, bfloat 2.000000e+00, bfloat 8.000000e+00 %ldexp = fmul bfloat %x, %y @@ -3198,116 +2985,74 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) } define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-SDAG-LABEL: fmul_select_bf16_test6: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x40400000 -; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0xc1000000 -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmul_select_bf16_test6: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0xc100 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0x4040 -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-SDAG-LABEL: fmul_select_bf16_test6: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x4040 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffc100 -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: fmul_select_bf16_test6: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0xc100 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0x4040 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: fmul_select_bf16_test6: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0xffffc100 -; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmul_select_bf16_test6: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, 0x40400000 +; GFX7-NEXT: v_mov_b32_e32 v4, 0xc1000000 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: fmul_select_bf16_test6: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0x4040 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xc100, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: fmul_select_bf16_test6: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, 0x4040 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffc100 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: fmul_select_bf16_test6: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_bf16_test6: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffc100 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: fmul_select_bf16_test6: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0x4040 -; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0xc100, vcc_lo -; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: fmul_select_bf16_test6: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, 0xffffc100 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x4040, v3, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, bfloat -8.000000e+00, bfloat 3.000000e+00 %ldexp = fmul bfloat %x, %y @@ -3315,115 +3060,73 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) } define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-SDAG-LABEL: fmul_select_bf16_test7: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x41000000 -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, -4.0, v3, vcc -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmul_select_bf16_test7: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0x4100 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v4, 0xc080 -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-SDAG-LABEL: fmul_select_bf16_test7: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xffffc080 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x4100 -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: fmul_select_bf16_test7: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x4100 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v4, 0xc080 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc -; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: fmul_select_bf16_test7: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x4100 -; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmul_select_bf16_test7: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, 0x41000000 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v1, -4.0, v3, vcc +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: fmul_select_bf16_test7: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0xc080 -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x4100, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: fmul_select_bf16_test7: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffffc080 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x4100 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: fmul_select_bf16_test7: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_bf16_test7: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, 0x4100 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: fmul_select_bf16_test7: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0xc080 -; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0x4100, vcc_lo -; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: fmul_select_bf16_test7: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, 0x4100 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xffffc080, v3, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, bfloat 8.000000e+00, bfloat -4.000000e+00 %ldexp = fmul bfloat %x, %y @@ -3431,111 +3134,73 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) } define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-SDAG-LABEL: fmul_select_bf16_test8: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: v_lshlrev_b32_e32 v1, 31, v1 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmul_select_bf16_test8: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0x8000 -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX7-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-SDAG-LABEL: fmul_select_bf16_test8: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc -; GFX9-SDAG-NEXT: v_mov_b32_e32 v2, 15 -; GFX9-SDAG-NEXT: v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: fmul_select_bf16_test8: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x8000 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc -; GFX9-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: fmul_select_bf16_test8: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX10-SDAG-NEXT: v_lshlrev_b16 v1, 15, v1 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-GISEL-LABEL: fmul_select_bf16_test8: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo -; GFX10-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmul_select_bf16_test8: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: fmul_select_bf16_test8: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshlrev_b16 v1, 15, v1 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: fmul_select_bf16_test8: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 15 +; GFX9-NEXT: v_lshlrev_b16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: fmul_select_bf16_test8: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 0x8000, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_bf16_test8: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_lshlrev_b16 v1, 15, v1 +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-LABEL: fmul_select_bf16_test8: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshlrev_b16 v1, 15, v1 +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, bfloat -0.000000e+00, bfloat 0.000000e+00 %ldexp = fmul bfloat %x, %y @@ -3543,121 +3208,74 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) } define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-SDAG-LABEL: fmul_select_bf16_test9: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0xc2000000 -; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0xc1800000 -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmul_select_bf16_test9: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GFX7-GISEL-NEXT: v_add_i32_e32 v1, vcc, 5, v1 -; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-SDAG-LABEL: fmul_select_bf16_test9: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xffffc200 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffc180 -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: fmul_select_bf16_test9: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc -; GFX9-GISEL-NEXT: v_add_u32_e32 v1, 5, v1 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v2, 0xffff8000 -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x7fff -; GFX9-GISEL-NEXT: v_med3_i32 v1, v1, v2, v3 -; GFX9-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: fmul_select_bf16_test9: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0xffffc180 -; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmul_select_bf16_test9: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, 0xc2000000 +; GFX7-NEXT: v_mov_b32_e32 v4, 0xc1800000 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: fmul_select_bf16_test9: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX10-GISEL-NEXT: v_add_nc_u32_e32 v1, 5, v1 -; GFX10-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 -; GFX10-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: fmul_select_bf16_test9: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffffc200 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffc180 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: fmul_select_bf16_test9: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_bf16_test9: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffc180 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: fmul_select_bf16_test9: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0x7fff -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v1, 5, v1 -; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v1, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: fmul_select_bf16_test9: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, 0xffffc180 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xffffc200, v3, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, bfloat -1.600000e+01, bfloat -3.200000e+01 %ldexp = fmul bfloat %x, %y @@ -3665,111 +3283,74 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) } define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0xdb800000 -; GFX7-SDAG-NEXT: v_bfrev_b32_e32 v4, 7 -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v0, -v0 -; GFX7-GISEL-NEXT: v_mov_b32_e32 v3, 0x41 -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, 56, v3, vcc -; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0xffffdb80 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0xffffe000 -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_mov_b32_e32 v3, 0x41 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 56, v3, vcc -; GFX9-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0xffffe000 -; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_mov_b32_e32 v3, 0xdb800000 +; GFX7-NEXT: v_bfrev_b32_e32 v4, 7 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 56, 0x41, vcc_lo -; GFX10-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, 0xffffdb80 +; GFX9-NEXT: v_mov_b32_e32 v4, 0xffffe000 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, 0xffffe000 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 56, 0x41, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_ldexp_f16_e64 v0, -v0, v1 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, 0xffffe000 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xffffdb80, v3, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, bfloat 0xRE000, bfloat 0xRDB80 %ldexp = fmul bfloat %x, %y @@ -3777,111 +3358,74 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b } define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %bool.arg1, i32 %bool.arg2) { -; GFX7-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: -; GFX7-SDAG: ; %bb.0: -; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-SDAG-NEXT: v_bfrev_b32_e32 v3, 50 -; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0x34800000 -; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX7-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: -; GFX7-GISEL: ; %bb.0: -; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX7-GISEL-NEXT: v_not_b32_e32 v3, 21 -; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX7-GISEL-NEXT: v_cndmask_b32_e32 v1, 25, v3, vcc -; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1 -; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: -; GFX9-SDAG: ; %bb.0: -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_mov_b32_e32 v3, 0x4c00 -; GFX9-SDAG-NEXT: v_mov_b32_e32 v4, 0x3480 -; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX9-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX9-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX9-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX9-SDAG-NEXT: s_movk_i32 s4, 0x7fff -; GFX9-SDAG-NEXT: v_add3_u32 v1, v1, v0, s4 -; GFX9-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX9-SDAG-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX9-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc -; GFX9-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: -; GFX9-GISEL: ; %bb.0: -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_not_b32_e32 v3, 21 -; GFX9-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 -; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 25, v3, vcc -; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: -; GFX10-SDAG: ; %bb.0: -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, 0x3480 -; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo -; GFX10-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX10-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX10-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX10-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX10-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX10-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX10-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX10-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX7-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: +; GFX7: ; %bb.0: +; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 +; GFX7-NEXT: v_bfrev_b32_e32 v3, 50 +; GFX7-NEXT: v_mov_b32_e32 v4, 0x34800000 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: -; GFX10-GISEL: ; %bb.0: -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 25, 0xffffffea, vcc_lo -; GFX10-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v3, 0x4c00 +; GFX9-NEXT: v_mov_b32_e32 v4, 0x3480 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX9-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX9-NEXT: s_movk_i32 s4, 0x7fff +; GFX9-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX9-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX9-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0 -; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1 -; GFX11-SDAG-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX11-SDAG-NEXT: v_or_b32_e32 v2, 0x400000, v0 -; GFX11-SDAG-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_add3_u32 v1, v1, v0, 0x7fff -; GFX11-SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v3, 0x3480 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX10-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo +; GFX10-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX10-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX10-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX10-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX10-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX10-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 25, 0xffffffea, vcc_lo -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v0, v1 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_mov_b32 v3, 0x3480 :: v_dual_lshlrev_b32 v0, 16, v0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x4c00, v3, vcc_lo +; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_mul_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_bfe_u32 v1, v0, 16, 1 +; GFX11-NEXT: v_or_b32_e32 v2, 0x400000, v0 +; GFX11-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add3_u32 v1, v1, v0, 0x7fff +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-NEXT: s_setpc_b64 s[30:31] %bool = icmp eq i32 %bool.arg1, %bool.arg2 %y = select i1 %bool, bfloat 0xR3480, bfloat 0xR4C00 %ldexp = fmul bfloat %x, %y diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll index a0578756433ff..62f16fe2760ef 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=0 | FileCheck %s -check-prefix=GFX12-SDAG -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 | FileCheck %s -check-prefix=GFX12-GISEL +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -global-isel=1 -global-isel-abort=2 | FileCheck %s -check-prefix=GFX12-GISEL declare <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half>, <4 x i32>, i32, i32, i32, i32 immarg) declare <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat>, <4 x i32>, i32, i32, i32, i32 immarg) @@ -15,7 +15,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret_offset(<2 x half> %val, ; ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret_offset: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92 +; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 ; GFX12-GISEL-NEXT: s_endpgm %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0) ret void @@ -29,7 +29,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4 x i ; ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen +; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen ; GFX12-GISEL-NEXT: s_endpgm %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -44,7 +44,7 @@ define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret_offset(<2 x half> % ; ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_ret_offset: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0) @@ -60,7 +60,7 @@ define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4 ; ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_ret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -76,7 +76,7 @@ define amdgpu_ps float @struct_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4 x ; ; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2f16_ret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN +; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: ; return to shader part epilog %orig = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) @@ -92,7 +92,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4 ; ; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2f16_noret: ; GFX12-GISEL: ; %bb.0: -; GFX12-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen +; GFX12-GISEL-NEXT: buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen ; GFX12-GISEL-NEXT: s_endpgm %orig = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll index d8ea0ddf77b7a..5d9944add13a3 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 | FileCheck %s -check-prefix=GFX950-SDAG -; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 | FileCheck %s -check-prefix=GFX950-GISEL +; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 | FileCheck %s -check-prefix=GFX950-GISEL declare <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat>, <4 x i32>, i32, i32, i32, i32 immarg) declare <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32, i32, i32) @@ -20,9 +20,9 @@ define amdgpu_ps float @struct_buffer_atomic_add_v2bf16_ret(<2 x bfloat> %val, < ; ; GFX950-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_ret: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[4:5], s[0:3], s4 idxen offen sc0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[2:3], s[0:3], s4 idxen offen sc0 ; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], 0 ; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) ; GFX950-GISEL-NEXT: flat_store_dword v[2:3], v0 @@ -44,9 +44,9 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2bf16_noret(<2 x bfloat> %val, ; ; GFX950-GISEL-LABEL: struct_buffer_atomic_add_v2bf16_noret: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 -; GFX950-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[4:5], s[0:3], s4 idxen offen +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, v1 +; GFX950-GISEL-NEXT: buffer_atomic_pk_add_bf16 v0, v[2:3], s[0:3], s4 idxen offen ; GFX950-GISEL-NEXT: s_endpgm %orig = call <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) ret void diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll index acd48a64dea1f..befe0d405307b 100644 --- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10 %s @@ -594,35 +594,35 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_half(half inreg %a, half %b) { define amdgpu_cs_chain void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat %b) { ; GISEL-GFX11-LABEL: name: amdgpu_cs_chain_cc_bfloat - ; GISEL-GFX11: bb.1 (%ir-block.0): + ; GISEL-GFX11: bb.0 (%ir-block.0): ; GISEL-GFX11-NEXT: liveins: $sgpr0, $vgpr8 ; GISEL-GFX11-NEXT: {{ $}} - ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; GISEL-GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32 - ; GISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY]] - ; GISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY1]] - ; GISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc - ; GISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) - ; GISEL-GFX11-NEXT: $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1 - ; GISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32 + ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GISEL-GFX11-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; GISEL-GFX11-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc + ; GISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) + ; GISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]] + ; GISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]] + ; GISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1 + ; GISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; GISEL-GFX11-NEXT: S_ENDPGM 0 ; ; GISEL-GFX10-LABEL: name: amdgpu_cs_chain_cc_bfloat - ; GISEL-GFX10: bb.1 (%ir-block.0): + ; GISEL-GFX10: bb.0 (%ir-block.0): ; GISEL-GFX10-NEXT: liveins: $sgpr0, $vgpr8 ; GISEL-GFX10-NEXT: {{ $}} - ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; GISEL-GFX10-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32 - ; GISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY]] - ; GISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 + ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 + ; GISEL-GFX10-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 + ; GISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def dead $scc + ; GISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4) ; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_128 = COPY $sgpr48_sgpr49_sgpr50_sgpr51 ; GISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]] - ; GISEL-GFX10-NEXT: [[SI_PC_ADD_REL_OFFSET:%[0-9]+]]:sreg_64 = SI_PC_ADD_REL_OFFSET target-flags(amdgpu-gotprel32-lo) @use, target-flags(amdgpu-gotprel32-hi) @use, implicit-def $scc - ; GISEL-GFX10-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (p0) from got, addrspace 4) - ; GISEL-GFX10-NEXT: $sgpr30_sgpr31 = noconvergent SI_CALL [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $scc, implicit-def $sgpr32, implicit $sgpr32 + ; GISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]] + ; GISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]] + ; GISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1 + ; GISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; GISEL-GFX10-NEXT: S_ENDPGM 0 ; ; DAGISEL-GFX11-LABEL: name: amdgpu_cs_chain_cc_bfloat diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll index 2e2a1094ba99a..ef91f36d60373 100644 --- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll +++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX11 %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=GISEL-GFX10 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF32 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX11-WF64 %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize32 -stop-after=finalize-isel -verify-machineinstrs < %s | FileCheck -check-prefix=DAGISEL-GFX10-WF32 %s @@ -873,32 +873,6 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_half(half inre } define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat %b) { - ; GISEL-GFX11-LABEL: name: amdgpu_cs_chain_cc_bfloat - ; GISEL-GFX11: bb.1 (%ir-block.0): - ; GISEL-GFX11-NEXT: liveins: $sgpr0, $vgpr8 - ; GISEL-GFX11-NEXT: {{ $}} - ; GISEL-GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; GISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GISEL-GFX11-NEXT: [[V_ADD_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_fake16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; GISEL-GFX11-NEXT: FLAT_STORE_SHORT [[COPY3]], [[V_ADD_F16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) - ; GISEL-GFX11-NEXT: S_ENDPGM 0 - ; - ; GISEL-GFX10-LABEL: name: amdgpu_cs_chain_cc_bfloat - ; GISEL-GFX10: bb.1 (%ir-block.0): - ; GISEL-GFX10-NEXT: liveins: $sgpr0, $vgpr8 - ; GISEL-GFX10-NEXT: {{ $}} - ; GISEL-GFX10-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; GISEL-GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 - ; GISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF - ; GISEL-GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GISEL-GFX10-NEXT: [[V_ADD_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_ADD_F16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec - ; GISEL-GFX10-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; GISEL-GFX10-NEXT: FLAT_STORE_SHORT [[COPY3]], [[V_ADD_F16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) - ; GISEL-GFX10-NEXT: S_ENDPGM 0 - ; ; DAGISEL-GFX11-WF32-LABEL: name: amdgpu_cs_chain_cc_bfloat ; DAGISEL-GFX11-WF32: bb.0 (%ir-block.0): ; DAGISEL-GFX11-WF32-NEXT: liveins: $sgpr0, $vgpr8 @@ -996,9 +970,9 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg ; GISEL-GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; GISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; GISEL-GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]] - ; GISEL-GFX11-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $exec + ; GISEL-GFX11-NEXT: [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $exec ; GISEL-GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; GISEL-GFX11-NEXT: FLAT_STORE_SHORT [[COPY3]], [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) + ; GISEL-GFX11-NEXT: FLAT_STORE_SHORT [[COPY3]], [[V_ADD_NC_U16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) ; GISEL-GFX11-NEXT: S_ENDPGM 0 ; ; GISEL-GFX10-LABEL: name: amdgpu_cs_chain_preserve_cc_i16 @@ -1020,10 +994,10 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg ; DAGISEL-GFX11-WF32-NEXT: {{ $}} ; DAGISEL-GFX11-WF32-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; DAGISEL-GFX11-WF32-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec + ; DAGISEL-GFX11-WF32-NEXT: [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec ; DAGISEL-GFX11-WF32-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-WF32-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) + ; DAGISEL-GFX11-WF32-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) ; DAGISEL-GFX11-WF32-NEXT: S_ENDPGM 0 ; ; DAGISEL-GFX11-WF64-LABEL: name: amdgpu_cs_chain_preserve_cc_i16 @@ -1032,10 +1006,10 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg ; DAGISEL-GFX11-WF64-NEXT: {{ $}} ; DAGISEL-GFX11-WF64-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8 ; DAGISEL-GFX11-WF64-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec + ; DAGISEL-GFX11-WF64-NEXT: [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec ; DAGISEL-GFX11-WF64-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF ; DAGISEL-GFX11-WF64-NEXT: [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]] - ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) + ; DAGISEL-GFX11-WF64-NEXT: FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_fake16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`) ; DAGISEL-GFX11-WF64-NEXT: S_ENDPGM 0 ; ; DAGISEL-GFX10-WF32-LABEL: name: amdgpu_cs_chain_preserve_cc_i16 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll index 046a72b9307d0..a0ba97d3b639c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.gfx950.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX950-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX950-GISEL %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX950-GISEL %s declare <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.bf6.f32(<16 x float> %src0, <16 x float> %src1, float %scale) declare <6 x i32> @llvm.amdgcn.cvt.scalef32.2xpk16.fp6.f32(<16 x float> %src0, <16 x float> %src1, float %scale) @@ -983,85 +983,35 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_fp6_sl(<6 x i32> inreg %src) { } define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_vv(<6 x i32> %src, float %scale) { -; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv: -; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0 -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], v22 -; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv: -; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[16:31], v[0:5], v6 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v17 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v18 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v19 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v20 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v21 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v22 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v23 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v16 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, v17 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v18 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v19 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, v21 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, v22 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, v23 -; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_cvt_scalef32_pk32_bf16_fp6_vv: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, v6 +; GCN-NEXT: v_mov_b32_e32 v21, v5 +; GCN-NEXT: v_mov_b32_e32 v20, v4 +; GCN-NEXT: v_mov_b32_e32 v19, v3 +; GCN-NEXT: v_mov_b32_e32 v18, v2 +; GCN-NEXT: v_mov_b32_e32 v17, v1 +; GCN-NEXT: v_mov_b32_e32 v16, v0 +; GCN-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], v22 +; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> %src, float %scale) ret <32 x bfloat> %ret } define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_fp6_sl(<6 x i32> inreg %src) { -; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl: -; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17 -; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], s0 -; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl: -; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 -; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 0x42c80000 -; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[16:31], v[0:5], v6 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v17 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v18 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v19 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v20 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v21 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v22 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v23 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v16 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, v17 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v18 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v19 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, v21 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, v22 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, v23 -; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_cvt_scalef32_pk32_bf16_fp6_sl: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: v_mov_b32_e32 v17, s1 +; GCN-NEXT: v_mov_b32_e32 v18, s2 +; GCN-NEXT: v_mov_b32_e32 v19, s3 +; GCN-NEXT: v_mov_b32_e32 v20, s16 +; GCN-NEXT: v_mov_b32_e32 v21, s17 +; GCN-NEXT: s_mov_b32 s0, 0x42c80000 +; GCN-NEXT: v_cvt_scalef32_pk32_bf16_fp6 v[0:15], v[16:21], s0 +; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.fp6(<6 x i32> %src, float 100.0) ret <32 x bfloat> %ret } @@ -1126,85 +1076,35 @@ define <32 x half> @test_cvt_scalef32_pk32_f16_bf6_sl(<6 x i32> inreg %src) { } define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_vv(<6 x i32> %src, float %scale) { -; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv: -; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b32_e32 v22, v6 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, v5 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, v4 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, v3 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, v2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, v1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, v0 -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], v22 -; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv: -; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[16:31], v[0:5], v6 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v17 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v18 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v19 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v20 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v21 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v22 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v23 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v16 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, v17 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v18 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v19 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, v21 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, v22 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, v23 -; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_cvt_scalef32_pk32_bf16_bf6_vv: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v22, v6 +; GCN-NEXT: v_mov_b32_e32 v21, v5 +; GCN-NEXT: v_mov_b32_e32 v20, v4 +; GCN-NEXT: v_mov_b32_e32 v19, v3 +; GCN-NEXT: v_mov_b32_e32 v18, v2 +; GCN-NEXT: v_mov_b32_e32 v17, v1 +; GCN-NEXT: v_mov_b32_e32 v16, v0 +; GCN-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], v22 +; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> %src, float %scale) ret <32 x bfloat> %ret } define <32 x bfloat> @test_cvt_scalef32_pk32_bf16_bf6_sl(<6 x i32> inreg %src) { -; GFX950-SDAG-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl: -; GFX950-SDAG: ; %bb.0: -; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-SDAG-NEXT: v_mov_b32_e32 v16, s0 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v17, s1 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v18, s2 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v19, s3 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v20, s16 -; GFX950-SDAG-NEXT: v_mov_b32_e32 v21, s17 -; GFX950-SDAG-NEXT: s_mov_b32 s0, 0x42c80000 -; GFX950-SDAG-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], s0 -; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX950-GISEL-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl: -; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX950-GISEL-NEXT: s_mov_b32 s4, s16 -; GFX950-GISEL-NEXT: s_mov_b32 s5, s17 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, 0x42c80000 -; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[16:31], v[0:5], v6 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v16 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v17 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v18 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v19 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v9, 16, v20 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v11, 16, v21 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v22 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v23 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v16 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, v17 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v18 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v19 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v20 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, v21 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, v22 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, v23 -; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_cvt_scalef32_pk32_bf16_bf6_sl: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v16, s0 +; GCN-NEXT: v_mov_b32_e32 v17, s1 +; GCN-NEXT: v_mov_b32_e32 v18, s2 +; GCN-NEXT: v_mov_b32_e32 v19, s3 +; GCN-NEXT: v_mov_b32_e32 v20, s16 +; GCN-NEXT: v_mov_b32_e32 v21, s17 +; GCN-NEXT: s_mov_b32 s0, 0x42c80000 +; GCN-NEXT: v_cvt_scalef32_pk32_bf16_bf6 v[0:15], v[16:21], s0 +; GCN-NEXT: s_setpc_b64 s[30:31] %ret = tail call <32 x bfloat> @llvm.amdgcn.cvt.scalef32.pk32.bf16.bf6(<6 x i32> %src, float 100.0) ret <32 x bfloat> %ret } diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll index f9fd7e253b124..517c87193598d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s declare <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float %scale) declare <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.f16(<32 x half> %src, float %scale) @@ -19,44 +19,11 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_vv(<32 x bfloat> %src, float ; ; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_vv: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v17 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v25, v18 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v30, 16, v11 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v34, 16, v15 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v4, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v5, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v7, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v8, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v9, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v10, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v11, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v12, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v15, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v17 ; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[0:15], v16 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off ; GFX950-GISEL-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float %scale) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -90,82 +57,26 @@ define amdgpu_ps void @test_scalef32_pk32_bf6_bf16_sl(<32 x bfloat> inreg %src, ; ; GFX950-GISEL-LABEL: test_scalef32_pk32_bf6_bf16_sl: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: s_lshr_b32 s16, s0, 16 -; GFX950-GISEL-NEXT: s_lshr_b32 s17, s1, 16 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s16, 16 -; GFX950-GISEL-NEXT: s_and_b32 s0, s0, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s18, s2, 16 -; GFX950-GISEL-NEXT: s_or_b32 s0, s16, s0 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s17, 16 -; GFX950-GISEL-NEXT: s_and_b32 s1, s1, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s19, s3, 16 -; GFX950-GISEL-NEXT: s_or_b32 s1, s16, s1 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s18, 16 -; GFX950-GISEL-NEXT: s_and_b32 s2, s2, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s20, s4, 16 -; GFX950-GISEL-NEXT: s_or_b32 s2, s16, s2 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s19, 16 -; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s21, s5, 16 -; GFX950-GISEL-NEXT: s_or_b32 s3, s16, s3 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s20, 16 -; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s22, s6, 16 -; GFX950-GISEL-NEXT: s_or_b32 s4, s16, s4 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s21, 16 -; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s23, s7, 16 -; GFX950-GISEL-NEXT: s_or_b32 s5, s16, s5 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s22, 16 -; GFX950-GISEL-NEXT: s_and_b32 s6, s6, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s24, s8, 16 -; GFX950-GISEL-NEXT: s_or_b32 s6, s16, s6 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s23, 16 -; GFX950-GISEL-NEXT: s_and_b32 s7, s7, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s25, s9, 16 -; GFX950-GISEL-NEXT: s_or_b32 s7, s16, s7 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s24, 16 -; GFX950-GISEL-NEXT: s_and_b32 s8, s8, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s26, s10, 16 -; GFX950-GISEL-NEXT: s_or_b32 s8, s16, s8 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s25, 16 -; GFX950-GISEL-NEXT: s_and_b32 s9, s9, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s27, s11, 16 -; GFX950-GISEL-NEXT: s_or_b32 s9, s16, s9 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s26, 16 -; GFX950-GISEL-NEXT: s_and_b32 s10, s10, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s28, s12, 16 -; GFX950-GISEL-NEXT: s_or_b32 s10, s16, s10 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s27, 16 -; GFX950-GISEL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s29, s13, 16 -; GFX950-GISEL-NEXT: s_or_b32 s11, s16, s11 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s28, 16 -; GFX950-GISEL-NEXT: s_and_b32 s12, s12, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s30, s14, 16 -; GFX950-GISEL-NEXT: s_or_b32 s12, s16, s12 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s29, 16 -; GFX950-GISEL-NEXT: s_and_b32 s13, s13, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s31, s15, 16 -; GFX950-GISEL-NEXT: s_or_b32 s13, s16, s13 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s30, 16 -; GFX950-GISEL-NEXT: s_and_b32 s14, s14, 0xffff -; GFX950-GISEL-NEXT: s_or_b32 s14, s16, s14 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s31, 16 -; GFX950-GISEL-NEXT: s_and_b32 s15, s15, 0xffff -; GFX950-GISEL-NEXT: s_or_b32 s15, s16, s15 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000 -; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], v24 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-GISEL-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_bf6_bf16 v[18:23], v[2:17], s0 ; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.bf6.bf16(<32 x bfloat> %src, float 100.0) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -252,44 +163,11 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_vv(<32 x bfloat> %src, float ; ; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_bf16_vv: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v17 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v25, v18 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v5 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v6 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v26, 16, v7 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v27, 16, v8 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v28, 16, v9 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v29, 16, v10 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v30, 16, v11 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v31, 16, v12 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v32, 16, v13 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v33, 16, v14 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v34, 16, v15 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v4, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v5, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v6, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v7, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v8, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v9, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v10, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v11, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v12, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v13, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v14, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v15, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: s_nop 0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, v17 ; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[0:15], v16 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off ; GFX950-GISEL-NEXT: global_store_dwordx2 v[24:25], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[24:25], v[18:21], off ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> %src, float %scale) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -323,82 +201,26 @@ define amdgpu_ps void @test_scalef32_pk32_fp6_bf16_sl(<32 x bfloat> inreg %src, ; ; GFX950-GISEL-LABEL: test_scalef32_pk32_fp6_bf16_sl: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: s_lshr_b32 s16, s0, 16 -; GFX950-GISEL-NEXT: s_lshr_b32 s17, s1, 16 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s16, 16 -; GFX950-GISEL-NEXT: s_and_b32 s0, s0, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s18, s2, 16 -; GFX950-GISEL-NEXT: s_or_b32 s0, s16, s0 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s17, 16 -; GFX950-GISEL-NEXT: s_and_b32 s1, s1, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s19, s3, 16 -; GFX950-GISEL-NEXT: s_or_b32 s1, s16, s1 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s18, 16 -; GFX950-GISEL-NEXT: s_and_b32 s2, s2, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s20, s4, 16 -; GFX950-GISEL-NEXT: s_or_b32 s2, s16, s2 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s19, 16 -; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s21, s5, 16 -; GFX950-GISEL-NEXT: s_or_b32 s3, s16, s3 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s20, 16 -; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s22, s6, 16 -; GFX950-GISEL-NEXT: s_or_b32 s4, s16, s4 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s21, 16 -; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s23, s7, 16 -; GFX950-GISEL-NEXT: s_or_b32 s5, s16, s5 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s22, 16 -; GFX950-GISEL-NEXT: s_and_b32 s6, s6, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s24, s8, 16 -; GFX950-GISEL-NEXT: s_or_b32 s6, s16, s6 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s23, 16 -; GFX950-GISEL-NEXT: s_and_b32 s7, s7, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s25, s9, 16 -; GFX950-GISEL-NEXT: s_or_b32 s7, s16, s7 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s24, 16 -; GFX950-GISEL-NEXT: s_and_b32 s8, s8, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s26, s10, 16 -; GFX950-GISEL-NEXT: s_or_b32 s8, s16, s8 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s25, 16 -; GFX950-GISEL-NEXT: s_and_b32 s9, s9, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s27, s11, 16 -; GFX950-GISEL-NEXT: s_or_b32 s9, s16, s9 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s26, 16 -; GFX950-GISEL-NEXT: s_and_b32 s10, s10, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s28, s12, 16 -; GFX950-GISEL-NEXT: s_or_b32 s10, s16, s10 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s27, 16 -; GFX950-GISEL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s29, s13, 16 -; GFX950-GISEL-NEXT: s_or_b32 s11, s16, s11 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s28, 16 -; GFX950-GISEL-NEXT: s_and_b32 s12, s12, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s30, s14, 16 -; GFX950-GISEL-NEXT: s_or_b32 s12, s16, s12 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s29, 16 -; GFX950-GISEL-NEXT: s_and_b32 s13, s13, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s31, s15, 16 -; GFX950-GISEL-NEXT: s_or_b32 s13, s16, s13 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s30, 16 -; GFX950-GISEL-NEXT: s_and_b32 s14, s14, 0xffff -; GFX950-GISEL-NEXT: s_or_b32 s14, s16, s14 -; GFX950-GISEL-NEXT: s_lshl_b32 s16, s31, 16 -; GFX950-GISEL-NEXT: s_and_b32 s15, s15, 0xffff -; GFX950-GISEL-NEXT: s_or_b32 s15, s16, s15 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] -; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000 -; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], v24 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, s15 +; GFX950-GISEL-NEXT: s_mov_b32 s0, 0x42c80000 +; GFX950-GISEL-NEXT: v_cvt_scalef32_pk32_fp6_bf16 v[18:23], v[2:17], s0 ; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.pk32.fp6.bf16(<32 x bfloat> %src, float 100.0) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll index e1bf9f0daa1ef..d3851b1a084d6 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX950 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX950 %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX950 %s declare i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.bf16(i32 %old, bfloat %src, i32 %seed, float %scale, i32 %dst_sel) declare i32 @llvm.amdgcn.cvt.scalef32.sr.bf8.f16(i32 %old, half %src, i32 %seed, float %scale, i32 %dst_sel) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll index 1107b46f8f6d3..7433f6611cd9b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx950.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950 %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950 %s declare i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.f16(i32 %old, <2 x half> %src, i32 %seed, float %scale, i32 %dst_sel) declare i32 @llvm.amdgcn.cvt.scalef32.sr.pk.fp4.bf16(i32 %old, <2 x bfloat> %src, i32 %seed, float %scale, i32 %dst_sel) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll index 0d4598f316c41..18b20e101a938 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX950-GISEL %s declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> %src, i32 %sr, float %scale) declare <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.f16(<32 x half> %src, i32 %sr, float %scale) @@ -19,42 +19,9 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_bf16_vv(<32 x bfloat> %src, i32 ; ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_bf16_vv: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v25, 16, v5 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v29, 16, v9 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v0, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v2, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v3, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v4, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v5, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v7, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v8, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v9, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v10, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v11, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v12, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v13, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v14, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v15, v35 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_bf6_bf16 v[20:25], v[0:15], v16, v17 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[20:23], off ; GFX950-GISEL-NEXT: global_store_dwordx2 v[18:19], v[24:25], off offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[20:23], off ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> %src, i32 %sr, float %scale) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -88,82 +55,26 @@ define amdgpu_ps void @test_scalef32_sr_pk32_bf6_bf16_sl(<32 x bfloat> inreg %sr ; ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_bf6_bf16_sl: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: s_lshr_b32 s17, s0, 16 -; GFX950-GISEL-NEXT: s_lshr_b32 s18, s1, 16 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s17, 16 -; GFX950-GISEL-NEXT: s_and_b32 s0, s0, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s19, s2, 16 -; GFX950-GISEL-NEXT: s_or_b32 s0, s17, s0 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s18, 16 -; GFX950-GISEL-NEXT: s_and_b32 s1, s1, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s20, s3, 16 -; GFX950-GISEL-NEXT: s_or_b32 s1, s17, s1 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s19, 16 -; GFX950-GISEL-NEXT: s_and_b32 s2, s2, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s21, s4, 16 -; GFX950-GISEL-NEXT: s_or_b32 s2, s17, s2 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s20, 16 -; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s22, s5, 16 -; GFX950-GISEL-NEXT: s_or_b32 s3, s17, s3 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s21, 16 -; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s23, s6, 16 -; GFX950-GISEL-NEXT: s_or_b32 s4, s17, s4 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s22, 16 -; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s24, s7, 16 -; GFX950-GISEL-NEXT: s_or_b32 s5, s17, s5 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s23, 16 -; GFX950-GISEL-NEXT: s_and_b32 s6, s6, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s25, s8, 16 -; GFX950-GISEL-NEXT: s_or_b32 s6, s17, s6 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s24, 16 -; GFX950-GISEL-NEXT: s_and_b32 s7, s7, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s26, s9, 16 -; GFX950-GISEL-NEXT: s_or_b32 s7, s17, s7 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s25, 16 -; GFX950-GISEL-NEXT: s_and_b32 s8, s8, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s27, s10, 16 -; GFX950-GISEL-NEXT: s_or_b32 s8, s17, s8 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s26, 16 -; GFX950-GISEL-NEXT: s_and_b32 s9, s9, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s28, s11, 16 -; GFX950-GISEL-NEXT: s_or_b32 s9, s17, s9 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s27, 16 -; GFX950-GISEL-NEXT: s_and_b32 s10, s10, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s29, s12, 16 -; GFX950-GISEL-NEXT: s_or_b32 s10, s17, s10 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s28, 16 -; GFX950-GISEL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s30, s13, 16 -; GFX950-GISEL-NEXT: s_or_b32 s11, s17, s11 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s29, 16 -; GFX950-GISEL-NEXT: s_and_b32 s12, s12, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s31, s14, 16 -; GFX950-GISEL-NEXT: s_or_b32 s12, s17, s12 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s30, 16 -; GFX950-GISEL-NEXT: s_and_b32 s13, s13, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s33, s15, 16 -; GFX950-GISEL-NEXT: s_or_b32 s13, s17, s13 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s31, 16 -; GFX950-GISEL-NEXT: s_and_b32 s14, s14, 0xffff -; GFX950-GISEL-NEXT: s_or_b32 s14, s17, s14 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s33, 16 -; GFX950-GISEL-NEXT: s_and_b32 s15, s15, 0xffff -; GFX950-GISEL-NEXT: s_or_b32 s15, s17, s15 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, s15 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000 ; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_bf6_bf16 v[18:23], v[2:17], s16, v24 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off ; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.bf6.bf16(<32 x bfloat> %src, i32 %sr, float 100.0) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -244,42 +155,9 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_bf16_vv(<32 x bfloat> %src, i32 ; ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_fp6_bf16_vv: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v21, 16, v1 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v2 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v3 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v24, 16, v4 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v25, 16, v5 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v26, 16, v6 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v27, 16, v7 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v28, 16, v8 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v29, 16, v9 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v30, 16, v10 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v31, 16, v11 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v32, 16, v12 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v33, 16, v13 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v34, 16, v14 -; GFX950-GISEL-NEXT: v_lshrrev_b32_e32 v35, 16, v15 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v0, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v1, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v2, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v3, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v4, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v5, v25 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v6, v26 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v7, v27 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v8, v28 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v9, v29 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v10, v30 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v11, v31 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v12, v32 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v13, v33 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v14, v34 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: v_mov_b32_sdwa v15, v35 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GFX950-GISEL-NEXT: s_nop 0 ; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_fp6_bf16 v[20:25], v[0:15], v16, v17 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[20:23], off ; GFX950-GISEL-NEXT: global_store_dwordx2 v[18:19], v[24:25], off offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[18:19], v[20:23], off ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> %src, i32 %sr, float %scale) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 @@ -313,82 +191,26 @@ define amdgpu_ps void @test_scalef32_sr_pk32_fp6_bf16_sl(<32 x bfloat> inreg %sr ; ; GFX950-GISEL-LABEL: test_scalef32_sr_pk32_fp6_bf16_sl: ; GFX950-GISEL: ; %bb.0: -; GFX950-GISEL-NEXT: s_lshr_b32 s17, s0, 16 -; GFX950-GISEL-NEXT: s_lshr_b32 s18, s1, 16 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s17, 16 -; GFX950-GISEL-NEXT: s_and_b32 s0, s0, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s19, s2, 16 -; GFX950-GISEL-NEXT: s_or_b32 s0, s17, s0 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s18, 16 -; GFX950-GISEL-NEXT: s_and_b32 s1, s1, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s20, s3, 16 -; GFX950-GISEL-NEXT: s_or_b32 s1, s17, s1 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s19, 16 -; GFX950-GISEL-NEXT: s_and_b32 s2, s2, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s21, s4, 16 -; GFX950-GISEL-NEXT: s_or_b32 s2, s17, s2 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s20, 16 -; GFX950-GISEL-NEXT: s_and_b32 s3, s3, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s22, s5, 16 -; GFX950-GISEL-NEXT: s_or_b32 s3, s17, s3 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s21, 16 -; GFX950-GISEL-NEXT: s_and_b32 s4, s4, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s23, s6, 16 -; GFX950-GISEL-NEXT: s_or_b32 s4, s17, s4 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s22, 16 -; GFX950-GISEL-NEXT: s_and_b32 s5, s5, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s24, s7, 16 -; GFX950-GISEL-NEXT: s_or_b32 s5, s17, s5 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s23, 16 -; GFX950-GISEL-NEXT: s_and_b32 s6, s6, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s25, s8, 16 -; GFX950-GISEL-NEXT: s_or_b32 s6, s17, s6 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s24, 16 -; GFX950-GISEL-NEXT: s_and_b32 s7, s7, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s26, s9, 16 -; GFX950-GISEL-NEXT: s_or_b32 s7, s17, s7 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s25, 16 -; GFX950-GISEL-NEXT: s_and_b32 s8, s8, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s27, s10, 16 -; GFX950-GISEL-NEXT: s_or_b32 s8, s17, s8 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s26, 16 -; GFX950-GISEL-NEXT: s_and_b32 s9, s9, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s28, s11, 16 -; GFX950-GISEL-NEXT: s_or_b32 s9, s17, s9 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s27, 16 -; GFX950-GISEL-NEXT: s_and_b32 s10, s10, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s29, s12, 16 -; GFX950-GISEL-NEXT: s_or_b32 s10, s17, s10 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s28, 16 -; GFX950-GISEL-NEXT: s_and_b32 s11, s11, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s30, s13, 16 -; GFX950-GISEL-NEXT: s_or_b32 s11, s17, s11 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s29, 16 -; GFX950-GISEL-NEXT: s_and_b32 s12, s12, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s31, s14, 16 -; GFX950-GISEL-NEXT: s_or_b32 s12, s17, s12 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s30, 16 -; GFX950-GISEL-NEXT: s_and_b32 s13, s13, 0xffff -; GFX950-GISEL-NEXT: s_lshr_b32 s33, s15, 16 -; GFX950-GISEL-NEXT: s_or_b32 s13, s17, s13 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s31, 16 -; GFX950-GISEL-NEXT: s_and_b32 s14, s14, 0xffff -; GFX950-GISEL-NEXT: s_or_b32 s14, s17, s14 -; GFX950-GISEL-NEXT: s_lshl_b32 s17, s33, 16 -; GFX950-GISEL-NEXT: s_and_b32 s15, s15, 0xffff -; GFX950-GISEL-NEXT: s_or_b32 s15, s17, s15 -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[16:17], s[14:15] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[14:15], s[12:13] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[12:13], s[10:11] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[10:11], s[8:9] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[8:9], s[6:7] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[6:7], s[4:5] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3] -; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1] +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, s1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, s2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, s3 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, s4 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, s5 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, s6 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, s7 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v10, s8 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v11, s9 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v12, s10 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v13, s11 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v14, s12 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v15, s13 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v16, s14 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v17, s15 ; GFX950-GISEL-NEXT: v_mov_b32_e32 v24, 0x42c80000 ; GFX950-GISEL-NEXT: v_cvt_scalef32_sr_pk32_fp6_bf16 v[18:23], v[2:17], s16, v24 -; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off ; GFX950-GISEL-NEXT: global_store_dwordx2 v[0:1], v[22:23], off offset:16 +; GFX950-GISEL-NEXT: global_store_dwordx4 v[0:1], v[18:21], off ; GFX950-GISEL-NEXT: s_endpgm %cvt = tail call <6 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk32.fp6.bf16(<32 x bfloat> %src, i32 %sr, float 100.0) store <6 x i32> %cvt, ptr addrspace(1) %out, align 8 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.ll index 4a58d6346fc57..64a15bc102759 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -o - %s | FileCheck -check-prefix=GFX950 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -o - %s | FileCheck -check-prefix=GFX950 %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -o - %s | FileCheck -check-prefix=GFX950 %s declare <2 x half> @llvm.amdgcn.cvt.sr.f16.f32(<2 x half>, float, i32, i1) declare <2 x bfloat> @llvm.amdgcn.cvt.sr.bf16.f32(<2 x bfloat>, float, i32, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll index f694d55f83b68..eb5bded6d2610 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.read.tr.gfx950.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-GISEL %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX950-GISEL %s declare <2 x i32> @llvm.amdgcn.ds.read.tr4.b64.v2i32.p3(ptr addrspace(3)) declare <2 x i32> @llvm.amdgcn.ds.read.tr8.b64.v2i32.p3(ptr addrspace(3)) @@ -146,11 +146,11 @@ define amdgpu_ps void @ds_read_b64_tr_b16_v4bf16(ptr addrspace(3) %addr, ptr add ; ; GFX950-GISEL-LABEL: ds_read_b64_tr_b16_v4bf16: ; GFX950-GISEL: ; %bb.0: ; %entry -; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v2 +; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, v1 ; GFX950-GISEL-NEXT: ds_read_b64_tr_b16 v[0:1], v0 offset:32 -; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2 ; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-GISEL-NEXT: global_store_dwordx2 v[4:5], v[0:1], off +; GFX950-GISEL-NEXT: global_store_dwordx2 v[2:3], v[0:1], off ; GFX950-GISEL-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr addrspace(3) %addr, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll index 42acf089e8648..159592cab6a34 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 +; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950 -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950-ISEL +; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950-ISEL declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 %clamp) @@ -40,17 +40,17 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp( ; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp: ; GFX950-ISEL: ; %bb.0: ; %entry ; GFX950-ISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-ISEL-NEXT: s_load_dword s0, s[12:13], 0x0 ; GFX950-ISEL-NEXT: s_load_dword s1, s[14:15], 0x0 ; GFX950-ISEL-NEXT: s_load_dword s2, s[10:11], 0x0 ; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX950-ISEL-NEXT: v_dot2_f32_bf16 v0, s2, v0, v1 clamp -; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, 0 -; GFX950-ISEL-NEXT: s_nop 1 -; GFX950-ISEL-NEXT: global_store_dword v1, v0, s[8:9] +; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX950-ISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX950-ISEL-NEXT: v_dot2_f32_bf16 v1, s2, v1, v2 clamp +; GFX950-ISEL-NEXT: s_nop 2 +; GFX950-ISEL-NEXT: global_store_dword v0, v1, s[8:9] ; GFX950-ISEL-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -100,17 +100,17 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp( ; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp: ; GFX950-ISEL: ; %bb.0: ; %entry ; GFX950-ISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 +; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, 0 ; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX950-ISEL-NEXT: s_load_dword s0, s[12:13], 0x0 ; GFX950-ISEL-NEXT: s_load_dword s1, s[14:15], 0x0 ; GFX950-ISEL-NEXT: s_load_dword s2, s[10:11], 0x0 ; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, s0 -; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, s1 -; GFX950-ISEL-NEXT: v_dot2c_f32_bf16_e32 v1, s2, v0 -; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, 0 -; GFX950-ISEL-NEXT: s_nop 1 -; GFX950-ISEL-NEXT: global_store_dword v0, v1, s[8:9] +; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, s0 +; GFX950-ISEL-NEXT: v_mov_b32_e32 v2, s1 +; GFX950-ISEL-NEXT: v_dot2c_f32_bf16_e32 v2, s2, v1 +; GFX950-ISEL-NEXT: s_nop 2 +; GFX950-ISEL-NEXT: global_store_dword v0, v2, s[8:9] ; GFX950-ISEL-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll index 8427b4e7f6f35..537aab9a3e9c5 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32 < %s | FileCheck -check-prefix=GFX12 %s declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1)) declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1)) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll index be4fa79951daf..4db256de1ce1b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s declare i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1)) declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1)) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll index 12f9029392a43..7be0d9ca329aa 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -global-isel=0 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-SDAG %s -; RUN: llc -mtriple=amdgcn -global-isel=1 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-GISEL %s +; RUN: llc -mtriple=amdgcn -global-isel=1 -global-isel-abort=2 -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12-GISEL %s define amdgpu_ps float @atomic_pk_add_f16_1d_v2(<8 x i32> inreg %rsrc, <2 x half> %data, i32 %s) { ; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v2: @@ -156,16 +156,6 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v4(<8 x i32> inreg %rsrc, <4 x bfl ; ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4: ; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX12-GISEL-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN ; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, 0 @@ -190,16 +180,6 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_noret(<8 x i32> inreg %rsrc, <4 ; ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_noret: ; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX12-GISEL-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 @@ -219,16 +199,6 @@ define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_nt(<8 x i32> inreg %rsrc, <4 x ; ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_nt: ; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX12-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX12-GISEL-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-GISEL-NEXT: v_or_b32_e32 v0, v3, v0 -; GFX12-GISEL-NEXT: v_or_b32_e32 v1, v4, v1 ; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN ; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll index 722c53a9dd607..d9ee276c3f076 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx950.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.f16(<8 x half>, <8 x half>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half>, <8 x half>, <16 x float>, i32 immarg, i32 immarg, i32 immarg) @@ -1856,198 +1856,92 @@ define amdgpu_kernel void @test_mfma_i32_32x32x32_i8__vgprcd_mac_flags(<4 x i32> declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat>, <8 x bfloat>, <4 x float>, i32 immarg, i32 immarg, i32 immarg) define <4 x float> @test_mfma_f32_16x16x32_bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_f32_16x16x32_bf16: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] -; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_f32_16x16x32_bf16: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GISEL-NEXT: v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] -; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_f32_16x16x32_bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] +; GCN-NEXT: s_nop 6 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) ret <4 x float> %result } define <4 x float> @test_mfma_f32_16x16x32_bf16__flags(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) { -; SDAG-LABEL: test_mfma_f32_16x16x32_bf16__flags: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v8 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v9 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v10 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v11 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_mfma_f32_16x16x32_bf16__flags: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v1 -; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v2 -; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v3 -; GISEL-NEXT: v_mov_b32_sdwa v0, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v1, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v2, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v3, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; GISEL-NEXT: v_lshrrev_b32_e32 v13, 16, v5 -; GISEL-NEXT: v_lshrrev_b32_e32 v14, 16, v6 -; GISEL-NEXT: v_lshrrev_b32_e32 v15, 16, v7 -; GISEL-NEXT: v_accvgpr_write_b32 a0, v8 -; GISEL-NEXT: v_mov_b32_sdwa v4, v12 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v5, v13 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v6, v14 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v7, v15 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_accvgpr_write_b32 a1, v9 -; GISEL-NEXT: v_accvgpr_write_b32 a2, v10 -; GISEL-NEXT: v_accvgpr_write_b32 a3, v11 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 -; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: v_accvgpr_read_b32 v0, a0 -; GISEL-NEXT: v_accvgpr_read_b32 v1, a1 -; GISEL-NEXT: v_accvgpr_read_b32 v2, a2 -; GISEL-NEXT: v_accvgpr_read_b32 v3, a3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_mfma_f32_16x16x32_bf16__flags: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v8 +; GCN-NEXT: v_accvgpr_write_b32 a1, v9 +; GCN-NEXT: v_accvgpr_write_b32 a2, v10 +; GCN-NEXT: v_accvgpr_write_b32 a3, v11 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:1 abid:1 blgp:1 +; GCN-NEXT: s_nop 6 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 1, i32 1, i32 1) ret <4 x float> %result } define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 { -; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] -; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] -; SDAG-NEXT: s_endpgm -; -; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] -; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] -; GISEL-NEXT: s_endpgm +; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GCN-NEXT: v_accvgpr_write_b32 a0, s0 +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GCN-NEXT: v_accvgpr_write_b32 a1, s1 +; GCN-NEXT: v_accvgpr_write_b32 a2, s2 +; GCN-NEXT: v_accvgpr_write_b32 a3, s3 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] +; GCN-NEXT: s_nop 6 +; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; GCN-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 0, i32 0, i32 0) store <4 x float> %result, ptr addrspace(1) %out ret void } define amdgpu_kernel void @test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags(ptr addrspace(1) %out, <8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2) #0 { -; SDAG-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: v_mov_b32_e32 v8, 0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_accvgpr_write_b32 a0, s0 -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_accvgpr_write_b32 a1, s1 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s2 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s3 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 -; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] -; SDAG-NEXT: s_endpgm -; -; GISEL-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 -; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_accvgpr_write_b32 a0, s0 -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_accvgpr_write_b32 a1, s1 -; GISEL-NEXT: v_accvgpr_write_b32 a2, s2 -; GISEL-NEXT: v_accvgpr_write_b32 a3, s3 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 -; GISEL-NEXT: global_store_dwordx4 v0, a[0:3], s[6:7] -; GISEL-NEXT: s_endpgm +; GCN-LABEL: test_mfma_f32_16x16x32_bf16_no_agpr__vgprcd__flags: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x54 +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GCN-NEXT: v_accvgpr_write_b32 a0, s0 +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GCN-NEXT: v_accvgpr_write_b32 a1, s1 +; GCN-NEXT: v_accvgpr_write_b32 a2, s2 +; GCN-NEXT: v_accvgpr_write_b32 a3, s3 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_mfma_f32_16x16x32_bf16 a[0:3], v[0:3], v[4:7], a[0:3] cbsz:3 abid:2 blgp:1 +; GCN-NEXT: s_nop 6 +; GCN-NEXT: global_store_dwordx4 v8, a[0:3], s[6:7] +; GCN-NEXT: s_endpgm %result = call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf16(<8 x bfloat> %arg0, <8 x bfloat> %arg1, <4 x float> %arg2, i32 3, i32 2, i32 1) store <4 x float> %result, ptr addrspace(1) %out ret void diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll index 53e37479f68e6..481e721e3c21d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll @@ -1,9 +1,9 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s ; GFX10PLUS-LABEL: {{^}}dpp8_test: ; GFX10PLUS: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll index 71961a57bd080..5eb6d203098ee 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.ll @@ -1,10 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s -; RUN: llc -global-isel=1 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s ; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s -; RUN: llc -global-isel=1 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s ; RUN: llc -global-isel=0 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s -; RUN: llc -global-isel=1 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -amdgpu-load-store-vectorizer=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll index f8caf84d5c51a..09cc55b53539b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll @@ -15,14 +15,12 @@ ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-f32-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-F32-GISEL %s ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/raw-ret-v2f16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2F16-GISEL %s ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-v2f16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2F16-GISEL %s -; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s -; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s - -; FIXME: These should fail when bfloat support is handled correctly -; xUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s -; xUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s -; xUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s -; xUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s +; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s +; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s +; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s +; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s +; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s +; RUN: not --crash llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s ; Make sure buffer fadd atomics with return values are not selected ; for gfx908 where they do not work. @@ -66,7 +64,7 @@ define <2 x half> @struct_ptr_buffer_atomic_fadd_v2f16_rtn(<2 x half> %val, ptr ;--- raw-ret-v2bf16-error.ll ; ERR-RAW-V2BF16-SDAG: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD -; ERR-RAW-V2BF16-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(<2 x s16>) = G_AMDGPU_BUFFER_ATOMIC_FADD +; ERR-RAW-V2BF16-GISEL: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD define <2 x bfloat> @raw_ptr_buffer_atomic_fadd_v2bf16_rtn(<2 x bfloat> %val, <4 x i32> inreg %rsrc, i32 inreg %soffset) { %ret = call <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32 0, i32 %soffset, i32 0) @@ -75,7 +73,7 @@ define <2 x bfloat> @raw_ptr_buffer_atomic_fadd_v2bf16_rtn(<2 x bfloat> %val, <4 ;--- struct-ret-v2bf16-error.ll ; ERR-STRUCT-V2BF16-SDAG: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD -; ERR-STRUCT-V2BF16-GISEL: LLVM ERROR: cannot select: %{{[0-9]+}}:vgpr_32(<2 x s16>) = G_AMDGPU_BUFFER_ATOMIC_FADD +; ERR-STRUCT-V2BF16-GISEL: LLVM ERROR: Cannot select: {{.+}}: v2bf16,ch = BUFFER_ATOMIC_FADD define <2 x bfloat> @struct_ptr_buffer_atomic_fadd_v2bf16_rtn(<2 x bfloat> %val, ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) { %ret = call <2 x bfloat> @llvm.amdgcn.struct.ptr.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, ptr addrspace(8) %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll index 9a2f0aa5adb77..0605a158b974f 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK-SDAG -enable-var-scope %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefix=CHECK-GISEL -enable-var-scope %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefix=CHECK-GISEL -enable-var-scope %s define void @test_readfirstlane_i1(ptr addrspace(1) %out, i1 %src) { ; CHECK-SDAG-LABEL: test_readfirstlane_i1: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll index a8560ff1aa2b0..edb6ebcee1325 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-SDAG -enable-var-scope %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck --check-prefix=CHECK-GISEL -enable-var-scope %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck --check-prefix=CHECK-GISEL -enable-var-scope %s declare i32 @llvm.amdgcn.readlane.i32(i32, i32) #0 declare i64 @llvm.amdgcn.readlane.i64(i64, i32) #0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll index 0a330e91f8206..66c02a9bd0c6a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.smfmac.gfx950.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,SDAG %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -global-isel-abort=2 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s declare i32 @llvm.amdgcn.workitem.id.x() @@ -628,57 +628,31 @@ define <16 x float> @test_smfmac_f32_32x32x32_f16__sgpr(<8 x half> inreg %arg0, declare <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat>, <16 x bfloat>, <4 x float>, i32, i32 immarg, i32 immarg) define amdgpu_kernel void @test_smfmac_f32_16x16x64_bf16__vgpr(ptr addrspace(1) %arg, <8 x bfloat> %a, <16 x bfloat> %b, i32 %idx) #0 { -; SDAG-LABEL: test_smfmac_f32_16x16x64_bf16__vgpr: -; SDAG: ; %bb.0: ; %bb -; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 -; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[14:15], s[2:3] -; SDAG-NEXT: v_mov_b64_e32 v[12:13], s[0:1] -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; SDAG-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; SDAG-NEXT: v_mov_b32_e32 v17, s16 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 -; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] -; SDAG-NEXT: s_endpgm -; -; GISEL-LABEL: test_smfmac_f32_16x16x64_bf16__vgpr: -; GISEL: ; %bb.0: ; %bb -; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 -; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GISEL-NEXT: v_lshlrev_b32_e32 v0, 4, v0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 -; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64 -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[14:15] -; GISEL-NEXT: v_mov_b32_e32 v16, s16 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v16 cbsz:1 abid:2 -; GISEL-NEXT: v_mov_b32_e32 v0, 0 -; GISEL-NEXT: s_nop 5 -; GISEL-NEXT: global_store_dwordx4 v0, v[8:11], s[6:7] -; GISEL-NEXT: s_endpgm +; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__vgpr: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_load_dwordx4 v[8:11], v0, s[6:7] +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GCN-NEXT: s_load_dword s16, s[4:5], 0x64 +; GCN-NEXT: v_mov_b64_e32 v[14:15], s[2:3] +; GCN-NEXT: v_mov_b64_e32 v[12:13], s[0:1] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b64_e32 v[0:1], s[8:9] +; GCN-NEXT: v_mov_b64_e32 v[2:3], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[4:5], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[6:7], s[14:15] +; GCN-NEXT: v_mov_b32_e32 v17, s16 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 v[8:11], v[12:15], v[0:7], v17 cbsz:1 abid:2 +; GCN-NEXT: s_nop 6 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] +; GCN-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <4 x float>, ptr addrspace(1) %arg, i32 %id @@ -689,266 +663,94 @@ bb: } define <4 x float> @test_smfmac_f32_16x16x64_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x64_bf16: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 -; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x64_bf16: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; GISEL-NEXT: v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; GISEL-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; GISEL-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GISEL-NEXT: v_mov_b32_sdwa v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v7, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v8, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v9, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v10, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v11, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 -; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x64_bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 +; GCN-NEXT: s_nop 6 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags0(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x64_bf16__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x64_bf16__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; GISEL-NEXT: v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; GISEL-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; GISEL-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GISEL-NEXT: v_mov_b32_sdwa v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v7, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v8, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v9, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v10, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v11, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:1 abid:3 -; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:1 abid:3 +; GCN-NEXT: s_nop 6 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x64_bf16__flags1(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x64_bf16__flags1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x64_bf16__flags1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v0 -; GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v2 -; GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; GISEL-NEXT: v_mov_b32_sdwa v0, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v1, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v2, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v3, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_lshrrev_b32_e32 v17, 16, v4 -; GISEL-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; GISEL-NEXT: v_lshrrev_b32_e32 v19, 16, v6 -; GISEL-NEXT: v_lshrrev_b32_e32 v20, 16, v7 -; GISEL-NEXT: v_lshrrev_b32_e32 v21, 16, v8 -; GISEL-NEXT: v_lshrrev_b32_e32 v22, 16, v9 -; GISEL-NEXT: v_lshrrev_b32_e32 v23, 16, v10 -; GISEL-NEXT: v_lshrrev_b32_e32 v24, 16, v11 -; GISEL-NEXT: v_mov_b32_sdwa v4, v17 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v5, v18 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v6, v19 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v7, v20 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v8, v21 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v9, v22 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v10, v23 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v11, v24 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_16x16x64_bf16 v[12:15], v[0:3], v[4:11], v16 cbsz:3 abid:1 -; GISEL-NEXT: s_nop 6 -; GISEL-NEXT: v_mov_b32_e32 v0, v12 -; GISEL-NEXT: v_mov_b32_e32 v1, v13 -; GISEL-NEXT: v_mov_b32_e32 v2, v14 -; GISEL-NEXT: v_mov_b32_e32 v3, v15 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[0:3], v[4:11], v16 cbsz:3 abid:1 +; GCN-NEXT: s_nop 6 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <4 x float> %result } define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0, <16 x bfloat> inreg %arg1, <4 x float> inreg %arg2, i32 inreg %arg3) { -; SDAG-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v8, s0 -; SDAG-NEXT: v_mov_b32_e32 v9, s1 -; SDAG-NEXT: v_mov_b32_e32 v10, s2 -; SDAG-NEXT: v_mov_b32_e32 v11, s3 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s20 -; SDAG-NEXT: v_mov_b32_e32 v5, s21 -; SDAG-NEXT: v_mov_b32_e32 v6, s22 -; SDAG-NEXT: v_mov_b32_e32 v7, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a0, s24 -; SDAG-NEXT: v_accvgpr_write_b32 a1, s25 -; SDAG-NEXT: v_accvgpr_write_b32 a2, s26 -; SDAG-NEXT: v_accvgpr_write_b32 a3, s27 -; SDAG-NEXT: v_mov_b32_e32 v12, s28 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[8:11], v[0:7], v12 -; SDAG-NEXT: s_nop 6 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_lshr_b32 s4, s0, 16 -; GISEL-NEXT: s_lshr_b32 s5, s1, 16 -; GISEL-NEXT: s_lshl_b32 s4, s4, 16 -; GISEL-NEXT: s_and_b32 s0, s0, 0xffff -; GISEL-NEXT: s_lshr_b32 s6, s2, 16 -; GISEL-NEXT: s_or_b32 s0, s4, s0 -; GISEL-NEXT: s_lshl_b32 s4, s5, 16 -; GISEL-NEXT: s_and_b32 s1, s1, 0xffff -; GISEL-NEXT: s_lshr_b32 s7, s3, 16 -; GISEL-NEXT: s_or_b32 s1, s4, s1 -; GISEL-NEXT: s_lshl_b32 s4, s6, 16 -; GISEL-NEXT: s_and_b32 s2, s2, 0xffff -; GISEL-NEXT: s_or_b32 s2, s4, s2 -; GISEL-NEXT: s_lshl_b32 s4, s7, 16 -; GISEL-NEXT: s_and_b32 s3, s3, 0xffff -; GISEL-NEXT: s_or_b32 s3, s4, s3 -; GISEL-NEXT: s_lshr_b32 s4, s16, 16 -; GISEL-NEXT: s_lshr_b32 s5, s17, 16 -; GISEL-NEXT: s_lshl_b32 s4, s4, 16 -; GISEL-NEXT: s_and_b32 s12, s16, 0xffff -; GISEL-NEXT: s_lshr_b32 s6, s18, 16 -; GISEL-NEXT: s_or_b32 s4, s4, s12 -; GISEL-NEXT: s_lshl_b32 s5, s5, 16 -; GISEL-NEXT: s_and_b32 s12, s17, 0xffff -; GISEL-NEXT: s_lshr_b32 s7, s19, 16 -; GISEL-NEXT: s_or_b32 s5, s5, s12 -; GISEL-NEXT: s_lshl_b32 s6, s6, 16 -; GISEL-NEXT: s_and_b32 s12, s18, 0xffff -; GISEL-NEXT: s_lshr_b32 s8, s20, 16 -; GISEL-NEXT: s_or_b32 s6, s6, s12 -; GISEL-NEXT: s_lshl_b32 s7, s7, 16 -; GISEL-NEXT: s_and_b32 s12, s19, 0xffff -; GISEL-NEXT: s_lshr_b32 s9, s21, 16 -; GISEL-NEXT: s_or_b32 s7, s7, s12 -; GISEL-NEXT: s_lshl_b32 s8, s8, 16 -; GISEL-NEXT: s_and_b32 s12, s20, 0xffff -; GISEL-NEXT: v_mov_b64_e32 v[14:15], s[2:3] -; GISEL-NEXT: s_lshr_b32 s10, s22, 16 -; GISEL-NEXT: s_or_b32 s8, s8, s12 -; GISEL-NEXT: s_lshl_b32 s9, s9, 16 -; GISEL-NEXT: s_and_b32 s12, s21, 0xffff -; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[0:1] -; GISEL-NEXT: s_lshr_b32 s11, s23, 16 -; GISEL-NEXT: s_or_b32 s9, s9, s12 -; GISEL-NEXT: s_lshl_b32 s10, s10, 16 -; GISEL-NEXT: s_and_b32 s12, s22, 0xffff -; GISEL-NEXT: s_or_b32 s10, s10, s12 -; GISEL-NEXT: s_lshl_b32 s11, s11, 16 -; GISEL-NEXT: s_and_b32 s12, s23, 0xffff -; GISEL-NEXT: s_or_b32 s11, s11, s12 -; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[4:5] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[6:7] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[8:9] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[26:27] -; GISEL-NEXT: v_mov_b32_e32 v16, s28 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_16x16x64_bf16 v[0:3], v[12:15], v[4:11], v16 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_16x16x64_bf16__sgpr: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v8, s0 +; GCN-NEXT: v_mov_b32_e32 v9, s1 +; GCN-NEXT: v_mov_b32_e32 v10, s2 +; GCN-NEXT: v_mov_b32_e32 v11, s3 +; GCN-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NEXT: v_mov_b32_e32 v5, s21 +; GCN-NEXT: v_mov_b32_e32 v6, s22 +; GCN-NEXT: v_mov_b32_e32 v7, s23 +; GCN-NEXT: v_accvgpr_write_b32 a0, s24 +; GCN-NEXT: v_accvgpr_write_b32 a1, s25 +; GCN-NEXT: v_accvgpr_write_b32 a2, s26 +; GCN-NEXT: v_accvgpr_write_b32 a3, s27 +; GCN-NEXT: v_mov_b32_e32 v12, s28 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_smfmac_f32_16x16x64_bf16 a[0:3], v[8:11], v[0:7], v12 +; GCN-NEXT: s_nop 6 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <4 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <4 x float> %result } @@ -960,71 +762,38 @@ define <4 x float> @test_smfmac_f32_16x16x64_bf16__sgpr(<8 x bfloat> inreg %arg0 declare <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat>, <16 x bfloat>, <16 x float>, i32, i32 immarg, i32 immarg) define amdgpu_kernel void @test_smfmac_f32_32x32x32_bf16__vgpr(ptr addrspace(1) %arg, <8 x bfloat> %a, <16 x bfloat> %b, i32 %idx) #0 { -; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16__vgpr: -; SDAG: ; %bb.0: ; %bb -; SDAG-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; SDAG-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 -; SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; SDAG-NEXT: v_lshlrev_b32_e32 v16, 6, v0 -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 -; SDAG-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 -; SDAG-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 -; SDAG-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; SDAG-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 -; SDAG-NEXT: s_load_dword s16, s[4:5], 0x64 -; SDAG-NEXT: v_mov_b64_e32 v[26:27], s[2:3] -; SDAG-NEXT: v_mov_b64_e32 v[24:25], s[0:1] -; SDAG-NEXT: s_waitcnt lgkmcnt(0) -; SDAG-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; SDAG-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; SDAG-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; SDAG-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; SDAG-NEXT: v_mov_b32_e32 v28, s16 -; SDAG-NEXT: s_waitcnt vmcnt(0) -; SDAG-NEXT: s_nop 0 -; SDAG-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 -; SDAG-NEXT: v_mov_b32_e32 v16, 0 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 -; SDAG-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 -; SDAG-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] -; SDAG-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16 -; SDAG-NEXT: s_endpgm -; -; GISEL-LABEL: test_smfmac_f32_32x32x32_bf16__vgpr: -; GISEL: ; %bb.0: ; %bb -; GISEL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 -; GISEL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 -; GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GISEL-NEXT: v_lshlrev_b32_e32 v16, 6, v0 -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] -; GISEL-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 -; GISEL-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 -; GISEL-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 -; GISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 -; GISEL-NEXT: s_load_dword s16, s[4:5], 0x64 -; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[0:1] -; GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[10:11] -; GISEL-NEXT: v_mov_b64_e32 v[16:17], s[8:9] -; GISEL-NEXT: v_mov_b32_e32 v28, s16 -; GISEL-NEXT: s_waitcnt vmcnt(0) -; GISEL-NEXT: s_nop 0 -; GISEL-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 -; GISEL-NEXT: v_mov_b32_e32 v16, 0 -; GISEL-NEXT: s_nop 7 -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] -; GISEL-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16 -; GISEL-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 -; GISEL-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 -; GISEL-NEXT: s_endpgm +; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__vgpr: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x34 +; GCN-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v16, 6, v0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[6:7] offset:48 +; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[6:7] offset:32 +; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[6:7] offset:16 +; GCN-NEXT: global_load_dwordx4 v[0:3], v16, s[6:7] +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x44 +; GCN-NEXT: s_load_dword s16, s[4:5], 0x64 +; GCN-NEXT: v_mov_b64_e32 v[26:27], s[2:3] +; GCN-NEXT: v_mov_b64_e32 v[24:25], s[0:1] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b64_e32 v[22:23], s[14:15] +; GCN-NEXT: v_mov_b64_e32 v[20:21], s[12:13] +; GCN-NEXT: v_mov_b64_e32 v[18:19], s[10:11] +; GCN-NEXT: v_mov_b64_e32 v[16:17], s[8:9] +; GCN-NEXT: v_mov_b32_e32 v28, s16 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[24:27], v[16:23], v28 cbsz:1 abid:2 +; GCN-NEXT: v_mov_b32_e32 v16, 0 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: global_store_dwordx4 v16, v[8:11], s[6:7] offset:32 +; GCN-NEXT: global_store_dwordx4 v16, v[12:15], s[6:7] offset:48 +; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[6:7] +; GCN-NEXT: global_store_dwordx4 v16, v[4:7], s[6:7] offset:16 +; GCN-NEXT: s_endpgm bb: %id = call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr <16 x float>, ptr addrspace(1) %arg, i32 %id @@ -1035,448 +804,209 @@ bb: } define <16 x float> @test_smfmac_f32_32x32x32_bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x32_bf16: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v49 -; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v50 -; GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v51 -; GISEL-NEXT: v_mov_b32_sdwa v48, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v32 -; GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v33 -; GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v34 -; GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v35 -; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; GISEL-NEXT: v_mov_b32_sdwa v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v35, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v36, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v37, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[48:51], v[30:37], v28 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x32_bf16: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags0(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16__flags0: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x32_bf16__flags0: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v49 -; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v50 -; GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v51 -; GISEL-NEXT: v_mov_b32_sdwa v48, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v32 -; GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v33 -; GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v34 -; GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v35 -; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; GISEL-NEXT: v_mov_b32_sdwa v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v35, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v36, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v37, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[48:51], v[30:37], v28 cbsz:1 abid:3 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags0: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:1 abid:3 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 1, i32 immarg 3) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x32_bf16__flags1(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16__flags1: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x32_bf16__flags1: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: v_mov_b32_e32 v48, v0 -; GISEL-NEXT: v_mov_b32_e32 v49, v1 -; GISEL-NEXT: v_mov_b32_e32 v50, v2 -; GISEL-NEXT: v_mov_b32_e32 v51, v3 -; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v48 -; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v49 -; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v50 -; GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v51 -; GISEL-NEXT: v_mov_b32_sdwa v48, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v49, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v50, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v51, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_e32 v30, v4 -; GISEL-NEXT: v_mov_b32_e32 v31, v5 -; GISEL-NEXT: v_mov_b32_e32 v32, v6 -; GISEL-NEXT: v_mov_b32_e32 v33, v7 -; GISEL-NEXT: v_mov_b32_e32 v34, v8 -; GISEL-NEXT: v_mov_b32_e32 v35, v9 -; GISEL-NEXT: v_mov_b32_e32 v36, v10 -; GISEL-NEXT: v_mov_b32_e32 v37, v11 -; GISEL-NEXT: v_lshrrev_b32_e32 v0, 16, v30 -; GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v31 -; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v32 -; GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v33 -; GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v34 -; GISEL-NEXT: v_lshrrev_b32_e32 v5, 16, v35 -; GISEL-NEXT: v_lshrrev_b32_e32 v6, 16, v36 -; GISEL-NEXT: v_lshrrev_b32_e32 v7, 16, v37 -; GISEL-NEXT: v_mov_b32_sdwa v30, v0 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v31, v1 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v32, v2 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v33, v3 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v34, v4 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v35, v5 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v36, v6 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b32_sdwa v37, v7 dst_sel:WORD_1 dst_unused:UNUSED_PRESERVE src0_sel:WORD_0 -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[12:13] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[14:15] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[16:17] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[26:27] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[48:51], v[30:37], v28 cbsz:3 abid:1 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__flags1: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[0:3], v[4:11], v28 cbsz:3 abid:1 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 3, i32 immarg 1) ret <16 x float> %result } define <16 x float> @test_smfmac_f32_32x32x32_bf16__sgpr(<8 x bfloat> inreg %arg0, <16 x bfloat> inreg %arg1, <16 x float> inreg %arg2, i32 inreg %arg3) { -; SDAG-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr: -; SDAG: ; %bb.0: -; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: v_mov_b32_e32 v28, s0 -; SDAG-NEXT: v_mov_b32_e32 v29, s1 -; SDAG-NEXT: v_mov_b32_e32 v30, s2 -; SDAG-NEXT: v_mov_b32_e32 v31, s3 -; SDAG-NEXT: v_mov_b32_e32 v12, s24 -; SDAG-NEXT: v_mov_b32_e32 v27, v9 -; SDAG-NEXT: v_mov_b32_e32 v26, v8 -; SDAG-NEXT: v_mov_b32_e32 v25, v7 -; SDAG-NEXT: v_mov_b32_e32 v24, v6 -; SDAG-NEXT: v_mov_b32_e32 v23, v5 -; SDAG-NEXT: v_mov_b32_e32 v22, v4 -; SDAG-NEXT: v_mov_b32_e32 v21, v3 -; SDAG-NEXT: v_mov_b32_e32 v20, v2 -; SDAG-NEXT: v_mov_b32_e32 v19, v1 -; SDAG-NEXT: v_mov_b32_e32 v18, v0 -; SDAG-NEXT: v_mov_b32_e32 v13, s25 -; SDAG-NEXT: v_mov_b32_e32 v14, s26 -; SDAG-NEXT: v_mov_b32_e32 v15, s27 -; SDAG-NEXT: v_mov_b32_e32 v16, s28 -; SDAG-NEXT: v_mov_b32_e32 v17, s29 -; SDAG-NEXT: v_accvgpr_write_b32 a0, v12 -; SDAG-NEXT: v_mov_b32_e32 v0, s16 -; SDAG-NEXT: v_mov_b32_e32 v1, s17 -; SDAG-NEXT: v_mov_b32_e32 v2, s18 -; SDAG-NEXT: v_mov_b32_e32 v3, s19 -; SDAG-NEXT: v_mov_b32_e32 v4, s20 -; SDAG-NEXT: v_mov_b32_e32 v5, s21 -; SDAG-NEXT: v_mov_b32_e32 v6, s22 -; SDAG-NEXT: v_mov_b32_e32 v7, s23 -; SDAG-NEXT: v_accvgpr_write_b32 a1, v13 -; SDAG-NEXT: v_accvgpr_write_b32 a2, v14 -; SDAG-NEXT: v_accvgpr_write_b32 a3, v15 -; SDAG-NEXT: v_accvgpr_write_b32 a4, v16 -; SDAG-NEXT: v_accvgpr_write_b32 a5, v17 -; SDAG-NEXT: v_accvgpr_write_b32 a6, v18 -; SDAG-NEXT: v_accvgpr_write_b32 a7, v19 -; SDAG-NEXT: v_accvgpr_write_b32 a8, v20 -; SDAG-NEXT: v_accvgpr_write_b32 a9, v21 -; SDAG-NEXT: v_accvgpr_write_b32 a10, v22 -; SDAG-NEXT: v_accvgpr_write_b32 a11, v23 -; SDAG-NEXT: v_accvgpr_write_b32 a12, v24 -; SDAG-NEXT: v_accvgpr_write_b32 a13, v25 -; SDAG-NEXT: v_accvgpr_write_b32 a14, v26 -; SDAG-NEXT: v_accvgpr_write_b32 a15, v27 -; SDAG-NEXT: s_nop 1 -; SDAG-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[28:31], v[0:7], v10 -; SDAG-NEXT: s_nop 7 -; SDAG-NEXT: s_nop 2 -; SDAG-NEXT: v_accvgpr_read_b32 v0, a0 -; SDAG-NEXT: v_accvgpr_read_b32 v1, a1 -; SDAG-NEXT: v_accvgpr_read_b32 v2, a2 -; SDAG-NEXT: v_accvgpr_read_b32 v3, a3 -; SDAG-NEXT: v_accvgpr_read_b32 v4, a4 -; SDAG-NEXT: v_accvgpr_read_b32 v5, a5 -; SDAG-NEXT: v_accvgpr_read_b32 v6, a6 -; SDAG-NEXT: v_accvgpr_read_b32 v7, a7 -; SDAG-NEXT: v_accvgpr_read_b32 v8, a8 -; SDAG-NEXT: v_accvgpr_read_b32 v9, a9 -; SDAG-NEXT: v_accvgpr_read_b32 v10, a10 -; SDAG-NEXT: v_accvgpr_read_b32 v11, a11 -; SDAG-NEXT: v_accvgpr_read_b32 v12, a12 -; SDAG-NEXT: v_accvgpr_read_b32 v13, a13 -; SDAG-NEXT: v_accvgpr_read_b32 v14, a14 -; SDAG-NEXT: v_accvgpr_read_b32 v15, a15 -; SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GISEL-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr: -; GISEL: ; %bb.0: -; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_lshr_b32 s4, s0, 16 -; GISEL-NEXT: s_lshr_b32 s5, s1, 16 -; GISEL-NEXT: s_lshl_b32 s4, s4, 16 -; GISEL-NEXT: s_and_b32 s0, s0, 0xffff -; GISEL-NEXT: s_lshr_b32 s6, s2, 16 -; GISEL-NEXT: s_or_b32 s8, s4, s0 -; GISEL-NEXT: s_lshl_b32 s0, s5, 16 -; GISEL-NEXT: s_and_b32 s1, s1, 0xffff -; GISEL-NEXT: s_lshr_b32 s7, s3, 16 -; GISEL-NEXT: s_or_b32 s9, s0, s1 -; GISEL-NEXT: s_lshl_b32 s0, s6, 16 -; GISEL-NEXT: s_and_b32 s1, s2, 0xffff -; GISEL-NEXT: s_or_b32 s10, s0, s1 -; GISEL-NEXT: s_lshl_b32 s0, s7, 16 -; GISEL-NEXT: s_and_b32 s1, s3, 0xffff -; GISEL-NEXT: s_or_b32 s11, s0, s1 -; GISEL-NEXT: s_lshr_b32 s0, s16, 16 -; GISEL-NEXT: s_lshr_b32 s1, s17, 16 -; GISEL-NEXT: s_lshl_b32 s0, s0, 16 -; GISEL-NEXT: s_and_b32 s12, s16, 0xffff -; GISEL-NEXT: s_lshr_b32 s2, s18, 16 -; GISEL-NEXT: s_or_b32 s0, s0, s12 -; GISEL-NEXT: s_lshl_b32 s1, s1, 16 -; GISEL-NEXT: s_and_b32 s12, s17, 0xffff -; GISEL-NEXT: s_lshr_b32 s3, s19, 16 -; GISEL-NEXT: s_or_b32 s1, s1, s12 -; GISEL-NEXT: s_lshl_b32 s2, s2, 16 -; GISEL-NEXT: s_and_b32 s12, s18, 0xffff -; GISEL-NEXT: s_lshr_b32 s4, s20, 16 -; GISEL-NEXT: s_or_b32 s2, s2, s12 -; GISEL-NEXT: s_lshl_b32 s3, s3, 16 -; GISEL-NEXT: s_and_b32 s12, s19, 0xffff -; GISEL-NEXT: s_lshr_b32 s5, s21, 16 -; GISEL-NEXT: s_or_b32 s3, s3, s12 -; GISEL-NEXT: s_lshl_b32 s4, s4, 16 -; GISEL-NEXT: s_and_b32 s12, s20, 0xffff -; GISEL-NEXT: s_lshr_b32 s6, s22, 16 -; GISEL-NEXT: s_or_b32 s4, s4, s12 -; GISEL-NEXT: s_lshl_b32 s5, s5, 16 -; GISEL-NEXT: s_and_b32 s12, s21, 0xffff -; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] -; GISEL-NEXT: s_lshr_b32 s7, s23, 16 -; GISEL-NEXT: s_or_b32 s5, s5, s12 -; GISEL-NEXT: s_lshl_b32 s6, s6, 16 -; GISEL-NEXT: s_and_b32 s12, s22, 0xffff -; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] -; GISEL-NEXT: s_or_b32 s6, s6, s12 -; GISEL-NEXT: s_lshl_b32 s7, s7, 16 -; GISEL-NEXT: s_and_b32 s12, s23, 0xffff -; GISEL-NEXT: s_or_b32 s7, s7, s12 -; GISEL-NEXT: v_mov_b32_e32 v18, s24 -; GISEL-NEXT: v_mov_b32_e32 v19, s25 -; GISEL-NEXT: v_mov_b32_e32 v24, v0 -; GISEL-NEXT: v_mov_b32_e32 v25, v1 -; GISEL-NEXT: v_mov_b32_e32 v26, v2 -; GISEL-NEXT: v_mov_b32_e32 v27, v3 -; GISEL-NEXT: v_mov_b32_e32 v28, v4 -; GISEL-NEXT: v_mov_b32_e32 v29, v5 -; GISEL-NEXT: v_mov_b32_e32 v30, v6 -; GISEL-NEXT: v_mov_b32_e32 v31, v7 -; GISEL-NEXT: v_mov_b32_e32 v32, v8 -; GISEL-NEXT: v_mov_b32_e32 v33, v9 -; GISEL-NEXT: v_mov_b32_e32 v16, v10 -; GISEL-NEXT: v_mov_b32_e32 v20, s26 -; GISEL-NEXT: v_mov_b32_e32 v21, s27 -; GISEL-NEXT: v_mov_b32_e32 v22, s28 -; GISEL-NEXT: v_mov_b32_e32 v23, s29 -; GISEL-NEXT: v_mov_b64_e32 v[54:55], s[6:7] -; GISEL-NEXT: v_mov_b64_e32 v[0:1], v[18:19] -; GISEL-NEXT: v_mov_b64_e32 v[52:53], s[4:5] -; GISEL-NEXT: v_mov_b64_e32 v[50:51], s[2:3] -; GISEL-NEXT: v_mov_b64_e32 v[48:49], s[0:1] -; GISEL-NEXT: v_mov_b64_e32 v[2:3], v[20:21] -; GISEL-NEXT: v_mov_b64_e32 v[4:5], v[22:23] -; GISEL-NEXT: v_mov_b64_e32 v[6:7], v[24:25] -; GISEL-NEXT: v_mov_b64_e32 v[8:9], v[26:27] -; GISEL-NEXT: v_mov_b64_e32 v[10:11], v[28:29] -; GISEL-NEXT: v_mov_b64_e32 v[12:13], v[30:31] -; GISEL-NEXT: v_mov_b64_e32 v[14:15], v[32:33] -; GISEL-NEXT: s_nop 1 -; GISEL-NEXT: v_smfmac_f32_32x32x32_bf16 v[0:15], v[34:37], v[48:55], v16 -; GISEL-NEXT: s_setpc_b64 s[30:31] +; GCN-LABEL: test_smfmac_f32_32x32x32_bf16__sgpr: +; GCN: ; %bb.0: +; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v28, s0 +; GCN-NEXT: v_mov_b32_e32 v29, s1 +; GCN-NEXT: v_mov_b32_e32 v30, s2 +; GCN-NEXT: v_mov_b32_e32 v31, s3 +; GCN-NEXT: v_mov_b32_e32 v12, s24 +; GCN-NEXT: v_mov_b32_e32 v27, v9 +; GCN-NEXT: v_mov_b32_e32 v26, v8 +; GCN-NEXT: v_mov_b32_e32 v25, v7 +; GCN-NEXT: v_mov_b32_e32 v24, v6 +; GCN-NEXT: v_mov_b32_e32 v23, v5 +; GCN-NEXT: v_mov_b32_e32 v22, v4 +; GCN-NEXT: v_mov_b32_e32 v21, v3 +; GCN-NEXT: v_mov_b32_e32 v20, v2 +; GCN-NEXT: v_mov_b32_e32 v19, v1 +; GCN-NEXT: v_mov_b32_e32 v18, v0 +; GCN-NEXT: v_mov_b32_e32 v13, s25 +; GCN-NEXT: v_mov_b32_e32 v14, s26 +; GCN-NEXT: v_mov_b32_e32 v15, s27 +; GCN-NEXT: v_mov_b32_e32 v16, s28 +; GCN-NEXT: v_mov_b32_e32 v17, s29 +; GCN-NEXT: v_accvgpr_write_b32 a0, v12 +; GCN-NEXT: v_mov_b32_e32 v0, s16 +; GCN-NEXT: v_mov_b32_e32 v1, s17 +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: v_mov_b32_e32 v4, s20 +; GCN-NEXT: v_mov_b32_e32 v5, s21 +; GCN-NEXT: v_mov_b32_e32 v6, s22 +; GCN-NEXT: v_mov_b32_e32 v7, s23 +; GCN-NEXT: v_accvgpr_write_b32 a1, v13 +; GCN-NEXT: v_accvgpr_write_b32 a2, v14 +; GCN-NEXT: v_accvgpr_write_b32 a3, v15 +; GCN-NEXT: v_accvgpr_write_b32 a4, v16 +; GCN-NEXT: v_accvgpr_write_b32 a5, v17 +; GCN-NEXT: v_accvgpr_write_b32 a6, v18 +; GCN-NEXT: v_accvgpr_write_b32 a7, v19 +; GCN-NEXT: v_accvgpr_write_b32 a8, v20 +; GCN-NEXT: v_accvgpr_write_b32 a9, v21 +; GCN-NEXT: v_accvgpr_write_b32 a10, v22 +; GCN-NEXT: v_accvgpr_write_b32 a11, v23 +; GCN-NEXT: v_accvgpr_write_b32 a12, v24 +; GCN-NEXT: v_accvgpr_write_b32 a13, v25 +; GCN-NEXT: v_accvgpr_write_b32 a14, v26 +; GCN-NEXT: v_accvgpr_write_b32 a15, v27 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_smfmac_f32_32x32x32_bf16 a[0:15], v[28:31], v[0:7], v10 +; GCN-NEXT: s_nop 7 +; GCN-NEXT: s_nop 2 +; GCN-NEXT: v_accvgpr_read_b32 v0, a0 +; GCN-NEXT: v_accvgpr_read_b32 v1, a1 +; GCN-NEXT: v_accvgpr_read_b32 v2, a2 +; GCN-NEXT: v_accvgpr_read_b32 v3, a3 +; GCN-NEXT: v_accvgpr_read_b32 v4, a4 +; GCN-NEXT: v_accvgpr_read_b32 v5, a5 +; GCN-NEXT: v_accvgpr_read_b32 v6, a6 +; GCN-NEXT: v_accvgpr_read_b32 v7, a7 +; GCN-NEXT: v_accvgpr_read_b32 v8, a8 +; GCN-NEXT: v_accvgpr_read_b32 v9, a9 +; GCN-NEXT: v_accvgpr_read_b32 v10, a10 +; GCN-NEXT: v_accvgpr_read_b32 v11, a11 +; GCN-NEXT: v_accvgpr_read_b32 v12, a12 +; GCN-NEXT: v_accvgpr_read_b32 v13, a13 +; GCN-NEXT: v_accvgpr_read_b32 v14, a14 +; GCN-NEXT: v_accvgpr_read_b32 v15, a15 +; GCN-NEXT: s_setpc_b64 s[30:31] %result = call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf16(<8 x bfloat> %arg0, <16 x bfloat> %arg1, <16 x float> %arg2, i32 %arg3, i32 immarg 0, i32 immarg 0) ret <16 x float> %result } @@ -4627,5 +4157,3 @@ define <16 x float> @test_smfmac_f32_32x32x64_fp8_fp8__sgpr(<4 x i32> inreg %arg } attributes #0 = { "amdgpu-flat-work-group-size"="1,256" } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll index 0ca96d5a1eb19..fa32ee108d382 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.gfx90a.ll @@ -1,8 +1,8 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 < %s | FileCheck --check-prefixes=GCN,GFX942 %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s +; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn -mcpu=gfx9-4-generic --amdhsa-code-object-version=6 < %s | FileCheck --check-prefixes=GCN,GFX942 %s ; DPP control value 337 is valid for 64-bit DPP on gfx942 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll index cbc76a32a75e4..7342c366799e9 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll @@ -3,9 +3,9 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX1010-SDAG %s ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX802-GISEL %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX1010-GISEL %s -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 -global-isel < %s | FileCheck -check-prefixes=GFX1100-GISEL %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX802-GISEL %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -verify-machineinstrs -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX1010-GISEL %s +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -verify-machineinstrs -amdgpu-enable-vopd=0 -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX1100-GISEL %s declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #0 declare i64 @llvm.amdgcn.writelane.i64(i64, i32, i64) #0 @@ -2128,10 +2128,10 @@ define void @test_writelane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1 ; GFX802-GISEL: ; %bb.0: ; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX802-GISEL-NEXT: flat_load_ushort v4, v[0:1] -; GFX802-GISEL-NEXT: v_readfirstlane_b32 s5, v3 +; GFX802-GISEL-NEXT: v_readfirstlane_b32 m0, v3 ; GFX802-GISEL-NEXT: v_readfirstlane_b32 s4, v2 -; GFX802-GISEL-NEXT: s_mov_b32 m0, s5 ; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) +; GFX802-GISEL-NEXT: s_nop 1 ; GFX802-GISEL-NEXT: v_writelane_b32 v4, s4, m0 ; GFX802-GISEL-NEXT: flat_store_short v[0:1], v4 ; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index 5466d315c05a4..2969dd9156ccb 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -2346,6 +2346,20 @@ void GlobalISelEmitter::emitRunCustomAction(raw_ostream &OS) { << "}\n"; } +bool hasBFloatType(const TreePatternNode &Node) { + for (unsigned I = 0, E = Node.getNumTypes(); I < E; I++) { + auto Ty = Node.getType(I); + for (auto T : Ty) + if (T.second == MVT::bf16 || + (T.second.isVector() && T.second.getScalarType() == MVT::bf16)) + return true; + } + for (const TreePatternNode &C : Node.children()) + if (hasBFloatType(C)) + return true; + return false; +} + void GlobalISelEmitter::run(raw_ostream &OS) { if (!UseCoverageFile.empty()) { RuleCoverage = CodeGenCoverage(); @@ -2382,6 +2396,13 @@ void GlobalISelEmitter::run(raw_ostream &OS) { if (Pat.getGISelShouldIgnore()) continue; // skip without warning + + // Skip any patterns containing BF16 types, as GISel cannot currently tell + // the difference between fp16 and bf16. FIXME: This can be removed once + // BF16 is supported properly. + if (hasBFloatType(Pat.getSrcPattern())) + continue; + auto MatcherOrErr = runOnPattern(Pat); // The pattern analysis can fail, indicating an unsupported pattern.