diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 962693003349e..acd10af1709ea 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2919,6 +2919,20 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var" // the form: D = A * B + C. // A is sparse matrix, half the size of B, and is expanded using sparsity index. +class AMDGPUSWmmacIntrinsicIdxReuse : + Intrinsic< + [CD], // %D + [ + A, // %A + B, // %B + LLVMMatchType<0>, // %C + Index, // %Sparsity index for A + llvm_i1_ty, // matrix_a_reuse + llvm_i1_ty, // matrix_b_reuse + ], + [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, ImmArg>] +>; + class AMDGPUSWmmacIntrinsicIdx : Intrinsic< [CD], // %D @@ -3602,6 +3616,161 @@ def int_amdgcn_fdiv_fast : DefaultAttrsIntrinsic< [IntrNoMem, IntrSpeculatable] >; +// WMMA intrinsics. +class AMDGPUWmmaIntrinsicModsAB : + Intrinsic< + [CD], // %D + [ + llvm_i1_ty, // %A_mod: 0 -- none, 1 -- neg + AB, // %A + llvm_i1_ty, // %B_mod: 0 -- none, 1 -- neg + LLVMMatchType<1>, // %B + LLVMMatchType<0>, // %C + llvm_i1_ty, // matrix_a_reuse + llvm_i1_ty, // matrix_b_reuse + ], + [IntrNoMem, IntrConvergent, ImmArg>, ImmArg>, ImmArg>, ImmArg>, + IntrWillReturn, IntrNoCallback, IntrNoFree] +>; + +class AMDGPUWmmaIntrinsicModsC : + Intrinsic< + [CD], // %D + [ + AB, // %A + LLVMMatchType<1>, // %B + llvm_i16_ty, // %C_mod: 0 - none, 1 - neg, 2 - abs, 3 - neg(abs) + LLVMMatchType<0>, // %C + llvm_i1_ty, // matrix_a_reuse + llvm_i1_ty, // matrix_b_reuse + ], + [IntrNoMem, IntrConvergent, ImmArg>, ImmArg>, ImmArg>, + IntrWillReturn, IntrNoCallback, IntrNoFree] +>; + +class AMDGPUWmmaIntrinsicF4ModsC : + Intrinsic< + [CD], // %D + [ + A, // %A + B, // %B + llvm_i16_ty, // %C_mod: 0 - none, 1 - neg, 2 - abs, 3 - neg(abs) + LLVMMatchType<0>, // %C + ], + [IntrNoMem, IntrConvergent, ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree] +>; + +class AMDGPUWmmaIntrinsicModsAll : + Intrinsic< + [CD], // %D + [ + llvm_i1_ty, // %A_mod: 0 -- none, 1 -- neg + AB, // %A + llvm_i1_ty, // %B_mod: 0 -- none, 1 -- neg + LLVMMatchType<1>, // %B + llvm_i16_ty, // %C_mod: 0 -- none, 1 -- neg, 2 -- abs, 3 -- neg(abs) + LLVMMatchType<0>, // %C + ], + [IntrNoMem, IntrConvergent, ImmArg>, ImmArg>, ImmArg>, IntrWillReturn, IntrNoCallback, IntrNoFree] +>; + +class AMDGPUWmmaIntrinsicModsAllReuse : + Intrinsic< + [CD], // %D + [ + llvm_i1_ty, // %A_mod: 0 -- none, 1 -- neg + AB, // %A + llvm_i1_ty, // %B_mod: 0 -- none, 1 -- neg + LLVMMatchType<1>, // %B + llvm_i16_ty, // %C_mod: 0 -- none, 1 -- neg, 2 -- abs, 3 -- neg(abs) + LLVMMatchType<0>, // %C + llvm_i1_ty, // matrix_a_reuse + llvm_i1_ty, // matrix_b_reuse + ], + [IntrNoMem, IntrConvergent, ImmArg>, ImmArg>, ImmArg>, ImmArg>, ImmArg>, + IntrWillReturn, IntrNoCallback, IntrNoFree] +>; + +// D and C are of different types. +class AMDGPUWmmaIntrinsicModsAllDiff : + Intrinsic< + [DstTy], // %D + [ + llvm_i1_ty, // %A_mod: 0 -- none, 1 -- neg + AB, // %A + llvm_i1_ty, // %B_mod: 0 -- none, 1 -- neg + LLVMMatchType<1>, // %B + llvm_i16_ty, // %C_mod: 0 -- none, 1 -- neg, 2 -- abs, 3 -- neg(abs) + C, // %C + llvm_i1_ty, // matrix_a_reuse + llvm_i1_ty, // matrix_b_reuse + ], + [IntrNoMem, IntrConvergent, ImmArg>, ImmArg>, ImmArg>, ImmArg>, ImmArg>, + IntrWillReturn, IntrNoCallback, IntrNoFree] +>; + +defset list AMDGPUWMMAIntrinsicsGFX1250 = { +def int_amdgcn_wmma_f64_16x16x4_f64 : AMDGPUWmmaIntrinsicModsAll; +def int_amdgcn_wmma_f32_16x16x4_f32 : AMDGPUWmmaIntrinsicModsAllReuse; +def int_amdgcn_wmma_f32_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllReuse; +def int_amdgcn_wmma_f32_16x16x32_f16 : AMDGPUWmmaIntrinsicModsAllReuse; +def int_amdgcn_wmma_f16_16x16x32_f16 : AMDGPUWmmaIntrinsicModsAllReuse; +def int_amdgcn_wmma_bf16_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllReuse; +def int_amdgcn_wmma_bf16f32_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllDiff; +def int_amdgcn_wmma_f32_16x16x64_fp8_fp8 : AMDGPUWmmaIntrinsicModsC; +def int_amdgcn_wmma_f32_16x16x64_fp8_bf8 : AMDGPUWmmaIntrinsicModsC; +def int_amdgcn_wmma_f32_16x16x64_bf8_fp8 : AMDGPUWmmaIntrinsicModsC; +def int_amdgcn_wmma_f32_16x16x64_bf8_bf8 : AMDGPUWmmaIntrinsicModsC; +def int_amdgcn_wmma_f16_16x16x64_fp8_fp8 : AMDGPUWmmaIntrinsicModsC; +def int_amdgcn_wmma_f16_16x16x64_fp8_bf8 : AMDGPUWmmaIntrinsicModsC; +def int_amdgcn_wmma_f16_16x16x64_bf8_fp8 : AMDGPUWmmaIntrinsicModsC; +def int_amdgcn_wmma_f16_16x16x64_bf8_bf8 : AMDGPUWmmaIntrinsicModsC; +def int_amdgcn_wmma_f16_16x16x128_fp8_fp8 : AMDGPUWmmaIntrinsicModsC; +def int_amdgcn_wmma_f16_16x16x128_fp8_bf8 : AMDGPUWmmaIntrinsicModsC; +def int_amdgcn_wmma_f16_16x16x128_bf8_fp8 : AMDGPUWmmaIntrinsicModsC; +def int_amdgcn_wmma_f16_16x16x128_bf8_bf8 : AMDGPUWmmaIntrinsicModsC; +def int_amdgcn_wmma_f32_16x16x128_fp8_fp8 : AMDGPUWmmaIntrinsicModsC; +def int_amdgcn_wmma_f32_16x16x128_fp8_bf8 : AMDGPUWmmaIntrinsicModsC; +def int_amdgcn_wmma_f32_16x16x128_bf8_fp8 : AMDGPUWmmaIntrinsicModsC; +def int_amdgcn_wmma_f32_16x16x128_bf8_bf8 : AMDGPUWmmaIntrinsicModsC; +def int_amdgcn_wmma_i32_16x16x64_iu8 : AMDGPUWmmaIntrinsicModsAB; +def int_amdgcn_wmma_f32_32x16x128_f4 : AMDGPUWmmaIntrinsicF4ModsC; +} + +class AMDGPUSWmmacIntrinsicABIdx : + Intrinsic< + [CD], // %D + [ + llvm_i1_ty, // %A_mod: 0 - none, 1 - neg + A, // %A + llvm_i1_ty, // %B_mod: 0 - none, 1 - neg + B, // %B + LLVMMatchType<0>, // %C + Index, // %Sparsity index for A + llvm_i1_ty, // matrix_a_reuse + llvm_i1_ty, // matrix_b_reuse + ], + [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, ImmArg>, ImmArg>, ImmArg>] +>; + +defset list AMDGPUSWMMACIntrinsicsGFX1250 = { +def int_amdgcn_swmmac_f32_16x16x64_f16 : AMDGPUSWmmacIntrinsicABIdx; +def int_amdgcn_swmmac_f32_16x16x64_bf16 : AMDGPUSWmmacIntrinsicABIdx; +def int_amdgcn_swmmac_f16_16x16x64_f16 : AMDGPUSWmmacIntrinsicABIdx; +def int_amdgcn_swmmac_bf16_16x16x64_bf16 : AMDGPUSWmmacIntrinsicABIdx; +def int_amdgcn_swmmac_bf16f32_16x16x64_bf16 : AMDGPUSWmmacIntrinsicABIdx; +def int_amdgcn_swmmac_f32_16x16x128_fp8_fp8 : AMDGPUSWmmacIntrinsicIdxReuse; +def int_amdgcn_swmmac_f32_16x16x128_fp8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse; +def int_amdgcn_swmmac_f32_16x16x128_bf8_fp8 : AMDGPUSWmmacIntrinsicIdxReuse; +def int_amdgcn_swmmac_f32_16x16x128_bf8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse; +def int_amdgcn_swmmac_f16_16x16x128_fp8_fp8 : AMDGPUSWmmacIntrinsicIdxReuse; +def int_amdgcn_swmmac_f16_16x16x128_fp8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse; +def int_amdgcn_swmmac_f16_16x16x128_bf8_fp8 : AMDGPUSWmmacIntrinsicIdxReuse; +def int_amdgcn_swmmac_f16_16x16x128_bf8_bf8 : AMDGPUSWmmacIntrinsicIdxReuse; +def int_amdgcn_swmmac_i32_16x16x128_iu8 : AMDGPUSWmmacIntrinsicABIdx; +} + + class AMDGPUTensorLoadStore: Intrinsic< [], diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td index 1b909568fc555..7b5d4077e85f3 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td @@ -55,6 +55,14 @@ def gi_vop3pmodsneg : GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_vop3pmodsnegs : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + +def gi_dotiuvop3pmodsnegabs : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + def gi_wmmaopselvop3pmods : GIComplexOperandMatcher, GIComplexPatternEquiv; @@ -83,6 +91,10 @@ def gi_swmmacindex16 : GIComplexOperandMatcher, GIComplexPatternEquiv; +def gi_swmmacindex32 : + GIComplexOperandMatcher, + GIComplexPatternEquiv; + def gi_vop3opselmods : GIComplexOperandMatcher, GIComplexPatternEquiv; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 620eac428c084..25672a52345cb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -3273,6 +3273,7 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsDOT(SDValue In, SDValue &Src, return SelectVOP3PMods(In, Src, SrcMods, true); } +// Select neg_lo from the i1 immediate operand. bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const { const ConstantSDNode *C = cast(In); // Literal i1 value set in intrinsic, represents SrcMods for the next operand. @@ -3288,6 +3289,47 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PModsNeg(SDValue In, SDValue &Src) const { return true; } +// Select both neg_lo and neg_hi from the i1 immediate operand. This is +// specifically for F16/BF16 operands in WMMA instructions, where neg_lo applies +// to matrix's even k elements, and neg_hi applies to matrix's odd k elements. +bool AMDGPUDAGToDAGISel::SelectVOP3PModsNegs(SDValue In, SDValue &Src) const { + const ConstantSDNode *C = cast(In); + // Literal i1 value set in intrinsic, represents SrcMods for the next operand. + // 1 promotes packed values to signed, 0 treats them as unsigned. + assert(C->getAPIntValue().getBitWidth() == 1 && "expected i1 value"); + + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned SrcSign = C->getZExtValue(); + if (SrcSign == 1) + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + + Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + +// Select neg, abs, or both neg and abs from the i16 immediate operans. +bool AMDGPUDAGToDAGISel::SelectVOP3PModsNegAbs(SDValue In, SDValue &Src) const { + const ConstantSDNode *C = cast(In); + unsigned Mods = SISrcMods::OP_SEL_1; + unsigned SrcMod = C->getZExtValue(); + switch (SrcMod) { + default: // Any other value will be silently ignored (considered as 0). + break; + case 1: + Mods ^= SISrcMods::NEG; + break; + case 2: + Mods ^= SISrcMods::ABS; + break; + case 3: + Mods ^= (SISrcMods::NEG | SISrcMods::ABS); + break; + } + + Src = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; +} + bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const { const ConstantSDNode *C = cast(In); @@ -3639,6 +3681,41 @@ bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src, return true; } +bool AMDGPUDAGToDAGISel::SelectSWMMACIndex32(SDValue In, SDValue &Src, + SDValue &IndexKey) const { + unsigned Key = 0; + Src = In; + + SDValue InI32; + + if (In.getOpcode() == ISD::ANY_EXTEND || In.getOpcode() == ISD::ZERO_EXTEND) { + const SDValue &ExtendSrc = In.getOperand(0); + if (ExtendSrc.getValueSizeInBits() == 32) + InI32 = ExtendSrc; + } else if (In->getOpcode() == ISD::BITCAST) { + const SDValue &CastSrc = In.getOperand(0); + if (CastSrc.getOpcode() == ISD::BUILD_VECTOR && + CastSrc.getOperand(0).getValueSizeInBits() == 32) { + ConstantSDNode *Zero = dyn_cast(CastSrc.getOperand(1)); + if (Zero && Zero->getZExtValue() == 0) + InI32 = CastSrc.getOperand(0); + } + } + + if (InI32 && InI32.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + const SDValue &ExtractVecEltSrc = InI32.getOperand(0); + ConstantSDNode *EltIdx = dyn_cast(InI32.getOperand(1)); + if (ExtractVecEltSrc.getValueSizeInBits() == 64 && EltIdx && + EltIdx->getZExtValue() == 1) { + Key = 1; + Src = ExtractVecEltSrc; + } + } + + IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32); + return true; +} + bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const { Src = In; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index f3b9364fdb92b..9967f46e085e4 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -222,6 +222,8 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectVOP3PModsDOT(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3PModsNeg(SDValue In, SDValue &Src) const; + bool SelectVOP3PModsNegs(SDValue In, SDValue &Src) const; + bool SelectVOP3PModsNegAbs(SDValue In, SDValue &Src) const; bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const; bool SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src, @@ -233,6 +235,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectSWMMACIndex8(SDValue In, SDValue &Src, SDValue &IndexKey) const; bool SelectSWMMACIndex16(SDValue In, SDValue &Src, SDValue &IndexKey) const; + bool SelectSWMMACIndex32(SDValue In, SDValue &Src, SDValue &IndexKey) const; bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index ea79c57080faa..1a63c48e3666c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -3513,6 +3513,25 @@ static Register matchZeroExtendFromS32(MachineRegisterInfo &MRI, Register Reg) { return Register(); } +Register AMDGPUInstructionSelector::matchAnyExtendFromS32(Register Reg) const { + Register AnyExtSrc; + if (mi_match(Reg, *MRI, m_GAnyExt(m_Reg(AnyExtSrc)))) + return MRI->getType(AnyExtSrc) == LLT::scalar(32) ? AnyExtSrc : Register(); + + // Match legalized form %zext = G_MERGE_VALUES (s32 %x), (s32 G_IMPLICIT_DEF) + const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI); + if (Def->getOpcode() != AMDGPU::G_MERGE_VALUES) + return Register(); + + assert(Def->getNumOperands() == 3 && + MRI->getType(Def->getOperand(0).getReg()) == LLT::scalar(64)); + + if (mi_match(Def->getOperand(2).getReg(), *MRI, m_GImplicitDef())) + return Def->getOperand(1).getReg(); + + return Register(); +} + bool AMDGPUInstructionSelector::selectGlobalLoadLds(MachineInstr &MI) const{ if (!Subtarget->hasVMemToLDSLoad()) return false; @@ -4904,6 +4923,7 @@ AMDGPUInstructionSelector::selectVOP3PModsDOT(MachineOperand &Root) const { return selectVOP3PRetHelper(Root, true); } +// Select neg_lo from the i1 immediate operand. InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const { // Literal i1 value set in intrinsic, represents SrcMods for the next operand. @@ -4919,6 +4939,50 @@ AMDGPUInstructionSelector::selectVOP3PModsNeg(MachineOperand &Root) const { }}; } +// Select both neg_lo and neg_hi from the i1 immediate operand. This is +// specifically for F16/BF16 operands in WMMA instructions, where neg_lo applies +// to matrix's even k elements, and neg_hi applies to matrix's odd k elements. +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PModsNegs(MachineOperand &Root) const { + // Literal i1 value set in intrinsic, represents SrcMods for the next operand. + // Value is in Imm operand as i1 sign extended to int64_t. + // 1(-1) promotes packed values to signed, 0 treats them as unsigned. + assert((Root.isImm() && (Root.getImm() == -1 || Root.getImm() == 0)) && + "expected i1 value"); + unsigned Mods = SISrcMods::OP_SEL_1; + if (Root.getImm() == -1) + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + +// Select neg, abs, or both neg and abs from the i16 immediate operans. +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectVOP3PModsNegAbs(MachineOperand &Root) const { + + assert(Root.isImm() && "Modifier for C must be an immediate"); + + unsigned Mods = SISrcMods::OP_SEL_1; + switch (Root.getImm()) { + default: // Any other value will be silently ignored (considered as 0). + break; + case 1: + Mods ^= SISrcMods::NEG; + break; + case 2: + Mods ^= SISrcMods::ABS; + break; + case 3: + Mods ^= (SISrcMods::NEG | SISrcMods::ABS); + break; + } + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); } // src_mods + }}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods( MachineOperand &Root) const { @@ -5149,6 +5213,35 @@ AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const { }}; } +InstructionSelector::ComplexRendererFns +AMDGPUInstructionSelector::selectSWMMACIndex32(MachineOperand &Root) const { + Register Src = + getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg(); + unsigned Key = 0; + + Register S32 = matchZeroExtendFromS32(*MRI, Src); + if (!S32) + S32 = matchAnyExtendFromS32(Src); + + if (S32) { + const MachineInstr *Def = getDefIgnoringCopies(S32, *MRI); + if (Def->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) { + assert(Def->getNumOperands() == 3); + Register DstReg1 = Def->getOperand(1).getReg(); + if (mi_match(S32, *MRI, + m_any_of(m_SpecificReg(DstReg1), m_Copy(m_Reg(DstReg1))))) { + Src = Def->getOperand(2).getReg(); + Key = 1; + } + } + } + + return {{ + [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); }, + [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key + }}; +} + InstructionSelector::ComplexRendererFns AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const { Register Src; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h index 8e9e573147a86..2cb7904d27ccc 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h @@ -201,6 +201,10 @@ class AMDGPUInstructionSelector final : public InstructionSelector { InstructionSelector::ComplexRendererFns selectVOP3PModsNeg(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectVOP3PModsNegs(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectVOP3PModsNegAbs(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectWMMAOpSelVOP3PMods(MachineOperand &Root) const; @@ -217,6 +221,8 @@ class AMDGPUInstructionSelector final : public InstructionSelector { selectSWMMACIndex8(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectSWMMACIndex16(MachineOperand &Root) const; + InstructionSelector::ComplexRendererFns + selectSWMMACIndex32(MachineOperand &Root) const; InstructionSelector::ComplexRendererFns selectVOP3OpSelMods(MachineOperand &Root) const; @@ -411,6 +417,9 @@ class AMDGPUInstructionSelector final : public InstructionSelector { // shift amount operand's `ShAmtBits` bits is unneeded. bool isUnneededShiftMask(const MachineInstr &MI, unsigned ShAmtBits) const; + /// Match an any extend from a 32-bit value to 64-bit. + Register matchAnyExtendFromS32(Register Reg) const; + const SIInstrInfo &TII; const SIRegisterInfo &TRI; const AMDGPURegisterBankInfo &RBI; diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index aa678df675fb6..eb0d8b9d5b958 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -7622,6 +7622,20 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, case Intrinsic::amdgcn_image_bvh_dual_intersect_ray: case Intrinsic::amdgcn_image_bvh8_intersect_ray: return legalizeBVHDualOrBVH8IntersectRayIntrinsic(MI, B); + case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: { + Register Index = MI.getOperand(5).getReg(); + LLT S64 = LLT::scalar(64); + if (MRI.getType(Index) != S64) + MI.getOperand(5).setReg(B.buildAnyExt(S64, Index).getReg(0)); + return true; + } case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16: case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16: case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16: @@ -7636,15 +7650,24 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper, MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0)); return true; } + case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16: + case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16: + case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { Register Index = MI.getOperand(7).getReg(); - LLT S32 = LLT::scalar(32); - if (MRI.getType(Index) != S32) - MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0)); + LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8 + ? LLT::scalar(64) + : LLT::scalar(32); + if (MRI.getType(Index) != IdxTy) + MI.getOperand(7).setReg(B.buildAnyExt(IdxTy, Index).getReg(0)); return true; } + case Intrinsic::amdgcn_fmed3: { GISelChangeObserver &Observer = Helper.Observer; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 965053ffe8624..3cc4f3ace9686 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -4689,6 +4689,45 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8: case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8: case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: + case Intrinsic::amdgcn_wmma_f64_16x16x4_f64: + case Intrinsic::amdgcn_wmma_f32_16x16x4_f32: + case Intrinsic::amdgcn_wmma_f32_16x16x32_bf16: + case Intrinsic::amdgcn_wmma_f32_16x16x32_f16: + case Intrinsic::amdgcn_wmma_f16_16x16x32_f16: + case Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16: + case Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16: + case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8: + case Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8: + case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8: + case Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8: + case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8: + case Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8: + case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8: + case Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8: + case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8: + case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8: + case Intrinsic::amdgcn_wmma_i32_16x16x64_iu8: + case Intrinsic::amdgcn_wmma_f32_32x16x128_f4: + case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16: + case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: + case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: return getDefaultMappingVOP(MI); case Intrinsic::amdgcn_log: case Intrinsic::amdgcn_exp2: diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index 1f6002a3c6a20..dfe0cbf18c476 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -341,6 +341,10 @@ foreach intr = AMDGPUWMMAIntrinsicsGFX11 in def : SourceOfDivergence; foreach intr = AMDGPUWMMAIntrinsicsGFX12 in def : SourceOfDivergence; +foreach intr = AMDGPUWMMAIntrinsicsGFX1250 in +def : SourceOfDivergence; +foreach intr = AMDGPUSWMMACIntrinsicsGFX1250 in +def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6cf2055c8e565..c648711f330ee 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9315,6 +9315,44 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), IndexKeyi32); } + case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8: + case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: { + if (Op.getOperand(4).getValueType() == MVT::i64) + return SDValue(); + + SDLoc SL(Op); + auto IndexKeyi64 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), + {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), + Op.getOperand(3), IndexKeyi64, Op.getOperand(5), + Op.getOperand(6)}); + } + case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16: + case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16: + case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16: + case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: { + EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8 + ? MVT::i64 + : MVT::i32; + if (Op.getOperand(6).getValueType() == IndexKeyTy) + return SDValue(); + + SDLoc SL(Op); + auto IndexKey = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), + {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2), + Op.getOperand(3), Op.getOperand(4), Op.getOperand(5), + IndexKey, Op.getOperand(7), + Op.getOperand(8)}); // No clamp operand + } case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4: case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8: case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 391cc968e9d8b..9ea5c75606f9c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1637,6 +1637,8 @@ def VOP3PMods : ComplexPattern; def VOP3PModsDOT : ComplexPattern; def VOP3PModsNeg : ComplexPattern; +def VOP3PModsNegs : ComplexPattern; // chfang: not use complex pattern? +def VOP3PModsNegAbs : ComplexPattern; def WMMAOpSelVOP3PMods : ComplexPattern; def WMMAModsF32NegAbs : ComplexPattern; diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index 1679cee320067..ef8faffa5f557 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -66,6 +66,13 @@ def Write4PassDGEMM : SchedWrite; def Write8PassDGEMM : SchedWrite; def Write16PassDGEMM : SchedWrite; +// WMMA/SWMMA instructions +def WriteXDL2PassWMMA : SchedWrite; +def WriteXDL4PassWMMA : SchedWrite; +def Write4PassWMMA : SchedWrite; +def Write8PassWMMA : SchedWrite; +def Write16PassWMMA : SchedWrite; + // Scalar float instructions def WriteSFPU : SchedWrite; @@ -459,6 +466,15 @@ def : InstRW<[WriteCopy], (instrs COPY)>; multiclass GFX125xCommonWriteRes { +let ReleaseAtCycles = [8] in +def : HWWriteRes; +let ReleaseAtCycles = [16] in +def : HWWriteRes; + +def : HWWriteRes; +def : HWWriteRes; +def : HWWriteRes; + def : HWWriteRes; def : HWWriteRes; def : HWWriteRes; @@ -476,6 +492,11 @@ def : HWWriteRes; def : HWWriteRes; def : InstRW<[WriteCopy], (instrs COPY)>; + +def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(FP8|BF8|BF16|F16)_w32")>; +def : InstRW<[WriteXDL4PassWMMA], (instregex "^V_[S]*WMMA[C]*_.*_(IU8|IU4)_w32")>; +def : InstRW<[Write4PassWMMA], (instregex "^V_WMMA_F32_16X16X4_F32_w32")>; +def : InstRW<[WriteXDL2PassWMMA], (instregex "^V_WMMA.*_F32_32X16X128_F4")>; } // End GFX125xCommonWriteRes let SchedModel = GFX1250SpeedModel in { diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 41878a71ad3e6..e51e9574f8de0 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1457,26 +1457,26 @@ class VOP3PWMMA_Profile ArgTy, bit _IsSWMMAC, int _IndexType, bit IsAB_F16_IMod0 = !and(IsAB_F16, !not(HasIModOp)); bit IsAB_F32F64_IMod1 = !and(!or(IsAB_F64, IsAB_F32), HasIModOp); bit IsAB_F16BF16_IMod1 = !and(!or(IsAB_F16, IsAB_BF16), HasIModOp); - dag Src0InPat = !cond(IsAB_F32F64_IMod1 : (ins Src0VT:$src0), - IsAB_F16BF16_IMod1 : (ins Src0VT:$src0), + dag Src0InPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0), + IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs i32:$src0_modifiers), Src0VT:$src0), IsAB_F16_IMod0 : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))), IsAB_BF16_IMod0 : (ins Src0VT:$src0), IsIU : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0), NoABMods : (ins Src0VT:$src0)); - dag Src0OutPat = !cond(IsAB_F32F64_IMod1 : (ins Src0VT:$src0), - IsAB_F16BF16_IMod1 : (ins Src0VT:$src0), + dag Src0OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0), + IsAB_F16BF16_IMod1 : (ins i32:$src0_modifiers, Src0VT:$src0), IsAB_F16_IMod0 : (ins i32:$src0_modifiers, Src0VT:$src0), IsAB_BF16_IMod0 : (ins (i32 8), Src0VT:$src0), IsIU : (ins i32:$src0_modifiers, Src0VT:$src0), NoABMods : (ins Src0VT:$src0)); - dag Src1InPat = !cond(IsAB_F32F64_IMod1 : (ins Src1VT:$src1), - IsAB_F16BF16_IMod1 : (ins Src1VT:$src1), + dag Src1InPat = !cond(IsAB_F32F64_IMod1 : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1), + IsAB_F16BF16_IMod1 : (ins (VOP3PModsNegs i32:$src1_modifiers), Src1VT:$src1), IsAB_F16_IMod0 : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))), IsAB_BF16_IMod0 : (ins Src1VT:$src1), IsIU : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1), NoABMods : (ins Src1VT:$src1)); - dag Src1OutPat = !cond(IsAB_F32F64_IMod1 : (ins Src1VT:$src1), - IsAB_F16BF16_IMod1 : (ins Src1VT:$src1), + dag Src1OutPat = !cond(IsAB_F32F64_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1), + IsAB_F16BF16_IMod1 : (ins i32:$src1_modifiers, Src1VT:$src1), IsAB_F16_IMod0 : (ins i32:$src1_modifiers, Src1VT:$src1), IsAB_BF16_IMod0 : (ins (i32 8), Src1VT:$src1), IsIU : (ins i32:$src1_modifiers, Src1VT:$src1), @@ -1486,13 +1486,13 @@ class VOP3PWMMA_Profile ArgTy, bit _IsSWMMAC, int _IndexType, bit IsC_F16_IMod0 = !and(IsC_F16, !not(HasIModOp)); bit IsC_BF16_IMod0 = !and(IsC_BF16, !not(HasIModOp)); bit IsIUXF32 = !or(IsIU, IsXF32); - dag Src2InPatWmma = !cond(IsC_IMod1 : (ins Src2VT:$src2), + dag Src2InPatWmma = !cond(IsC_IMod1 : (ins (VOP3PModsNegAbs i32:$src2_modifiers), Src2VT:$src2), IsC_F32_IMod0 : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))), IsC_F16_IMod0 : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))), IsC_BF16_IMod0 : (ins Src2VT:$src2), IsIUXF32 : (ins Src2VT:$src2), IsSWMMAC : (ins)); - dag Src2OutPatWmma = !cond(IsC_IMod1 : (ins Src2VT:$src2), + dag Src2OutPatWmma = !cond(IsC_IMod1 : (ins i32:$src2_modifiers, Src2VT:$src2), IsC_F32_IMod0 : (ins i32:$src2_modifiers, Src2VT:$src2), IsC_F16_IMod0 : (ins i32:$src2_modifiers, Src2VT:$src2), IsC_BF16_IMod0 : (ins (i32 8), Src2VT:$src2), @@ -1508,20 +1508,22 @@ class VOP3PWMMA_Profile ArgTy, bit _IsSWMMAC, int _IndexType, !eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit), !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit), !eq(IndexType, 32): (ins i64:$src2, i32:$index_key_32bit)); - dag Src2InlineInPat = (ins (Src2VT (WMMAVISrc Src2VT:$src2))); - dag Src2InlineOutPat = !con(!if(IsIU, (ins), (ins (i32 8))), (ins Src2VT:$src2)); + dag Src2InlineInPat = !con(!if(IsC_IMod1, (ins (VOP3PModsNegAbs i32:$src2_modifiers)), (ins)), (ins (Src2VT (WMMAVISrc Src2VT:$src2)))); + dag Src2InlineOutPat = !con(!if(IsIUXF32, (ins), !if(IsC_IMod1, (ins i32:$src2_modifiers), (ins (i32 8)))), (ins Src2VT:$src2)); + dag MatrixReuseInPat = !if(HasMatrixReuse, (ins timm:$matrix_a_reuse, timm:$matrix_b_reuse), (ins)); + dag MatrixReuseOutModPat = !if(HasMatrixReuse, (ins i1:$matrix_a_reuse, i1:$matrix_b_reuse), (ins)); - dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, ClampPat); - dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, ClampPat); + dag WmmaInPat = !con(Src0InPat, Src1InPat, Src2InPatWmma, MatrixReuseInPat, ClampPat); + dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, MatrixReuseOutModPat, ClampPat); - dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, ClampPat); - dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, ClampPat); + dag SwmmacInPat = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, MatrixReuseInPat, ClampPat); + dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, MatrixReuseOutModPat, ClampPat); // wmma pattern where src2 is inline imm uses _threeaddr pseudo, // can't use _twoaddr since it would violate src2 tied to vdst constraint. - dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, ClampPat); - dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, ClampPat); + dag WmmaInlineInPat = !con(Src0InPat, Src1InPat, Src2InlineInPat, MatrixReuseInPat, ClampPat); + dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, MatrixReuseOutModPat, ClampPat); } def WMMAInstInfoTable : GenericTable { @@ -1632,7 +1634,7 @@ def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32, i32, v2i32, v4f32], 1, def F64_F64X4_WMMA_w32 : VOP3PWMMA_Profile<[v8f64, v2f64, v2f64, v8f64], 0, 0, 0, 0, 1>; def F32_F32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2f32, v2f32, v8f32], 0, 0, 0, 0, 1, 1>; -def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 0, 1>; +def F32_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v16bf16, v8f32], 0, 0, 0, 0, 1, 1>; def F32_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v16f16, v8f32], 0, 0, 0, 0, 1, 1>; def F16_F16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v16f16, v8f16], 0, 0, 0, 0, 1, 1>; def BF16_BF16X32_WMMA_w32 : VOP3PWMMA_Profile<[v8bf16, v16bf16, v16bf16, v8bf16], 0, 0, 0, 0, 1, 1>; @@ -1772,7 +1774,7 @@ class SWMMACPat_w64; defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w32", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w32>; defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w32", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w32,1>; @@ -1799,7 +1801,7 @@ let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12Plus in { def : SWMMACPat; } -let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in { +let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12PlusNot12_50 in { defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w64", int_amdgcn_wmma_f32_16x16x16_f16, F32_F16_WMMA_w64>; defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w64", int_amdgcn_wmma_f32_16x16x16_bf16, F32_BF16_WMMA_w64>; defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w64", int_amdgcn_wmma_f16_16x16x16_f16, F16_F16_WMMA_w64,1>; @@ -1825,6 +1827,49 @@ let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in { def : SWMMACPat; } +let WaveSizePredicate = isWave32 in { +let SubtargetPredicate = isGFX125xOnly in { + defm : WMMAPat<"V_WMMA_F32_16X16X4_F32_w32", int_amdgcn_wmma_f32_16x16x4_f32, F32_F32_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X32_BF16_w32", int_amdgcn_wmma_f32_16x16x32_bf16, F32_BF16X32_WMMA_w32>; + defm : WMMAPat<"V_WMMA_BF16_16X16X32_BF16_w32", int_amdgcn_wmma_bf16_16x16x32_bf16, BF16_BF16X32_WMMA_w32>; + defm : WMMAPat<"V_WMMA_BF16F32_16X16X32_BF16_w32", int_amdgcn_wmma_bf16f32_16x16x32_bf16, BF16F32_BF16_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X64_FP8_FP8_w32", int_amdgcn_wmma_f32_16x16x64_fp8_fp8, F32_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X64_FP8_BF8_w32", int_amdgcn_wmma_f32_16x16x64_fp8_bf8, F32_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X64_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x64_bf8_fp8, F32_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X64_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x64_bf8_bf8, F32_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X64_FP8_FP8_w32", int_amdgcn_wmma_f16_16x16x64_fp8_fp8, F16_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X64_FP8_BF8_w32", int_amdgcn_wmma_f16_16x16x64_fp8_bf8, F16_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X64_BF8_FP8_w32", int_amdgcn_wmma_f16_16x16x64_bf8_fp8, F16_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X64_BF8_BF8_w32", int_amdgcn_wmma_f16_16x16x64_bf8_bf8, F16_FP8BF8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_I32_16X16X64_IU8_w32", int_amdgcn_wmma_i32_16x16x64_iu8, I32_IU8X64_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X32_F16_w32", int_amdgcn_wmma_f32_16x16x32_f16, F32_F16X32_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X32_F16_w32", int_amdgcn_wmma_f16_16x16x32_f16, F16_F16X32_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X128_FP8_FP8_w32", int_amdgcn_wmma_f16_16x16x128_fp8_fp8, F16_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X128_FP8_BF8_w32", int_amdgcn_wmma_f16_16x16x128_fp8_bf8, F16_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X128_BF8_FP8_w32", int_amdgcn_wmma_f16_16x16x128_bf8_fp8, F16_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F16_16X16X128_BF8_BF8_w32", int_amdgcn_wmma_f16_16x16x128_bf8_bf8, F16_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X128_FP8_FP8_w32", int_amdgcn_wmma_f32_16x16x128_fp8_fp8, F32_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X128_FP8_BF8_w32", int_amdgcn_wmma_f32_16x16x128_fp8_bf8, F32_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_fp8, F32_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_bf8, F32_FP8BF8X128_WMMA_w32>; + defm : WMMAPat<"V_WMMA_F32_32X16X128_F4_w32", int_amdgcn_wmma_f32_32x16x128_f4, F32_32X16X128_F4_WMMA_w32>; + + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; + def : SWMMACPat; +} // End SubtargetPredicate = isGFX125xOnly +} // End WaveSizePredicate = isWave32 //===----------------------------------------------------------------------===// // Begin Real Encodings diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll index bd7464577b7db..b5b24b75ceba2 100644 --- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll +++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll @@ -197,6 +197,216 @@ bb: ret void } +; CHECK: DIVERGENT: %tmp0 = call <8 x double> @llvm.amdgcn.wmma.f64.16x16x4.f64.v8f64.v2f64(i1 false, <2 x double> %A, i1 false, <2 x double> %B, i16 0, <8 x double> %C) +define amdgpu_kernel void @wmma_f64_16x16x4_f64(<2 x double> %A, <2 x double> %B, <8 x double> %C, ptr addrspace(1) %out) { + %tmp0 = call <8 x double> @llvm.amdgcn.wmma.f64.16x16x4.f64.v8f64.v2f64(i1 0, <2 x double> %A, i1 0, <2 x double> %B, i16 0, <8 x double> %C) + store <8 x double> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 false, <2 x float> %A, i1 false, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 false) +define amdgpu_kernel void @wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) { + %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 false) + store <8 x float> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 false, <16 x bfloat> %A, i1 false, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false) +define amdgpu_kernel void @wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { + %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false) + store <8 x float> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> %A, i1 false, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 false) +define amdgpu_kernel void @wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { + %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 false) + store <8 x float> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 false, <16 x half> %A, i1 false, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 false) +define amdgpu_kernel void @wmma_f16_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { + %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 false) + store <8 x half> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 false, <16 x bfloat> %A, i1 false, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 false) +define amdgpu_kernel void @wmma_bf16_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) { + %tmp0 = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 false) + store <8 x bfloat> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16.v8f32(i1 false, <16 x bfloat> %A, i1 false, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false) +define amdgpu_kernel void @wmma_bf16f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { + %tmp0 = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false) + store <8 x bfloat> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 false) +define amdgpu_kernel void @wmma_f32_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { + %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 false) + store <8 x float> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 false) +define amdgpu_kernel void @wmma_f32_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { + %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 false) + store <8 x float> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 false) +define amdgpu_kernel void @wmma_f32_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { + %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 false) + store <8 x float> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 false) +define amdgpu_kernel void @wmma_f32_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { + %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 false) + store <8 x float> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 false) +define amdgpu_kernel void @wmma_f16_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { + %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 false) + store <8 x half> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 false) +define amdgpu_kernel void @wmma_f16_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { + %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 false) + store <8 x half> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 false) +define amdgpu_kernel void @wmma_f16_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { + %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 false) + store <8 x half> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 false) +define amdgpu_kernel void @wmma_f16_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { + %tmp0 = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 false) + store <8 x half> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> %A, i1 false, <8 x i32> %B, <8 x i32> %C, i1 false, i1 false) +define amdgpu_kernel void @wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { + %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> %C, i1 false, i1 false) + store <8 x i32> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHRCK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 false, <16 x half> %A, i1 false, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) +define amdgpu_ps void @swmmac_f32_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 0, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) + store <8 x float> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 false, <16 x bfloat> %A, i1 false, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) +define amdgpu_ps void @swmmac_f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) + store <8 x float> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1 false, <16 x half> %A, i1 false, <32 x half> %B, <8 x half> %C, i16 %Index, i1 false, i1 false) +define amdgpu_ps void @swmmac_f16_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { + %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 0, <32 x half> %B, <8 x half> %C, i16 %Index, i1 false, i1 false) + store <8 x half> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1 false, <16 x bfloat> %A, i1 false, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, i1 false, i1 false) +define amdgpu_ps void @swmmac_bf16_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, ptr addrspace(1) %out) { + %tmp0 = call <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, i1 false, i1 false) + store <8 x bfloat> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 false, <16 x bfloat> %A, i1 false, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) +define amdgpu_ps void @swmmac_bf16f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) + store <8 x float> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) +define amdgpu_ps void @swmmac_f32_16x16x128_fp8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) + store <8 x float> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) +define amdgpu_ps void @swmmac_f32_16x16x128_fp8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) + store <8 x float> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) +define amdgpu_ps void @swmmac_f32_16x16x128_bf8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) + store <8 x float> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) +define amdgpu_ps void @swmmac_f32_16x16x128_bf8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { + %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) + store <8 x float> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 false) +define amdgpu_ps void @swmmac_f16_16x16x128_fp8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { + %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 false) + store <8 x half> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 false) +define amdgpu_ps void @swmmac_f16_16x16x128_fp8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { + %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 false) + store <8 x half> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 false) +define amdgpu_ps void @swmmac_f16_16x16x128_bf8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { + %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 false) + store <8 x half> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 false) +define amdgpu_ps void @swmmac_f16_16x16x128_bf8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { + %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 false) + store <8 x half> %tmp0, ptr addrspace(1) %out + ret void +} + +; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i16(i1 false, <8 x i32> %A, i1 false, <16 x i32> %B, <8 x i32> %C, i16 %Index, i1 false, i1 false) +define amdgpu_ps void @swmmac_i32_16x16x128_iu8(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) { + %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i16(i1 0, <8 x i32> %A, i1 0, <16 x i32> %B, <8 x i32> %C, i16 %Index, i1 false, i1 false) + store <8 x i32> %tmp0, ptr addrspace(1) %out + ret void +} + ; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1) %addr) define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) { bb: @@ -618,6 +828,36 @@ declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8(<2 x i32>, <4 x i32 declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8(<2 x i32>, <4 x i32>, <8 x float>, i16) declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16) +declare <8 x double> @llvm.amdgcn.wmma.f64.16x16x4.f64.v8f64.v2f64(i1, <2 x double>, i1, <2 x double>, i16, <8 x double>) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1, <2 x float>, i1, <2 x float>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1) +declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x bfloat>, i1, i1) +declare <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x float>, i16, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x half>, i16, i1, i1) +declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i16(<8 x i32>, <16 x i32>, <8 x float>, i16, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i16(<8 x i32>, <16 x i32>, <8 x float>, i16, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i16(<8 x i32>, <16 x i32>, <8 x float>, i16, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i16(<8 x i32>, <16 x i32>, <8 x float>, i16, i1, i1) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i16(<8 x i32>, <16 x i32>, <8 x half>, i16, i1, i1) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i16(<8 x i32>, <16 x i32>, <8 x half>, i16, i1, i1) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i16(<8 x i32>, <16 x i32>, <8 x half>, i16, i1, i1) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i16(<8 x i32>, <16 x i32>, <8 x half>, i16, i1, i1) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i16(i1 immarg, <8 x i32>, i1 immarg, <16 x i32>, <8 x i32>, i16 %Index, i1, i1) declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1)) declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1)) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll new file mode 100644 index 0000000000000..2f5ff90c9274f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll @@ -0,0 +1,840 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=GFX1250 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -global-isel -global-isel-abort=2 < %s | FileCheck %s --check-prefix=GISEL + +define amdgpu_ps void @test_wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x4_f32: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_b_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX1250-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x4_f32: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_b_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[12:13], v[4:7], off +; GISEL-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 true) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x32_bf16: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 true) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 true) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_b_reuse +; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_b_reuse +; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 true) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 true) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 true) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 true) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 true) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 true) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 true) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 true) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 true) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_i32_16x16x64_iu8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> %C, i1 false, i1 true) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x32_f16: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x32_f16: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 true) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x32_f16: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x32_f16: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 true) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 true) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 true) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 true) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> %C, i1 false, i1 true) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 true) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 true) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 true) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> %C, i1 false, i1 true) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_32x16x128_f4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_32x16x128_f4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f32_16x16x64_bf16: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse +; GISEL-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 true) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_bf16_16x16x64_bf16: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse +; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 +; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_bf16_16x16x64_bf16: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse +; GISEL-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 +; GISEL-NEXT: global_store_b128 v[30:31], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, i1 false, i1 true) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_bf16f32_16x16x64_bf16: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_bf16f32_16x16x64_bf16: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse +; GISEL-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 true) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, i1 false, i1 true) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, i1 false, i1 true) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, i1 false, i1 true) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, i1 false, i1 true) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 +; GFX1250-NEXT: v_mov_b32_e32 v29, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse +; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse +; GISEL-NEXT: v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 true) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 +; GFX1250-NEXT: v_mov_b32_e32 v29, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse +; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse +; GISEL-NEXT: v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 true) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 +; GFX1250-NEXT: v_mov_b32_e32 v29, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse +; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse +; GISEL-NEXT: v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 true) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 +; GFX1250-NEXT: v_mov_b32_e32 v29, 0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse +; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse +; GISEL-NEXT: v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i16(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, i1 false, i1 true) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, i64 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i64(i1 0, <8 x i32> %A, i1 0, <16 x i32> %B, <8 x i32> %C, i64 %Index, i1 false, i1 true) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f32_16x16x64_f16: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f32_16x16x64_f16: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse +; GISEL-NEXT: v_dual_mov_b32 v36, v33 :: v_dual_mov_b32 v37, v34 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 0, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 true) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f16_16x16x64_f16: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse +; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 +; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f16_16x16x64_f16: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse +; GISEL-NEXT: v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 0, <32 x half> %B, <8 x half> %C, i16 %Index, i1 false, i1 true) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1, <2 x float>, i1, <2 x float>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1) +declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x bfloat>, i1, i1) +declare <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) +declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1) +declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x float>, i64, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x float>, i64, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x float>, i64, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x float>, i64, i1, i1) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x half>, i64, i1, i1) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x half>, i64, i1, i1) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x half>, i64, i1, i1) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x half>, i64, i1, i1) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i64(i1 immarg, <8 x i32>, i1 immarg, <16 x i32>, <8 x i32>, i64 %Index, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x float>, i16, i1, i1) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x half>, i16, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll new file mode 100644 index 0000000000000..fe8358fcc7a9a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll @@ -0,0 +1,2238 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=GFX1250 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -global-isel -global-isel-abort=2 < %s | FileCheck %s --check-prefix=GISEL + +define amdgpu_ps void @test_wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x4_f32: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], 1.0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX1250-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x4_f32: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], 1.0 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off +; GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_splat(<2 x float> %A, <2 x float> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v6, 1.0 :: v_dual_mov_b32 v8, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v9, v6 +; GFX1250-NEXT: v_dual_mov_b32 v10, v6 :: v_dual_mov_b32 v11, v6 +; GFX1250-NEXT: v_dual_mov_b32 v12, v6 :: v_dual_mov_b32 v13, v6 +; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX1250-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x4_f32_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 1.0 +; GISEL-NEXT: s_mov_b32 s2, 2.0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], v[6:13] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off +; GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_inlineable(<2 x float> %A, <2 x float> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v6, 0x40400000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 +; GFX1250-NEXT: v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6 +; GFX1250-NEXT: v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6 +; GFX1250-NEXT: v_mov_b32_e32 v13, v6 +; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], v[6:13] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GFX1250-NEXT: global_store_b128 v[4:5], v[6:9], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x4_f32_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[12:13], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[10:11], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[8:9], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[6:7], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], v[6:13] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[4:5], v[6:9], off +; GISEL-NEXT: global_store_b128 v[4:5], v[10:13], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], 1.0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x32_bf16: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], 1.0 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_non_splat(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 +; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18 +; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18 +; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], v[18:25] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 +; GISEL-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18 +; GISEL-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18 +; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], v[18:25] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_non_inlineable(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 +; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18 +; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18 +; GFX1250-NEXT: v_mov_b32_e32 v25, v18 +; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], v[18:25] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_mov_b32_e32 v18, 0x40400000 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 +; GISEL-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18 +; GISEL-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18 +; GISEL-NEXT: v_mov_b32_e32 v25, v18 +; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], v[18:25] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> , i1 false, i1 false) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_non_splat(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v18, 0x3f803f80 :: v_dual_mov_b32 v19, 1.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v20, v18 :: v_dual_mov_b32 v21, v18 +; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], v[18:21] +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_dual_mov_b32 v18, 0x3f803f80 :: v_dual_mov_b32 v19, 1.0 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_dual_mov_b32 v20, v18 :: v_dual_mov_b32 v21, v18 +; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], v[18:21] +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> , i1 false, i1 false) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_non_inlineable(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3fc03fc0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 +; GFX1250-NEXT: v_mov_b32_e32 v21, v18 +; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], v[18:21] +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_mov_b32_e32 v18, 0x3fc03fc0 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 +; GISEL-NEXT: v_mov_b32_e32 v21, v18 +; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], v[18:21] +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> , i1 false, i1 false) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_non_splat(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 +; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18 +; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18 +; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[18:25] +; GFX1250-NEXT: global_store_b128 v[16:17], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 +; GISEL-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18 +; GISEL-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18 +; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[18:25] +; GISEL-NEXT: global_store_b128 v[16:17], v[26:29], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_non_inlinable(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_inlinable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 +; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18 +; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18 +; GFX1250-NEXT: v_mov_b32_e32 v25, v18 +; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[18:25] +; GFX1250-NEXT: global_store_b128 v[16:17], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_inlinable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_mov_b32_e32 v18, 0x40400000 +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 +; GISEL-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18 +; GISEL-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18 +; GISEL-NEXT: v_mov_b32_e32 v25, v18 +; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[18:25] +; GISEL-NEXT: global_store_b128 v[16:17], v[26:29], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], 1.0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], 1.0 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 +; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18 +; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18 +; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], v[18:25] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 1.0 +; GISEL-NEXT: s_mov_b32 s2, 2.0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], v[18:25] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 +; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18 +; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18 +; GFX1250-NEXT: v_mov_b32_e32 v25, v18 +; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], v[18:25] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], v[18:25] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], 1.0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], 1.0 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 +; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18 +; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18 +; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], v[18:25] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 1.0 +; GISEL-NEXT: s_mov_b32 s2, 2.0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], v[18:25] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 +; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18 +; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18 +; GFX1250-NEXT: v_mov_b32_e32 v25, v18 +; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], v[18:25] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], v[18:25] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], 1.0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], 1.0 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 +; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18 +; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18 +; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], v[18:25] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 1.0 +; GISEL-NEXT: s_mov_b32 s2, 2.0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], v[18:25] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 +; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18 +; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18 +; GFX1250-NEXT: v_mov_b32_e32 v25, v18 +; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], v[18:25] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], v[18:25] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], 1.0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], 1.0 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 +; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18 +; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18 +; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], v[18:25] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 1.0 +; GISEL-NEXT: s_mov_b32 s2, 2.0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], v[18:25] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 +; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18 +; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18 +; GFX1250-NEXT: v_mov_b32_e32 v25, v18 +; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], v[18:25] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], v[18:25] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], 1.0 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], 1.0 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3c003c00 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18 +; GFX1250-NEXT: v_mov_b32_e32 v21, v18 +; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 +; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v18, 0x42004200 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 +; GFX1250-NEXT: v_mov_b32_e32 v21, v18 +; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x42004200 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], 1.0 neg_hi:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], 1.0 neg_hi:[0,0,1] +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 2, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3c003c00 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18 +; GFX1250-NEXT: v_mov_b32_e32 v21, v18 +; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] neg_hi:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 +; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] neg_hi:[0,0,1] +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 2, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v18, 0x42004200 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 +; GFX1250-NEXT: v_mov_b32_e32 v21, v18 +; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] neg_hi:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x42004200 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] neg_hi:[0,0,1] +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 2, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], 1.0 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], 1.0 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3c003c00 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18 +; GFX1250-NEXT: v_mov_b32_e32 v21, v18 +; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 +; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v18, 0x42004200 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 +; GFX1250-NEXT: v_mov_b32_e32 v21, v18 +; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x42004200 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], v[18:21] +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], 1.0 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], 1.0 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3c003c00 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18 +; GFX1250-NEXT: v_mov_b32_e32 v21, v18 +; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 +; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v18, 0x42004200 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 +; GFX1250-NEXT: v_mov_b32_e32 v21, v18 +; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x42004200 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], v[18:21] +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], 1 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_i32_16x16x64_iu8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], 1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> , i1 false, i1 false) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v18, 1 :: v_dual_mov_b32 v20, 2 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 +; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18 +; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18 +; GFX1250-NEXT: v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], v[18:25] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_i32_16x16x64_iu8_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 1 +; GISEL-NEXT: s_mov_b32 s2, 2 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], v[18:25] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> , i1 false, i1 false) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v18, 0x80 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 +; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18 +; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18 +; GFX1250-NEXT: v_mov_b32_e32 v25, v18 +; GFX1250-NEXT: v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], v[18:25] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_i32_16x16x64_iu8_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_movk_i32 s0, 0x80 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], v[18:25] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> , i1 false, i1 false) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x32_f16: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], 1.0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x32_f16: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], 1.0 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_splat(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18 +; GFX1250-NEXT: v_dual_mov_b32 v22, v18 :: v_dual_mov_b32 v23, v18 +; GFX1250-NEXT: v_dual_mov_b32 v24, v18 :: v_dual_mov_b32 v25, v18 +; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], v[18:25] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x32_f16_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 1.0 +; GISEL-NEXT: s_mov_b32 s2, 2.0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], v[18:25] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_inlineable(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v18, 0x40400000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 +; GFX1250-NEXT: v_dual_mov_b32 v21, v18 :: v_dual_mov_b32 v22, v18 +; GFX1250-NEXT: v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v24, v18 +; GFX1250-NEXT: v_mov_b32_e32 v25, v18 +; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], v[18:25] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x32_f16_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[24:25], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[22:23], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], v[18:25] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: global_store_b128 v[16:17], v[22:25], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x32_f16(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x32_f16: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], 1.0 +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x32_f16: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], 1.0 +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_splat(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v18, 0x3c003c00 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18 +; GFX1250-NEXT: v_mov_b32_e32 v21, v18 +; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], v[18:21] +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x32_f16_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 +; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], v[18:21] +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_inlineable(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v18, 0x42004200 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18 +; GFX1250-NEXT: v_mov_b32_e32 v21, v18 +; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], v[18:21] +; GFX1250-NEXT: global_store_b128 v[16:17], v[18:21], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x32_f16_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x42004200 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: v_mov_b64_e32 v[20:21], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[18:19], s[0:1] +; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], v[18:21] +; GISEL-NEXT: global_store_b128 v[16:17], v[18:21], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], 1.0 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], 1.0 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v34, 0x3c003c00 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34 +; GFX1250-NEXT: v_mov_b32_e32 v37, v34 +; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 +; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v34, 0x42004200 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 +; GFX1250-NEXT: v_mov_b32_e32 v37, v34 +; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x42004200 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], 1.0 neg_hi:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], 1.0 neg_hi:[0,0,1] +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 2, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v34, 0x3c003c00 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34 +; GFX1250-NEXT: v_mov_b32_e32 v37, v34 +; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] neg_hi:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 +; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] neg_hi:[0,0,1] +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 2, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v34, 0x42004200 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 +; GFX1250-NEXT: v_mov_b32_e32 v37, v34 +; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] neg_hi:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x42004200 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] neg_hi:[0,0,1] +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 2, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], 1.0 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], 1.0 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v34, 0x3c003c00 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34 +; GFX1250-NEXT: v_mov_b32_e32 v37, v34 +; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 +; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v34, 0x42004200 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 +; GFX1250-NEXT: v_mov_b32_e32 v37, v34 +; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x42004200 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], v[34:37] +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], 1.0 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], 1.0 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v34, 0x3c003c00 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34 +; GFX1250-NEXT: v_mov_b32_e32 v37, v34 +; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x3c003c00 +; GISEL-NEXT: s_mov_b32 s1, 0x3c004000 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v34, 0x42004200 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 +; GFX1250-NEXT: v_mov_b32_e32 v37, v34 +; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x42004200 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1) +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], v[34:37] +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x half> , i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], 1.0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], 1.0 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34 +; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34 +; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34 +; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], v[34:41] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 1.0 +; GISEL-NEXT: s_mov_b32 s2, 2.0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], v[34:41] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 +; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34 +; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34 +; GFX1250-NEXT: v_mov_b32_e32 v41, v34 +; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], v[34:41] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], v[34:41] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], 1.0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], 1.0 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34 +; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34 +; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34 +; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], v[34:41] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 1.0 +; GISEL-NEXT: s_mov_b32 s2, 2.0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], v[34:41] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 +; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34 +; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34 +; GFX1250-NEXT: v_mov_b32_e32 v41, v34 +; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], v[34:41] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], v[34:41] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], 1.0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], 1.0 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34 +; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34 +; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34 +; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], v[34:41] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 1.0 +; GISEL-NEXT: s_mov_b32 s2, 2.0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], v[34:41] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 +; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34 +; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34 +; GFX1250-NEXT: v_mov_b32_e32 v41, v34 +; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], v[34:41] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], v[34:41] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], 1.0 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], 1.0 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34 +; GFX1250-NEXT: v_dual_mov_b32 v38, v34 :: v_dual_mov_b32 v39, v34 +; GFX1250-NEXT: v_dual_mov_b32 v40, v34 :: v_dual_mov_b32 v41, v34 +; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], v[34:41] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 1.0 +; GISEL-NEXT: s_mov_b32 s2, 2.0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], v[34:41] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v34, 0x40400000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34 +; GFX1250-NEXT: v_dual_mov_b32 v37, v34 :: v_dual_mov_b32 v38, v34 +; GFX1250-NEXT: v_dual_mov_b32 v39, v34 :: v_dual_mov_b32 v40, v34 +; GFX1250-NEXT: v_mov_b32_e32 v41, v34 +; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], v[34:41] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GFX1250-NEXT: global_store_b128 v[32:33], v[34:37], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], v[34:41] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[32:33], v[34:37], off +; GISEL-NEXT: global_store_b128 v[32:33], v[38:41], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 0, <8 x float> , i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_32x16x128_f4: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0 +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_32x16x128_f4: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0 +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off +; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> ) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_non_splat: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_dual_mov_b32 v26, 1.0 :: v_dual_mov_b32 v28, 2.0 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v29, v26 +; GFX1250-NEXT: v_dual_mov_b32 v30, v26 :: v_dual_mov_b32 v31, v26 +; GFX1250-NEXT: v_dual_mov_b32 v32, v26 :: v_dual_mov_b32 v33, v26 +; GFX1250-NEXT: v_dual_mov_b32 v34, v26 :: v_dual_mov_b32 v35, v26 +; GFX1250-NEXT: v_dual_mov_b32 v36, v28 :: v_dual_mov_b32 v37, v26 +; GFX1250-NEXT: v_dual_mov_b32 v38, v26 :: v_dual_mov_b32 v39, v26 +; GFX1250-NEXT: v_dual_mov_b32 v40, v26 :: v_dual_mov_b32 v41, v26 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41] +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_32x16x128_f4_non_splat: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 1.0 +; GISEL-NEXT: s_mov_b32 s2, 2.0 +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s8, s0 +; GISEL-NEXT: s_mov_b32 s9, s0 +; GISEL-NEXT: s_mov_b32 s10, s2 +; GISEL-NEXT: s_mov_b32 s11, s0 +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41] +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off +; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> ) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_non_inlineable: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_mov_b32_e32 v26, 0x40400000 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26 +; GFX1250-NEXT: v_dual_mov_b32 v29, v26 :: v_dual_mov_b32 v30, v26 +; GFX1250-NEXT: v_dual_mov_b32 v31, v26 :: v_dual_mov_b32 v32, v26 +; GFX1250-NEXT: v_dual_mov_b32 v33, v26 :: v_dual_mov_b32 v34, v26 +; GFX1250-NEXT: v_dual_mov_b32 v35, v26 :: v_dual_mov_b32 v36, v26 +; GFX1250-NEXT: v_dual_mov_b32 v37, v26 :: v_dual_mov_b32 v38, v26 +; GFX1250-NEXT: v_dual_mov_b32 v39, v26 :: v_dual_mov_b32 v40, v26 +; GFX1250-NEXT: v_mov_b32_e32 v41, v26 +; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1250-NEXT: v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41] +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GFX1250-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GFX1250-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_32x16x128_f4_non_inlineable: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: s_mov_b32 s0, 0x40400000 +; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GISEL-NEXT: s_mov_b32 s14, s0 +; GISEL-NEXT: s_mov_b32 s15, s0 +; GISEL-NEXT: s_mov_b32 s1, s0 +; GISEL-NEXT: s_mov_b32 s2, s0 +; GISEL-NEXT: s_mov_b32 s3, s0 +; GISEL-NEXT: s_mov_b32 s4, s0 +; GISEL-NEXT: s_mov_b32 s5, s0 +; GISEL-NEXT: s_mov_b32 s6, s0 +; GISEL-NEXT: s_mov_b32 s7, s0 +; GISEL-NEXT: s_mov_b32 s8, s0 +; GISEL-NEXT: s_mov_b32 s9, s0 +; GISEL-NEXT: s_mov_b32 s10, s0 +; GISEL-NEXT: s_mov_b32 s11, s0 +; GISEL-NEXT: s_mov_b32 s12, s0 +; GISEL-NEXT: s_mov_b32 s13, s0 +; GISEL-NEXT: v_mov_b64_e32 v[40:41], s[14:15] +; GISEL-NEXT: v_mov_b64_e32 v[38:39], s[12:13] +; GISEL-NEXT: v_mov_b64_e32 v[36:37], s[10:11] +; GISEL-NEXT: v_mov_b64_e32 v[34:35], s[8:9] +; GISEL-NEXT: v_mov_b64_e32 v[32:33], s[6:7] +; GISEL-NEXT: v_mov_b64_e32 v[30:31], s[4:5] +; GISEL-NEXT: v_mov_b64_e32 v[28:29], s[2:3] +; GISEL-NEXT: v_mov_b64_e32 v[26:27], s[0:1] +; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-NEXT: v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], v[26:41] +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off +; GISEL-NEXT: global_store_b128 v[24:25], v[30:33], off offset:16 +; GISEL-NEXT: global_store_b128 v[24:25], v[34:37], off offset:32 +; GISEL-NEXT: global_store_b128 v[24:25], v[38:41], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> ) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1, <2 x float>, i1, <2 x float>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1) +declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x bfloat>, i1, i1) +declare <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) +declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll new file mode 100644 index 0000000000000..9802144a29577 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll @@ -0,0 +1,1993 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=GFX1250 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -global-isel -global-isel-abort=2 < %s | FileCheck %s --check-prefix=GISEL + +define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negA(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_negA: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX1250-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x4_f32_negA: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[12:13], v[4:7], off +; GISEL-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 1, <2 x float> %A, i1 0, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negB(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_negB: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX1250-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x4_f32_negB: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[12:13], v[4:7], off +; GISEL-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 1, <2 x float> %B, i16 0, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negC(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX1250-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x4_f32_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[12:13], v[4:7], off +; GISEL-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 1, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x4_f32_neg_absC(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX1250-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x4_f32_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[12:13], v[4:7], off +; GISEL-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 3, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x4_f32_ignoreC(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GFX1250-NEXT: global_store_b128 v[12:13], v[4:7], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x4_f32_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[12:13], v[4:7], off +; GISEL-NEXT: global_store_b128 v[12:13], v[8:11], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 0, <2 x float> %A, i1 0, <2 x float> %B, i16 4, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negA(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_negA: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_negA: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 1, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negB(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_negB: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_negB: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 1, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 1, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_neg_absC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 3, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_ignoreC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 4, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_negA(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_negA: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_negA: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 1, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 false) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_negB(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_negB: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_negB: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 1, <16 x bfloat> %B, i16 0, <8 x bfloat> %C, i1 false, i1 false) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_negC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 1, <8 x bfloat> %C, i1 false, i1 false) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_neg_absC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 3, <8 x bfloat> %C, i1 false, i1 false) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_ignoreC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 4, <8 x bfloat> %C, i1 false, i1 false) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_negA(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_negA: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_negA: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 1, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_negB(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_negB: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_negB: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 1, <16 x bfloat> %B, i16 0, <8 x float> %C, i1 false, i1 false) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_negC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] +; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 1, <8 x float> %C, i1 false, i1 false) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_neg_absC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 0, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 3, <8 x float> %C, i1 false, i1 false) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_ignoreC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX1250-NEXT: global_store_b128 v[24:25], v[26:29], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GISEL-NEXT: global_store_b128 v[24:25], v[26:29], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1 1, <16 x bfloat> %A, i1 0, <16 x bfloat> %B, i16 4, <8 x float> %C, i1 false, i1 false) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_negC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 1, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 3, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 4, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_negC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 1, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 3, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 4, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_negC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 1, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 3, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 4, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_negC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 1, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 3, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> %A, <8 x i32> %B, i16 4, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_negC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 1, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 3, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 4, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_negC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 1, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 3, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 4, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_negC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 1, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 3, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 4, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_negC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 1, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 3, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> %A, <8 x i32> %B, i16 4, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_signedA(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8_signedA: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_i32_16x16x64_iu8_signedA: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 1, <8 x i32> %A, i1 0, <8 x i32> %B, <8 x i32> %C, i1 false, i1 false) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_signedB(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8_signedB: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_i32_16x16x64_iu8_signedB: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 0, <8 x i32> %A, i1 1, <8 x i32> %B, <8 x i32> %C, i1 false, i1 false) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negA(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_negA: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x32_f16_negA: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 1, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negB(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_negB: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x32_f16_negB: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 1, <16 x half> %B, i16 0, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negC(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x32_f16_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 1, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x32_f16_neg_absC(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x32_f16_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 3, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x32_f16_ignoreC(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GFX1250-NEXT: global_store_b128 v[24:25], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x32_f16_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[24:25], v[16:19], off +; GISEL-NEXT: global_store_b128 v[24:25], v[20:23], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 4, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x32_f16_negA(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_negA: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x32_f16_negA: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 1, <16 x half> %A, i1 0, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x32_f16_negB(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_negB: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x32_f16_negB: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 1, <16 x half> %B, i16 0, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x32_f16_negC(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x32_f16_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 1, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x32_f16_neg_absC(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x32_f16_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 3, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x32_f16_ignoreC(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] +; GFX1250-NEXT: global_store_b128 v[20:21], v[16:19], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x32_f16_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] +; GISEL-NEXT: global_store_b128 v[20:21], v[16:19], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 0, <16 x half> %A, i1 0, <16 x half> %B, i16 4, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 1, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 3, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 4, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 1, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 3, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 4, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 1, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 3, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 4, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 1, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 3, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] +; GFX1250-NEXT: global_store_b128 v[36:37], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] +; GISEL-NEXT: global_store_b128 v[36:37], v[32:35], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> %A, <16 x i32> %B, i16 4, <8 x half> %C, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 1, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 3, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 4, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 1, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 3, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 4, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 1, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 3, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 4, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 1, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 3, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> %A, <16 x i32> %B, i16 4, <8 x float> %C, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_32x16x128_f4_negC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_negC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] neg_lo:[0,0,1] +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_32x16x128_f4_negC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] neg_lo:[0,0,1] +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32> %A, <8 x i32> %B, i16 1, <16 x float> %C) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_32x16x128_f4_neg_absC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_neg_absC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_32x16x128_f4_neg_absC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32> %A, <8 x i32> %B, i16 3, <16 x float> %C) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_wmma_f32_32x16x128_f4_ignoreC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_ignoreC: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] +; GFX1250-NEXT: s_clause 0x3 +; GFX1250-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GFX1250-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GFX1250-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[40:41], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_wmma_f32_32x16x128_f4_ignoreC: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] +; GISEL-NEXT: s_clause 0x3 +; GISEL-NEXT: global_store_b128 v[40:41], v[24:27], off +; GISEL-NEXT: global_store_b128 v[40:41], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[40:41], v[32:35], off offset:32 +; GISEL-NEXT: global_store_b128 v[40:41], v[36:39], off offset:48 +; GISEL-NEXT: s_endpgm +bb: + %res = call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32> %A, <8 x i32> %B, i16 4, <16 x float> %C) + store <16 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16_negA(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16_negA: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f32_16x16x64_bf16_negA: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GISEL-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 1, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16_negB(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16_negB: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f32_16x16x64_bf16_negB: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GISEL-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 1, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16_negA(<16 x bfloat> %A, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_bf16_16x16x64_bf16_negA: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 +; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_bf16_16x16x64_bf16_negA: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GISEL-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 +; GISEL-NEXT: global_store_b128 v[30:31], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1 1, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, i1 false, i1 false) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16_negB(<16 x bfloat> %A, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_bf16_16x16x64_bf16_negB: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 +; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_bf16_16x16x64_bf16_negB: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GISEL-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 +; GISEL-NEXT: global_store_b128 v[30:31], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 1, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, i1 false, i1 false) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16_negA(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_bf16f32_16x16x64_bf16_negA: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_bf16f32_16x16x64_bf16_negA: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GISEL-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 1, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16_negB(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_bf16f32_16x16x64_bf16_negB: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_bf16f32_16x16x64_bf16_negB: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GISEL-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 1, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_signedA(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, i64 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8_signedA: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[1,0,0] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8_signedA: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[1,0,0] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i64(i1 1, <8 x i32> %A, i1 0, <16 x i32> %B, <8 x i32> %C, i64 %Index, i1 false, i1 false) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_signedB(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, i64 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8_signedB: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,1,0] +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8_signedB: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,1,0] +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i64(i1 0, <8 x i32> %A, i1 1, <16 x i32> %B, <8 x i32> %C, i64 %Index, i1 false, i1 false) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x64_f16_negA(<16 x half> %A, <32 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f32_16x16x64_f16_negA: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f32_16x16x64_f16_negA: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GISEL-NEXT: v_dual_mov_b32 v36, v33 :: v_dual_mov_b32 v37, v34 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 1, <16 x half> %A, i1 0, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x64_f16_negB(<16 x half> %A, <32 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f32_16x16x64_f16_negB: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX1250-NEXT: v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f32_16x16x64_f16_negB: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GISEL-NEXT: v_dual_mov_b32 v36, v33 :: v_dual_mov_b32 v37, v34 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 1, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x64_f16_negA(<16 x half> %A, <32 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f16_16x16x64_f16_negA: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 +; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f16_16x16x64_f16_negA: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0] +; GISEL-NEXT: v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1 1, <16 x half> %A, i1 0, <32 x half> %B, <8 x half> %C, i16 %Index, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x64_f16_negB(<16 x half> %A, <32 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f16_16x16x64_f16_negB: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GFX1250-NEXT: v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29 +; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f16_16x16x64_f16_negB: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0] +; GISEL-NEXT: v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30 +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 1, <32 x half> %B, <8 x half> %C, i16 %Index, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1, <2 x float>, i1, <2 x float>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1) +declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x bfloat>, i1, i1) +declare <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16(i1, <16 x bfloat>, i1, <16 x bfloat>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32>, <8 x i32>, i16, <8 x float>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32>, <8 x i32>, i16, <8 x half>, i1, i1) +declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 immarg, <8 x i32>, i1 immarg, <8 x i32>, <8 x i32>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x float>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1, <16 x half>, i1, <16 x half>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) +declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32>, <16 x i32>, i16, <8 x half>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) +declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32>, <16 x i32>, i16, <8 x float>, i1, i1) +declare <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16i32.v8i32.v16f32(<16 x i32>, <8 x i32>, i16, <16 x float>) + +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1) +declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x float>, i64, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x float>, i64, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x float>, i64, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x float>, i64, i1, i1) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x half>, i64, i1, i1) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x half>, i64, i1, i1) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x half>, i64, i1, i1) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i64(<8 x i32>, <16 x i32>, <8 x half>, i64, i1, i1) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i64(i1 immarg, <8 x i32>, i1 immarg, <16 x i32>, <8 x i32>, i64, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x float>, i16, i1, i1) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x half>, i16, i1, i1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.index.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.index.gfx1250.w32.ll new file mode 100644 index 0000000000000..b8745e0ebf480 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.index.gfx1250.w32.ll @@ -0,0 +1,690 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s --check-prefix=GFX1250 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -global-isel -global-isel-abort=2 < %s | FileCheck %s --check-prefix=GISEL + +define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b32 v32, v[32:33], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f32_16x16x64_bf16: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b32 v32, v[32:33], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i32, ptr addrspace(1) %IndexVecPtr, align 4 + %IndexVec = bitcast i32 %IndexVecPacked to <2 x i16> + %Index = extractelement <2 x i16> %IndexVec, i32 1 + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_bf16_16x16x64_bf16: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b32 v28, v[28:29], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 index_key:1 +; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_bf16_16x16x64_bf16: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b32 v28, v[28:29], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 index_key:1 +; GISEL-NEXT: global_store_b128 v[30:31], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i32, ptr addrspace(1) %IndexVecPtr, align 4 + %IndexVec = bitcast i32 %IndexVecPacked to <2 x i16> + %Index = extractelement <2 x i16> %IndexVec, i32 1 + %res = call <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, i1 false, i1 false) + store <8 x bfloat> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_bf16f32_16x16x64_bf16: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b32 v32, v[32:33], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_bf16f32_16x16x64_bf16: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b32 v32, v[32:33], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i32, ptr addrspace(1) %IndexVecPtr, align 4 + %IndexVec = bitcast i32 %IndexVecPacked to <2 x i16> + %Index = extractelement <2 x i16> %IndexVec, i32 1 + %res = call <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1 0, <16 x bfloat> %A, i1 0, <32 x bfloat> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_fp8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_fp8_i32_index: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +; GFX1250-NEXT: global_store_b64 v[34:35], v[32:33], off +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_fp8_i32_index: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +; GISEL-NEXT: global_store_b64 v[34:35], v[32:33], off +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 + store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr + %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32> + %Index = extractelement <2 x i32> %IndexVec, i32 1 + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %Index, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_fp8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_fp8_i64_index: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +; GFX1250-NEXT: global_store_b64 v[34:35], v[32:33], off +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_fp8_i64_index: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +; GISEL-NEXT: global_store_b64 v[34:35], v[32:33], off +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 + store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr + %Index = lshr i64 %IndexVecPacked, 32 + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_bf8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_bf8_i32_index: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +; GFX1250-NEXT: global_store_b64 v[34:35], v[32:33], off +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_bf8_i32_index: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +; GISEL-NEXT: global_store_b64 v[34:35], v[32:33], off +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 + store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr + %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32> + %Index = extractelement <2 x i32> %IndexVec, i32 1 + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %Index, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_bf8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_bf8_i64_index: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +; GFX1250-NEXT: global_store_b64 v[34:35], v[32:33], off +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_bf8_i64_index: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +; GISEL-NEXT: global_store_b64 v[34:35], v[32:33], off +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 + store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr + %Index = lshr i64 %IndexVecPacked, 32 + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_fp8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_fp8_i32_index: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +; GFX1250-NEXT: global_store_b64 v[34:35], v[32:33], off +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_fp8_i32_index: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +; GISEL-NEXT: global_store_b64 v[34:35], v[32:33], off +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 + store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr + %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32> + %Index = extractelement <2 x i32> %IndexVec, i32 1 + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %Index, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_fp8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_fp8_i64_index: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +; GFX1250-NEXT: global_store_b64 v[34:35], v[32:33], off +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_fp8_i64_index: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +; GISEL-NEXT: global_store_b64 v[34:35], v[32:33], off +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 + store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr + %Index = lshr i64 %IndexVecPacked, 32 + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_bf8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_bf8_i32_index: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +; GFX1250-NEXT: global_store_b64 v[34:35], v[32:33], off +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_bf8_i32_index: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +; GISEL-NEXT: global_store_b64 v[34:35], v[32:33], off +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 + store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr + %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32> + %Index = extractelement <2 x i32> %IndexVec, i32 1 + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %Index, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_bf8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_bf8_i64_index: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +; GFX1250-NEXT: global_store_b64 v[34:35], v[32:33], off +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_bf8_i64_index: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +; GISEL-NEXT: global_store_b64 v[34:35], v[32:33], off +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 + store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr + %Index = lshr i64 %IndexVecPacked, 32 + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_fp8_i32_index: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 +; GFX1250-NEXT: global_store_b64 v[30:31], v[28:29], off +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_fp8_i32_index: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 +; GISEL-NEXT: global_store_b64 v[30:31], v[28:29], off +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 + store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr + %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32> + %Index = extractelement <2 x i32> %IndexVec, i32 1 + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i32 %Index, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_fp8_i64_index: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 +; GFX1250-NEXT: global_store_b64 v[30:31], v[28:29], off +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_fp8_i64_index: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 +; GISEL-NEXT: global_store_b64 v[30:31], v[28:29], off +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 + store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr + %Index = lshr i64 %IndexVecPacked, 32 + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i64 %Index, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_bf8_i32_index: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 +; GFX1250-NEXT: global_store_b64 v[30:31], v[28:29], off +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_bf8_i32_index: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 +; GISEL-NEXT: global_store_b64 v[30:31], v[28:29], off +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 4 + store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr + %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32> + %Index = extractelement <2 x i32> %IndexVec, i32 1 + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i32 %Index, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_bf8_i64_index: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 +; GFX1250-NEXT: global_store_b64 v[30:31], v[28:29], off +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_bf8_i64_index: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 +; GISEL-NEXT: global_store_b64 v[30:31], v[28:29], off +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 4 + store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr + %Index = lshr i64 %IndexVecPacked, 32 + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i64 %Index, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_fp8_i32_index: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 +; GFX1250-NEXT: global_store_b64 v[30:31], v[28:29], off +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_fp8_i32_index: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 +; GISEL-NEXT: global_store_b64 v[30:31], v[28:29], off +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 + store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr + %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32> + %Index = extractelement <2 x i32> %IndexVec, i32 1 + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i32 %Index, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_fp8_i64_index: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 +; GFX1250-NEXT: global_store_b64 v[30:31], v[28:29], off +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_fp8_i64_index: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 +; GISEL-NEXT: global_store_b64 v[30:31], v[28:29], off +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 + store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr + %Index = lshr i64 %IndexVecPacked, 32 + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i64 %Index, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_bf8_i32_index: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 +; GFX1250-NEXT: global_store_b64 v[30:31], v[28:29], off +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_bf8_i32_index: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 +; GISEL-NEXT: global_store_b64 v[30:31], v[28:29], off +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 + store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr + %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32> + %Index = extractelement <2 x i32> %IndexVec, i32 1 + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i32(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i32 %Index, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_bf8_i64_index: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b64 v[28:29], v[28:29], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 +; GFX1250-NEXT: global_store_b64 v[30:31], v[28:29], off +; GFX1250-NEXT: global_store_b128 v[32:33], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_bf8_i64_index: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b64 v[28:29], v[28:29], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1 +; GISEL-NEXT: global_store_b64 v[30:31], v[28:29], off +; GISEL-NEXT: global_store_b128 v[32:33], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 + store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr + %Index = lshr i64 %IndexVecPacked, 32 + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i64(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i64 %Index, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8_i32_index: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +; GFX1250-NEXT: global_store_b64 v[34:35], v[32:33], off +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8_i32_index: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +; GISEL-NEXT: global_store_b64 v[34:35], v[32:33], off +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 + store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr + %IndexVec = bitcast i64 %IndexVecPacked to <2 x i32> + %Index = extractelement <2 x i32> %IndexVec, i32 1 + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i32(i1 0, <8 x i32> %A, i1 0, <16 x i32> %B, <8 x i32> %C, i32 %Index, i1 false, i1 false) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8_i64_index: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b64 v[32:33], v[32:33], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +; GFX1250-NEXT: global_store_b64 v[34:35], v[32:33], off +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[36:37], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8_i64_index: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b64 v[32:33], v[32:33], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1 +; GISEL-NEXT: global_store_b64 v[34:35], v[32:33], off +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[36:37], v[24:27], off +; GISEL-NEXT: global_store_b128 v[36:37], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i64, ptr addrspace(1) %IndexVecPtr, align 8 + store i64 %IndexVecPacked, ptr addrspace(1) %IndexVecOutPtr + %Index = lshr i64 %IndexVecPacked, 32 + %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i64(i1 0, <8 x i32> %A, i1 0, <16 x i32> %B, <8 x i32> %C, i64 %Index, i1 false, i1 false) + store <8 x i32> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f32_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f32_16x16x64_f16: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b32 v32, v[32:33], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 index_key:1 +; GFX1250-NEXT: s_clause 0x1 +; GFX1250-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GFX1250-NEXT: global_store_b128 v[34:35], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f32_16x16x64_f16: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b32 v32, v[32:33], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 index_key:1 +; GISEL-NEXT: s_clause 0x1 +; GISEL-NEXT: global_store_b128 v[34:35], v[24:27], off +; GISEL-NEXT: global_store_b128 v[34:35], v[28:31], off offset:16 +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i32, ptr addrspace(1) %IndexVecPtr, align 4 + %IndexVec = bitcast i32 %IndexVecPacked to <2 x i16> + %Index = extractelement <2 x i16> %IndexVec, i32 1 + %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 0, <32 x half> %B, <8 x float> %C, i16 %Index, i1 false, i1 false) + store <8 x float> %res, ptr addrspace(1) %out + ret void +} + +define amdgpu_ps void @test_swmmac_f16_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) { +; GFX1250-LABEL: test_swmmac_f16_16x16x64_f16: +; GFX1250: ; %bb.0: ; %bb +; GFX1250-NEXT: global_load_b32 v28, v[28:29], off +; GFX1250-NEXT: s_wait_loadcnt 0x0 +; GFX1250-NEXT: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:1 +; GFX1250-NEXT: global_store_b128 v[30:31], v[24:27], off +; GFX1250-NEXT: s_endpgm +; +; GISEL-LABEL: test_swmmac_f16_16x16x64_f16: +; GISEL: ; %bb.0: ; %bb +; GISEL-NEXT: global_load_b32 v28, v[28:29], off +; GISEL-NEXT: s_wait_loadcnt 0x0 +; GISEL-NEXT: v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:1 +; GISEL-NEXT: global_store_b128 v[30:31], v[24:27], off +; GISEL-NEXT: s_endpgm +bb: + %IndexVecPacked = load i32, ptr addrspace(1) %IndexVecPtr, align 4 + %IndexVec = bitcast i32 %IndexVecPacked to <2 x i16> + %Index = extractelement <2 x i16> %IndexVec, i32 1 + %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1 0, <16 x half> %A, i1 0, <32 x half> %B, <8 x half> %C, i16 %Index, i1 false, i1 false) + store <8 x half> %res, ptr addrspace(1) %out + ret void +} + +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1) +declare <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x bfloat>, i16, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i16(i1, <16 x bfloat>, i1, <32 x bfloat>, <8 x float>, i16, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x float>, i32, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x float>, i32, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x float>, i32, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x float>, i32, i1, i1) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x half>, i32, i1, i1) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x half>, i32, i1, i1) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x half>, i32, i1, i1) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i32(<8 x i32>, <16 x i32>, <8 x half>, i32, i1, i1) +declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i32(i1 immarg, <8 x i32>, i1 immarg, <16 x i32>, <8 x i32>, i32, i1, i1) +declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x float>, i16, i1, i1) +declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i16(i1, <16 x half>, i1, <32 x half>, <8 x half>, i16, i1, i1)