diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 2e1bdf4692478..67d8715d3f1c2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -17,7 +17,6 @@ #include "AMDGPULegalizerInfo.h" #include "AMDGPURegisterBankInfo.h" #include "AMDGPUTargetMachine.h" -#include "GCNSubtarget.h" #include "R600Subtarget.h" #include "SIMachineFunctionInfo.h" #include "Utils/AMDGPUBaseInfo.h" @@ -36,308 +35,12 @@ using namespace llvm; #define DEBUG_TYPE "amdgpu-subtarget" -#define GET_SUBTARGETINFO_TARGET_DESC -#define GET_SUBTARGETINFO_CTOR -#define AMDGPUSubtarget GCNSubtarget -#include "AMDGPUGenSubtargetInfo.inc" -#undef AMDGPUSubtarget - -static cl::opt EnablePowerSched( - "amdgpu-enable-power-sched", - cl::desc("Enable scheduling to minimize mAI power bursts"), - cl::init(false)); - -static cl::opt EnableVGPRIndexMode( - "amdgpu-vgpr-index-mode", - cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), - cl::init(false)); - -static cl::opt UseAA("amdgpu-use-aa-in-codegen", - cl::desc("Enable the use of AA during codegen."), - cl::init(true)); - -static cl::opt NSAThreshold("amdgpu-nsa-threshold", - cl::desc("Number of addresses from which to enable MIMG NSA."), - cl::init(3), cl::Hidden); - -GCNSubtarget::~GCNSubtarget() = default; - -GCNSubtarget & -GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, - StringRef GPU, StringRef FS) { - // Determine default and user-specified characteristics - // - // We want to be able to turn these off, but making this a subtarget feature - // for SI has the unhelpful behavior that it unsets everything else if you - // disable it. - // - // Similarly we want enable-prt-strict-null to be on by default and not to - // unset everything else if it is disabled - - SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); - - // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by default - if (isAmdHsaOS()) - FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; - - FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS - - // Disable mutually exclusive bits. - if (FS.contains_insensitive("+wavefrontsize")) { - if (!FS.contains_insensitive("wavefrontsize16")) - FullFS += "-wavefrontsize16,"; - if (!FS.contains_insensitive("wavefrontsize32")) - FullFS += "-wavefrontsize32,"; - if (!FS.contains_insensitive("wavefrontsize64")) - FullFS += "-wavefrontsize64,"; - } - - FullFS += FS; - - ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); - - // Implement the "generic" processors, which acts as the default when no - // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to - // the first amdgcn target that supports flat addressing. Other OSes defaults - // to the first amdgcn target. - if (Gen == AMDGPUSubtarget::INVALID) { - Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS - : AMDGPUSubtarget::SOUTHERN_ISLANDS; - } - - if (!hasFeature(AMDGPU::FeatureWavefrontSize32) && - !hasFeature(AMDGPU::FeatureWavefrontSize64)) { - // If there is no default wave size it must be a generation before gfx10, - // these have FeatureWavefrontSize64 in their definition already. For gfx10+ - // set wave32 as a default. - ToggleFeature(AMDGPU::FeatureWavefrontSize32); - } - - // We don't support FP64 for EG/NI atm. - assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); - - // Targets must either support 64-bit offsets for MUBUF instructions, and/or - // support flat operations, otherwise they cannot access a 64-bit global - // address space - assert(hasAddr64() || hasFlat()); - // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets - // that do not support ADDR64 variants of MUBUF instructions. Such targets - // cannot use a 64 bit offset with a MUBUF instruction to access the global - // address space - if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { - ToggleFeature(AMDGPU::FeatureFlatForGlobal); - FlatForGlobal = true; - } - // Unless +-flat-for-global is specified, use MUBUF instructions for global - // address space access if flat operations are not available. - if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { - ToggleFeature(AMDGPU::FeatureFlatForGlobal); - FlatForGlobal = false; - } - - // Set defaults if needed. - if (MaxPrivateElementSize == 0) - MaxPrivateElementSize = 4; - - if (LDSBankCount == 0) - LDSBankCount = 32; - - if (TT.getArch() == Triple::amdgcn) { - if (LocalMemorySize == 0) - LocalMemorySize = 32768; - - // Do something sensible for unspecified target. - if (!HasMovrel && !HasVGPRIndexMode) - HasMovrel = true; - } - - AddressableLocalMemorySize = LocalMemorySize; - - if (AMDGPU::isGFX10Plus(*this) && - !getFeatureBits().test(AMDGPU::FeatureCuMode)) - LocalMemorySize *= 2; - - // Don't crash on invalid devices. - if (WavefrontSizeLog2 == 0) - WavefrontSizeLog2 = 5; - - HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; - HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; - - TargetID.setTargetIDFromFeaturesString(FS); - - LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " - << TargetID.getXnackSetting() << '\n'); - LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " - << TargetID.getSramEccSetting() << '\n'); - - return *this; -} - -void GCNSubtarget::checkSubtargetFeatures(const Function &F) const { - LLVMContext &Ctx = F.getContext(); - if (hasFeature(AMDGPU::FeatureWavefrontSize32) == - hasFeature(AMDGPU::FeatureWavefrontSize64)) { - Ctx.diagnose(DiagnosticInfoUnsupported( - F, "must specify exactly one of wavefrontsize32 and wavefrontsize64")); - } -} - AMDGPUSubtarget::AMDGPUSubtarget(Triple TT) : TargetTriple(std::move(TT)) {} bool AMDGPUSubtarget::useRealTrue16Insts() const { return hasTrue16BitInsts() && EnableRealTrue16Insts; } -GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, - const GCNTargetMachine &TM) - : // clang-format off - AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), - AMDGPUSubtarget(TT), - TargetTriple(TT), - TargetID(*this), - InstrItins(getInstrItineraryForCPU(GPU)), - InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), - TLInfo(TM, *this), - FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { - // clang-format on - MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); - EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this); - CallLoweringInfo = std::make_unique(*getTargetLowering()); - InlineAsmLoweringInfo = - std::make_unique(getTargetLowering()); - Legalizer = std::make_unique(*this, TM); - RegBankInfo = std::make_unique(*this); - InstSelector = - std::make_unique(*this, *RegBankInfo, TM); -} - -unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { - if (getGeneration() < GFX10) - return 1; - - switch (Opcode) { - case AMDGPU::V_LSHLREV_B64_e64: - case AMDGPU::V_LSHLREV_B64_gfx10: - case AMDGPU::V_LSHLREV_B64_e64_gfx11: - case AMDGPU::V_LSHLREV_B64_e32_gfx12: - case AMDGPU::V_LSHLREV_B64_e64_gfx12: - case AMDGPU::V_LSHL_B64_e64: - case AMDGPU::V_LSHRREV_B64_e64: - case AMDGPU::V_LSHRREV_B64_gfx10: - case AMDGPU::V_LSHRREV_B64_e64_gfx11: - case AMDGPU::V_LSHRREV_B64_e64_gfx12: - case AMDGPU::V_LSHR_B64_e64: - case AMDGPU::V_ASHRREV_I64_e64: - case AMDGPU::V_ASHRREV_I64_gfx10: - case AMDGPU::V_ASHRREV_I64_e64_gfx11: - case AMDGPU::V_ASHRREV_I64_e64_gfx12: - case AMDGPU::V_ASHR_I64_e64: - return 1; - } - - return 2; -} - -/// This list was mostly derived from experimentation. -bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { - switch (Opcode) { - case AMDGPU::V_CVT_F16_F32_e32: - case AMDGPU::V_CVT_F16_F32_e64: - case AMDGPU::V_CVT_F16_U16_e32: - case AMDGPU::V_CVT_F16_U16_e64: - case AMDGPU::V_CVT_F16_I16_e32: - case AMDGPU::V_CVT_F16_I16_e64: - case AMDGPU::V_RCP_F16_e64: - case AMDGPU::V_RCP_F16_e32: - case AMDGPU::V_RSQ_F16_e64: - case AMDGPU::V_RSQ_F16_e32: - case AMDGPU::V_SQRT_F16_e64: - case AMDGPU::V_SQRT_F16_e32: - case AMDGPU::V_LOG_F16_e64: - case AMDGPU::V_LOG_F16_e32: - case AMDGPU::V_EXP_F16_e64: - case AMDGPU::V_EXP_F16_e32: - case AMDGPU::V_SIN_F16_e64: - case AMDGPU::V_SIN_F16_e32: - case AMDGPU::V_COS_F16_e64: - case AMDGPU::V_COS_F16_e32: - case AMDGPU::V_FLOOR_F16_e64: - case AMDGPU::V_FLOOR_F16_e32: - case AMDGPU::V_CEIL_F16_e64: - case AMDGPU::V_CEIL_F16_e32: - case AMDGPU::V_TRUNC_F16_e64: - case AMDGPU::V_TRUNC_F16_e32: - case AMDGPU::V_RNDNE_F16_e64: - case AMDGPU::V_RNDNE_F16_e32: - case AMDGPU::V_FRACT_F16_e64: - case AMDGPU::V_FRACT_F16_e32: - case AMDGPU::V_FREXP_MANT_F16_e64: - case AMDGPU::V_FREXP_MANT_F16_e32: - case AMDGPU::V_FREXP_EXP_I16_F16_e64: - case AMDGPU::V_FREXP_EXP_I16_F16_e32: - case AMDGPU::V_LDEXP_F16_e64: - case AMDGPU::V_LDEXP_F16_e32: - case AMDGPU::V_LSHLREV_B16_e64: - case AMDGPU::V_LSHLREV_B16_e32: - case AMDGPU::V_LSHRREV_B16_e64: - case AMDGPU::V_LSHRREV_B16_e32: - case AMDGPU::V_ASHRREV_I16_e64: - case AMDGPU::V_ASHRREV_I16_e32: - case AMDGPU::V_ADD_U16_e64: - case AMDGPU::V_ADD_U16_e32: - case AMDGPU::V_SUB_U16_e64: - case AMDGPU::V_SUB_U16_e32: - case AMDGPU::V_SUBREV_U16_e64: - case AMDGPU::V_SUBREV_U16_e32: - case AMDGPU::V_MUL_LO_U16_e64: - case AMDGPU::V_MUL_LO_U16_e32: - case AMDGPU::V_ADD_F16_e64: - case AMDGPU::V_ADD_F16_e32: - case AMDGPU::V_SUB_F16_e64: - case AMDGPU::V_SUB_F16_e32: - case AMDGPU::V_SUBREV_F16_e64: - case AMDGPU::V_SUBREV_F16_e32: - case AMDGPU::V_MUL_F16_e64: - case AMDGPU::V_MUL_F16_e32: - case AMDGPU::V_MAX_F16_e64: - case AMDGPU::V_MAX_F16_e32: - case AMDGPU::V_MIN_F16_e64: - case AMDGPU::V_MIN_F16_e32: - case AMDGPU::V_MAX_U16_e64: - case AMDGPU::V_MAX_U16_e32: - case AMDGPU::V_MIN_U16_e64: - case AMDGPU::V_MIN_U16_e32: - case AMDGPU::V_MAX_I16_e64: - case AMDGPU::V_MAX_I16_e32: - case AMDGPU::V_MIN_I16_e64: - case AMDGPU::V_MIN_I16_e32: - case AMDGPU::V_MAD_F16_e64: - case AMDGPU::V_MAD_U16_e64: - case AMDGPU::V_MAD_I16_e64: - case AMDGPU::V_FMA_F16_e64: - case AMDGPU::V_DIV_FIXUP_F16_e64: - // On gfx10, all 16-bit instructions preserve the high bits. - return getGeneration() <= AMDGPUSubtarget::GFX9; - case AMDGPU::V_MADAK_F16: - case AMDGPU::V_MADMK_F16: - case AMDGPU::V_MAC_F16_e64: - case AMDGPU::V_MAC_F16_e32: - case AMDGPU::V_FMAMK_F16: - case AMDGPU::V_FMAAK_F16: - case AMDGPU::V_FMAC_F16_e64: - case AMDGPU::V_FMAC_F16_e32: - // In gfx9, the preferred handling of the unused high 16-bits changed. Most - // instructions maintain the legacy behavior of 0ing. Some instructions - // changed to preserving the high bits. - return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; - case AMDGPU::V_MAD_MIXLO_F16: - case AMDGPU::V_MAD_MIXHI_F16: - default: - return false; - } -} - // Returns the maximum per-workgroup LDS allocation size (in bytes) that still // allows the given function to achieve an occupancy of NWaves waves per // SIMD / EU, taking into account only the function's *maximum* workgroup size. @@ -650,391 +353,6 @@ AMDGPUDwarfFlavour AMDGPUSubtarget::getAMDGPUDwarfFlavour() const { : AMDGPUDwarfFlavour::Wave64; } -void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, - unsigned NumRegionInstrs) const { - // Track register pressure so the scheduler can try to decrease - // pressure once register usage is above the threshold defined by - // SIRegisterInfo::getRegPressureSetLimit() - Policy.ShouldTrackPressure = true; - - // Enabling both top down and bottom up scheduling seems to give us less - // register spills than just using one of these approaches on its own. - Policy.OnlyTopDown = false; - Policy.OnlyBottomUp = false; - - // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. - if (!enableSIScheduler()) - Policy.ShouldTrackLaneMasks = true; -} - -void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const { - if (isWave32()) { - // Fix implicit $vcc operands after MIParser has verified that they match - // the instruction definitions. - for (auto &MBB : MF) { - for (auto &MI : MBB) - InstrInfo.fixImplicitOperands(MI); - } - } -} - -bool GCNSubtarget::hasMadF16() const { - return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; -} - -bool GCNSubtarget::useVGPRIndexMode() const { - return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); -} - -bool GCNSubtarget::useAA() const { return UseAA; } - -unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { - return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, getMaxWavesPerEU(), - getGeneration()); -} - -unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const { - return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs); -} - -unsigned -GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { - if (getGeneration() >= AMDGPUSubtarget::GFX10) - return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. - - if (HasFlatScratch || HasArchitectedFlatScratch) { - if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) - return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). - if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) - return 4; // FLAT_SCRATCH, VCC (in that order). - } - - if (isXNACKEnabled()) - return 4; // XNACK, VCC (in that order). - return 2; // VCC. -} - -unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { - const SIMachineFunctionInfo &MFI = *MF.getInfo(); - return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit()); -} - -unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { - // In principle we do not need to reserve SGPR pair used for flat_scratch if - // we know flat instructions do not access the stack anywhere in the - // program. For now assume it's needed if we have flat instructions. - const bool KernelUsesFlatScratch = hasFlatAddressSpace(); - return getBaseReservedNumSGPRs(KernelUsesFlatScratch); -} - -unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, - unsigned NumSGPRs, - unsigned NumVGPRs) const { - unsigned Occupancy = - std::min(getMaxWavesPerEU(), - getOccupancyWithLocalMemSize(LDSSize, F)); - if (NumSGPRs) - Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); - if (NumVGPRs) - Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); - return Occupancy; -} - -unsigned GCNSubtarget::getBaseMaxNumSGPRs( - const Function &F, std::pair WavesPerEU, - unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { - // Compute maximum number of SGPRs function can use using default/requested - // minimum number of waves per execution unit. - unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); - unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); - - // Check if maximum number of SGPRs was explicitly requested using - // "amdgpu-num-sgpr" attribute. - if (F.hasFnAttribute("amdgpu-num-sgpr")) { - unsigned Requested = - F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs); - - // Make sure requested value does not violate subtarget's specifications. - if (Requested && (Requested <= ReservedNumSGPRs)) - Requested = 0; - - // If more SGPRs are required to support the input user/system SGPRs, - // increase to accommodate them. - // - // FIXME: This really ends up using the requested number of SGPRs + number - // of reserved special registers in total. Theoretically you could re-use - // the last input registers for these special registers, but this would - // require a lot of complexity to deal with the weird aliasing. - unsigned InputNumSGPRs = PreloadedSGPRs; - if (Requested && Requested < InputNumSGPRs) - Requested = InputNumSGPRs; - - // Make sure requested value is compatible with values implied by - // default/requested minimum/maximum number of waves per execution unit. - if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) - Requested = 0; - if (WavesPerEU.second && - Requested && Requested < getMinNumSGPRs(WavesPerEU.second)) - Requested = 0; - - if (Requested) - MaxNumSGPRs = Requested; - } - - if (hasSGPRInitBug()) - MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; - - return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); -} - -unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { - const Function &F = MF.getFunction(); - const SIMachineFunctionInfo &MFI = *MF.getInfo(); - return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), - getReservedNumSGPRs(MF)); -} - -static unsigned getMaxNumPreloadedSGPRs() { - using USI = GCNUserSGPRUsageInfo; - // Max number of user SGPRs - const unsigned MaxUserSGPRs = - USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) + - USI::getNumUserSGPRForField(USI::DispatchPtrID) + - USI::getNumUserSGPRForField(USI::QueuePtrID) + - USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) + - USI::getNumUserSGPRForField(USI::DispatchIdID) + - USI::getNumUserSGPRForField(USI::FlatScratchInitID) + - USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID); - - // Max number of system SGPRs - const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX - 1 + // WorkGroupIDY - 1 + // WorkGroupIDZ - 1 + // WorkGroupInfo - 1; // private segment wave byte offset - - // Max number of synthetic SGPRs - const unsigned SyntheticSGPRs = 1; // LDSKernelId - - return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs; -} - -unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { - return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), - getReservedNumSGPRs(F)); -} - -unsigned GCNSubtarget::getBaseMaxNumVGPRs( - const Function &F, std::pair WavesPerEU) const { - // Compute maximum number of VGPRs function can use using default/requested - // minimum number of waves per execution unit. - unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); - - // Check if maximum number of VGPRs was explicitly requested using - // "amdgpu-num-vgpr" attribute. - if (F.hasFnAttribute("amdgpu-num-vgpr")) { - unsigned Requested = - F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs); - - if (hasGFX90AInsts()) - Requested *= 2; - - // Make sure requested value is compatible with values implied by - // default/requested minimum/maximum number of waves per execution unit. - if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) - Requested = 0; - if (WavesPerEU.second && - Requested && Requested < getMinNumVGPRs(WavesPerEU.second)) - Requested = 0; - - if (Requested) - MaxNumVGPRs = Requested; - } - - return MaxNumVGPRs; -} - -unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { - return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); -} - -unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { - const Function &F = MF.getFunction(); - const SIMachineFunctionInfo &MFI = *MF.getInfo(); - return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); -} - -void GCNSubtarget::adjustSchedDependency( - SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, - const TargetSchedModel *SchedModel) const { - if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || - !Def->isInstr() || !Use->isInstr()) - return; - - MachineInstr *DefI = Def->getInstr(); - MachineInstr *UseI = Use->getInstr(); - - if (DefI->isBundle()) { - const SIRegisterInfo *TRI = getRegisterInfo(); - auto Reg = Dep.getReg(); - MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); - MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); - unsigned Lat = 0; - for (++I; I != E && I->isBundledWithPred(); ++I) { - if (I->modifiesRegister(Reg, TRI)) - Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); - else if (Lat) - --Lat; - } - Dep.setLatency(Lat); - } else if (UseI->isBundle()) { - const SIRegisterInfo *TRI = getRegisterInfo(); - auto Reg = Dep.getReg(); - MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); - MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); - unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); - for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { - if (I->readsRegister(Reg, TRI)) - break; - --Lat; - } - Dep.setLatency(Lat); - } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) { - // Work around the fact that SIInstrInfo::fixImplicitOperands modifies - // implicit operands which come from the MCInstrDesc, which can fool - // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit - // pseudo operands. - Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency( - DefI, DefOpIdx, UseI, UseOpIdx)); - } -} - -namespace { -struct FillMFMAShadowMutation : ScheduleDAGMutation { - const SIInstrInfo *TII; - - ScheduleDAGMI *DAG; - - FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} - - bool isSALU(const SUnit *SU) const { - const MachineInstr *MI = SU->getInstr(); - return MI && TII->isSALU(*MI) && !MI->isTerminator(); - } - - bool isVALU(const SUnit *SU) const { - const MachineInstr *MI = SU->getInstr(); - return MI && TII->isVALU(*MI); - } - - // Link as many SALU instructions in chain as possible. Return the size - // of the chain. Links up to MaxChain instructions. - unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, - SmallPtrSetImpl &Visited) const { - SmallVector Worklist({To}); - unsigned Linked = 0; - - while (!Worklist.empty() && MaxChain-- > 0) { - SUnit *SU = Worklist.pop_back_val(); - if (!Visited.insert(SU).second) - continue; - - LLVM_DEBUG(dbgs() << "Inserting edge from\n" ; DAG->dumpNode(*From); - dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); - - if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From)) - if (DAG->addEdge(SU, SDep(From, SDep::Artificial))) - ++Linked; - - for (SDep &SI : From->Succs) { - SUnit *SUv = SI.getSUnit(); - if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) && - DAG->canAddEdge(SUv, SU)) - DAG->addEdge(SUv, SDep(SU, SDep::Artificial)); - } - - for (SDep &SI : SU->Succs) { - SUnit *Succ = SI.getSUnit(); - if (Succ != SU && isSALU(Succ)) - Worklist.push_back(Succ); - } - } - - return Linked; - } - - void apply(ScheduleDAGInstrs *DAGInstrs) override { - const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget(); - if (!ST.hasMAIInsts()) - return; - DAG = static_cast(DAGInstrs); - const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); - if (!TSchedModel || DAG->SUnits.empty()) - return; - - // Scan for MFMA long latency instructions and try to add a dependency - // of available SALU instructions to give them a chance to fill MFMA - // shadow. That is desirable to fill MFMA shadow with SALU instructions - // rather than VALU to prevent power consumption bursts and throttle. - auto LastSALU = DAG->SUnits.begin(); - auto E = DAG->SUnits.end(); - SmallPtrSet Visited; - for (SUnit &SU : DAG->SUnits) { - MachineInstr &MAI = *SU.getInstr(); - if (!TII->isMAI(MAI) || - MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || - MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) - continue; - - unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; - - LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); - dbgs() << "Need " << Lat - << " instructions to cover latency.\n"); - - // Find up to Lat independent scalar instructions as early as - // possible such that they can be scheduled after this MFMA. - for ( ; Lat && LastSALU != E; ++LastSALU) { - if (Visited.count(&*LastSALU)) - continue; - - if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) || - !DAG->canAddEdge(&*LastSALU, &SU)) - continue; - - Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); - } - } - } -}; -} // namespace - -void GCNSubtarget::getPostRAMutations( - std::vector> &Mutations) const { - Mutations.push_back(std::make_unique(&InstrInfo)); -} - -std::unique_ptr -GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const { - return EnablePowerSched ? std::make_unique(&InstrInfo) - : nullptr; -} - -unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { - if (getGeneration() >= AMDGPUSubtarget::GFX12) - return 0; // Not MIMG encoding. - - if (NSAThreshold.getNumOccurrences() > 0) - return std::max(NSAThreshold.getValue(), 2u); - - int Value = MF.getFunction().getFnAttributeAsParsedInteger( - "amdgpu-nsa-threshold", -1); - if (Value > 0) - return std::max(Value, 2); - - return 3; -} - const AMDGPUSubtarget &AMDGPUSubtarget::get(const MachineFunction &MF) { if (MF.getTarget().getTargetTriple().getArch() == Triple::amdgcn) return static_cast(MF.getSubtarget()); @@ -1048,85 +366,6 @@ const AMDGPUSubtarget &AMDGPUSubtarget::get(const TargetMachine &TM, const Funct TM.getSubtarget(F)); } -GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, - const GCNSubtarget &ST) - : ST(ST) { - const CallingConv::ID CC = F.getCallingConv(); - const bool IsKernel = - CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; - // FIXME: Should have analysis or something rather than attribute to detect - // calls. - const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); - // FIXME: This attribute is a hack, we just need an analysis on the function - // to look for allocas. - const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); - - if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0)) - KernargSegmentPtr = true; - - bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); - if (IsAmdHsaOrMesa && !ST.enableFlatScratch()) - PrivateSegmentBuffer = true; - else if (ST.isMesaGfxShader(F)) - ImplicitBufferPtr = true; - - if (!AMDGPU::isGraphics(CC)) { - if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr")) - DispatchPtr = true; - - // FIXME: Can this always be disabled with < COv5? - if (!F.hasFnAttribute("amdgpu-no-queue-ptr")) - QueuePtr = true; - - if (!F.hasFnAttribute("amdgpu-no-dispatch-id")) - DispatchID = true; - } - - // TODO: This could be refined a lot. The attribute is a poor way of - // detecting calls or stack objects that may require it before argument - // lowering. - if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) && - (IsAmdHsaOrMesa || ST.enableFlatScratch()) && - (HasCalls || HasStackObjects || ST.enableFlatScratch()) && - !ST.flatScratchIsArchitected()) { - FlatScratchInit = true; - } - - if (hasImplicitBufferPtr()) - NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID); - - if (hasPrivateSegmentBuffer()) - NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID); - - if (hasDispatchPtr()) - NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID); - - if (hasQueuePtr()) - NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID); - - if (hasKernargSegmentPtr()) - NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID); - - if (hasDispatchID()) - NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID); - - if (hasFlatScratchInit()) - NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID); - - if (hasPrivateSegmentSize()) - NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID); -} - -void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) { - assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST)); - NumKernargPreloadSGPRs += NumSGPRs; - NumUsedUserSGPRs += NumSGPRs; -} - -unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() { - return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs; -} - SmallVector AMDGPUSubtarget::getMaxNumWorkGroups(const Function &F) const { return AMDGPU::getIntegerVecAttribute(F, "amdgpu-max-num-workgroups", 3); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index 85a59e0123023..18a8e917fbb71 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -119,6 +119,7 @@ add_llvm_target(AMDGPUCodeGen GCNRegPressure.cpp GCNRewritePartialRegUses.cpp GCNSchedStrategy.cpp + GCNSubtarget.cpp GCNVOPDUtils.cpp R600AsmPrinter.cpp R600ClauseMergePass.cpp diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp new file mode 100644 index 0000000000000..b3872a6374261 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -0,0 +1,797 @@ +//===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Implements the GCN specific subclass of TargetSubtarget. +// +//===----------------------------------------------------------------------===// + +#include "GCNSubtarget.h" +#include "AMDGPUCallLowering.h" +#include "AMDGPUInstructionSelector.h" +#include "AMDGPULegalizerInfo.h" +#include "AMDGPURegisterBankInfo.h" +#include "AMDGPUTargetMachine.h" +#include "R600Subtarget.h" +#include "SIMachineFunctionInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h" +#include "llvm/CodeGen/MachineScheduler.h" +#include "llvm/CodeGen/TargetFrameLowering.h" +#include "llvm/IR/DiagnosticInfo.h" +#include "llvm/IR/IntrinsicsAMDGPU.h" +#include "llvm/IR/IntrinsicsR600.h" +#include "llvm/IR/MDBuilder.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "gcn-subtarget" + +#define GET_SUBTARGETINFO_TARGET_DESC +#define GET_SUBTARGETINFO_CTOR +#define AMDGPUSubtarget GCNSubtarget +#include "AMDGPUGenSubtargetInfo.inc" +#undef AMDGPUSubtarget + +static cl::opt + EnablePowerSched("amdgpu-enable-power-sched", + cl::desc("Enable scheduling to minimize mAI power bursts"), + cl::init(false)); + +static cl::opt EnableVGPRIndexMode( + "amdgpu-vgpr-index-mode", + cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), + cl::init(false)); + +static cl::opt UseAA("amdgpu-use-aa-in-codegen", + cl::desc("Enable the use of AA during codegen."), + cl::init(true)); + +static cl::opt + NSAThreshold("amdgpu-nsa-threshold", + cl::desc("Number of addresses from which to enable MIMG NSA."), + cl::init(3), cl::Hidden); + +GCNSubtarget::~GCNSubtarget() = default; + +GCNSubtarget &GCNSubtarget::initializeSubtargetDependencies(const Triple &TT, + StringRef GPU, + StringRef FS) { + // Determine default and user-specified characteristics + // + // We want to be able to turn these off, but making this a subtarget feature + // for SI has the unhelpful behavior that it unsets everything else if you + // disable it. + // + // Similarly we want enable-prt-strict-null to be on by default and not to + // unset everything else if it is disabled + + SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,"); + + // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by + // default + if (isAmdHsaOS()) + FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,"; + + FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS + + // Disable mutually exclusive bits. + if (FS.contains_insensitive("+wavefrontsize")) { + if (!FS.contains_insensitive("wavefrontsize16")) + FullFS += "-wavefrontsize16,"; + if (!FS.contains_insensitive("wavefrontsize32")) + FullFS += "-wavefrontsize32,"; + if (!FS.contains_insensitive("wavefrontsize64")) + FullFS += "-wavefrontsize64,"; + } + + FullFS += FS; + + ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS); + + // Implement the "generic" processors, which acts as the default when no + // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to + // the first amdgcn target that supports flat addressing. Other OSes defaults + // to the first amdgcn target. + if (Gen == AMDGPUSubtarget::INVALID) { + Gen = TT.getOS() == Triple::AMDHSA ? AMDGPUSubtarget::SEA_ISLANDS + : AMDGPUSubtarget::SOUTHERN_ISLANDS; + } + + if (!hasFeature(AMDGPU::FeatureWavefrontSize32) && + !hasFeature(AMDGPU::FeatureWavefrontSize64)) { + // If there is no default wave size it must be a generation before gfx10, + // these have FeatureWavefrontSize64 in their definition already. For gfx10+ + // set wave32 as a default. + ToggleFeature(AMDGPU::FeatureWavefrontSize32); + } + + // We don't support FP64 for EG/NI atm. + assert(!hasFP64() || (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS)); + + // Targets must either support 64-bit offsets for MUBUF instructions, and/or + // support flat operations, otherwise they cannot access a 64-bit global + // address space + assert(hasAddr64() || hasFlat()); + // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets + // that do not support ADDR64 variants of MUBUF instructions. Such targets + // cannot use a 64 bit offset with a MUBUF instruction to access the global + // address space + if (!hasAddr64() && !FS.contains("flat-for-global") && !FlatForGlobal) { + ToggleFeature(AMDGPU::FeatureFlatForGlobal); + FlatForGlobal = true; + } + // Unless +-flat-for-global is specified, use MUBUF instructions for global + // address space access if flat operations are not available. + if (!hasFlat() && !FS.contains("flat-for-global") && FlatForGlobal) { + ToggleFeature(AMDGPU::FeatureFlatForGlobal); + FlatForGlobal = false; + } + + // Set defaults if needed. + if (MaxPrivateElementSize == 0) + MaxPrivateElementSize = 4; + + if (LDSBankCount == 0) + LDSBankCount = 32; + + if (TT.getArch() == Triple::amdgcn) { + if (LocalMemorySize == 0) + LocalMemorySize = 32768; + + // Do something sensible for unspecified target. + if (!HasMovrel && !HasVGPRIndexMode) + HasMovrel = true; + } + + AddressableLocalMemorySize = LocalMemorySize; + + if (AMDGPU::isGFX10Plus(*this) && + !getFeatureBits().test(AMDGPU::FeatureCuMode)) + LocalMemorySize *= 2; + + // Don't crash on invalid devices. + if (WavefrontSizeLog2 == 0) + WavefrontSizeLog2 = 5; + + HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS; + HasSMulHi = getGeneration() >= AMDGPUSubtarget::GFX9; + + TargetID.setTargetIDFromFeaturesString(FS); + + LLVM_DEBUG(dbgs() << "xnack setting for subtarget: " + << TargetID.getXnackSetting() << '\n'); + LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: " + << TargetID.getSramEccSetting() << '\n'); + + return *this; +} + +void GCNSubtarget::checkSubtargetFeatures(const Function &F) const { + LLVMContext &Ctx = F.getContext(); + if (hasFeature(AMDGPU::FeatureWavefrontSize32) == + hasFeature(AMDGPU::FeatureWavefrontSize64)) { + Ctx.diagnose(DiagnosticInfoUnsupported( + F, "must specify exactly one of wavefrontsize32 and wavefrontsize64")); + } +} + +GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, + const GCNTargetMachine &TM) + : // clang-format off + AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS), + AMDGPUSubtarget(TT), + TargetTriple(TT), + TargetID(*this), + InstrItins(getInstrItineraryForCPU(GPU)), + InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)), + TLInfo(TM, *this), + FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) { + // clang-format on + MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this); + EUsPerCU = AMDGPU::IsaInfo::getEUsPerCU(this); + CallLoweringInfo = std::make_unique(*getTargetLowering()); + InlineAsmLoweringInfo = + std::make_unique(getTargetLowering()); + Legalizer = std::make_unique(*this, TM); + RegBankInfo = std::make_unique(*this); + InstSelector = + std::make_unique(*this, *RegBankInfo, TM); +} + +unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const { + if (getGeneration() < GFX10) + return 1; + + switch (Opcode) { + case AMDGPU::V_LSHLREV_B64_e64: + case AMDGPU::V_LSHLREV_B64_gfx10: + case AMDGPU::V_LSHLREV_B64_e64_gfx11: + case AMDGPU::V_LSHLREV_B64_e32_gfx12: + case AMDGPU::V_LSHLREV_B64_e64_gfx12: + case AMDGPU::V_LSHL_B64_e64: + case AMDGPU::V_LSHRREV_B64_e64: + case AMDGPU::V_LSHRREV_B64_gfx10: + case AMDGPU::V_LSHRREV_B64_e64_gfx11: + case AMDGPU::V_LSHRREV_B64_e64_gfx12: + case AMDGPU::V_LSHR_B64_e64: + case AMDGPU::V_ASHRREV_I64_e64: + case AMDGPU::V_ASHRREV_I64_gfx10: + case AMDGPU::V_ASHRREV_I64_e64_gfx11: + case AMDGPU::V_ASHRREV_I64_e64_gfx12: + case AMDGPU::V_ASHR_I64_e64: + return 1; + } + + return 2; +} + +/// This list was mostly derived from experimentation. +bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const { + switch (Opcode) { + case AMDGPU::V_CVT_F16_F32_e32: + case AMDGPU::V_CVT_F16_F32_e64: + case AMDGPU::V_CVT_F16_U16_e32: + case AMDGPU::V_CVT_F16_U16_e64: + case AMDGPU::V_CVT_F16_I16_e32: + case AMDGPU::V_CVT_F16_I16_e64: + case AMDGPU::V_RCP_F16_e64: + case AMDGPU::V_RCP_F16_e32: + case AMDGPU::V_RSQ_F16_e64: + case AMDGPU::V_RSQ_F16_e32: + case AMDGPU::V_SQRT_F16_e64: + case AMDGPU::V_SQRT_F16_e32: + case AMDGPU::V_LOG_F16_e64: + case AMDGPU::V_LOG_F16_e32: + case AMDGPU::V_EXP_F16_e64: + case AMDGPU::V_EXP_F16_e32: + case AMDGPU::V_SIN_F16_e64: + case AMDGPU::V_SIN_F16_e32: + case AMDGPU::V_COS_F16_e64: + case AMDGPU::V_COS_F16_e32: + case AMDGPU::V_FLOOR_F16_e64: + case AMDGPU::V_FLOOR_F16_e32: + case AMDGPU::V_CEIL_F16_e64: + case AMDGPU::V_CEIL_F16_e32: + case AMDGPU::V_TRUNC_F16_e64: + case AMDGPU::V_TRUNC_F16_e32: + case AMDGPU::V_RNDNE_F16_e64: + case AMDGPU::V_RNDNE_F16_e32: + case AMDGPU::V_FRACT_F16_e64: + case AMDGPU::V_FRACT_F16_e32: + case AMDGPU::V_FREXP_MANT_F16_e64: + case AMDGPU::V_FREXP_MANT_F16_e32: + case AMDGPU::V_FREXP_EXP_I16_F16_e64: + case AMDGPU::V_FREXP_EXP_I16_F16_e32: + case AMDGPU::V_LDEXP_F16_e64: + case AMDGPU::V_LDEXP_F16_e32: + case AMDGPU::V_LSHLREV_B16_e64: + case AMDGPU::V_LSHLREV_B16_e32: + case AMDGPU::V_LSHRREV_B16_e64: + case AMDGPU::V_LSHRREV_B16_e32: + case AMDGPU::V_ASHRREV_I16_e64: + case AMDGPU::V_ASHRREV_I16_e32: + case AMDGPU::V_ADD_U16_e64: + case AMDGPU::V_ADD_U16_e32: + case AMDGPU::V_SUB_U16_e64: + case AMDGPU::V_SUB_U16_e32: + case AMDGPU::V_SUBREV_U16_e64: + case AMDGPU::V_SUBREV_U16_e32: + case AMDGPU::V_MUL_LO_U16_e64: + case AMDGPU::V_MUL_LO_U16_e32: + case AMDGPU::V_ADD_F16_e64: + case AMDGPU::V_ADD_F16_e32: + case AMDGPU::V_SUB_F16_e64: + case AMDGPU::V_SUB_F16_e32: + case AMDGPU::V_SUBREV_F16_e64: + case AMDGPU::V_SUBREV_F16_e32: + case AMDGPU::V_MUL_F16_e64: + case AMDGPU::V_MUL_F16_e32: + case AMDGPU::V_MAX_F16_e64: + case AMDGPU::V_MAX_F16_e32: + case AMDGPU::V_MIN_F16_e64: + case AMDGPU::V_MIN_F16_e32: + case AMDGPU::V_MAX_U16_e64: + case AMDGPU::V_MAX_U16_e32: + case AMDGPU::V_MIN_U16_e64: + case AMDGPU::V_MIN_U16_e32: + case AMDGPU::V_MAX_I16_e64: + case AMDGPU::V_MAX_I16_e32: + case AMDGPU::V_MIN_I16_e64: + case AMDGPU::V_MIN_I16_e32: + case AMDGPU::V_MAD_F16_e64: + case AMDGPU::V_MAD_U16_e64: + case AMDGPU::V_MAD_I16_e64: + case AMDGPU::V_FMA_F16_e64: + case AMDGPU::V_DIV_FIXUP_F16_e64: + // On gfx10, all 16-bit instructions preserve the high bits. + return getGeneration() <= AMDGPUSubtarget::GFX9; + case AMDGPU::V_MADAK_F16: + case AMDGPU::V_MADMK_F16: + case AMDGPU::V_MAC_F16_e64: + case AMDGPU::V_MAC_F16_e32: + case AMDGPU::V_FMAMK_F16: + case AMDGPU::V_FMAAK_F16: + case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F16_e32: + // In gfx9, the preferred handling of the unused high 16-bits changed. Most + // instructions maintain the legacy behavior of 0ing. Some instructions + // changed to preserving the high bits. + return getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS; + case AMDGPU::V_MAD_MIXLO_F16: + case AMDGPU::V_MAD_MIXHI_F16: + default: + return false; + } +} + +void GCNSubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, + unsigned NumRegionInstrs) const { + // Track register pressure so the scheduler can try to decrease + // pressure once register usage is above the threshold defined by + // SIRegisterInfo::getRegPressureSetLimit() + Policy.ShouldTrackPressure = true; + + // Enabling both top down and bottom up scheduling seems to give us less + // register spills than just using one of these approaches on its own. + Policy.OnlyTopDown = false; + Policy.OnlyBottomUp = false; + + // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler. + if (!enableSIScheduler()) + Policy.ShouldTrackLaneMasks = true; +} + +void GCNSubtarget::mirFileLoaded(MachineFunction &MF) const { + if (isWave32()) { + // Fix implicit $vcc operands after MIParser has verified that they match + // the instruction definitions. + for (auto &MBB : MF) { + for (auto &MI : MBB) + InstrInfo.fixImplicitOperands(MI); + } + } +} + +bool GCNSubtarget::hasMadF16() const { + return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1; +} + +bool GCNSubtarget::useVGPRIndexMode() const { + return !hasMovrel() || (EnableVGPRIndexMode && hasVGPRIndexMode()); +} + +bool GCNSubtarget::useAA() const { return UseAA; } + +unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const { + return AMDGPU::IsaInfo::getOccupancyWithNumSGPRs(SGPRs, getMaxWavesPerEU(), + getGeneration()); +} + +unsigned GCNSubtarget::getOccupancyWithNumVGPRs(unsigned NumVGPRs) const { + return AMDGPU::IsaInfo::getNumWavesPerEUWithNumVGPRs(this, NumVGPRs); +} + +unsigned +GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const { + if (getGeneration() >= AMDGPUSubtarget::GFX10) + return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs. + + if (HasFlatScratch || HasArchitectedFlatScratch) { + if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) + return 6; // FLAT_SCRATCH, XNACK, VCC (in that order). + if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) + return 4; // FLAT_SCRATCH, VCC (in that order). + } + + if (isXNACKEnabled()) + return 4; // XNACK, VCC (in that order). + return 2; // VCC. +} + +unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const { + const SIMachineFunctionInfo &MFI = *MF.getInfo(); + return getBaseReservedNumSGPRs(MFI.getUserSGPRInfo().hasFlatScratchInit()); +} + +unsigned GCNSubtarget::getReservedNumSGPRs(const Function &F) const { + // In principle we do not need to reserve SGPR pair used for flat_scratch if + // we know flat instructions do not access the stack anywhere in the + // program. For now assume it's needed if we have flat instructions. + const bool KernelUsesFlatScratch = hasFlatAddressSpace(); + return getBaseReservedNumSGPRs(KernelUsesFlatScratch); +} + +unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize, + unsigned NumSGPRs, + unsigned NumVGPRs) const { + unsigned Occupancy = + std::min(getMaxWavesPerEU(), getOccupancyWithLocalMemSize(LDSSize, F)); + if (NumSGPRs) + Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs)); + if (NumVGPRs) + Occupancy = std::min(Occupancy, getOccupancyWithNumVGPRs(NumVGPRs)); + return Occupancy; +} + +unsigned GCNSubtarget::getBaseMaxNumSGPRs( + const Function &F, std::pair WavesPerEU, + unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const { + // Compute maximum number of SGPRs function can use using default/requested + // minimum number of waves per execution unit. + unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false); + unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true); + + // Check if maximum number of SGPRs was explicitly requested using + // "amdgpu-num-sgpr" attribute. + if (F.hasFnAttribute("amdgpu-num-sgpr")) { + unsigned Requested = + F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs); + + // Make sure requested value does not violate subtarget's specifications. + if (Requested && (Requested <= ReservedNumSGPRs)) + Requested = 0; + + // If more SGPRs are required to support the input user/system SGPRs, + // increase to accommodate them. + // + // FIXME: This really ends up using the requested number of SGPRs + number + // of reserved special registers in total. Theoretically you could re-use + // the last input registers for these special registers, but this would + // require a lot of complexity to deal with the weird aliasing. + unsigned InputNumSGPRs = PreloadedSGPRs; + if (Requested && Requested < InputNumSGPRs) + Requested = InputNumSGPRs; + + // Make sure requested value is compatible with values implied by + // default/requested minimum/maximum number of waves per execution unit. + if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false)) + Requested = 0; + if (WavesPerEU.second && Requested && + Requested < getMinNumSGPRs(WavesPerEU.second)) + Requested = 0; + + if (Requested) + MaxNumSGPRs = Requested; + } + + if (hasSGPRInitBug()) + MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG; + + return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs); +} + +unsigned GCNSubtarget::getMaxNumSGPRs(const MachineFunction &MF) const { + const Function &F = MF.getFunction(); + const SIMachineFunctionInfo &MFI = *MF.getInfo(); + return getBaseMaxNumSGPRs(F, MFI.getWavesPerEU(), MFI.getNumPreloadedSGPRs(), + getReservedNumSGPRs(MF)); +} + +static unsigned getMaxNumPreloadedSGPRs() { + using USI = GCNUserSGPRUsageInfo; + // Max number of user SGPRs + const unsigned MaxUserSGPRs = + USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) + + USI::getNumUserSGPRForField(USI::DispatchPtrID) + + USI::getNumUserSGPRForField(USI::QueuePtrID) + + USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) + + USI::getNumUserSGPRForField(USI::DispatchIdID) + + USI::getNumUserSGPRForField(USI::FlatScratchInitID) + + USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID); + + // Max number of system SGPRs + const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX + 1 + // WorkGroupIDY + 1 + // WorkGroupIDZ + 1 + // WorkGroupInfo + 1; // private segment wave byte offset + + // Max number of synthetic SGPRs + const unsigned SyntheticSGPRs = 1; // LDSKernelId + + return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs; +} + +unsigned GCNSubtarget::getMaxNumSGPRs(const Function &F) const { + return getBaseMaxNumSGPRs(F, getWavesPerEU(F), getMaxNumPreloadedSGPRs(), + getReservedNumSGPRs(F)); +} + +unsigned GCNSubtarget::getBaseMaxNumVGPRs( + const Function &F, std::pair WavesPerEU) const { + // Compute maximum number of VGPRs function can use using default/requested + // minimum number of waves per execution unit. + unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first); + + // Check if maximum number of VGPRs was explicitly requested using + // "amdgpu-num-vgpr" attribute. + if (F.hasFnAttribute("amdgpu-num-vgpr")) { + unsigned Requested = + F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs); + + if (hasGFX90AInsts()) + Requested *= 2; + + // Make sure requested value is compatible with values implied by + // default/requested minimum/maximum number of waves per execution unit. + if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first)) + Requested = 0; + if (WavesPerEU.second && Requested && + Requested < getMinNumVGPRs(WavesPerEU.second)) + Requested = 0; + + if (Requested) + MaxNumVGPRs = Requested; + } + + return MaxNumVGPRs; +} + +unsigned GCNSubtarget::getMaxNumVGPRs(const Function &F) const { + return getBaseMaxNumVGPRs(F, getWavesPerEU(F)); +} + +unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const { + const Function &F = MF.getFunction(); + const SIMachineFunctionInfo &MFI = *MF.getInfo(); + return getBaseMaxNumVGPRs(F, MFI.getWavesPerEU()); +} + +void GCNSubtarget::adjustSchedDependency( + SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, + const TargetSchedModel *SchedModel) const { + if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() || + !Use->isInstr()) + return; + + MachineInstr *DefI = Def->getInstr(); + MachineInstr *UseI = Use->getInstr(); + + if (DefI->isBundle()) { + const SIRegisterInfo *TRI = getRegisterInfo(); + auto Reg = Dep.getReg(); + MachineBasicBlock::const_instr_iterator I(DefI->getIterator()); + MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end()); + unsigned Lat = 0; + for (++I; I != E && I->isBundledWithPred(); ++I) { + if (I->modifiesRegister(Reg, TRI)) + Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I); + else if (Lat) + --Lat; + } + Dep.setLatency(Lat); + } else if (UseI->isBundle()) { + const SIRegisterInfo *TRI = getRegisterInfo(); + auto Reg = Dep.getReg(); + MachineBasicBlock::const_instr_iterator I(UseI->getIterator()); + MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end()); + unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI); + for (++I; I != E && I->isBundledWithPred() && Lat; ++I) { + if (I->readsRegister(Reg, TRI)) + break; + --Lat; + } + Dep.setLatency(Lat); + } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) { + // Work around the fact that SIInstrInfo::fixImplicitOperands modifies + // implicit operands which come from the MCInstrDesc, which can fool + // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit + // pseudo operands. + Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency( + DefI, DefOpIdx, UseI, UseOpIdx)); + } +} + +namespace { +struct FillMFMAShadowMutation : ScheduleDAGMutation { + const SIInstrInfo *TII; + + ScheduleDAGMI *DAG; + + FillMFMAShadowMutation(const SIInstrInfo *tii) : TII(tii) {} + + bool isSALU(const SUnit *SU) const { + const MachineInstr *MI = SU->getInstr(); + return MI && TII->isSALU(*MI) && !MI->isTerminator(); + } + + bool isVALU(const SUnit *SU) const { + const MachineInstr *MI = SU->getInstr(); + return MI && TII->isVALU(*MI); + } + + // Link as many SALU instructions in chain as possible. Return the size + // of the chain. Links up to MaxChain instructions. + unsigned linkSALUChain(SUnit *From, SUnit *To, unsigned MaxChain, + SmallPtrSetImpl &Visited) const { + SmallVector Worklist({To}); + unsigned Linked = 0; + + while (!Worklist.empty() && MaxChain-- > 0) { + SUnit *SU = Worklist.pop_back_val(); + if (!Visited.insert(SU).second) + continue; + + LLVM_DEBUG(dbgs() << "Inserting edge from\n"; DAG->dumpNode(*From); + dbgs() << "to\n"; DAG->dumpNode(*SU); dbgs() << '\n'); + + if (SU != From && From != &DAG->ExitSU && DAG->canAddEdge(SU, From)) + if (DAG->addEdge(SU, SDep(From, SDep::Artificial))) + ++Linked; + + for (SDep &SI : From->Succs) { + SUnit *SUv = SI.getSUnit(); + if (SUv != From && SU != &DAG->ExitSU && isVALU(SUv) && + DAG->canAddEdge(SUv, SU)) + DAG->addEdge(SUv, SDep(SU, SDep::Artificial)); + } + + for (SDep &SI : SU->Succs) { + SUnit *Succ = SI.getSUnit(); + if (Succ != SU && isSALU(Succ)) + Worklist.push_back(Succ); + } + } + + return Linked; + } + + void apply(ScheduleDAGInstrs *DAGInstrs) override { + const GCNSubtarget &ST = DAGInstrs->MF.getSubtarget(); + if (!ST.hasMAIInsts()) + return; + DAG = static_cast(DAGInstrs); + const TargetSchedModel *TSchedModel = DAGInstrs->getSchedModel(); + if (!TSchedModel || DAG->SUnits.empty()) + return; + + // Scan for MFMA long latency instructions and try to add a dependency + // of available SALU instructions to give them a chance to fill MFMA + // shadow. That is desirable to fill MFMA shadow with SALU instructions + // rather than VALU to prevent power consumption bursts and throttle. + auto LastSALU = DAG->SUnits.begin(); + auto E = DAG->SUnits.end(); + SmallPtrSet Visited; + for (SUnit &SU : DAG->SUnits) { + MachineInstr &MAI = *SU.getInstr(); + if (!TII->isMAI(MAI) || + MAI.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || + MAI.getOpcode() == AMDGPU::V_ACCVGPR_READ_B32_e64) + continue; + + unsigned Lat = TSchedModel->computeInstrLatency(&MAI) - 1; + + LLVM_DEBUG(dbgs() << "Found MFMA: "; DAG->dumpNode(SU); + dbgs() << "Need " << Lat + << " instructions to cover latency.\n"); + + // Find up to Lat independent scalar instructions as early as + // possible such that they can be scheduled after this MFMA. + for (; Lat && LastSALU != E; ++LastSALU) { + if (Visited.count(&*LastSALU)) + continue; + + if (&SU == &DAG->ExitSU || &SU == &*LastSALU || !isSALU(&*LastSALU) || + !DAG->canAddEdge(&*LastSALU, &SU)) + continue; + + Lat -= linkSALUChain(&SU, &*LastSALU, Lat, Visited); + } + } + } +}; +} // namespace + +void GCNSubtarget::getPostRAMutations( + std::vector> &Mutations) const { + Mutations.push_back(std::make_unique(&InstrInfo)); +} + +std::unique_ptr +GCNSubtarget::createFillMFMAShadowMutation(const TargetInstrInfo *TII) const { + return EnablePowerSched ? std::make_unique(&InstrInfo) + : nullptr; +} + +unsigned GCNSubtarget::getNSAThreshold(const MachineFunction &MF) const { + if (getGeneration() >= AMDGPUSubtarget::GFX12) + return 0; // Not MIMG encoding. + + if (NSAThreshold.getNumOccurrences() > 0) + return std::max(NSAThreshold.getValue(), 2u); + + int Value = MF.getFunction().getFnAttributeAsParsedInteger( + "amdgpu-nsa-threshold", -1); + if (Value > 0) + return std::max(Value, 2); + + return 3; +} + +GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F, + const GCNSubtarget &ST) + : ST(ST) { + const CallingConv::ID CC = F.getCallingConv(); + const bool IsKernel = + CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL; + // FIXME: Should have analysis or something rather than attribute to detect + // calls. + const bool HasCalls = F.hasFnAttribute("amdgpu-calls"); + // FIXME: This attribute is a hack, we just need an analysis on the function + // to look for allocas. + const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects"); + + if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0)) + KernargSegmentPtr = true; + + bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F); + if (IsAmdHsaOrMesa && !ST.enableFlatScratch()) + PrivateSegmentBuffer = true; + else if (ST.isMesaGfxShader(F)) + ImplicitBufferPtr = true; + + if (!AMDGPU::isGraphics(CC)) { + if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr")) + DispatchPtr = true; + + // FIXME: Can this always be disabled with < COv5? + if (!F.hasFnAttribute("amdgpu-no-queue-ptr")) + QueuePtr = true; + + if (!F.hasFnAttribute("amdgpu-no-dispatch-id")) + DispatchID = true; + } + + // TODO: This could be refined a lot. The attribute is a poor way of + // detecting calls or stack objects that may require it before argument + // lowering. + if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) && + (IsAmdHsaOrMesa || ST.enableFlatScratch()) && + (HasCalls || HasStackObjects || ST.enableFlatScratch()) && + !ST.flatScratchIsArchitected()) { + FlatScratchInit = true; + } + + if (hasImplicitBufferPtr()) + NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID); + + if (hasPrivateSegmentBuffer()) + NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentBufferID); + + if (hasDispatchPtr()) + NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID); + + if (hasQueuePtr()) + NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID); + + if (hasKernargSegmentPtr()) + NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID); + + if (hasDispatchID()) + NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID); + + if (hasFlatScratchInit()) + NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID); + + if (hasPrivateSegmentSize()) + NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID); +} + +void GCNUserSGPRUsageInfo::allocKernargPreloadSGPRs(unsigned NumSGPRs) { + assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST)); + NumKernargPreloadSGPRs += NumSGPRs; + NumUsedUserSGPRs += NumSGPRs; +} + +unsigned GCNUserSGPRUsageInfo::getNumFreeUserSGPRs() { + return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs; +} diff --git a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll index 331518c0c9d33..a3fed314fed24 100644 --- a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll +++ b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-any.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s ; REQUIRES: asserts diff --git a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-disabled.ll b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-disabled.ll index 1e4e9f3e13fe2..65b289bcd29d9 100644 --- a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-disabled.ll +++ b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-disabled.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s ; REQUIRES: asserts diff --git a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll index 713b276ddedb3..bd665eb432f48 100644 --- a/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll +++ b/llvm/test/CodeGen/AMDGPU/sramecc-subtarget-feature-enabled.ll @@ -1,6 +1,6 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s ; REQUIRES: asserts diff --git a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll index b7da3b77c9637..5aaf81d0e10e2 100644 --- a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll +++ b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-any.ll @@ -1,10 +1,10 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx902 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx902 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ANY %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=NOT-SUPPORTED %s ; REQUIRES: asserts diff --git a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-disabled.ll b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-disabled.ll index 23baeabc6a1bb..4ced763abc2ac 100644 --- a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-disabled.ll +++ b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-disabled.ll @@ -1,10 +1,10 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -debug-only=amdgpu-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -debug-only=gcn-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=gcn-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=OFF %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s ; REQUIRES: asserts diff --git a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-enabled.ll b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-enabled.ll index a52c842afb291..20354f6828f9c 100644 --- a/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-enabled.ll +++ b/llvm/test/CodeGen/AMDGPU/xnack-subtarget-feature-enabled.ll @@ -1,10 +1,10 @@ -; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -debug-only=amdgpu-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=amdgpu-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -debug-only=amdgpu-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -debug-only=gcn-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -debug-only=gcn-subtarget -o /dev/null %s 2>&1 | FileCheck --check-prefix=WARN %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx801 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=ON %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -debug-only=gcn-subtarget -o - %s 2>&1 | FileCheck --check-prefix=WARN %s ; REQUIRES: asserts