diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp index e3e3e411c6843..593b3ab220389 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.cpp @@ -97,6 +97,86 @@ MCSymbol *MCResourceInfo::getMaxSGPRSymbol(MCContext &OutContext) { return OutContext.getOrCreateSymbol("amdgpu.max_num_sgpr"); } +// Tries to flatten recursive call register resource gathering. Simple cycle +// avoiding dfs to find the constants in the propagated symbols. +// Assumes: +// - RecSym has been confirmed to recurse (this means the callee symbols should +// all be populated, started at RecSym). +// - Shape of the resource symbol's MCExpr (`max` args are order agnostic): +// RecSym.MCExpr := max(+, *) +const MCExpr *MCResourceInfo::flattenedCycleMax(MCSymbol *RecSym, + ResourceInfoKind RIK, + MCContext &OutContext) { + SmallPtrSet Seen; + SmallVector WorkList; + int64_t Maximum = 0; + + const MCExpr *RecExpr = RecSym->getVariableValue(); + WorkList.push_back(RecExpr); + + while (!WorkList.empty()) { + const MCExpr *CurExpr = WorkList.pop_back_val(); + switch (CurExpr->getKind()) { + default: { + // Assuming the recursion is of shape `max(, )` + // where will eventually recurse. If this condition holds, + // the recursion occurs within some other (possibly unresolvable) MCExpr, + // thus using the worst case value then. + if (!AMDGPUMCExpr::isSymbolUsedInExpression(RecSym, CurExpr)) { + LLVM_DEBUG(dbgs() << "MCResUse: " << RecSym->getName() + << ": Recursion in unexpected sub-expression, using " + "module maximum\n"); + switch (RIK) { + default: + break; + case RIK_NumVGPR: + return MCSymbolRefExpr::create(getMaxVGPRSymbol(OutContext), + OutContext); + break; + case RIK_NumSGPR: + return MCSymbolRefExpr::create(getMaxSGPRSymbol(OutContext), + OutContext); + break; + case RIK_NumAGPR: + return MCSymbolRefExpr::create(getMaxAGPRSymbol(OutContext), + OutContext); + break; + } + } + break; + } + case MCExpr::ExprKind::Constant: { + int64_t Val = cast(CurExpr)->getValue(); + Maximum = std::max(Maximum, Val); + break; + } + case MCExpr::ExprKind::SymbolRef: { + const MCSymbolRefExpr *SymExpr = cast(CurExpr); + const MCSymbol &SymRef = SymExpr->getSymbol(); + if (SymRef.isVariable()) { + const MCExpr *SymVal = SymRef.getVariableValue(); + if (Seen.insert(SymVal).second) + WorkList.push_back(SymVal); + } + break; + } + case MCExpr::ExprKind::Target: { + const AMDGPUMCExpr *TargetExpr = cast(CurExpr); + if (TargetExpr->getKind() == AMDGPUMCExpr::VariantKind::AGVK_Max) { + for (auto &Arg : TargetExpr->getArgs()) + WorkList.push_back(Arg); + } + break; + } + } + } + + LLVM_DEBUG(dbgs() << "MCResUse: " << RecSym->getName() + << ": Using flattened max: << " << Maximum << '\n'); + + return MCConstantExpr::create(Maximum, OutContext); +} + void MCResourceInfo::assignResourceInfoExpr( int64_t LocalValue, ResourceInfoKind RIK, AMDGPUMCExpr::VariantKind Kind, const MachineFunction &MF, const SmallVectorImpl &Callees, @@ -133,25 +213,19 @@ void MCResourceInfo::assignResourceInfoExpr( << CalleeValSym->getName() << " as callee\n"); ArgExprs.push_back(MCSymbolRefExpr::create(CalleeValSym, OutContext)); } else { - LLVM_DEBUG( - dbgs() << "MCResUse: " << Sym->getName() - << ": Recursion found, falling back to module maximum\n"); - // In case of recursion: make sure to use conservative register counts - // (i.e., specifically for VGPR/SGPR/AGPR). + LLVM_DEBUG(dbgs() << "MCResUse: " << Sym->getName() + << ": Recursion found, attempt flattening of cycle " + "for resource usage\n"); + // In case of recursion for vgpr/sgpr/agpr resource usage: try to + // flatten and use the max of the call cycle. May still end up emitting + // module max if not fully resolvable. switch (RIK) { default: break; case RIK_NumVGPR: - ArgExprs.push_back(MCSymbolRefExpr::create( - getMaxVGPRSymbol(OutContext), OutContext)); - break; case RIK_NumSGPR: - ArgExprs.push_back(MCSymbolRefExpr::create( - getMaxSGPRSymbol(OutContext), OutContext)); - break; case RIK_NumAGPR: - ArgExprs.push_back(MCSymbolRefExpr::create( - getMaxAGPRSymbol(OutContext), OutContext)); + ArgExprs.push_back(flattenedCycleMax(CalleeValSym, RIK, OutContext)); break; } } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h index a670878948c31..fa98f82d11022 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUMCResourceInfo.h @@ -58,6 +58,12 @@ class MCResourceInfo { // Assigns expression for Max S/V/A-GPRs to the referenced symbols. void assignMaxRegs(MCContext &OutContext); + // Take flattened max of cyclic function calls' knowns. For example, for + // a cycle A->B->C->D->A, take max(A, B, C, D) for A and have B, C, D have the + // propgated value from A. + const MCExpr *flattenedCycleMax(MCSymbol *RecSym, ResourceInfoKind RIK, + MCContext &OutContext); + public: MCResourceInfo() = default; void addMaxVGPRCandidate(int32_t candidate) { diff --git a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll index 0a6aa05c2d212..2a18d40e0bd8a 100644 --- a/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/function-resource-usage.ll @@ -495,17 +495,17 @@ define amdgpu_kernel void @usage_direct_recursion(i32 %n) #0 { ; GCN: NumVgprs: max(43, multi_stage_recurse1.num_vgpr) ; GCN: ScratchSize: 16+max(multi_stage_recurse1.private_seg_size) ; GCN-LABEL: {{^}}multi_stage_recurse1: -; GCN: .set multi_stage_recurse1.num_vgpr, max(48, amdgpu.max_num_vgpr) -; GCN: .set multi_stage_recurse1.num_agpr, max(0, amdgpu.max_num_agpr) -; GCN: .set multi_stage_recurse1.numbered_sgpr, max(34, amdgpu.max_num_sgpr) +; GCN: .set multi_stage_recurse1.num_vgpr, max(48, 43) +; GCN: .set multi_stage_recurse1.num_agpr, max(0, 0) +; GCN: .set multi_stage_recurse1.numbered_sgpr, max(34, 34) ; GCN: .set multi_stage_recurse1.private_seg_size, 16 ; GCN: .set multi_stage_recurse1.uses_vcc, 1 ; GCN: .set multi_stage_recurse1.uses_flat_scratch, 0 ; GCN: .set multi_stage_recurse1.has_dyn_sized_stack, 0 ; GCN: .set multi_stage_recurse1.has_recursion, 1 ; GCN: .set multi_stage_recurse1.has_indirect_call, 0 -; GCN: TotalNumSgprs: multi_stage_recurse1.numbered_sgpr+4 -; GCN: NumVgprs: max(48, amdgpu.max_num_vgpr) +; GCN: TotalNumSgprs: 38 +; GCN: NumVgprs: 48 ; GCN: ScratchSize: 16 define void @multi_stage_recurse1(i32 %val) #2 { call void @multi_stage_recurse2(i32 %val) @@ -528,8 +528,8 @@ define void @multi_stage_recurse2(i32 %val) #2 { ; GCN: .set usage_multi_stage_recurse.has_dyn_sized_stack, or(0, multi_stage_recurse1.has_dyn_sized_stack) ; GCN: .set usage_multi_stage_recurse.has_recursion, or(1, multi_stage_recurse1.has_recursion) ; GCN: .set usage_multi_stage_recurse.has_indirect_call, or(0, multi_stage_recurse1.has_indirect_call) -; GCN: TotalNumSgprs: usage_multi_stage_recurse.numbered_sgpr+6 -; GCN: NumVgprs: usage_multi_stage_recurse.num_vgpr +; GCN: TotalNumSgprs: 40 +; GCN: NumVgprs: 48 ; GCN: ScratchSize: 16 define amdgpu_kernel void @usage_multi_stage_recurse(i32 %n) #0 { call void @multi_stage_recurse1(i32 %n) @@ -550,17 +550,17 @@ define amdgpu_kernel void @usage_multi_stage_recurse(i32 %n) #0 { ; GCN: NumVgprs: max(41, multi_stage_recurse_noattr1.num_vgpr) ; GCN: ScratchSize: 16+max(multi_stage_recurse_noattr1.private_seg_size) ; GCN-LABEL: {{^}}multi_stage_recurse_noattr1: -; GCN: .set multi_stage_recurse_noattr1.num_vgpr, max(41, amdgpu.max_num_vgpr) -; GCN: .set multi_stage_recurse_noattr1.num_agpr, max(0, amdgpu.max_num_agpr) -; GCN: .set multi_stage_recurse_noattr1.numbered_sgpr, max(57, amdgpu.max_num_sgpr) +; GCN: .set multi_stage_recurse_noattr1.num_vgpr, max(41, 41) +; GCN: .set multi_stage_recurse_noattr1.num_agpr, max(0, 0) +; GCN: .set multi_stage_recurse_noattr1.numbered_sgpr, max(57, 54) ; GCN: .set multi_stage_recurse_noattr1.private_seg_size, 16 ; GCN: .set multi_stage_recurse_noattr1.uses_vcc, 1 ; GCN: .set multi_stage_recurse_noattr1.uses_flat_scratch, 0 ; GCN: .set multi_stage_recurse_noattr1.has_dyn_sized_stack, 0 ; GCN: .set multi_stage_recurse_noattr1.has_recursion, 0 ; GCN: .set multi_stage_recurse_noattr1.has_indirect_call, 0 -; GCN: TotalNumSgprs: multi_stage_recurse_noattr1.numbered_sgpr+4 -; GCN: NumVgprs: max(41, amdgpu.max_num_vgpr) +; GCN: TotalNumSgprs: 61 +; GCN: NumVgprs: 41 ; GCN: ScratchSize: 16 define void @multi_stage_recurse_noattr1(i32 %val) #0 { call void @multi_stage_recurse_noattr2(i32 %val) @@ -583,8 +583,8 @@ define void @multi_stage_recurse_noattr2(i32 %val) #0 { ; GCN: .set usage_multi_stage_recurse_noattrs.has_dyn_sized_stack, or(0, multi_stage_recurse_noattr1.has_dyn_sized_stack) ; GCN: .set usage_multi_stage_recurse_noattrs.has_recursion, or(0, multi_stage_recurse_noattr1.has_recursion) ; GCN: .set usage_multi_stage_recurse_noattrs.has_indirect_call, or(0, multi_stage_recurse_noattr1.has_indirect_call) -; GCN: TotalNumSgprs: usage_multi_stage_recurse_noattrs.numbered_sgpr+6 -; GCN: NumVgprs: usage_multi_stage_recurse_noattrs.num_vgpr +; GCN: TotalNumSgprs: 63 +; GCN: NumVgprs: 41 ; GCN: ScratchSize: 16 define amdgpu_kernel void @usage_multi_stage_recurse_noattrs(i32 %n) #0 { call void @multi_stage_recurse_noattr1(i32 %n) @@ -601,8 +601,8 @@ define amdgpu_kernel void @usage_multi_stage_recurse_noattrs(i32 %n) #0 { ; GCN: .set multi_call_with_multi_stage_recurse.has_dyn_sized_stack, or(0, use_stack0.has_dyn_sized_stack, use_stack1.has_dyn_sized_stack, multi_stage_recurse1.has_dyn_sized_stack) ; GCN: .set multi_call_with_multi_stage_recurse.has_recursion, or(1, use_stack0.has_recursion, use_stack1.has_recursion, multi_stage_recurse1.has_recursion) ; GCN: .set multi_call_with_multi_stage_recurse.has_indirect_call, or(0, use_stack0.has_indirect_call, use_stack1.has_indirect_call, multi_stage_recurse1.has_indirect_call) -; GCN: TotalNumSgprs: multi_call_with_multi_stage_recurse.numbered_sgpr+6 -; GCN: NumVgprs: multi_call_with_multi_stage_recurse.num_vgpr +; GCN: TotalNumSgprs: 59 +; GCN: NumVgprs: 48 ; GCN: ScratchSize: 2052 define amdgpu_kernel void @multi_call_with_multi_stage_recurse(i32 %n) #0 { call void @use_stack0() diff --git a/llvm/test/CodeGen/AMDGPU/recursive-resource-usage-mcexpr.ll b/llvm/test/CodeGen/AMDGPU/recursive-resource-usage-mcexpr.ll index 3093349bff37c..a41a06592f62f 100644 --- a/llvm/test/CodeGen/AMDGPU/recursive-resource-usage-mcexpr.ll +++ b/llvm/test/CodeGen/AMDGPU/recursive-resource-usage-mcexpr.ll @@ -1,5 +1,7 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck %s +; Recursion: foo -> bar -> baz -> qux -> foo + ; CHECK-LABEL: {{^}}qux ; CHECK: .set qux.num_vgpr, max(71, foo.num_vgpr) ; CHECK: .set qux.num_agpr, max(0, foo.num_agpr) @@ -34,9 +36,9 @@ ; CHECK: .set bar.has_indirect_call, or(0, baz.has_indirect_call) ; CHECK-LABEL: {{^}}foo -; CHECK: .set foo.num_vgpr, max(46, amdgpu.max_num_vgpr) -; CHECK: .set foo.num_agpr, max(0, amdgpu.max_num_agpr) -; CHECK: .set foo.numbered_sgpr, max(71, amdgpu.max_num_sgpr) +; CHECK: .set foo.num_vgpr, max(46, 71) +; CHECK: .set foo.num_agpr, max(0, 0) +; CHECK: .set foo.numbered_sgpr, max(71, 61) ; CHECK: .set foo.private_seg_size, 16 ; CHECK: .set foo.uses_vcc, 1 ; CHECK: .set foo.uses_flat_scratch, 0 @@ -91,3 +93,77 @@ define amdgpu_kernel void @usefoo() { ret void } +; Recursion: A -> B -> C -> A && C -> D -> C + +; CHECK-LABEL: {{^}}D +; CHECK: .set D.num_vgpr, max(71, C.num_vgpr) +; CHECK: .set D.num_agpr, max(0, C.num_agpr) +; CHECK: .set D.numbered_sgpr, max(71, C.numbered_sgpr) +; CHECK: .set D.private_seg_size, 16+max(C.private_seg_size) +; CHECK: .set D.uses_vcc, or(1, C.uses_vcc) +; CHECK: .set D.uses_flat_scratch, or(0, C.uses_flat_scratch) +; CHECK: .set D.has_dyn_sized_stack, or(0, C.has_dyn_sized_stack) +; CHECK: .set D.has_recursion, or(1, C.has_recursion) +; CHECK: .set D.has_indirect_call, or(0, C.has_indirect_call) + +; CHECK-LABEL: {{^}}C +; CHECK: .set C.num_vgpr, max(42, A.num_vgpr, 71) +; CHECK: .set C.num_agpr, max(0, A.num_agpr, 0) +; CHECK: .set C.numbered_sgpr, max(71, A.numbered_sgpr, 71) +; CHECK: .set C.private_seg_size, 16+max(A.private_seg_size) +; CHECK: .set C.uses_vcc, or(1, A.uses_vcc) +; CHECK: .set C.uses_flat_scratch, or(0, A.uses_flat_scratch) +; CHECK: .set C.has_dyn_sized_stack, or(0, A.has_dyn_sized_stack) +; CHECK: .set C.has_recursion, or(1, A.has_recursion) +; CHECK: .set C.has_indirect_call, or(0, A.has_indirect_call) + +; CHECK-LABEL: {{^}}B +; CHECK: .set B.num_vgpr, max(42, C.num_vgpr) +; CHECK: .set B.num_agpr, max(0, C.num_agpr) +; CHECK: .set B.numbered_sgpr, max(71, C.numbered_sgpr) +; CHECK: .set B.private_seg_size, 16+max(C.private_seg_size) +; CHECK: .set B.uses_vcc, or(1, C.uses_vcc) +; CHECK: .set B.uses_flat_scratch, or(0, C.uses_flat_scratch) +; CHECK: .set B.has_dyn_sized_stack, or(0, C.has_dyn_sized_stack) +; CHECK: .set B.has_recursion, or(1, C.has_recursion) +; CHECK: .set B.has_indirect_call, or(0, C.has_indirect_call) + +; CHECK-LABEL: {{^}}A +; CHECK: .set A.num_vgpr, max(42, 71) +; CHECK: .set A.num_agpr, max(0, 0) +; CHECK: .set A.numbered_sgpr, max(71, 71) +; CHECK: .set A.private_seg_size, 16 +; CHECK: .set A.uses_vcc, 1 +; CHECK: .set A.uses_flat_scratch, 0 +; CHECK: .set A.has_dyn_sized_stack, 0 +; CHECK: .set A.has_recursion, 1 +; CHECK: .set A.has_indirect_call, 0 + +define void @A() { + call void @B() + call void asm sideeffect "", "~{v10}"() + call void asm sideeffect "", "~{s50}"() + ret void +} + +define void @B() { + call void @C() + call void asm sideeffect "", "~{v20}"() + call void asm sideeffect "", "~{s30}"() + ret void +} + +define void @C() { + call void @A() + call void @D() + call void asm sideeffect "", "~{v30}"() + call void asm sideeffect "", "~{s40}"() + ret void +} + +define void @D() { + call void @C() + call void asm sideeffect "", "~{v70}"() + call void asm sideeffect "", "~{s70}"() + ret void +}