Skip to content

[X86][BreakFalseDeps] Using reverse order for undef register selection #137569

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 11, 2025
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion llvm/include/llvm/CodeGen/RegisterClassInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ class RegisterClassInfo {
// entry is valid when its tag matches.
unsigned Tag = 0;

bool Reverse = false;

const MachineFunction *MF = nullptr;
const TargetRegisterInfo *TRI = nullptr;

Expand Down Expand Up @@ -88,7 +90,8 @@ class RegisterClassInfo {

/// runOnFunction - Prepare to answer questions about MF. This must be called
/// before any other methods are used.
LLVM_ABI void runOnMachineFunction(const MachineFunction &MF);
LLVM_ABI void runOnMachineFunction(const MachineFunction &MF,
bool Rev = false);

/// getNumAllocatableRegs - Returns the number of actually allocatable
/// registers in RC in the current function.
Expand Down
7 changes: 4 additions & 3 deletions llvm/include/llvm/CodeGen/TargetRegisterInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ class TargetRegisterClass {
const bool CoveredBySubRegs;
const unsigned *SuperClasses;
const uint16_t SuperClassesSize;
ArrayRef<MCPhysReg> (*OrderFunc)(const MachineFunction&);
ArrayRef<MCPhysReg> (*OrderFunc)(const MachineFunction &, bool Rev);

/// Return the register class ID number.
unsigned getID() const { return MC->getID(); }
Expand Down Expand Up @@ -199,8 +199,9 @@ class TargetRegisterClass {
/// other criteria.
///
/// By default, this method returns all registers in the class.
ArrayRef<MCPhysReg> getRawAllocationOrder(const MachineFunction &MF) const {
return OrderFunc ? OrderFunc(MF) : getRegisters();
ArrayRef<MCPhysReg> getRawAllocationOrder(const MachineFunction &MF,
bool Rev = false) const {
return OrderFunc ? OrderFunc(MF, Rev) : getRegisters();
}

/// Returns the combination of all lane masks of register in this class.
Expand Down
2 changes: 1 addition & 1 deletion llvm/include/llvm/Target/Target.td
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ class RegisterClass<string namespace, list<ValueType> regTypes, int alignment,
// to use in a given machine function. The code will be inserted in a
// function like this:
//
// static inline unsigned f(const MachineFunction &MF) { ... }
// static inline unsigned f(const MachineFunction &MF, bool Rev) { ... }
//
// The function should return 0 to select the default order defined by
// MemberList, 1 to select the first AltOrders entry and so on.
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/CodeGen/BreakFalseDeps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ bool BreakFalseDeps::runOnMachineFunction(MachineFunction &mf) {
TRI = MF->getSubtarget().getRegisterInfo();
RDA = &getAnalysis<ReachingDefAnalysis>();

RegClassInfo.runOnMachineFunction(mf);
RegClassInfo.runOnMachineFunction(mf, /*Rev=*/true);

LLVM_DEBUG(dbgs() << "********** BREAK FALSE DEPENDENCIES **********\n");

Expand Down
13 changes: 10 additions & 3 deletions llvm/lib/CodeGen/RegisterClassInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,14 +39,16 @@ StressRA("stress-regalloc", cl::Hidden, cl::init(0), cl::value_desc("N"),

RegisterClassInfo::RegisterClassInfo() = default;

void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf,
bool Rev) {
bool Update = false;
MF = &mf;

auto &STI = MF->getSubtarget();

// Allocate new array the first time we see a new target.
if (STI.getRegisterInfo() != TRI) {
if (STI.getRegisterInfo() != TRI || Reverse != Rev) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This TRI check looks broken, shouldn't be necessary

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

TRI is a constant value within the same Subtarget, but can be changed when we compile functions with different target feature. So we need to reset RegClass in these cases.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The analysis shouldn't survive in those cases?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My understanding is RegClass survives longer than analysis. We have other passes like MachineSink, RegAllocBase, MachineCombiner etc. all use it. The cached RegClass can be used among them within the same Subtarget?

Reverse = Rev;
TRI = STI.getRegisterInfo();
RegClass.reset(new RCInfo[TRI->getNumRegClasses()]);
Update = true;
Expand Down Expand Up @@ -142,7 +144,12 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {

// FIXME: Once targets reserve registers instead of removing them from the
// allocation order, we can simply use begin/end here.
ArrayRef<MCPhysReg> RawOrder = RC->getRawAllocationOrder(*MF);
ArrayRef<MCPhysReg> RawOrder = RC->getRawAllocationOrder(*MF, Reverse);
std::vector<MCPhysReg> ReverseOrder;
if (Reverse) {
llvm::append_range(ReverseOrder, reverse(RawOrder));
Copy link
Contributor

@e-kud e-kud Jun 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I probably miss something but why can't we define the correct reverse order in AltOrders so we don't need to reverse a list once again. We've already passed Reverse=true to getRawAllocationOrder and after this need to reverse once more.

Something like

let AltOrders = [(add (sequence "XMM%u", 15, 0), (sequence "XMM%u", 31, 16))];

I think Rev might be too overloaded: 1) select AltOrder 2) Reverse the selected AltOrder.

Copy link
Contributor Author

@phoebewang phoebewang Jun 11, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

AltOrders is used for customizing given RegisterClass. We always reverse the list so that we don't need to provide AltOrders for each RegisterClass.

The reason to customize FR32X/FR64X/VR256X here because the X/YMM16~31 has longer encoding than X/YMM0~15. With the list reversed, we only need to overload such 3 classes.

RawOrder = ArrayRef<MCPhysReg>(ReverseOrder);
}
Comment on lines +147 to +152
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is already a mechanism for providing alternative allocation orders defined in tablegen, you shouldn't need to do this

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is to imitate the alternative allocation order way. Currently it's only controlled by target features. We want to control it through pass agrument too.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's wrong with it being a target faster? Could also expand the alternative allocation order controls. This is hardcoding a single alternative choice and requires a runtime sort

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The problem is not some registers are fast. They are all the same.

The intention here is to alter the order for the a specific pass. It doesn't solve the problem here if we just reverse register oder for all passes.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

s/faster/feature/

Then change the selection mechanism for the table generated order

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see how feature help here. This is not some feature works for all passes. We just want BreakFalseDeps uses reverse order.

for (unsigned PhysReg : RawOrder) {
// Remove reserved registers from the allocation order.
if (Reserved.test(PhysReg))
Expand Down
28 changes: 24 additions & 4 deletions llvm/lib/Target/X86/X86RegisterInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -806,17 +806,37 @@ def VR512_0_15 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i
512, (sequence "ZMM%u", 0, 15)>;

// Scalar AVX-512 floating point registers.
def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>;
def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)> {
let AltOrders = [(add (sequence "XMM%u", 16, 31), (sequence "XMM%u", 0, 15))];
let AltOrderSelect = [{
return Rev;
}];
}

def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>;
def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)> {
let AltOrders = [(add (sequence "XMM%u", 16, 31), (sequence "XMM%u", 0, 15))];
let AltOrderSelect = [{
return Rev;
}];
}

def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)> {let Size = 32;}

// Extended VR128 and VR256 for AVX-512 instructions
def VR128X : RegisterClass<"X86", [v4f32, v2f64, v8f16, v8bf16, v16i8, v8i16, v4i32, v2i64, f128],
128, (add FR32X)>;
128, (add FR32X)> {
let AltOrders = [(add (sequence "XMM%u", 16, 31), (sequence "XMM%u", 0, 15))];
let AltOrderSelect = [{
return Rev;
}];
}
def VR256X : RegisterClass<"X86", [v8f32, v4f64, v16f16, v16bf16, v32i8, v16i16, v8i32, v4i64],
256, (sequence "YMM%u", 0, 31)>;
256, (sequence "YMM%u", 0, 31)> {
let AltOrders = [(add (sequence "YMM%u", 16, 31), (sequence "YMM%u", 0, 15))];
let AltOrderSelect = [{
return Rev;
}];
}

// Mask registers
def VK1 : RegisterClass<"X86", [v1i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;}
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/X86/avx-cvt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ define <2 x double> @fpext01(<2 x double> %a0, <4 x float> %a1) nounwind {
define double @funcA(ptr nocapture %e) nounwind uwtable readonly ssp {
; CHECK-LABEL: funcA:
; CHECK: # %bb.0:
; CHECK-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0
; CHECK-NEXT: vcvtsi2sdq (%rdi), %xmm15, %xmm0
; CHECK-NEXT: retq
%tmp1 = load i64, ptr %e, align 8
%conv = sitofp i64 %tmp1 to double
Expand All @@ -118,7 +118,7 @@ define double @funcA(ptr nocapture %e) nounwind uwtable readonly ssp {
define double @funcB(ptr nocapture %e) nounwind uwtable readonly ssp {
; CHECK-LABEL: funcB:
; CHECK: # %bb.0:
; CHECK-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0
; CHECK-NEXT: vcvtsi2sdl (%rdi), %xmm15, %xmm0
; CHECK-NEXT: retq
%tmp1 = load i32, ptr %e, align 4
%conv = sitofp i32 %tmp1 to double
Expand All @@ -128,7 +128,7 @@ define double @funcB(ptr nocapture %e) nounwind uwtable readonly ssp {
define float @funcC(ptr nocapture %e) nounwind uwtable readonly ssp {
; CHECK-LABEL: funcC:
; CHECK: # %bb.0:
; CHECK-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0
; CHECK-NEXT: vcvtsi2ssl (%rdi), %xmm15, %xmm0
; CHECK-NEXT: retq
%tmp1 = load i32, ptr %e, align 4
%conv = sitofp i32 %tmp1 to float
Expand All @@ -138,7 +138,7 @@ define float @funcC(ptr nocapture %e) nounwind uwtable readonly ssp {
define float @funcD(ptr nocapture %e) nounwind uwtable readonly ssp {
; CHECK-LABEL: funcD:
; CHECK: # %bb.0:
; CHECK-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0
; CHECK-NEXT: vcvtsi2ssq (%rdi), %xmm15, %xmm0
; CHECK-NEXT: retq
%tmp1 = load i64, ptr %e, align 8
%conv = sitofp i64 %tmp1 to float
Expand Down Expand Up @@ -183,7 +183,7 @@ declare float @llvm.floor.f32(float %p)
define float @floor_f32_load(ptr %aptr) optsize {
; CHECK-LABEL: floor_f32_load:
; CHECK: # %bb.0:
; CHECK-NEXT: vroundss $9, (%rdi), %xmm0, %xmm0
; CHECK-NEXT: vroundss $9, (%rdi), %xmm15, %xmm0
; CHECK-NEXT: retq
%a = load float, ptr %aptr
%res = call float @llvm.floor.f32(float %a)
Expand All @@ -193,7 +193,7 @@ define float @floor_f32_load(ptr %aptr) optsize {
define float @floor_f32_load_pgso(ptr %aptr) !prof !14 {
; CHECK-LABEL: floor_f32_load_pgso:
; CHECK: # %bb.0:
; CHECK-NEXT: vroundss $9, (%rdi), %xmm0, %xmm0
; CHECK-NEXT: vroundss $9, (%rdi), %xmm15, %xmm0
; CHECK-NEXT: retq
%a = load float, ptr %aptr
%res = call float @llvm.floor.f32(float %a)
Expand All @@ -203,7 +203,7 @@ define float @floor_f32_load_pgso(ptr %aptr) !prof !14 {
define double @nearbyint_f64_load(ptr %aptr) optsize {
; CHECK-LABEL: nearbyint_f64_load:
; CHECK: # %bb.0:
; CHECK-NEXT: vroundsd $12, (%rdi), %xmm0, %xmm0
; CHECK-NEXT: vroundsd $12, (%rdi), %xmm15, %xmm0
; CHECK-NEXT: retq
%a = load double, ptr %aptr
%res = call double @llvm.nearbyint.f64(double %a)
Expand All @@ -213,7 +213,7 @@ define double @nearbyint_f64_load(ptr %aptr) optsize {
define double @nearbyint_f64_load_pgso(ptr %aptr) !prof !14 {
; CHECK-LABEL: nearbyint_f64_load_pgso:
; CHECK: # %bb.0:
; CHECK-NEXT: vroundsd $12, (%rdi), %xmm0, %xmm0
; CHECK-NEXT: vroundsd $12, (%rdi), %xmm15, %xmm0
; CHECK-NEXT: retq
%a = load double, ptr %aptr
%res = call double @llvm.nearbyint.f64(double %a)
Expand Down
Loading
Loading