Skip to content

Commit c25fd0b

Browse files
committed
CodeGenPrepare: Remove threadlocal_address intrinsic when cheap to recompute.
The `threadlocal_address` intrinsic is currently ignored/removed for instruction selection by the `SelectionDAGBuilder` (see also https://reviews.llvm.org/D125291 ). However being an Instruction means `SelectionDAG` will assign a register to it and share the value across basic blocks. This sharing is suboptimal in the "LocalExec" TLS model on x86 where it is cheaper to just recompute the address. We saw a 0.5% regression in a codebase with a lot of TLS usage (HHVM). This introduces a new `cheapToRecomputeTLSAddress` target lowering callback and removes the `threadlocal_address` intrinsic in `CodeGenPrepare` to restore the efficient behavior from before the introduction of the `threadlocal_address` intrinsic. This fixes #87437
1 parent 1e058c9 commit c25fd0b

File tree

5 files changed

+34
-14
lines changed

5 files changed

+34
-14
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3393,6 +3393,13 @@ class TargetLoweringBase {
33933393
return nullptr;
33943394
}
33953395

3396+
/// Returns true if thread local storage (TLS) addresses are so cheap to
3397+
/// re-compute that it is not worth keeping them in a register between basic
3398+
/// blocks.
3399+
virtual bool cheapToRecomputeTLSAddress(const GlobalVariable &) const {
3400+
return false;
3401+
}
3402+
33963403
//===--------------------------------------------------------------------===//
33973404
// Runtime Library hooks
33983405
//

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2528,6 +2528,18 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
25282528
return optimizeGatherScatterInst(II, II->getArgOperand(0));
25292529
case Intrinsic::masked_scatter:
25302530
return optimizeGatherScatterInst(II, II->getArgOperand(1));
2531+
case Intrinsic::threadlocal_address:
2532+
// SelectionDAGBuilder currently skips this intrinsic anyway; but removing
2533+
// it earlier means the addresses will not be kept in registers accross
2534+
// basic blocks but recomputed. This is preferable on architectures where
2535+
// TLS is part of normal addressing modes.
2536+
GlobalVariable &GV = cast<GlobalVariable>(*II->getArgOperand(0));
2537+
if (TLI->cheapToRecomputeTLSAddress(GV)) {
2538+
replaceAllUsesWith(II, &GV, FreshBBs, IsHugeFunc);
2539+
II->eraseFromParent();
2540+
return true;
2541+
}
2542+
break;
25312543
}
25322544

25332545
SmallVector<Value *, 2> PtrOps;

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18913,6 +18913,16 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
1891318913
llvm_unreachable("TLS not implemented for this target.");
1891418914
}
1891518915

18916+
bool X86TargetLowering::cheapToRecomputeTLSAddress(
18917+
const GlobalVariable &GV) const {
18918+
if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
18919+
const TargetMachine &TM = getTargetMachine();
18920+
TLSModel::Model Model = TM.getTLSModel(&GV);
18921+
return Model == TLSModel::LocalExec;
18922+
}
18923+
return false;
18924+
}
18925+
1891618926
/// Lower SRA_PARTS and friends, which return two i32 values
1891718927
/// and take a 2 x i32 value to shift plus a shift amount.
1891818928
/// TODO: Can this be moved to general expansion code?

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1661,6 +1661,8 @@ namespace llvm {
16611661
SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
16621662
SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
16631663

1664+
bool cheapToRecomputeTLSAddress(const GlobalVariable &GV) const override;
1665+
16641666
/// Creates target global address or external symbol nodes for calls or
16651667
/// other uses.
16661668
SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,

llvm/test/CodeGen/X86/tls-multi-use.ll

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,31 +11,20 @@ declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
1111
define i32 @tls_multi_use(i32 %arg) {
1212
; CHECK-LABEL: tls_multi_use:
1313
; CHECK: # %bb.0: # %entry
14-
; CHECK-NEXT: pushq %r14
15-
; CHECK-NEXT: .cfi_def_cfa_offset 16
1614
; CHECK-NEXT: pushq %rbx
17-
; CHECK-NEXT: .cfi_def_cfa_offset 24
18-
; CHECK-NEXT: pushq %rax
19-
; CHECK-NEXT: .cfi_def_cfa_offset 32
20-
; CHECK-NEXT: .cfi_offset %rbx, -24
21-
; CHECK-NEXT: .cfi_offset %r14, -16
15+
; CHECK-NEXT: .cfi_def_cfa_offset 16
16+
; CHECK-NEXT: .cfi_offset %rbx, -16
2217
; CHECK-NEXT: movl %fs:foo@TPOFF, %ebx
2318
; CHECK-NEXT: testl %edi, %edi
2419
; CHECK-NEXT: movl %ebx, %eax
2520
; CHECK-NEXT: jne .LBB0_2
2621
; CHECK-NEXT: # %bb.1: # %if.then
27-
; CHECK-NEXT: movq %fs:0, %rax
28-
; CHECK-NEXT: leaq foo@TPOFF(%rax), %r14
2922
; CHECK-NEXT: callq rand@PLT
30-
; CHECK-NEXT: movl (%r14), %eax
23+
; CHECK-NEXT: movl %fs:foo@TPOFF, %eax
3124
; CHECK-NEXT: .LBB0_2: # %if.end
3225
; CHECK-NEXT: addl %eax, %ebx
3326
; CHECK-NEXT: movl %ebx, %eax
34-
; CHECK-NEXT: addq $8, %rsp
35-
; CHECK-NEXT: .cfi_def_cfa_offset 24
3627
; CHECK-NEXT: popq %rbx
37-
; CHECK-NEXT: .cfi_def_cfa_offset 16
38-
; CHECK-NEXT: popq %r14
3928
; CHECK-NEXT: .cfi_def_cfa_offset 8
4029
; CHECK-NEXT: retq
4130
;

0 commit comments

Comments
 (0)