Skip to content

Commit fc482ad

Browse files
committed
CodeGenPrepare: Remove threadlocal_address intrinsic when cheap to recompute.
The `threadlocal_address` intrinsic is currently ignored/removed for instruction selection by the `SelectionDAGBuilder` (see also https://reviews.llvm.org/D125291 ). However being an Instruction means `SelectionDAG` will assign a register to it and share the value across basic blocks. This sharing is suboptimal in the "LocalExec" TLS model on x86 where it is cheaper to just recompute the address. We saw a 0.5% regression in a codebase with a lot of TLS usage (HHVM). This introduces a new `cheapToRecomputeTLSAddress` target lowering callback and removes the `threadlocal_address` intrinsic in `CodeGenPrepare` to restore the efficient behavior from before the introduction of the `threadlocal_address` intrinsic. This fixes #87437
1 parent 13a297f commit fc482ad

File tree

5 files changed

+37
-28
lines changed

5 files changed

+37
-28
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3393,6 +3393,13 @@ class TargetLoweringBase {
33933393
return nullptr;
33943394
}
33953395

3396+
/// Returns true if thread local storage (TLS) addresses are so cheap to
3397+
/// re-compute that it is not worth keeping them in a register between basic
3398+
/// blocks.
3399+
virtual bool cheapToRecomputeTLSAddress(const GlobalVariable &) const {
3400+
return false;
3401+
}
3402+
33963403
//===--------------------------------------------------------------------===//
33973404
// Runtime Library hooks
33983405
//

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2528,6 +2528,18 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
25282528
return optimizeGatherScatterInst(II, II->getArgOperand(0));
25292529
case Intrinsic::masked_scatter:
25302530
return optimizeGatherScatterInst(II, II->getArgOperand(1));
2531+
case Intrinsic::threadlocal_address:
2532+
// SelectionDAGBuilder currently skips this intrinsic anyway; but removing
2533+
// it earlier means the addresses will not be kept in registers accross
2534+
// basic blocks but recomputed. This is preferable on architectures where
2535+
// TLS is part of normal addressing modes.
2536+
GlobalVariable &GV = cast<GlobalVariable>(*II->getArgOperand(0));
2537+
if (TLI->cheapToRecomputeTLSAddress(GV)) {
2538+
replaceAllUsesWith(II, &GV, FreshBBs, IsHugeFunc);
2539+
II->eraseFromParent();
2540+
return true;
2541+
}
2542+
break;
25312543
}
25322544

25332545
SmallVector<Value *, 2> PtrOps;

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18913,6 +18913,16 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
1891318913
llvm_unreachable("TLS not implemented for this target.");
1891418914
}
1891518915

18916+
bool X86TargetLowering::cheapToRecomputeTLSAddress(
18917+
const GlobalVariable &GV) const {
18918+
if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
18919+
const TargetMachine &TM = getTargetMachine();
18920+
TLSModel::Model Model = TM.getTLSModel(&GV);
18921+
return Model == TLSModel::LocalExec;
18922+
}
18923+
return false;
18924+
}
18925+
1891618926
/// Lower SRA_PARTS and friends, which return two i32 values
1891718927
/// and take a 2 x i32 value to shift plus a shift amount.
1891818928
/// TODO: Can this be moved to general expansion code?

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1661,6 +1661,8 @@ namespace llvm {
16611661
SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
16621662
SDValue LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const;
16631663

1664+
bool cheapToRecomputeTLSAddress(const GlobalVariable &GV) const override;
1665+
16641666
/// Creates target global address or external symbol nodes for calls or
16651667
/// other uses.
16661668
SDValue LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG,

llvm/test/CodeGen/X86/tls-multi-use.ll

Lines changed: 6 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -12,31 +12,20 @@ declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
1212
define i32 @tls_multi_use(i32 %arg) {
1313
; CHECK-LABEL: tls_multi_use:
1414
; CHECK: # %bb.0: # %entry
15-
; CHECK-NEXT: pushq %r14
16-
; CHECK-NEXT: .cfi_def_cfa_offset 16
1715
; CHECK-NEXT: pushq %rbx
18-
; CHECK-NEXT: .cfi_def_cfa_offset 24
19-
; CHECK-NEXT: pushq %rax
20-
; CHECK-NEXT: .cfi_def_cfa_offset 32
21-
; CHECK-NEXT: .cfi_offset %rbx, -24
22-
; CHECK-NEXT: .cfi_offset %r14, -16
16+
; CHECK-NEXT: .cfi_def_cfa_offset 16
17+
; CHECK-NEXT: .cfi_offset %rbx, -16
2318
; CHECK-NEXT: movl %fs:foo@TPOFF, %ebx
2419
; CHECK-NEXT: testl %edi, %edi
2520
; CHECK-NEXT: movl %ebx, %eax
2621
; CHECK-NEXT: jne .LBB0_2
2722
; CHECK-NEXT: # %bb.1: # %if.then
28-
; CHECK-NEXT: movq %fs:0, %rax
29-
; CHECK-NEXT: leaq foo@TPOFF(%rax), %r14
3023
; CHECK-NEXT: callq rand@PLT
31-
; CHECK-NEXT: movl (%r14), %eax
24+
; CHECK-NEXT: movl %fs:foo@TPOFF, %eax
3225
; CHECK-NEXT: .LBB0_2: # %if.end
3326
; CHECK-NEXT: addl %eax, %ebx
3427
; CHECK-NEXT: movl %ebx, %eax
35-
; CHECK-NEXT: addq $8, %rsp
36-
; CHECK-NEXT: .cfi_def_cfa_offset 24
3728
; CHECK-NEXT: popq %rbx
38-
; CHECK-NEXT: .cfi_def_cfa_offset 16
39-
; CHECK-NEXT: popq %r14
4029
; CHECK-NEXT: .cfi_def_cfa_offset 8
4130
; CHECK-NEXT: retq
4231
;
@@ -93,31 +82,20 @@ if.end:
9382
define i32 @tls_multi_use_nonlocal(i32 %arg) {
9483
; CHECK-LABEL: tls_multi_use_nonlocal:
9584
; CHECK: # %bb.0: # %entry
96-
; CHECK-NEXT: pushq %r14
97-
; CHECK-NEXT: .cfi_def_cfa_offset 16
9885
; CHECK-NEXT: pushq %rbx
99-
; CHECK-NEXT: .cfi_def_cfa_offset 24
100-
; CHECK-NEXT: pushq %rax
101-
; CHECK-NEXT: .cfi_def_cfa_offset 32
102-
; CHECK-NEXT: .cfi_offset %rbx, -24
103-
; CHECK-NEXT: .cfi_offset %r14, -16
86+
; CHECK-NEXT: .cfi_def_cfa_offset 16
87+
; CHECK-NEXT: .cfi_offset %rbx, -16
10488
; CHECK-NEXT: movl %fs:foo@TPOFF, %ebx
10589
; CHECK-NEXT: testl %edi, %edi
10690
; CHECK-NEXT: movl %ebx, %eax
10791
; CHECK-NEXT: jne .LBB1_2
10892
; CHECK-NEXT: # %bb.1: # %if.then
109-
; CHECK-NEXT: movq %fs:0, %rax
110-
; CHECK-NEXT: leaq foo@TPOFF(%rax), %r14
11193
; CHECK-NEXT: callq rand@PLT
112-
; CHECK-NEXT: movl (%r14), %eax
94+
; CHECK-NEXT: movl %fs:foo@TPOFF, %eax
11395
; CHECK-NEXT: .LBB1_2: # %if.end
11496
; CHECK-NEXT: addl %eax, %ebx
11597
; CHECK-NEXT: movl %ebx, %eax
116-
; CHECK-NEXT: addq $8, %rsp
117-
; CHECK-NEXT: .cfi_def_cfa_offset 24
11898
; CHECK-NEXT: popq %rbx
119-
; CHECK-NEXT: .cfi_def_cfa_offset 16
120-
; CHECK-NEXT: popq %r14
12199
; CHECK-NEXT: .cfi_def_cfa_offset 8
122100
; CHECK-NEXT: retq
123101
;

0 commit comments

Comments
 (0)