Skip to content

Commit 864723f

Browse files
committed
CodeGenPrepare: Remove threadlocal_address intrinsic when cheap to recompute.
The `threadlocal_address` intrinsic is currently ignored/removed for instruction selection by the `SelectionDAGBuilder` (see also https://reviews.llvm.org/D125291 ). However being an Instruction means `SelectionDAG` will assign a register to it and share the value across basic blocks. This sharing is suboptimal in the "LocalExec" TLS model on x86 where it is cheaper to just recompute the address. We saw a 0.5% regression in a codebase with a lot of TLS usage (HHVM). This introduces a new `cheapToRecomputeTLSAddress` target lowering callback and removes the `threadlocal_address` intrinsic in `CodeGenPrepare` to restore the efficient behavior from before the introduction of the `threadlocal_address` intrinsic. This fixes #87437
1 parent 60d49fd commit 864723f

File tree

6 files changed

+135
-25
lines changed

6 files changed

+135
-25
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2798,6 +2798,12 @@ class TargetLoweringBase {
27982798
Type *Ty, unsigned AddrSpace,
27992799
Instruction *I = nullptr) const;
28002800

2801+
/// Returns true if the targets addressing mode can target thread local
2802+
/// storage (TLS).
2803+
virtual bool addressingModeSupportsTLS(const GlobalValue &) const {
2804+
return false;
2805+
}
2806+
28012807
/// Return the prefered common base offset.
28022808
virtual int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset,
28032809
int64_t MaxOffset) const {

llvm/lib/CodeGen/CodeGenPrepare.cpp

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5082,6 +5082,15 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
50825082
}
50835083
return true;
50845084
}
5085+
case Instruction::Call:
5086+
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(AddrInst)) {
5087+
if (II->getIntrinsicID() == Intrinsic::threadlocal_address) {
5088+
GlobalValue &GV = cast<GlobalValue>(*II->getArgOperand(0));
5089+
if (TLI.addressingModeSupportsTLS(GV))
5090+
return matchAddr(AddrInst->getOperand(0), Depth);
5091+
}
5092+
}
5093+
break;
50855094
}
50865095
return false;
50875096
}
@@ -5620,11 +5629,16 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
56205629
return Modified;
56215630
}
56225631

5623-
if (AddrMode.BaseGV) {
5632+
GlobalValue *BaseGV = AddrMode.BaseGV;
5633+
if (BaseGV != nullptr) {
56245634
if (ResultPtr)
56255635
return Modified;
56265636

5627-
ResultPtr = AddrMode.BaseGV;
5637+
if (BaseGV->isThreadLocal()) {
5638+
ResultPtr = Builder.CreateThreadLocalAddress(BaseGV);
5639+
} else {
5640+
ResultPtr = BaseGV;
5641+
}
56285642
}
56295643

56305644
// If the real base value actually came from an inttoptr, then the matcher

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18928,6 +18928,30 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
1892818928
llvm_unreachable("TLS not implemented for this target.");
1892918929
}
1893018930

18931+
bool X86TargetLowering::addressingModeSupportsTLS(const GlobalValue &GV) const {
18932+
if (Subtarget.is64Bit() && Subtarget.isTargetELF()) {
18933+
const TargetMachine &TM = getTargetMachine();
18934+
TLSModel::Model Model = TM.getTLSModel(&GV);
18935+
switch (Model) {
18936+
case TLSModel::LocalExec:
18937+
case TLSModel::InitialExec:
18938+
// We can include the %fs segment register in addressing modes.
18939+
return true;
18940+
case TLSModel::LocalDynamic:
18941+
case TLSModel::GeneralDynamic:
18942+
// These models do not result in %fs relative addresses unless
18943+
// TLS descriptior are used.
18944+
//
18945+
// Even in the case of TLS descriptors we currently have no way to model
18946+
// the difference between %fs access and the computations needed for the
18947+
// offset and returning `true` for TLS-desc currently duplicates both
18948+
// which is detrimental :-/
18949+
return false;
18950+
}
18951+
}
18952+
return false;
18953+
}
18954+
1893118955
/// Lower SRA_PARTS and friends, which return two i32 values
1893218956
/// and take a 2 x i32 value to shift plus a shift amount.
1893318957
/// TODO: Can this be moved to general expansion code?

llvm/lib/Target/X86/X86ISelLowering.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1323,6 +1323,8 @@ namespace llvm {
13231323
Type *Ty, unsigned AS,
13241324
Instruction *I = nullptr) const override;
13251325

1326+
bool addressingModeSupportsTLS(const GlobalValue &GV) const override;
1327+
13261328
/// Return true if the specified immediate is legal
13271329
/// icmp immediate, that is the target has icmp instructions which can
13281330
/// compare a register against the immediate without having to materialize

llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll

Lines changed: 4 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -13,65 +13,47 @@ declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
1313
define i32 @func_local_tls(i32 %arg0, i32 %arg1) nounwind {
1414
; NOPIC-LABEL: func_local_tls:
1515
; NOPIC: # %bb.0: # %entry
16-
; NOPIC-NEXT: pushq %r14
1716
; NOPIC-NEXT: pushq %rbx
18-
; NOPIC-NEXT: pushq %rax
1917
; NOPIC-NEXT: movl %fs:foo_local@TPOFF, %ebx
2018
; NOPIC-NEXT: testl %edi, %edi
2119
; NOPIC-NEXT: movl %ebx, %eax
2220
; NOPIC-NEXT: jne .LBB0_2
2321
; NOPIC-NEXT: # %bb.1: # %if.then
24-
; NOPIC-NEXT: movq %fs:0, %rax
25-
; NOPIC-NEXT: leaq foo_local@TPOFF(%rax), %r14
2622
; NOPIC-NEXT: callq effect@PLT
27-
; NOPIC-NEXT: movl (%r14), %eax
23+
; NOPIC-NEXT: movl %fs:foo_local@TPOFF, %eax
2824
; NOPIC-NEXT: .LBB0_2: # %if.end
2925
; NOPIC-NEXT: addl %ebx, %eax
30-
; NOPIC-NEXT: addq $8, %rsp
3126
; NOPIC-NEXT: popq %rbx
32-
; NOPIC-NEXT: popq %r14
3327
; NOPIC-NEXT: retq
3428
;
3529
; PIC-LABEL: func_local_tls:
3630
; PIC: # %bb.0: # %entry
37-
; PIC-NEXT: pushq %r14
3831
; PIC-NEXT: pushq %rbx
39-
; PIC-NEXT: pushq %rax
4032
; PIC-NEXT: movl %fs:.Lfoo_local$local@TPOFF, %ebx
4133
; PIC-NEXT: testl %edi, %edi
4234
; PIC-NEXT: movl %ebx, %eax
4335
; PIC-NEXT: jne .LBB0_2
4436
; PIC-NEXT: # %bb.1: # %if.then
45-
; PIC-NEXT: movq %fs:0, %rax
46-
; PIC-NEXT: leaq .Lfoo_local$local@TPOFF(%rax), %r14
4737
; PIC-NEXT: callq effect@PLT
48-
; PIC-NEXT: movl (%r14), %eax
38+
; PIC-NEXT: movl %fs:.Lfoo_local$local@TPOFF, %eax
4939
; PIC-NEXT: .LBB0_2: # %if.end
5040
; PIC-NEXT: addl %ebx, %eax
51-
; PIC-NEXT: addq $8, %rsp
5241
; PIC-NEXT: popq %rbx
53-
; PIC-NEXT: popq %r14
5442
; PIC-NEXT: retq
5543
;
5644
; TLSDESC-LABEL: func_local_tls:
5745
; TLSDESC: # %bb.0: # %entry
58-
; TLSDESC-NEXT: pushq %r14
5946
; TLSDESC-NEXT: pushq %rbx
60-
; TLSDESC-NEXT: pushq %rax
6147
; TLSDESC-NEXT: movl %fs:.Lfoo_local$local@TPOFF, %ebx
6248
; TLSDESC-NEXT: testl %edi, %edi
6349
; TLSDESC-NEXT: movl %ebx, %eax
6450
; TLSDESC-NEXT: jne .LBB0_2
6551
; TLSDESC-NEXT: # %bb.1: # %if.then
66-
; TLSDESC-NEXT: movq %fs:0, %rax
67-
; TLSDESC-NEXT: leaq .Lfoo_local$local@TPOFF(%rax), %r14
6852
; TLSDESC-NEXT: callq effect@PLT
69-
; TLSDESC-NEXT: movl (%r14), %eax
53+
; TLSDESC-NEXT: movl %fs:.Lfoo_local$local@TPOFF, %eax
7054
; TLSDESC-NEXT: .LBB0_2: # %if.end
7155
; TLSDESC-NEXT: addl %ebx, %eax
72-
; TLSDESC-NEXT: addq $8, %rsp
7356
; TLSDESC-NEXT: popq %rbx
74-
; TLSDESC-NEXT: popq %r14
7557
; TLSDESC-NEXT: retq
7658
entry:
7759
%addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo_local)
@@ -106,9 +88,8 @@ define i32 @func_nonlocal_tls(i32 %arg0, i32 %arg1) nounwind {
10688
; NOPIC-NEXT: movl %ebx, %eax
10789
; NOPIC-NEXT: jne .LBB1_2
10890
; NOPIC-NEXT: # %bb.1: # %if.then
109-
; NOPIC-NEXT: addq %fs:0, %r14
11091
; NOPIC-NEXT: callq effect@PLT
111-
; NOPIC-NEXT: movl (%r14), %eax
92+
; NOPIC-NEXT: movl %fs:(%r14), %eax
11293
; NOPIC-NEXT: .LBB1_2: # %if.end
11394
; NOPIC-NEXT: addl %ebx, %eax
11495
; NOPIC-NEXT: addq $8, %rsp
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2+
; RUN: opt -S -passes='require<profile-summary>,function(codegenprepare)' %s | FileCheck %s
3+
4+
target triple = "x86_64--linux-gnu"
5+
6+
@foo = dso_local thread_local(localexec) global i32 0, align 4
7+
8+
declare void @effect()
9+
declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull)
10+
11+
define i32 @func0(i32 %arg) {
12+
; CHECK-LABEL: define i32 @func0(
13+
; CHECK-SAME: i32 [[ARG:%.*]]) {
14+
; CHECK-NEXT: entry:
15+
; CHECK-NEXT: [[ADDR:%.*]] = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
16+
; CHECK-NEXT: [[LOAD0:%.*]] = load i32, ptr [[ADDR]], align 4
17+
; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[ARG]], 0
18+
; CHECK-NEXT: br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
19+
; CHECK: if.then:
20+
; CHECK-NEXT: tail call void @effect()
21+
; CHECK-NEXT: [[TMP0:%.*]] = call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @foo)
22+
; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[TMP0]], align 4
23+
; CHECK-NEXT: br label [[IF_END]]
24+
; CHECK: if.end:
25+
; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[LOAD1]], [[IF_THEN]] ], [ [[LOAD0]], [[ENTRY:%.*]] ]
26+
; CHECK-NEXT: [[RET:%.*]] = add i32 [[PHI]], [[LOAD0]]
27+
; CHECK-NEXT: ret i32 [[RET]]
28+
;
29+
entry:
30+
%addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
31+
%load0 = load i32, ptr %addr, align 4
32+
%cond = icmp eq i32 %arg, 0
33+
br i1 %cond, label %if.then, label %if.end
34+
35+
if.then:
36+
tail call void @effect()
37+
%load1 = load i32, ptr %addr, align 4
38+
br label %if.end
39+
40+
if.end:
41+
%phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
42+
%ret = add i32 %phi, %load0
43+
ret i32 %ret
44+
}
45+
46+
define i32 @func1(i32 %arg0, i32 %arg1) {
47+
; CHECK-LABEL: define i32 @func1(
48+
; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) {
49+
; CHECK-NEXT: entry:
50+
; CHECK-NEXT: [[ADDR:%.*]] = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
51+
; CHECK-NEXT: [[LOAD0:%.*]] = load i32, ptr [[ADDR]], align 4
52+
; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[ARG0]], 0
53+
; CHECK-NEXT: br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
54+
; CHECK: if.then:
55+
; CHECK-NEXT: tail call void @effect()
56+
; CHECK-NEXT: [[X:%.*]] = add i32 [[ARG1]], 42
57+
; CHECK-NEXT: [[ADDR1:%.*]] = getelementptr inbounds i32, ptr [[ADDR]], i32 [[X]]
58+
; CHECK-NEXT: [[TMP0:%.*]] = call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @foo)
59+
; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[TMP0]], align 4
60+
; CHECK-NEXT: br label [[IF_END]]
61+
; CHECK: if.end:
62+
; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[LOAD1]], [[IF_THEN]] ], [ [[LOAD0]], [[ENTRY:%.*]] ]
63+
; CHECK-NEXT: [[RET:%.*]] = add i32 [[PHI]], [[LOAD0]]
64+
; CHECK-NEXT: ret i32 [[RET]]
65+
;
66+
entry:
67+
%addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo)
68+
%load0 = load i32, ptr %addr, align 4
69+
%cond = icmp eq i32 %arg0, 0
70+
br i1 %cond, label %if.then, label %if.end
71+
72+
if.then:
73+
tail call void @effect()
74+
%x = add i32 %arg1, 42
75+
%addr1 = getelementptr inbounds i32, ptr %addr, i32 %x
76+
%load1 = load i32, ptr %addr, align 4
77+
br label %if.end
78+
79+
if.end:
80+
%phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ]
81+
%ret = add i32 %phi, %load0
82+
ret i32 %ret
83+
}

0 commit comments

Comments
 (0)