diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index a4dc097446186..05b22289d6e24 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2798,6 +2798,12 @@ class TargetLoweringBase { Type *Ty, unsigned AddrSpace, Instruction *I = nullptr) const; + /// Returns true if the targets addressing mode can target thread local + /// storage (TLS). + virtual bool addressingModeSupportsTLS(const GlobalValue &) const { + return false; + } + /// Return the prefered common base offset. virtual int64_t getPreferredLargeGEPBaseOffset(int64_t MinOffset, int64_t MaxOffset) const { diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index e657872c38284..22a766f8d6252 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -5082,6 +5082,15 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode, } return true; } + case Instruction::Call: + if (IntrinsicInst *II = dyn_cast(AddrInst)) { + if (II->getIntrinsicID() == Intrinsic::threadlocal_address) { + GlobalValue &GV = cast(*II->getArgOperand(0)); + if (TLI.addressingModeSupportsTLS(GV)) + return matchAddr(AddrInst->getOperand(0), Depth); + } + } + break; } return false; } @@ -5620,11 +5629,16 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, return Modified; } - if (AddrMode.BaseGV) { + GlobalValue *BaseGV = AddrMode.BaseGV; + if (BaseGV != nullptr) { if (ResultPtr) return Modified; - ResultPtr = AddrMode.BaseGV; + if (BaseGV->isThreadLocal()) { + ResultPtr = Builder.CreateThreadLocalAddress(BaseGV); + } else { + ResultPtr = BaseGV; + } } // If the real base value actually came from an inttoptr, then the matcher @@ -5789,8 +5803,15 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr, } // Add in the BaseGV if present. - if (AddrMode.BaseGV) { - Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr"); + GlobalValue *BaseGV = AddrMode.BaseGV; + if (BaseGV != nullptr) { + Value *BaseGVPtr; + if (BaseGV->isThreadLocal()) { + BaseGVPtr = Builder.CreateThreadLocalAddress(BaseGV); + } else { + BaseGVPtr = BaseGV; + } + Value *V = Builder.CreatePtrToInt(BaseGVPtr, IntPtrTy, "sunkaddr"); if (Result) Result = Builder.CreateAdd(Result, V, "sunkaddr"); else diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index f274da6f6f776..3358d7918f4b0 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -18928,6 +18928,30 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("TLS not implemented for this target."); } +bool X86TargetLowering::addressingModeSupportsTLS(const GlobalValue &GV) const { + if (Subtarget.is64Bit() && Subtarget.isTargetELF()) { + const TargetMachine &TM = getTargetMachine(); + TLSModel::Model Model = TM.getTLSModel(&GV); + switch (Model) { + case TLSModel::LocalExec: + case TLSModel::InitialExec: + // We can include the %fs segment register in addressing modes. + return true; + case TLSModel::LocalDynamic: + case TLSModel::GeneralDynamic: + // These models do not result in %fs relative addresses unless + // TLS descriptior are used. + // + // Even in the case of TLS descriptors we currently have no way to model + // the difference between %fs access and the computations needed for the + // offset and returning `true` for TLS-desc currently duplicates both + // which is detrimental :-/ + return false; + } + } + return false; +} + /// Lower SRA_PARTS and friends, which return two i32 values /// and take a 2 x i32 value to shift plus a shift amount. /// TODO: Can this be moved to general expansion code? diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h index 0a1e8ca442731..e348ba6e8ac08 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.h +++ b/llvm/lib/Target/X86/X86ISelLowering.h @@ -1323,6 +1323,8 @@ namespace llvm { Type *Ty, unsigned AS, Instruction *I = nullptr) const override; + bool addressingModeSupportsTLS(const GlobalValue &GV) const override; + /// Return true if the specified immediate is legal /// icmp immediate, that is the target has icmp instructions which can /// compare a register against the immediate without having to materialize diff --git a/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll new file mode 100644 index 0000000000000..0ca1da26fa89c --- /dev/null +++ b/llvm/test/CodeGen/X86/codegen-prepare-addrmode-tls.ll @@ -0,0 +1,193 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -o - %s | FileCheck %s --check-prefix=NOPIC +; RUN: llc -o - %s -relocation-model=pic | FileCheck %s --check-prefix=PIC +; RUN: llc -o - %s -relocation-model=pic -enable-tlsdesc | FileCheck %s --check-prefix=TLSDESC + +target triple = "x86_64--linux-gnu" + +declare void @effect() +declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) + +@foo_local = dso_local thread_local(localexec) global i32 0, align 4 + +define i32 @func_local_tls(i32 %arg0, i64 %arg1) nounwind { +; NOPIC-LABEL: func_local_tls: +; NOPIC: # %bb.0: # %entry +; NOPIC-NEXT: pushq %rbp +; NOPIC-NEXT: pushq %rbx +; NOPIC-NEXT: pushq %rax +; NOPIC-NEXT: movl %fs:foo_local@TPOFF, %ebp +; NOPIC-NEXT: testl %edi, %edi +; NOPIC-NEXT: movl %ebp, %eax +; NOPIC-NEXT: jne .LBB0_2 +; NOPIC-NEXT: # %bb.1: # %if.then +; NOPIC-NEXT: movq %rsi, %rbx +; NOPIC-NEXT: callq effect@PLT +; NOPIC-NEXT: movl %fs:foo_local@TPOFF+168(,%rbx,4), %eax +; NOPIC-NEXT: .LBB0_2: # %if.end +; NOPIC-NEXT: addl %ebp, %eax +; NOPIC-NEXT: addq $8, %rsp +; NOPIC-NEXT: popq %rbx +; NOPIC-NEXT: popq %rbp +; NOPIC-NEXT: retq +; +; PIC-LABEL: func_local_tls: +; PIC: # %bb.0: # %entry +; PIC-NEXT: pushq %rbp +; PIC-NEXT: pushq %r14 +; PIC-NEXT: pushq %rbx +; PIC-NEXT: movl %fs:.Lfoo_local$local@TPOFF, %ebp +; PIC-NEXT: testl %edi, %edi +; PIC-NEXT: movl %ebp, %eax +; PIC-NEXT: jne .LBB0_2 +; PIC-NEXT: # %bb.1: # %if.then +; PIC-NEXT: movq %rsi, %rbx +; PIC-NEXT: movq %fs:0, %rax +; PIC-NEXT: leaq .Lfoo_local$local@TPOFF(%rax), %r14 +; PIC-NEXT: callq effect@PLT +; PIC-NEXT: movl 168(%r14,%rbx,4), %eax +; PIC-NEXT: .LBB0_2: # %if.end +; PIC-NEXT: addl %ebp, %eax +; PIC-NEXT: popq %rbx +; PIC-NEXT: popq %r14 +; PIC-NEXT: popq %rbp +; PIC-NEXT: retq +; +; TLSDESC-LABEL: func_local_tls: +; TLSDESC: # %bb.0: # %entry +; TLSDESC-NEXT: pushq %rbp +; TLSDESC-NEXT: pushq %r14 +; TLSDESC-NEXT: pushq %rbx +; TLSDESC-NEXT: movl %fs:.Lfoo_local$local@TPOFF, %ebp +; TLSDESC-NEXT: testl %edi, %edi +; TLSDESC-NEXT: movl %ebp, %eax +; TLSDESC-NEXT: jne .LBB0_2 +; TLSDESC-NEXT: # %bb.1: # %if.then +; TLSDESC-NEXT: movq %rsi, %rbx +; TLSDESC-NEXT: movq %fs:0, %rax +; TLSDESC-NEXT: leaq .Lfoo_local$local@TPOFF(%rax), %r14 +; TLSDESC-NEXT: callq effect@PLT +; TLSDESC-NEXT: movl 168(%r14,%rbx,4), %eax +; TLSDESC-NEXT: .LBB0_2: # %if.end +; TLSDESC-NEXT: addl %ebp, %eax +; TLSDESC-NEXT: popq %rbx +; TLSDESC-NEXT: popq %r14 +; TLSDESC-NEXT: popq %rbp +; TLSDESC-NEXT: retq +entry: + %addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo_local) + %load0 = load i32, ptr %addr, align 4 + %cond = icmp eq i32 %arg0, 0 + br i1 %cond, label %if.then, label %if.end + +if.then: + tail call void @effect() + %x = add i64 %arg1, 42 + %addr1 = getelementptr inbounds i32, ptr %addr, i64 %x + %load1 = load i32, ptr %addr1, align 4 + br label %if.end + +if.end: + %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ] + %ret = add i32 %phi, %load0 + ret i32 %ret +} + +@foo_nonlocal = thread_local global i32 0, align 4 + +define i32 @func_nonlocal_tls(i32 %arg0, i64 %arg1) nounwind { +; NOPIC-LABEL: func_nonlocal_tls: +; NOPIC: # %bb.0: # %entry +; NOPIC-NEXT: pushq %rbp +; NOPIC-NEXT: pushq %r14 +; NOPIC-NEXT: pushq %rbx +; NOPIC-NEXT: movq foo_nonlocal@GOTTPOFF(%rip), %r14 +; NOPIC-NEXT: movl %fs:(%r14), %ebp +; NOPIC-NEXT: testl %edi, %edi +; NOPIC-NEXT: movl %ebp, %eax +; NOPIC-NEXT: jne .LBB1_2 +; NOPIC-NEXT: # %bb.1: # %if.then +; NOPIC-NEXT: movq %rsi, %rbx +; NOPIC-NEXT: callq effect@PLT +; NOPIC-NEXT: movl %fs:168(%r14,%rbx,4), %eax +; NOPIC-NEXT: .LBB1_2: # %if.end +; NOPIC-NEXT: addl %ebp, %eax +; NOPIC-NEXT: popq %rbx +; NOPIC-NEXT: popq %r14 +; NOPIC-NEXT: popq %rbp +; NOPIC-NEXT: retq +; +; PIC-LABEL: func_nonlocal_tls: +; PIC: # %bb.0: # %entry +; PIC-NEXT: pushq %rbp +; PIC-NEXT: pushq %r15 +; PIC-NEXT: pushq %r14 +; PIC-NEXT: pushq %rbx +; PIC-NEXT: pushq %rax +; PIC-NEXT: movq %rsi, %rbx +; PIC-NEXT: movl %edi, %ebp +; PIC-NEXT: data16 +; PIC-NEXT: leaq foo_nonlocal@TLSGD(%rip), %rdi +; PIC-NEXT: data16 +; PIC-NEXT: data16 +; PIC-NEXT: rex64 +; PIC-NEXT: callq __tls_get_addr@PLT +; PIC-NEXT: movq %rax, %r14 +; PIC-NEXT: movl (%rax), %r15d +; PIC-NEXT: testl %ebp, %ebp +; PIC-NEXT: movl %r15d, %eax +; PIC-NEXT: jne .LBB1_2 +; PIC-NEXT: # %bb.1: # %if.then +; PIC-NEXT: callq effect@PLT +; PIC-NEXT: movl 168(%r14,%rbx,4), %eax +; PIC-NEXT: .LBB1_2: # %if.end +; PIC-NEXT: addl %r15d, %eax +; PIC-NEXT: addq $8, %rsp +; PIC-NEXT: popq %rbx +; PIC-NEXT: popq %r14 +; PIC-NEXT: popq %r15 +; PIC-NEXT: popq %rbp +; PIC-NEXT: retq +; +; TLSDESC-LABEL: func_nonlocal_tls: +; TLSDESC: # %bb.0: # %entry +; TLSDESC-NEXT: pushq %rbp +; TLSDESC-NEXT: pushq %r14 +; TLSDESC-NEXT: pushq %rbx +; TLSDESC-NEXT: leaq foo_nonlocal@tlsdesc(%rip), %rax +; TLSDESC-NEXT: callq *foo_nonlocal@tlscall(%rax) +; TLSDESC-NEXT: movl %fs:(%rax), %ebp +; TLSDESC-NEXT: testl %edi, %edi +; TLSDESC-NEXT: movl %ebp, %ecx +; TLSDESC-NEXT: jne .LBB1_2 +; TLSDESC-NEXT: # %bb.1: # %if.then +; TLSDESC-NEXT: movq %rsi, %rbx +; TLSDESC-NEXT: addq %fs:0, %rax +; TLSDESC-NEXT: movq %rax, %r14 +; TLSDESC-NEXT: callq effect@PLT +; TLSDESC-NEXT: movl 168(%r14,%rbx,4), %ecx +; TLSDESC-NEXT: .LBB1_2: # %if.end +; TLSDESC-NEXT: addl %ebp, %ecx +; TLSDESC-NEXT: movl %ecx, %eax +; TLSDESC-NEXT: popq %rbx +; TLSDESC-NEXT: popq %r14 +; TLSDESC-NEXT: popq %rbp +; TLSDESC-NEXT: retq +entry: + %addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo_nonlocal) + %load0 = load i32, ptr %addr, align 4 + %cond = icmp eq i32 %arg0, 0 + br i1 %cond, label %if.then, label %if.end + +if.then: + tail call void @effect() + %x = add i64 %arg1, 42 + %addr1 = getelementptr inbounds i32, ptr %addr, i64 %x + %load1 = load i32, ptr %addr1, align 4 + br label %if.end + +if.end: + %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ] + %ret = add i32 %phi, %load0 + ret i32 %ret +} diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-tls.ll b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-tls.ll new file mode 100644 index 0000000000000..080c807cbad13 --- /dev/null +++ b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-tls.ll @@ -0,0 +1,86 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; RUN: opt -S -passes='require,function(codegenprepare)' %s | FileCheck %s + +target triple = "x86_64--linux-gnu" + +@foo = dso_local thread_local(localexec) global i32 0, align 4 + +declare void @effect() +declare nonnull ptr @llvm.threadlocal.address.p0(ptr nonnull) + +define i32 @func0(i32 %arg) { +; CHECK-LABEL: define i32 @func0( +; CHECK-SAME: i32 [[ARG:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR:%.*]] = tail call ptr @llvm.threadlocal.address.p0(ptr @foo) +; CHECK-NEXT: [[LOAD0:%.*]] = load i32, ptr [[ADDR]], align 4 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[ARG]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: tail call void @effect() +; CHECK-NEXT: [[TMP0:%.*]] = call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @foo) +; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[TMP0]], align 4 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[LOAD1]], [[IF_THEN]] ], [ [[LOAD0]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[RET:%.*]] = add i32 [[PHI]], [[LOAD0]] +; CHECK-NEXT: ret i32 [[RET]] +; +entry: + %addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo) + %load0 = load i32, ptr %addr, align 4 + %cond = icmp eq i32 %arg, 0 + br i1 %cond, label %if.then, label %if.end + +if.then: + tail call void @effect() + %load1 = load i32, ptr %addr, align 4 + br label %if.end + +if.end: + %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ] + %ret = add i32 %phi, %load0 + ret i32 %ret +} + +define i32 @func1(i32 %arg0, i32 %arg1) { +; CHECK-LABEL: define i32 @func1( +; CHECK-SAME: i32 [[ARG0:%.*]], i32 [[ARG1:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADDR:%.*]] = tail call ptr @llvm.threadlocal.address.p0(ptr @foo) +; CHECK-NEXT: [[LOAD0:%.*]] = load i32, ptr [[ADDR]], align 4 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[ARG0]], 0 +; CHECK-NEXT: br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_END:%.*]] +; CHECK: if.then: +; CHECK-NEXT: tail call void @effect() +; CHECK-NEXT: [[X:%.*]] = add i32 [[ARG1]], 42 +; CHECK-NEXT: [[X64:%.*]] = sext i32 [[X]] to i64 +; CHECK-NEXT: [[TMP0:%.*]] = call align 4 ptr @llvm.threadlocal.address.p0(ptr align 4 @foo) +; CHECK-NEXT: [[SUNKADDR:%.*]] = mul i64 [[X64]], 4 +; CHECK-NEXT: [[ADDR1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 [[SUNKADDR]] +; CHECK-NEXT: [[LOAD1:%.*]] = load i32, ptr [[ADDR1]], align 4 +; CHECK-NEXT: br label [[IF_END]] +; CHECK: if.end: +; CHECK-NEXT: [[PHI:%.*]] = phi i32 [ [[LOAD1]], [[IF_THEN]] ], [ [[LOAD0]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[RET:%.*]] = add i32 [[PHI]], [[LOAD0]] +; CHECK-NEXT: ret i32 [[RET]] +; +entry: + %addr = tail call ptr @llvm.threadlocal.address.p0(ptr @foo) + %load0 = load i32, ptr %addr, align 4 + %cond = icmp eq i32 %arg0, 0 + br i1 %cond, label %if.then, label %if.end + +if.then: + tail call void @effect() + %x = add i32 %arg1, 42 + %x64 = sext i32 %x to i64 + %addr1 = getelementptr inbounds i32, ptr %addr, i64 %x64 + %load1 = load i32, ptr %addr1, align 4 + br label %if.end + +if.end: + %phi = phi i32 [ %load1, %if.then ], [ %load0, %entry ] + %ret = add i32 %phi, %load0 + ret i32 %ret +}