diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp index 0309e225d9df4..b6dd174f9be80 100644 --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -1839,7 +1839,8 @@ bool CodeGenPrepare::unfoldPowerOf2Test(CmpInst *Cmp) { /// lose; some adjustment may be wanted there. /// /// Return true if any changes are made. -static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) { +static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI, + const DataLayout &DL) { if (TLI.hasMultipleConditionRegisters(EVT::getEVT(Cmp->getType()))) return false; @@ -1847,6 +1848,18 @@ static bool sinkCmpExpression(CmpInst *Cmp, const TargetLowering &TLI) { if (TLI.useSoftFloat() && isa(Cmp)) return false; + bool UsedInPhiOrCurrentBlock = any_of(Cmp->users(), [Cmp](User *U) { + return isa(U) || + cast(U)->getParent() == Cmp->getParent(); + }); + + // Avoid sinking larger than legal integer comparisons unless its ONLY used in + // another BB. + if (UsedInPhiOrCurrentBlock && Cmp->getOperand(0)->getType()->isIntegerTy() && + Cmp->getOperand(0)->getType()->getScalarSizeInBits() > + DL.getLargestLegalIntTypeSizeInBits()) + return false; + // Only insert a cmp in each block once. DenseMap InsertedCmps; @@ -2224,7 +2237,7 @@ bool CodeGenPrepare::optimizeURem(Instruction *Rem) { } bool CodeGenPrepare::optimizeCmp(CmpInst *Cmp, ModifyDT &ModifiedDT) { - if (sinkCmpExpression(Cmp, *TLI)) + if (sinkCmpExpression(Cmp, *TLI, *DL)) return true; if (combineToUAddWithOverflow(Cmp, ModifiedDT)) diff --git a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll index ba6769b2aa3e1..0306bb18c2aed 100644 --- a/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll +++ b/llvm/test/CodeGen/RISCV/overflow-intrinsics.ll @@ -232,7 +232,7 @@ define i64 @uaddo3_math_overflow_used(i64 %a, i64 %b, ptr %res) nounwind ssp { ret i64 %Q } -; TODO? CGP sinks the compare before we have a chance to form the overflow intrinsic. +; Ensure CGP doesn't sink the compare before we have a chance to form the overflow intrinsic. define i64 @uaddo4(i64 %a, i64 %b, i1 %c) nounwind ssp { ; RV32-LABEL: uaddo4: @@ -1076,41 +1076,37 @@ define i1 @usubo_ult_cmp_dominates_i64(i64 %x, i64 %y, ptr %p, i1 %cond) { ; RV32-NEXT: .cfi_offset s4, -24 ; RV32-NEXT: .cfi_offset s5, -28 ; RV32-NEXT: .cfi_offset s6, -32 -; RV32-NEXT: mv s5, a5 -; RV32-NEXT: mv s3, a1 +; RV32-NEXT: mv s1, a5 +; RV32-NEXT: mv s4, a1 ; RV32-NEXT: andi a1, a5, 1 -; RV32-NEXT: beqz a1, .LBB32_8 +; RV32-NEXT: beqz a1, .LBB32_6 ; RV32-NEXT: # %bb.1: # %t ; RV32-NEXT: mv s0, a4 -; RV32-NEXT: mv s2, a3 -; RV32-NEXT: mv s1, a2 -; RV32-NEXT: mv s4, a0 -; RV32-NEXT: beq s3, a3, .LBB32_3 +; RV32-NEXT: mv s3, a3 +; RV32-NEXT: mv s2, a2 +; RV32-NEXT: mv s5, a0 +; RV32-NEXT: beq s4, a3, .LBB32_3 ; RV32-NEXT: # %bb.2: # %t -; RV32-NEXT: sltu s6, s3, s2 +; RV32-NEXT: sltu s6, s4, s3 ; RV32-NEXT: j .LBB32_4 ; RV32-NEXT: .LBB32_3: -; RV32-NEXT: sltu s6, s4, s1 +; RV32-NEXT: sltu s6, s5, s2 ; RV32-NEXT: .LBB32_4: # %t ; RV32-NEXT: mv a0, s6 ; RV32-NEXT: call call -; RV32-NEXT: beqz s6, .LBB32_8 +; RV32-NEXT: beqz s6, .LBB32_6 ; RV32-NEXT: # %bb.5: # %end -; RV32-NEXT: sltu a1, s4, s1 -; RV32-NEXT: mv a0, a1 -; RV32-NEXT: beq s3, s2, .LBB32_7 -; RV32-NEXT: # %bb.6: # %end -; RV32-NEXT: sltu a0, s3, s2 -; RV32-NEXT: .LBB32_7: # %end -; RV32-NEXT: sub a2, s3, s2 -; RV32-NEXT: sub a3, s4, s1 -; RV32-NEXT: sub a2, a2, a1 -; RV32-NEXT: sw a3, 0(s0) -; RV32-NEXT: sw a2, 4(s0) -; RV32-NEXT: j .LBB32_9 -; RV32-NEXT: .LBB32_8: # %f -; RV32-NEXT: mv a0, s5 -; RV32-NEXT: .LBB32_9: # %f +; RV32-NEXT: sltu a0, s5, s2 +; RV32-NEXT: sub a1, s4, s3 +; RV32-NEXT: sub a2, s5, s2 +; RV32-NEXT: sub a1, a1, a0 +; RV32-NEXT: sw a2, 0(s0) +; RV32-NEXT: sw a1, 4(s0) +; RV32-NEXT: mv a0, s6 +; RV32-NEXT: j .LBB32_7 +; RV32-NEXT: .LBB32_6: # %f +; RV32-NEXT: mv a0, s1 +; RV32-NEXT: .LBB32_7: # %f ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s0, 24(sp) # 4-byte Folded Reload ; RV32-NEXT: lw s1, 20(sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/pr166534.ll b/llvm/test/CodeGen/X86/pr166534.ll index aef44cc3e40d0..162a0c93bfcf4 100644 --- a/llvm/test/CodeGen/X86/pr166534.ll +++ b/llvm/test/CodeGen/X86/pr166534.ll @@ -7,100 +7,64 @@ define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) { ; SSE2-LABEL: pr166534: ; SSE2: # %bb.0: # %entry -; SSE2-NEXT: movq (%rdi), %rax -; SSE2-NEXT: movq 8(%rdi), %r8 ; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movq (%rsi), %r9 -; SSE2-NEXT: movq 8(%rsi), %rdi ; SSE2-NEXT: movdqu (%rsi), %xmm1 ; SSE2-NEXT: pcmpeqb %xmm0, %xmm1 ; SSE2-NEXT: pmovmskb %xmm1, %esi -; SSE2-NEXT: xorl %r10d, %r10d +; SSE2-NEXT: xorl %eax, %eax ; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF -; SSE2-NEXT: sete %r10b -; SSE2-NEXT: orq %r10, (%rdx) +; SSE2-NEXT: sete %al +; SSE2-NEXT: orq %rax, (%rdx) ; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF ; SSE2-NEXT: jne .LBB0_2 ; SSE2-NEXT: # %bb.1: # %if.then -; SSE2-NEXT: xorq %r9, %rax -; SSE2-NEXT: xorq %rdi, %r8 -; SSE2-NEXT: xorl %edx, %edx -; SSE2-NEXT: orq %rax, %r8 -; SSE2-NEXT: sete %dl -; SSE2-NEXT: orq %rdx, (%rcx) +; SSE2-NEXT: orq %rax, (%rcx) ; SSE2-NEXT: .LBB0_2: # %if.end ; SSE2-NEXT: retq ; ; SSE4-LABEL: pr166534: ; SSE4: # %bb.0: # %entry -; SSE4-NEXT: movq (%rdi), %rax -; SSE4-NEXT: movq 8(%rdi), %r8 ; SSE4-NEXT: movdqu (%rdi), %xmm0 -; SSE4-NEXT: movq (%rsi), %r9 -; SSE4-NEXT: movq 8(%rsi), %rdi ; SSE4-NEXT: movdqu (%rsi), %xmm1 ; SSE4-NEXT: pxor %xmm0, %xmm1 -; SSE4-NEXT: xorl %esi, %esi +; SSE4-NEXT: xorl %eax, %eax ; SSE4-NEXT: ptest %xmm1, %xmm1 -; SSE4-NEXT: sete %sil -; SSE4-NEXT: orq %rsi, (%rdx) +; SSE4-NEXT: sete %al +; SSE4-NEXT: orq %rax, (%rdx) ; SSE4-NEXT: ptest %xmm1, %xmm1 ; SSE4-NEXT: jne .LBB0_2 ; SSE4-NEXT: # %bb.1: # %if.then -; SSE4-NEXT: xorq %r9, %rax -; SSE4-NEXT: xorq %rdi, %r8 -; SSE4-NEXT: xorl %edx, %edx -; SSE4-NEXT: orq %rax, %r8 -; SSE4-NEXT: sete %dl -; SSE4-NEXT: orq %rdx, (%rcx) +; SSE4-NEXT: orq %rax, (%rcx) ; SSE4-NEXT: .LBB0_2: # %if.end ; SSE4-NEXT: retq ; ; AVX2-LABEL: pr166534: ; AVX2: # %bb.0: # %entry -; AVX2-NEXT: movq (%rdi), %rax -; AVX2-NEXT: movq 8(%rdi), %r8 ; AVX2-NEXT: vmovdqu (%rdi), %xmm0 -; AVX2-NEXT: movq (%rsi), %rdi ; AVX2-NEXT: vpxor (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: movq 8(%rsi), %rsi -; AVX2-NEXT: xorl %r9d, %r9d +; AVX2-NEXT: xorl %eax, %eax ; AVX2-NEXT: vptest %xmm0, %xmm0 -; AVX2-NEXT: sete %r9b -; AVX2-NEXT: orq %r9, (%rdx) +; AVX2-NEXT: sete %al +; AVX2-NEXT: orq %rax, (%rdx) ; AVX2-NEXT: vptest %xmm0, %xmm0 ; AVX2-NEXT: jne .LBB0_2 ; AVX2-NEXT: # %bb.1: # %if.then -; AVX2-NEXT: xorq %rdi, %rax -; AVX2-NEXT: xorq %rsi, %r8 -; AVX2-NEXT: xorl %edx, %edx -; AVX2-NEXT: orq %rax, %r8 -; AVX2-NEXT: sete %dl -; AVX2-NEXT: orq %rdx, (%rcx) +; AVX2-NEXT: orq %rax, (%rcx) ; AVX2-NEXT: .LBB0_2: # %if.end ; AVX2-NEXT: retq ; ; AVX512-LABEL: pr166534: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: movq (%rdi), %rax -; AVX512-NEXT: movq 8(%rdi), %r8 ; AVX512-NEXT: vmovdqu (%rdi), %xmm0 -; AVX512-NEXT: movq (%rsi), %r9 -; AVX512-NEXT: movq 8(%rsi), %rdi ; AVX512-NEXT: vpxor (%rsi), %xmm0, %xmm0 -; AVX512-NEXT: xorl %esi, %esi +; AVX512-NEXT: xorl %eax, %eax ; AVX512-NEXT: vptest %xmm0, %xmm0 -; AVX512-NEXT: sete %sil -; AVX512-NEXT: orq %rsi, (%rdx) +; AVX512-NEXT: sete %al +; AVX512-NEXT: orq %rax, (%rdx) ; AVX512-NEXT: vptest %xmm0, %xmm0 ; AVX512-NEXT: jne .LBB0_2 ; AVX512-NEXT: # %bb.1: # %if.then -; AVX512-NEXT: xorq %r9, %rax -; AVX512-NEXT: xorq %rdi, %r8 -; AVX512-NEXT: xorl %edx, %edx -; AVX512-NEXT: orq %rax, %r8 -; AVX512-NEXT: sete %dl -; AVX512-NEXT: orq %rdx, (%rcx) +; AVX512-NEXT: orq %rax, (%rcx) ; AVX512-NEXT: .LBB0_2: # %if.end ; AVX512-NEXT: retq entry: