From 88cfecfd390136d107a3d777ef3ec1c7c8ce5edf Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 30 Oct 2025 08:23:10 +0000 Subject: [PATCH 1/2] [X86] combinePTESTCC - fold PTESTZ(X,SIGNMASK) -> VTESTPD/PSZ(X,X) on AVX targets If the PTEST is just using the ZF result and one of the operands is a i32/i64 sign mask we can use the TESTPD/PS instructions instead and avoid the use of an extra constant. Fixes some codegen identified in #156233 --- llvm/lib/Target/X86/X86ISelLowering.cpp | 20 +++++ llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll | 84 ++++--------------- 2 files changed, 38 insertions(+), 66 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 624cff24ddf03..60ee2291d47e5 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48859,6 +48859,26 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, if (ISD::isBuildVectorAllOnes(Op1.getNode())) return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0); + // Attempt to convert PTESTZ(X,SIGNMASK) -> VTESTPD/PSZ(X,X) on AVX targets. + if (EFLAGS.getOpcode() == X86ISD::PTEST && Subtarget.hasAVX()) { + KnownBits KnownOp1 = DAG.computeKnownBits(Op1); + assert(KnownOp1.getBitWidth() == 64 && + "Illegal PTEST vector element width"); + if (KnownOp1.isConstant()) { + const APInt &Mask = KnownOp1.getConstant(); + if (Mask.isSignMask()) { + MVT FpVT = MVT::getVectorVT(MVT::f64, OpVT.getSizeInBits() / 64); + Op0 = DAG.getBitcast(FpVT, Op0); + return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Op0, Op0); + } + if (Mask.isSplat(32) && Mask.trunc(32).isSignMask()) { + MVT FpVT = MVT::getVectorVT(MVT::f32, OpVT.getSizeInBits() / 32); + Op0 = DAG.getBitcast(FpVT, Op0); + return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Op0, Op0); + } + } + } + // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y) // TODO: Add COND_NE handling? if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) { diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll index 9816fa7c83560..044327d94c0ef 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll @@ -875,28 +875,12 @@ define i1 @mask_v8i32(<8 x i32> %a0) { ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: mask_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: sete %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: mask_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: mask_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: mask_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: vtestps %ymm0, %ymm0 +; AVX-NEXT: sete %al +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0) %2 = and i32 %1, 2147483648 %3 = icmp eq i32 %2, 0 @@ -965,28 +949,12 @@ define i1 @signtest_v8i32(<8 x i32> %a0) { ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: signtest_v8i32: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: sete %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: signtest_v8i32: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: signtest_v8i32: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372039002259456,9223372039002259456,9223372039002259456,9223372039002259456] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: signtest_v8i32: +; AVX: # %bb.0: +; AVX-NEXT: vtestps %ymm0, %ymm0 +; AVX-NEXT: sete %al +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %1 = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %a0) %2 = icmp sgt i32 %1, -1 ret i1 %2 @@ -1010,28 +978,12 @@ define i1 @signtest_v4i64(<4 x i64> %a0) { ; SSE41-NEXT: sete %al ; SSE41-NEXT: retq ; -; AVX1-LABEL: signtest_v4i64: -; AVX1: # %bb.0: -; AVX1-NEXT: vptest {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0 -; AVX1-NEXT: sete %al -; AVX1-NEXT: vzeroupper -; AVX1-NEXT: retq -; -; AVX2-LABEL: signtest_v4i64: -; AVX2: # %bb.0: -; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX2-NEXT: vptest %ymm1, %ymm0 -; AVX2-NEXT: sete %al -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq -; -; AVX512-LABEL: signtest_v4i64: -; AVX512: # %bb.0: -; AVX512-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808] -; AVX512-NEXT: vptest %ymm1, %ymm0 -; AVX512-NEXT: sete %al -; AVX512-NEXT: vzeroupper -; AVX512-NEXT: retq +; AVX-LABEL: signtest_v4i64: +; AVX: # %bb.0: +; AVX-NEXT: vtestpd %ymm0, %ymm0 +; AVX-NEXT: sete %al +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq %1 = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %a0) %2 = icmp sgt i64 %1, -1 ret i1 %2 From 547cfaebba394e9af93a6bbce043816c4bddd0be Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 30 Oct 2025 10:49:07 +0000 Subject: [PATCH 2/2] Freeze duplicated operand --- llvm/lib/Target/X86/X86ISelLowering.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 60ee2291d47e5..6165462afd833 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -48868,12 +48868,12 @@ static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, const APInt &Mask = KnownOp1.getConstant(); if (Mask.isSignMask()) { MVT FpVT = MVT::getVectorVT(MVT::f64, OpVT.getSizeInBits() / 64); - Op0 = DAG.getBitcast(FpVT, Op0); + Op0 = DAG.getBitcast(FpVT, DAG.getFreeze(Op0)); return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Op0, Op0); } if (Mask.isSplat(32) && Mask.trunc(32).isSignMask()) { MVT FpVT = MVT::getVectorVT(MVT::f32, OpVT.getSizeInBits() / 32); - Op0 = DAG.getBitcast(FpVT, Op0); + Op0 = DAG.getBitcast(FpVT, DAG.getFreeze(Op0)); return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Op0, Op0); } }