From 0173be88666d3d074d72c0f69c91782d84b14c99 Mon Sep 17 00:00:00 2001 From: Kavin Gnanapandithan Date: Thu, 18 Sep 2025 21:02:49 -0400 Subject: [PATCH 1/3] added ISD::VECTOR_COMPRES handling in computeKnownBits/ComputeNumSignBits with test coverage --- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 34 +++ llvm/test/CodeGen/AArch64/vector-compress.ll | 113 ++++++++++ llvm/test/CodeGen/X86/vector-compress.ll | 209 ++++++++++++++++++ 3 files changed, 356 insertions(+) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index bcf25958d0982..ab0411eae9549 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -58,6 +58,7 @@ #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Type.h" +#include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/Compiler.h" @@ -3480,6 +3481,26 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, break; } break; + case ISD::VECTOR_COMPRESS: { + assert(!Op.getValueType().isScalableVector()); + + SDValue Vec = Op.getOperand(0); + SDValue PassThru = Op.getOperand(2); + // If PassThru is undefined, early out + if (PassThru.isUndef()) + break; + + Known.Zero.setAllBits(); + Known.One.setAllBits(); + Known2 = computeKnownBits(PassThru, Depth + 1); + Known = Known.intersectWith(Known2); + // If we don't know any bits, early out. + if (Known.isUnknown()) + break; + Known2 = computeKnownBits(Vec, Depth + 1); + Known = Known.intersectWith(Known2); + break; + } case ISD::VECTOR_SHUFFLE: { assert(!Op.getValueType().isScalableVector()); // Collect the known bits that are shared by every vector element referenced @@ -4792,6 +4813,19 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, } return Tmp; + case ISD::VECTOR_COMPRESS: { + SDValue Vec = Op.getOperand(0); + SDValue Mask = Op.getOperand(1); + SDValue PassThru = Op.getOperand(2); + // If PassThru is undefined, early out. + if (PassThru.isUndef()) + return 1; + Tmp = ComputeNumSignBits(Vec, Depth + 1); + Tmp2 = ComputeNumSignBits(PassThru, Depth + 1); + Tmp = std::min(Tmp, Tmp2); + return Tmp; + } + case ISD::VECTOR_SHUFFLE: { // Collect the minimum number of sign bits that are shared by every vector // element referenced by the shuffle. diff --git a/llvm/test/CodeGen/AArch64/vector-compress.ll b/llvm/test/CodeGen/AArch64/vector-compress.ll index 67a0379d05244..9165493863729 100644 --- a/llvm/test/CodeGen/AArch64/vector-compress.ll +++ b/llvm/test/CodeGen/AArch64/vector-compress.ll @@ -471,3 +471,116 @@ define <3 x i3> @test_compress_narrow_illegal_element_type(<3 x i3> %vec, <3 x i %out = call <3 x i3> @llvm.experimental.vector.compress(<3 x i3> %vec, <3 x i1> %mask, <3 x i3> undef) ret <3 x i3> %out } + +define <4 x i32> @test_compress_knownbits_zext_v4i16_4i32(<4 x i16> %vec, <4 x i1> %mask, <4 x i32> %passthru) nounwind { +; CHECK-LABEL: test_compress_knownbits_zext_v4i16_4i32: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: ushll.4s v1, v1, #0 +; CHECK-NEXT: movi.4s v3, #1 +; CHECK-NEXT: mov x14, sp +; CHECK-NEXT: movi.4s v4, #3 +; CHECK-NEXT: ushll.4s v0, v0, #0 +; CHECK-NEXT: mov x13, sp +; CHECK-NEXT: mov x12, sp +; CHECK-NEXT: mov x15, sp +; CHECK-NEXT: shl.4s v1, v1, #31 +; CHECK-NEXT: and.16b v2, v2, v4 +; CHECK-NEXT: cmlt.4s v1, v1, #0 +; CHECK-NEXT: str q2, [sp] +; CHECK-NEXT: and.16b v3, v1, v3 +; CHECK-NEXT: mov.s w8, v1[1] +; CHECK-NEXT: mov.s w9, v1[2] +; CHECK-NEXT: mov.s w10, v1[3] +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: addv.4s s1, v3 +; CHECK-NEXT: and x16, x11, #0x1 +; CHECK-NEXT: and x8, x8, #0x1 +; CHECK-NEXT: bfi x14, x11, #2, #1 +; CHECK-NEXT: add x8, x16, x8 +; CHECK-NEXT: and x9, x9, #0x1 +; CHECK-NEXT: and x10, x10, #0x1 +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: add x9, x8, x9 +; CHECK-NEXT: mov w16, #3 ; =0x3 +; CHECK-NEXT: add x10, x9, x10 +; CHECK-NEXT: orr x8, x12, x8, lsl #2 +; CHECK-NEXT: bfi x15, x9, #2, #2 +; CHECK-NEXT: cmp x10, #3 +; CHECK-NEXT: bfi x13, x11, #2, #2 +; CHECK-NEXT: mov.s w11, v0[3] +; CHECK-NEXT: csel x9, x10, x16, lo +; CHECK-NEXT: ldr w13, [x13] +; CHECK-NEXT: str s0, [sp] +; CHECK-NEXT: st1.s { v0 }[1], [x14] +; CHECK-NEXT: st1.s { v0 }[2], [x8] +; CHECK-NEXT: orr x8, x12, x9, lsl #2 +; CHECK-NEXT: csel w9, w11, w13, hi +; CHECK-NEXT: st1.s { v0 }[3], [x15] +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: ret +entry: + %xvec = zext <4 x i16> %vec to <4 x i32> + %xpassthru = and <4 x i32> %passthru, splat (i32 3) + %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %xvec, <4 x i1> %mask, <4 x i32> %xpassthru) + %res = and <4 x i32> %out, splat (i32 65535) + ret <4 x i32> %res +} + +define <4 x i32> @test_compress_numsignbits_sext_v4i16_4i32(<4 x i16> %vec, <4 x i1> %mask, <4 x i32> %passthru) nounwind { +; CHECK-LABEL: test_compress_numsignbits_sext_v4i16_4i32: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: ushll.4s v1, v1, #0 +; CHECK-NEXT: movi.4s v3, #1 +; CHECK-NEXT: mov x14, sp +; CHECK-NEXT: movi.4s v4, #3 +; CHECK-NEXT: sshll.4s v0, v0, #0 +; CHECK-NEXT: mov x13, sp +; CHECK-NEXT: mov x12, sp +; CHECK-NEXT: mov x15, sp +; CHECK-NEXT: shl.4s v1, v1, #31 +; CHECK-NEXT: and.16b v2, v2, v4 +; CHECK-NEXT: cmlt.4s v1, v1, #0 +; CHECK-NEXT: str q2, [sp] +; CHECK-NEXT: and.16b v3, v1, v3 +; CHECK-NEXT: mov.s w8, v1[1] +; CHECK-NEXT: mov.s w9, v1[2] +; CHECK-NEXT: mov.s w10, v1[3] +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: addv.4s s1, v3 +; CHECK-NEXT: and x16, x11, #0x1 +; CHECK-NEXT: and x8, x8, #0x1 +; CHECK-NEXT: bfi x14, x11, #2, #1 +; CHECK-NEXT: add x8, x16, x8 +; CHECK-NEXT: and x9, x9, #0x1 +; CHECK-NEXT: and x10, x10, #0x1 +; CHECK-NEXT: fmov w11, s1 +; CHECK-NEXT: add x9, x8, x9 +; CHECK-NEXT: mov w16, #3 ; =0x3 +; CHECK-NEXT: add x10, x9, x10 +; CHECK-NEXT: orr x8, x12, x8, lsl #2 +; CHECK-NEXT: bfi x15, x9, #2, #2 +; CHECK-NEXT: cmp x10, #3 +; CHECK-NEXT: bfi x13, x11, #2, #2 +; CHECK-NEXT: mov.s w11, v0[3] +; CHECK-NEXT: csel x9, x10, x16, lo +; CHECK-NEXT: ldr w13, [x13] +; CHECK-NEXT: str s0, [sp] +; CHECK-NEXT: st1.s { v0 }[1], [x14] +; CHECK-NEXT: st1.s { v0 }[2], [x8] +; CHECK-NEXT: orr x8, x12, x9, lsl #2 +; CHECK-NEXT: csel w9, w11, w13, hi +; CHECK-NEXT: st1.s { v0 }[3], [x15] +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ldr q0, [sp], #16 +; CHECK-NEXT: ret +entry: + %xvec = sext <4 x i16> %vec to <4 x i32> + %xpassthru = and <4 x i32> %passthru, splat(i32 3) + %out = call <4 x i32> @llvm.experimental.vector.compress(<4 x i32> %xvec, <4 x i1> %mask, <4 x i32> %xpassthru) + %shl = shl <4 x i32> %out, splat(i32 16) + %res = ashr <4 x i32> %shl, splat(i32 16) + ret <4 x i32> %res +} diff --git a/llvm/test/CodeGen/X86/vector-compress.ll b/llvm/test/CodeGen/X86/vector-compress.ll index 1ab1a1a01e168..ac932d51017ae 100644 --- a/llvm/test/CodeGen/X86/vector-compress.ll +++ b/llvm/test/CodeGen/X86/vector-compress.ll @@ -4427,6 +4427,215 @@ define <64 x i32> @test_compress_large(<64 x i1> %mask, <64 x i32> %vec, <64 x i ret <64 x i32> %out } +define <8 x i64> @test_compress_knownbits_zext_v8i16_8i64(<8 x i16> %vec, <8 x i1> %mask, <8 x i64> %passthru) nounwind { +; AVX2-LABEL: test_compress_knownbits_zext_v8i16_8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $96, %rsp +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm5 = [3,3,3,3] +; AVX2-NEXT: vandps %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vandps %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovaps %ymm2, (%rsp) +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm2, %rcx +; AVX2-NEXT: vmovq %xmm2, %rax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: andl $7, %eax +; AVX2-NEXT: vpextrw $1, %xmm1, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vmovd %xmm1, %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: vpextrw $2, %xmm1, %esi +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: addq %rcx, %rsi +; AVX2-NEXT: vpextrw $3, %xmm1, %edi +; AVX2-NEXT: andl $1, %edi +; AVX2-NEXT: addq %rsi, %rdi +; AVX2-NEXT: vpextrw $4, %xmm1, %r8d +; AVX2-NEXT: andl $1, %r8d +; AVX2-NEXT: addq %rdi, %r8 +; AVX2-NEXT: vpextrw $5, %xmm1, %r9d +; AVX2-NEXT: andl $1, %r9d +; AVX2-NEXT: addq %r8, %r9 +; AVX2-NEXT: vpextrw $6, %xmm1, %r10d +; AVX2-NEXT: andl $1, %r10d +; AVX2-NEXT: addq %r9, %r10 +; AVX2-NEXT: vpextrw $7, %xmm1, %r11d +; AVX2-NEXT: andl $1, %r11d +; AVX2-NEXT: addq %r10, %r11 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rbx +; AVX2-NEXT: cmpq $8, %r11 +; AVX2-NEXT: cmovbq (%rsp,%rax,8), %rbx +; AVX2-NEXT: vmovq %xmm4, (%rsp) +; AVX2-NEXT: vpextrq $1, %xmm4, (%rsp,%rdx,8) +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%rsp,%rcx,8) +; AVX2-NEXT: vpextrq $1, %xmm2, (%rsp,%rsi,8) +; AVX2-NEXT: andl $7, %edi +; AVX2-NEXT: vmovq %xmm0, (%rsp,%rdi,8) +; AVX2-NEXT: andl $7, %r8d +; AVX2-NEXT: vpextrq $1, %xmm0, (%rsp,%r8,8) +; AVX2-NEXT: andl $7, %r9d +; AVX2-NEXT: vmovq %xmm1, (%rsp,%r9,8) +; AVX2-NEXT: andl $7, %r10d +; AVX2-NEXT: vpextrq $1, %xmm1, (%rsp,%r10,8) +; AVX2-NEXT: cmpq $7, %r11 +; AVX2-NEXT: movl $7, %eax +; AVX2-NEXT: cmovbq %r11, %rax +; AVX2-NEXT: movl %eax, %eax +; AVX2-NEXT: movq %rbx, (%rsp,%rax,8) +; AVX2-NEXT: vmovaps (%rsp), %ymm0 +; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX2-NEXT: leaq -8(%rbp), %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512F-LABEL: test_compress_knownbits_zext_v8i16_8i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: test_compress_knownbits_zext_v8i16_8i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovw2m %xmm1, %k1 +; AVX512VL-NEXT: vpmovzxwq {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512VL-NEXT: retq + %xvec = zext <8 x i16> %vec to <8 x i64> ; 0 -> 65535 + %xpassthru = and <8 x i64> %passthru, splat (i64 3) ; 0 -> 3 + %out = call <8 x i64> @llvm.experimental.vector.compress(<8 x i64> %xvec, <8 x i1> %mask, <8 x i64> %xpassthru) + %res = and <8 x i64> %out, splat (i64 65535) ; unnecessary - %out guaranteed to be 0 -> 65535 + ret <8 x i64> %res +} + +define <8 x i64> @test_compress_knownbits_sext_v8i16_8i64(<8 x i16> %vec, <8 x i1> %mask, <8 x i64> %passthru) nounwind { +; AVX2-LABEL: test_compress_knownbits_sext_v8i16_8i64: +; AVX2: # %bb.0: +; AVX2-NEXT: pushq %rbp +; AVX2-NEXT: movq %rsp, %rbp +; AVX2-NEXT: pushq %rbx +; AVX2-NEXT: andq $-32, %rsp +; AVX2-NEXT: subq $96, %rsp +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm4 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 +; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm5 = [3,3,3,3] +; AVX2-NEXT: vandps %ymm5, %ymm2, %ymm2 +; AVX2-NEXT: vandps %ymm5, %ymm3, %ymm3 +; AVX2-NEXT: vmovaps %ymm3, {{[0-9]+}}(%rsp) +; AVX2-NEXT: vmovaps %ymm2, (%rsp) +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm3 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3] +; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero +; AVX2-NEXT: vpaddq %ymm2, %ymm3, %ymm2 +; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm3 +; AVX2-NEXT: vpaddq %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vpextrq $1, %xmm2, %rcx +; AVX2-NEXT: vmovq %xmm2, %rax +; AVX2-NEXT: addl %ecx, %eax +; AVX2-NEXT: andl $7, %eax +; AVX2-NEXT: vpextrw $1, %xmm1, %ecx +; AVX2-NEXT: andl $1, %ecx +; AVX2-NEXT: vmovd %xmm1, %edx +; AVX2-NEXT: andl $1, %edx +; AVX2-NEXT: addq %rdx, %rcx +; AVX2-NEXT: vpextrw $2, %xmm1, %esi +; AVX2-NEXT: andl $1, %esi +; AVX2-NEXT: addq %rcx, %rsi +; AVX2-NEXT: vpextrw $3, %xmm1, %edi +; AVX2-NEXT: andl $1, %edi +; AVX2-NEXT: addq %rsi, %rdi +; AVX2-NEXT: vpextrw $4, %xmm1, %r8d +; AVX2-NEXT: andl $1, %r8d +; AVX2-NEXT: addq %rdi, %r8 +; AVX2-NEXT: vpextrw $5, %xmm1, %r9d +; AVX2-NEXT: andl $1, %r9d +; AVX2-NEXT: addq %r8, %r9 +; AVX2-NEXT: vpextrw $6, %xmm1, %r10d +; AVX2-NEXT: andl $1, %r10d +; AVX2-NEXT: addq %r9, %r10 +; AVX2-NEXT: vpextrw $7, %xmm1, %r11d +; AVX2-NEXT: andl $1, %r11d +; AVX2-NEXT: addq %r10, %r11 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpextrq $1, %xmm1, %rbx +; AVX2-NEXT: cmpq $8, %r11 +; AVX2-NEXT: cmovbq (%rsp,%rax,8), %rbx +; AVX2-NEXT: vmovq %xmm4, (%rsp) +; AVX2-NEXT: vpextrq $1, %xmm4, (%rsp,%rdx,8) +; AVX2-NEXT: vextracti128 $1, %ymm4, %xmm2 +; AVX2-NEXT: vmovq %xmm2, (%rsp,%rcx,8) +; AVX2-NEXT: vpextrq $1, %xmm2, (%rsp,%rsi,8) +; AVX2-NEXT: andl $7, %edi +; AVX2-NEXT: vmovq %xmm0, (%rsp,%rdi,8) +; AVX2-NEXT: andl $7, %r8d +; AVX2-NEXT: vpextrq $1, %xmm0, (%rsp,%r8,8) +; AVX2-NEXT: andl $7, %r9d +; AVX2-NEXT: vmovq %xmm1, (%rsp,%r9,8) +; AVX2-NEXT: andl $7, %r10d +; AVX2-NEXT: vpextrq $1, %xmm1, (%rsp,%r10,8) +; AVX2-NEXT: cmpq $7, %r11 +; AVX2-NEXT: movl $7, %eax +; AVX2-NEXT: cmovbq %r11, %rax +; AVX2-NEXT: movl %eax, %eax +; AVX2-NEXT: movq %rbx, (%rsp,%rax,8) +; AVX2-NEXT: vmovaps (%rsp), %ymm0 +; AVX2-NEXT: vmovaps {{[0-9]+}}(%rsp), %ymm1 +; AVX2-NEXT: leaq -8(%rbp), %rsp +; AVX2-NEXT: popq %rbx +; AVX2-NEXT: popq %rbp +; AVX2-NEXT: retq +; +; AVX512F-LABEL: test_compress_knownbits_sext_v8i16_8i64: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpmovsxwq %xmm1, %zmm1 +; AVX512F-NEXT: vpsllq $63, %zmm1, %zmm1 +; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1 +; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm1 +; AVX512F-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: test_compress_knownbits_sext_v8i16_8i64: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpsllw $15, %xmm1, %xmm1 +; AVX512VL-NEXT: vpmovw2m %xmm1, %k1 +; AVX512VL-NEXT: vpmovsxwq %xmm0, %zmm1 +; AVX512VL-NEXT: vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm2, %zmm0 +; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} +; AVX512VL-NEXT: retq + %xvec = sext <8 x i16> %vec to <8 x i64> ; sign extend vec + %xpassthru = and <8 x i64> %passthru, splat(i64 3) + %out = call <8 x i64> @llvm.experimental.vector.compress(<8 x i64> %xvec, <8 x i1> %mask, <8 x i64> %xpassthru) + %shl = shl <8 x i64> %out, splat(i64 48) + %res = ashr <8 x i64> %shl, splat(i64 48) + ret <8 x i64> %res +} + define <4 x i32> @test_compress_all_const() nounwind { ; AVX2-LABEL: test_compress_all_const: ; AVX2: # %bb.0: From 3ad285b58a9014410c8f5caf2145116c1a4c645d Mon Sep 17 00:00:00 2001 From: Kavin Gnanapandithan Date: Sat, 20 Sep 2025 18:35:44 -0400 Subject: [PATCH 2/3] Updated knownbit/numsignbit for VECTOR_COMPRESS & added tests for SVE/RVV --- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 17 +++------ .../CodeGen/AArch64/sve-vector-compress.ll | 36 +++++++++++++++++++ .../test/CodeGen/RISCV/rvv/vector-compress.ll | 33 +++++++++++++++++ 3 files changed, 73 insertions(+), 13 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index ab0411eae9549..b5bbcafaab183 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -58,7 +58,6 @@ #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Type.h" -#include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CodeGen.h" #include "llvm/Support/Compiler.h" @@ -3482,18 +3481,12 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, } break; case ISD::VECTOR_COMPRESS: { - assert(!Op.getValueType().isScalableVector()); - SDValue Vec = Op.getOperand(0); SDValue PassThru = Op.getOperand(2); - // If PassThru is undefined, early out - if (PassThru.isUndef()) - break; Known.Zero.setAllBits(); Known.One.setAllBits(); - Known2 = computeKnownBits(PassThru, Depth + 1); - Known = Known.intersectWith(Known2); + Known = computeKnownBits(PassThru, DemandedElts, Depth + 1); // If we don't know any bits, early out. if (Known.isUnknown()) break; @@ -4815,13 +4808,11 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, case ISD::VECTOR_COMPRESS: { SDValue Vec = Op.getOperand(0); - SDValue Mask = Op.getOperand(1); SDValue PassThru = Op.getOperand(2); - // If PassThru is undefined, early out. - if (PassThru.isUndef()) + Tmp = ComputeNumSignBits(PassThru, DemandedElts, Depth + 1); + if (Tmp == 1) return 1; - Tmp = ComputeNumSignBits(Vec, Depth + 1); - Tmp2 = ComputeNumSignBits(PassThru, Depth + 1); + Tmp2 = ComputeNumSignBits(Vec, Depth + 1); Tmp = std::min(Tmp, Tmp2); return Tmp; } diff --git a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll index 198e0a37c56fa..cc3a3734a9721 100644 --- a/llvm/test/CodeGen/AArch64/sve-vector-compress.ll +++ b/llvm/test/CodeGen/AArch64/sve-vector-compress.ll @@ -100,6 +100,42 @@ define @test_compress_illegal_element_type( % ret %out } +define @test_compress_knownbits_zext( %vec, %mask, %passthru) nounwind { +; CHECK-LABEL: test_compress_knownbits_zext: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.s, z0.s, #0xffff +; CHECK-NEXT: cntp x8, p0, p0.s +; CHECK-NEXT: and z1.s, z1.s, #0x3 +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: ret + %xvec = zext %vec to + %xpassthru = and %passthru, splat (i32 3) + %out = call @llvm.experimental.vector.compress( %xvec, %mask, %xpassthru) + %res = and %out, splat (i32 65535) + ret %res +} + +define @test_compress_numsignbits_sext( %vec, %mask, %passthru) nounwind { +; CHECK-LABEL: test_compress_numsignbits_sext: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: and z1.s, z1.s, #0x3 +; CHECK-NEXT: cntp x8, p0, p0.s +; CHECK-NEXT: sxth z0.s, p1/m, z0.s +; CHECK-NEXT: compact z0.s, p0, z0.s +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s +; CHECK-NEXT: ret + %xvec = sext %vec to + %xpassthru = and %passthru, splat (i32 3) + %out = call @llvm.experimental.vector.compress( %xvec, %mask, %xpassthru) + %shl = shl %out, splat (i32 16) + %res = ashr %shl, splat (i32 16) + ret %res +} + define @test_compress_large( %vec, %mask) { ; CHECK-LABEL: test_compress_large: ; CHECK: // %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-compress.ll b/llvm/test/CodeGen/RISCV/rvv/vector-compress.ll index e06382b19c41a..6a3bfae0fb10c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-compress.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-compress.ll @@ -346,6 +346,39 @@ define @vector_compress_nxv4i32_passthru( % ret %ret } +define @test_compress_nvx8f64_knownbits( %vec, %mask, %passthru) nounwind { +; CHECK-LABEL: test_compress_nvx8f64_knownbits: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vzext.vf2 v12, v8 +; CHECK-NEXT: vand.vi v8, v10, 3 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vcompress.vm v8, v12, v0 +; CHECK-NEXT: ret + %xvec = zext %vec to + %xpassthru = and %passthru, splat (i32 3) + %out = call @llvm.experimental.vector.compress( %xvec, %mask, %xpassthru) + %res = and %out, splat (i32 65535) + ret %res +} + +define @test_compress_nv8xf64_numsignbits( %vec, %mask, %passthru) nounwind { +; CHECK-LABEL: test_compress_nv8xf64_numsignbits: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetvli a0, zero, e32, m2, ta, ma +; CHECK-NEXT: vsext.vf2 v12, v8 +; CHECK-NEXT: vand.vi v8, v10, 3 +; CHECK-NEXT: vsetvli zero, zero, e32, m2, tu, ma +; CHECK-NEXT: vcompress.vm v8, v12, v0 +; CHECK-NEXT: ret + %xvec = sext %vec to + %xpassthru = and %passthru, splat (i32 3) + %out = call @llvm.experimental.vector.compress( %xvec, %mask, %xpassthru) + %shl = shl %out, splat (i32 16) + %res = ashr %shl, splat (i32 16) + ret %res +} + define @vector_compress_nxv8i32( %data, %mask) { ; CHECK-LABEL: vector_compress_nxv8i32: ; CHECK: # %bb.0: From d322cb27a1aa44866a479676f5886e80a757cde7 Mon Sep 17 00:00:00 2001 From: Kavin Gnanapandithan Date: Sun, 21 Sep 2025 19:25:21 -0400 Subject: [PATCH 3/3] Removed setAllBits() --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index b5bbcafaab183..57060595e4107 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -3483,9 +3483,6 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts, case ISD::VECTOR_COMPRESS: { SDValue Vec = Op.getOperand(0); SDValue PassThru = Op.getOperand(2); - - Known.Zero.setAllBits(); - Known.One.setAllBits(); Known = computeKnownBits(PassThru, DemandedElts, Depth + 1); // If we don't know any bits, early out. if (Known.isUnknown())