Skip to content

[x86] Missing patterns to utilize "vtest" instructions fully. #156233

@ActuallyaDeviloper

Description

@ActuallyaDeviloper

Hello,

while investigating the ability to use generic SIMD code to generate well optimized x86, I was first pleasently surprised that "vtest" can be successfully generated. I also found some glaring missing cases though...

#include <immintrin.h>
#include <stdint.h>

void A();

using T = __attribute__((vector_size(32))) int32_t;

void okA(T a, T b) {
    if (__builtin_reduce_or(a & (~b)) != 0)
        A();
}
void okB(T a, T b) {
    if (__builtin_reduce_or(a & b) != 0)
        A();
}
void okC(T a, T b) {
    if (__builtin_reduce_and(a & b) != -1)
        A();
}
void okD(T a, T b) {
    if (__builtin_reduce_or((a & b) < 0))
        A();
}
void badA(T a, T b) {
    if (__builtin_reduce_or(a & (~b)) != 0) // Expect single "vtest" + "jae".
        A();
    else if (__builtin_reduce_or(a) != 0) // Expect single "vtest" + "jae".
        A();
}
void badB(T a, T b) {
    if (__builtin_reduce_or(a & (~b)) != 0) // Expect single "vtest" + "jae".
        A();
    else if (__builtin_reduce_or(a & b) != 0)  // Expect single "jne".
        A();
}
void badC(T a, T b) {
    if ((__builtin_reduce_or(a & (~b)) != 0) && (__builtin_reduce_or(a & b) != 0)) // Expect single "vtest" + "ja".
        A();
}
void badD(T a, T b) {
    if (__builtin_reduce_or(a & 0x80000000) != 0) // Expect single "vtestps" + "jae".
        A();
}
void badE(T a, T b) {
    if (__builtin_reduce_or((a & b) & 0x80000000) != 0) // Expect single "vtestps" + "jae".
        A();
}
void badF(T a, T b) {
    if (__builtin_reduce_or(((~a) & b) & 0x80000000) != 0) // Expect single "vtestps" + "jne".
        A();
}
// More combinations feasable. Similarly also with "vtestpd".

Especially for the variants "badA" to "badC" the generated code is attrocious for no reason apparent to me.

okA(int vector[8], int vector[8]):
        vptest  ymm1, ymm0
        vzeroupper
        jae     A()@PLT
        ret

okB(int vector[8], int vector[8]):
        vptest  ymm1, ymm0
        vzeroupper
        jne     A()@PLT
        ret

okC(int vector[8], int vector[8]):
        vpand   ymm0, ymm1, ymm0
        vpcmpeqd        ymm1, ymm1, ymm1
        vptest  ymm0, ymm1
        vzeroupper
        jae     A()@PLT
        ret

okD(int vector[8], int vector[8]):
        vtestps ymm1, ymm0
        vzeroupper
        jne     A()@PLT
        ret

badA(int vector[8], int vector[8]):
        vpandn  ymm1, ymm1, ymm0
        vextracti128    xmm2, ymm1, 1
        vpor    xmm1, xmm1, xmm2
        vpshufd xmm2, xmm1, 238
        vpor    xmm1, xmm1, xmm2
        vpshufd xmm2, xmm1, 85
        vpor    xmm1, xmm1, xmm2
        vmovd   eax, xmm1
        vextracti128    xmm1, ymm0, 1
        vpor    ymm0, ymm0, ymm1
        vpshufd xmm1, xmm0, 238
        vpor    ymm2, ymm0, ymm1
        vpor    xmm0, xmm0, xmm1
        vpshufd xmm0, xmm0, 85
        vpor    ymm0, ymm2, ymm0
        vmovd   ecx, xmm0
        or      ecx, eax
        vzeroupper
        jne     A()@PLT
        ret

badB(int vector[8], int vector[8]):
        vpandn  ymm2, ymm1, ymm0
        vextracti128    xmm3, ymm2, 1
        vpor    xmm2, xmm2, xmm3
        vpshufd xmm3, xmm2, 238
        vpor    xmm2, xmm2, xmm3
        vpshufd xmm3, xmm2, 85
        vpor    xmm2, xmm2, xmm3
        vmovd   eax, xmm2
        vpand   ymm0, ymm1, ymm0
        vextracti128    xmm1, ymm0, 1
        vpor    ymm0, ymm0, ymm1
        vpshufd xmm1, xmm0, 238
        vpor    ymm2, ymm0, ymm1
        vpor    xmm0, xmm0, xmm1
        vpshufd xmm0, xmm0, 85
        vpor    ymm0, ymm2, ymm0
        vmovd   ecx, xmm0
        or      ecx, eax
        vzeroupper
        jne     A()@PLT
        ret

badC(int vector[8], int vector[8]):
        vptest  ymm1, ymm0
        jb      .LBB6_2
        vpand   ymm0, ymm1, ymm0
        vextracti128    xmm1, ymm0, 1
        vpor    xmm0, xmm0, xmm1
        vpshufd xmm1, xmm0, 238
        vpor    xmm0, xmm0, xmm1
        vpshufd xmm1, xmm0, 85
        vpor    xmm0, xmm0, xmm1
        vmovd   eax, xmm0
        test    eax, eax
        vzeroupper
        jne     A()@PLT
.LBB6_2:
        vzeroupper
        ret

.LCPI7_0:
        .quad   -9223372034707292160
badD(int vector[8], int vector[8]):
        vpbroadcastq    ymm1, qword ptr [rip + .LCPI7_0]
        vptest  ymm0, ymm1
        vzeroupper
        jne     A()@PLT
        ret

.LCPI8_0:
        .quad   -9223372034707292160
badE(int vector[8], int vector[8]):
        vpand   ymm0, ymm0, ymm1
        vpbroadcastq    ymm1, qword ptr [rip + .LCPI8_0]
        vptest  ymm0, ymm1
        vzeroupper
        jne     A()@PLT
        ret

.LCPI9_0:
        .quad   -9223372034707292160
badF(int vector[8], int vector[8]):
        vpbroadcastq    ymm2, qword ptr [rip + .LCPI9_0]
        vpandn  ymm0, ymm0, ymm1
        vptest  ymm0, ymm2
        vzeroupper
        jne     A()@PLT
        ret

Direct Link.

Metadata

Metadata

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions