-
Notifications
You must be signed in to change notification settings - Fork 15.2k
Open
Description
Hello,
while investigating the ability to use generic SIMD code to generate well optimized x86, I was first pleasently surprised that "vtest" can be successfully generated. I also found some glaring missing cases though...
#include <immintrin.h>
#include <stdint.h>
void A();
using T = __attribute__((vector_size(32))) int32_t;
void okA(T a, T b) {
if (__builtin_reduce_or(a & (~b)) != 0)
A();
}
void okB(T a, T b) {
if (__builtin_reduce_or(a & b) != 0)
A();
}
void okC(T a, T b) {
if (__builtin_reduce_and(a & b) != -1)
A();
}
void okD(T a, T b) {
if (__builtin_reduce_or((a & b) < 0))
A();
}
void badA(T a, T b) {
if (__builtin_reduce_or(a & (~b)) != 0) // Expect single "vtest" + "jae".
A();
else if (__builtin_reduce_or(a) != 0) // Expect single "vtest" + "jae".
A();
}
void badB(T a, T b) {
if (__builtin_reduce_or(a & (~b)) != 0) // Expect single "vtest" + "jae".
A();
else if (__builtin_reduce_or(a & b) != 0) // Expect single "jne".
A();
}
void badC(T a, T b) {
if ((__builtin_reduce_or(a & (~b)) != 0) && (__builtin_reduce_or(a & b) != 0)) // Expect single "vtest" + "ja".
A();
}
void badD(T a, T b) {
if (__builtin_reduce_or(a & 0x80000000) != 0) // Expect single "vtestps" + "jae".
A();
}
void badE(T a, T b) {
if (__builtin_reduce_or((a & b) & 0x80000000) != 0) // Expect single "vtestps" + "jae".
A();
}
void badF(T a, T b) {
if (__builtin_reduce_or(((~a) & b) & 0x80000000) != 0) // Expect single "vtestps" + "jne".
A();
}
// More combinations feasable. Similarly also with "vtestpd".Especially for the variants "badA" to "badC" the generated code is attrocious for no reason apparent to me.
okA(int vector[8], int vector[8]):
vptest ymm1, ymm0
vzeroupper
jae A()@PLT
ret
okB(int vector[8], int vector[8]):
vptest ymm1, ymm0
vzeroupper
jne A()@PLT
ret
okC(int vector[8], int vector[8]):
vpand ymm0, ymm1, ymm0
vpcmpeqd ymm1, ymm1, ymm1
vptest ymm0, ymm1
vzeroupper
jae A()@PLT
ret
okD(int vector[8], int vector[8]):
vtestps ymm1, ymm0
vzeroupper
jne A()@PLT
ret
badA(int vector[8], int vector[8]):
vpandn ymm1, ymm1, ymm0
vextracti128 xmm2, ymm1, 1
vpor xmm1, xmm1, xmm2
vpshufd xmm2, xmm1, 238
vpor xmm1, xmm1, xmm2
vpshufd xmm2, xmm1, 85
vpor xmm1, xmm1, xmm2
vmovd eax, xmm1
vextracti128 xmm1, ymm0, 1
vpor ymm0, ymm0, ymm1
vpshufd xmm1, xmm0, 238
vpor ymm2, ymm0, ymm1
vpor xmm0, xmm0, xmm1
vpshufd xmm0, xmm0, 85
vpor ymm0, ymm2, ymm0
vmovd ecx, xmm0
or ecx, eax
vzeroupper
jne A()@PLT
ret
badB(int vector[8], int vector[8]):
vpandn ymm2, ymm1, ymm0
vextracti128 xmm3, ymm2, 1
vpor xmm2, xmm2, xmm3
vpshufd xmm3, xmm2, 238
vpor xmm2, xmm2, xmm3
vpshufd xmm3, xmm2, 85
vpor xmm2, xmm2, xmm3
vmovd eax, xmm2
vpand ymm0, ymm1, ymm0
vextracti128 xmm1, ymm0, 1
vpor ymm0, ymm0, ymm1
vpshufd xmm1, xmm0, 238
vpor ymm2, ymm0, ymm1
vpor xmm0, xmm0, xmm1
vpshufd xmm0, xmm0, 85
vpor ymm0, ymm2, ymm0
vmovd ecx, xmm0
or ecx, eax
vzeroupper
jne A()@PLT
ret
badC(int vector[8], int vector[8]):
vptest ymm1, ymm0
jb .LBB6_2
vpand ymm0, ymm1, ymm0
vextracti128 xmm1, ymm0, 1
vpor xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 238
vpor xmm0, xmm0, xmm1
vpshufd xmm1, xmm0, 85
vpor xmm0, xmm0, xmm1
vmovd eax, xmm0
test eax, eax
vzeroupper
jne A()@PLT
.LBB6_2:
vzeroupper
ret
.LCPI7_0:
.quad -9223372034707292160
badD(int vector[8], int vector[8]):
vpbroadcastq ymm1, qword ptr [rip + .LCPI7_0]
vptest ymm0, ymm1
vzeroupper
jne A()@PLT
ret
.LCPI8_0:
.quad -9223372034707292160
badE(int vector[8], int vector[8]):
vpand ymm0, ymm0, ymm1
vpbroadcastq ymm1, qword ptr [rip + .LCPI8_0]
vptest ymm0, ymm1
vzeroupper
jne A()@PLT
ret
.LCPI9_0:
.quad -9223372034707292160
badF(int vector[8], int vector[8]):
vpbroadcastq ymm2, qword ptr [rip + .LCPI9_0]
vpandn ymm0, ymm0, ymm1
vptest ymm0, ymm2
vzeroupper
jne A()@PLT
ret