-
Notifications
You must be signed in to change notification settings - Fork 15.2k
Closed
Description
#include <emmintrin.h>
__m128i b;
int square() {
if (_mm_cvtsi128_si64(b) == 0)
return 0;
__m128i c = _mm_shuffle_epi32(b, 238);
return _mm_cvtsi128_si32(c);
}
Compile with -O2, and with -O2 -msse4.1.
Expected result: Either same both times, or a SSE4.1 instruction in the output.
Actual:
_Z6squarev: # @_Z6squarev
xor ecx, ecx
cmp qword ptr [rip + b], 0
pshufd xmm0, xmmword ptr [rip + b], 238 # xmm0 = mem[2,3,2,3]
movd eax, xmm0
cmove eax, ecx
ret
b:
.zero 16
_Z6squarev: # @_Z6squarev
xor eax, eax
cmp qword ptr [rip + b], 0
je .LBB0_2
mov eax, dword ptr [rip + b+8]
.LBB0_2:
ret
b:
.zero 16
https://godbolt.org/z/4qrGvo617
(testcase reduced from an experiment to check how thoroughly I can disprove assumptions that _mm_load_si128 is atomic)