Skip to content

[VectorCombine][X86] Failure to replace @llvm.x86.sse41.pblendvb with select #66513

@RKSimon

Description

@RKSimon

https://godbolt.org/z/n8qYT9hvc

In many cases we can replace SSE pblendvb intrinsics with select nodes, by determining that the condition element is a sign-extended compare result (or logic combination of them).

But in some circumstance the logic fails to simplify and we end up stuck with the pblendvb intrinsics, which prevents further generic folds from occurring.

#include <x86intrin.h>
auto tricky(__m128i a, __m128i b, __m128i c, __m128i src) {
	// Valid (> 0) weights
	__m128i aValid = _mm_cmpgt_epi32( a, _mm_setzero_si128() );
	__m128i bValid = _mm_cmpgt_epi32( b, _mm_setzero_si128() );
	__m128i cValid = _mm_cmpgt_epi32( c, _mm_setzero_si128() );
	__m128i bothValid = _mm_and_si128( aValid, bValid );
	__m128i allValid = _mm_xor_si128( bothValid, cValid );

	// Force a / b
	__m128i forceA = _mm_and_si128( allValid, aValid );
	__m128i forceB = _mm_and_si128( allValid, bValid );

	// Determine output
	__m128i out = _mm_and_si128( src, bothValid );
	out = _mm_blendv_epi8( out, a, forceA );
	out = _mm_blendv_epi8( out, b, forceB );    
	return out;
}
define <2 x i64> @tricky(<2 x i64> noundef %a, <2 x i64> noundef %b, <2 x i64> noundef %c, <2 x i64> noundef %src) {
entry:
  %0 = bitcast <2 x i64> %a to <4 x i32>
  %cmp.i = icmp sgt <4 x i32> %0, zeroinitializer
  %sext.i = sext <4 x i1> %cmp.i to <4 x i32>
  %1 = bitcast <4 x i32> %sext.i to <2 x i64>
  %2 = bitcast <2 x i64> %b to <4 x i32>
  %cmp.i21 = icmp sgt <4 x i32> %2, zeroinitializer
  %sext.i22 = sext <4 x i1> %cmp.i21 to <4 x i32>
  %3 = bitcast <4 x i32> %sext.i22 to <2 x i64>
  %4 = bitcast <2 x i64> %c to <4 x i32>
  %cmp.i23 = icmp sgt <4 x i32> %4, zeroinitializer
  %sext.i24 = sext <4 x i1> %cmp.i23 to <4 x i32>
  %5 = bitcast <4 x i32> %sext.i24 to <2 x i64>
  %and.i = and <2 x i64> %3, %1
  %xor.i = xor <2 x i64> %and.i, %5
  %and.i25 = and <2 x i64> %xor.i, %1
  %and.i26 = and <2 x i64> %xor.i, %3
  %and.i27 = and <2 x i64> %and.i, %src
  %6 = bitcast <2 x i64> %and.i27 to <16 x i8>
  %7 = bitcast <2 x i64> %a to <16 x i8>
  %8 = bitcast <2 x i64> %and.i25 to <16 x i8>
  %9 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %6, <16 x i8> %7, <16 x i8> %8)
  %10 = bitcast <2 x i64> %b to <16 x i8>
  %11 = bitcast <2 x i64> %and.i26 to <16 x i8>
  %12 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %9, <16 x i8> %10, <16 x i8> %11)
  %13 = bitcast <16 x i8> %12 to <2 x i64>
  ret <2 x i64> %13
}
declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)

I don't know if its the endless bitcasts to/from <2 x i64> due to the __m128i type, or if something else is going on.

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions