-
Notifications
You must be signed in to change notification settings - Fork 15.2k
Closed
Labels
llvm:instcombineCovers the InstCombine, InstSimplify and AggressiveInstCombine passesCovers the InstCombine, InstSimplify and AggressiveInstCombine passes
Description
In my real code, I have some logic that compiles down to this (Godbolt link):
fn uzp2(x: @Vector(16, u8)) @Vector(16, u8) {
return @shuffle(u8, x, undefined, [_]i32{ 1, 3, 5, 7, 9, 11, 13, 15, 1, 3, 5, 7, 9, 11, 13, 15 });
}
fn neg(x: anytype) @TypeOf(x) {
return @as(@TypeOf(x), @splat(1)) +% ~x;
}
export fn foo(x: @Vector(16, u8)) u8 {
const a = @as([2]u64, @bitCast(uzp2(neg(x))))[0] *% 1;
return @as([8]u8, @bitCast(a))[7];
}Compiled for the Apple M3, we get:
foo:
neg v0.16b, v0.16b
dup v0.16b, v0.b[15]
fmov x8, d0
lsr x0, x8, #56
retWe can help the compiler like so:
export fn bar(x: @Vector(16, u8)) u8 {
const a = @as([2][8]u8, @bitCast(uzp2(neg(x))))[0];
return @as([8]u8, @bitCast(a))[7];
}bar:
umov w8, v0.b[15]
neg w0, w8
retThis same issue is present on x86-64 (znver4):
foo:
vpshufd xmm0, xmm0, 238
vpxor xmm1, xmm1, xmm1
vpsubb xmm0, xmm1, xmm0
vmovq rax, xmm0
shr rax, 56
ret
bar:
vpextrb eax, xmm0, 15
neg al
retPre-optimized LLVM IR (Godbolt link) via zig build-obj ./src/llvm_code.zig -O ReleaseFast -target aarch64-linux -mcpu apple_latest --verbose-llvm-ir -fstrip >llvm_code.ll 2>&1
; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define dso_local i8 @foo(<16 x i8> %0) #0 {
1:
%2 = alloca [8 x i8], align 8
%3 = alloca [16 x i8], align 8
%4 = call fastcc <16 x i8> @llvm_code.neg__anon_1457(<16 x i8> %0)
%5 = call fastcc <16 x i8> @llvm_code.uzp2(<16 x i8> %4)
store <16 x i8> %5, ptr %3, align 8
%6 = getelementptr inbounds [2 x i64], ptr %3, i64 0, i64 0
%7 = load i64, ptr %6
store i64 %7, ptr %2, align 8
%8 = getelementptr inbounds [8 x i8], ptr %2, i64 0, i64 7
%9 = load i8, ptr %8
ret i8 %9
}
; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define internal fastcc <16 x i8> @llvm_code.neg__anon_1457(<16 x i8> %0) unnamed_addr #0 {
1:
%2 = xor <16 x i8> %0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
%3 = add <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %2
ret <16 x i8> %3
}
; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define internal fastcc <16 x i8> @llvm_code.uzp2(<16 x i8> %0) unnamed_addr #0 {
1:
%2 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
ret <16 x i8> %2
}
; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define dso_local i8 @bar(<16 x i8> %0) #0 {
1:
%2 = alloca [16 x i8], align 1
%3 = call fastcc <16 x i8> @llvm_code.neg__anon_1457(<16 x i8> %0)
%4 = call fastcc <16 x i8> @llvm_code.uzp2(<16 x i8> %3)
store <16 x i8> %4, ptr %2, align 1
%5 = getelementptr inbounds [2 x [8 x i8]], ptr %2, i64 0, i64 0
%6 = getelementptr inbounds [8 x i8], ptr %5, i64 0, i64 7
%7 = load i8, ptr %6
ret i8 %7
}Metadata
Metadata
Assignees
Labels
llvm:instcombineCovers the InstCombine, InstSimplify and AggressiveInstCombine passesCovers the InstCombine, InstSimplify and AggressiveInstCombine passes