Skip to content

Bitcasting to u64 to multiply by 1 should be optimized out earlier #107404

@Validark

Description

@Validark

In my real code, I have some logic that compiles down to this (Godbolt link):

fn uzp2(x: @Vector(16, u8)) @Vector(16, u8) {
    return @shuffle(u8, x, undefined, [_]i32{ 1, 3, 5, 7, 9, 11, 13, 15, 1, 3, 5, 7, 9, 11, 13, 15 });
}

fn neg(x: anytype) @TypeOf(x) {
    return @as(@TypeOf(x), @splat(1)) +% ~x;
}

export fn foo(x: @Vector(16, u8)) u8 {
    const a = @as([2]u64, @bitCast(uzp2(neg(x))))[0] *% 1;
    return @as([8]u8, @bitCast(a))[7];
}

Compiled for the Apple M3, we get:

foo:
        neg     v0.16b, v0.16b
        dup     v0.16b, v0.b[15]
        fmov    x8, d0
        lsr     x0, x8, #56
        ret

We can help the compiler like so:

export fn bar(x: @Vector(16, u8)) u8 {
    const a = @as([2][8]u8, @bitCast(uzp2(neg(x))))[0];
    return @as([8]u8, @bitCast(a))[7];
}
bar:
        umov    w8, v0.b[15]
        neg     w0, w8
        ret

This same issue is present on x86-64 (znver4):

foo:
        vpshufd xmm0, xmm0, 238
        vpxor   xmm1, xmm1, xmm1
        vpsubb  xmm0, xmm1, xmm0
        vmovq   rax, xmm0
        shr     rax, 56
        ret

bar:
        vpextrb eax, xmm0, 15
        neg     al
        ret

Pre-optimized LLVM IR (Godbolt link) via zig build-obj ./src/llvm_code.zig -O ReleaseFast -target aarch64-linux -mcpu apple_latest --verbose-llvm-ir -fstrip >llvm_code.ll 2>&1

; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define dso_local i8 @foo(<16 x i8> %0) #0 {
1:
  %2 = alloca [8 x i8], align 8
  %3 = alloca [16 x i8], align 8
  %4 = call fastcc <16 x i8> @llvm_code.neg__anon_1457(<16 x i8> %0)
  %5 = call fastcc <16 x i8> @llvm_code.uzp2(<16 x i8> %4)
  store <16 x i8> %5, ptr %3, align 8
  %6 = getelementptr inbounds [2 x i64], ptr %3, i64 0, i64 0
  %7 = load i64, ptr %6
  store i64 %7, ptr %2, align 8
  %8 = getelementptr inbounds [8 x i8], ptr %2, i64 0, i64 7
  %9 = load i8, ptr %8
  ret i8 %9
}

; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define internal fastcc <16 x i8> @llvm_code.neg__anon_1457(<16 x i8> %0) unnamed_addr #0 {
1:
  %2 = xor <16 x i8> %0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
  %3 = add <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %2
  ret <16 x i8> %3
}

; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define internal fastcc <16 x i8> @llvm_code.uzp2(<16 x i8> %0) unnamed_addr #0 {
1:
  %2 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
  ret <16 x i8> %2
}

; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define dso_local i8 @bar(<16 x i8> %0) #0 {
1:
  %2 = alloca [16 x i8], align 1
  %3 = call fastcc <16 x i8> @llvm_code.neg__anon_1457(<16 x i8> %0)
  %4 = call fastcc <16 x i8> @llvm_code.uzp2(<16 x i8> %3)
  store <16 x i8> %4, ptr %2, align 1
  %5 = getelementptr inbounds [2 x [8 x i8]], ptr %2, i64 0, i64 0
  %6 = getelementptr inbounds [8 x i8], ptr %5, i64 0, i64 7
  %7 = load i8, ptr %6
  ret i8 %7
}

Metadata

Metadata

Assignees

Labels

llvm:instcombineCovers the InstCombine, InstSimplify and AggressiveInstCombine passes

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions