Bitcasting to u64 to multiply by 1 should be optimized out earlier

In my real code, I have some logic that compiles down to this ([Godbolt link](https://zig.godbolt.org/z/Kv87xTWG8)):

```zig
fn uzp2(x: @Vector(16, u8)) @Vector(16, u8) {
    return @shuffle(u8, x, undefined, [_]i32{ 1, 3, 5, 7, 9, 11, 13, 15, 1, 3, 5, 7, 9, 11, 13, 15 });
}

fn neg(x: anytype) @TypeOf(x) {
    return @as(@TypeOf(x), @splat(1)) +% ~x;
}

export fn foo(x: @Vector(16, u8)) u8 {
    const a = @as([2]u64, @bitCast(uzp2(neg(x))))[0] *% 1;
    return @as([8]u8, @bitCast(a))[7];
}
```

Compiled for the Apple M3, we get:


```asm
foo:
        neg     v0.16b, v0.16b
        dup     v0.16b, v0.b[15]
        fmov    x8, d0
        lsr     x0, x8, #56
        ret
```

We can help the compiler like so:

```zig
export fn bar(x: @Vector(16, u8)) u8 {
    const a = @as([2][8]u8, @bitCast(uzp2(neg(x))))[0];
    return @as([8]u8, @bitCast(a))[7];
}
```

```zig
bar:
        umov    w8, v0.b[15]
        neg     w0, w8
        ret
```

This same issue is present on x86-64 (znver4):

```asm
foo:
        vpshufd xmm0, xmm0, 238
        vpxor   xmm1, xmm1, xmm1
        vpsubb  xmm0, xmm1, xmm0
        vmovq   rax, xmm0
        shr     rax, 56
        ret

bar:
        vpextrb eax, xmm0, 15
        neg     al
        ret
```

Pre-optimized LLVM IR ([Godbolt link](https://llvm.godbolt.org/z/d6E4c9fd6)) via `zig build-obj ./src/llvm_code.zig -O ReleaseFast -target aarch64-linux -mcpu apple_latest --verbose-llvm-ir -fstrip >llvm_code.ll 2>&1`


```llvm
; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define dso_local i8 @foo(<16 x i8> %0) #0 {
1:
  %2 = alloca [8 x i8], align 8
  %3 = alloca [16 x i8], align 8
  %4 = call fastcc <16 x i8> @llvm_code.neg__anon_1457(<16 x i8> %0)
  %5 = call fastcc <16 x i8> @llvm_code.uzp2(<16 x i8> %4)
  store <16 x i8> %5, ptr %3, align 8
  %6 = getelementptr inbounds [2 x i64], ptr %3, i64 0, i64 0
  %7 = load i64, ptr %6
  store i64 %7, ptr %2, align 8
  %8 = getelementptr inbounds [8 x i8], ptr %2, i64 0, i64 7
  %9 = load i8, ptr %8
  ret i8 %9
}

; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define internal fastcc <16 x i8> @llvm_code.neg__anon_1457(<16 x i8> %0) unnamed_addr #0 {
1:
  %2 = xor <16 x i8> %0, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
  %3 = add <16 x i8> <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>, %2
  ret <16 x i8> %3
}

; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define internal fastcc <16 x i8> @llvm_code.uzp2(<16 x i8> %0) unnamed_addr #0 {
1:
  %2 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
  ret <16 x i8> %2
}

; Function Attrs: nounwind uwtable nosanitize_coverage skipprofile
define dso_local i8 @bar(<16 x i8> %0) #0 {
1:
  %2 = alloca [16 x i8], align 1
  %3 = call fastcc <16 x i8> @llvm_code.neg__anon_1457(<16 x i8> %0)
  %4 = call fastcc <16 x i8> @llvm_code.uzp2(<16 x i8> %3)
  store <16 x i8> %4, ptr %2, align 1
  %5 = getelementptr inbounds [2 x [8 x i8]], ptr %2, i64 0, i64 0
  %6 = getelementptr inbounds [8 x i8], ptr %5, i64 0, i64 7
  %7 = load i8, ptr %6
  ret i8 %7
}
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Bitcasting to u64 to multiply by 1 should be optimized out earlier #107404

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Bitcasting to u64 to multiply by 1 should be optimized out earlier #107404

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions