-
Notifications
You must be signed in to change notification settings - Fork 15.1k
Closed
Description
DAGCombiner::reduceLoadWidth handles cases where we are shifting + truncating wide loads, but only for constant shift amounts.
We should be able to do something similar for cases such as below where we're extracting aligned i64 blocks from a i512 load:
define i64 @load512_extract64(ptr %word, i32 %idx) {
%ld = load i512, ptr %word, align 8
%rem = and i32 %idx, 511 ; idx in bounds
%rem2 = and i32 %rem, -64 ; idx aligned
%sh_prom = zext nneg i32 %rem2 to i512
%sub = lshr i512 %ld, %sh_prom
%res = trunc i512 %sub to i64
ret i64 %res
}By the looks of the codegen, we're not managing this prior to legalisation which spills the i512 to stack, and then manage to do at least some cleanup, but we still end up doing the loading from the stack copy:
load512_extract64: # @load512_extract64
pushq %rax
vmovups (%rdi), %ymm0
vmovups 32(%rdi), %ymm1
vxorps %xmm2, %xmm2, %xmm2
vmovups %ymm2, -32(%rsp)
vmovups %ymm2, -64(%rsp)
vmovups %ymm1, -96(%rsp)
vmovups %ymm0, -128(%rsp)
shrl $3, %esi
andl $56, %esi
movq -128(%rsp,%rsi), %rax
popq %rcx
vzeroupper
retqInstead it should be possible to do this:
load512_extract64: # @load512_extract64
shrl $3, %esi
andl $56, %esi
movq (%rdi,%rsi), %rax
retq