@@ -365,35 +365,34 @@ define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias
365365; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
366366; SI-NEXT: v_mov_b32_e32 v1, 0
367367; SI-NEXT: s_waitcnt lgkmcnt(0)
368- ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64
369- ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:1
370- ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64 offset:2
371- ; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:3
372- ; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:4
373- ; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:5
374- ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:6
368+ ; SI-NEXT: buffer_load_ubyte v2, v[0:1], s[0:3], 0 addr64 offset:5
369+ ; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 offset:6
370+ ; SI-NEXT: buffer_load_ubyte v4, v[0:1], s[0:3], 0 addr64
371+ ; SI-NEXT: buffer_load_ubyte v5, v[0:1], s[0:3], 0 addr64 offset:1
372+ ; SI-NEXT: buffer_load_ubyte v6, v[0:1], s[0:3], 0 addr64 offset:2
373+ ; SI-NEXT: buffer_load_ubyte v7, v[0:1], s[0:3], 0 addr64 offset:3
374+ ; SI-NEXT: buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64 offset:4
375375; SI-NEXT: s_mov_b32 s6, -1
376- ; SI-NEXT: s_waitcnt vmcnt(5)
377- ; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v3
378- ; SI-NEXT: v_or_b32_e32 v1, v1, v2
376+ ; SI-NEXT: s_waitcnt vmcnt(6)
377+ ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2
379378; SI-NEXT: s_waitcnt vmcnt(3)
380- ; SI-NEXT: v_lshlrev_b32_e32 v2 , 8, v5
381- ; SI-NEXT: v_or_b32_e32 v2, v2 , v4
379+ ; SI-NEXT: v_lshlrev_b32_e32 v1 , 8, v5
380+ ; SI-NEXT: v_or_b32_e32 v1, v1 , v4
382381; SI-NEXT: s_waitcnt vmcnt(1)
383- ; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v7
382+ ; SI-NEXT: v_lshlrev_b32_e32 v5, 8, v7
383+ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v7, v3
384+ ; SI-NEXT: v_or_b32_e32 v3, v5, v6
384385; SI-NEXT: s_waitcnt vmcnt(0)
385- ; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
386- ; SI-NEXT: v_or_b32_e32 v3, v3, v6
387- ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:24
388- ; SI-NEXT: s_waitcnt expcnt(0)
389- ; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2
390- ; SI-NEXT: v_or_b32_e32 v0, v0, v1
391- ; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v3
392- ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v3
386+ ; SI-NEXT: v_or_b32_e32 v0, v2, v0
387+ ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v3
388+ ; SI-NEXT: v_cvt_f32_ubyte1_e32 v5, v0
389+ ; SI-NEXT: v_cvt_f32_ubyte0_e32 v4, v0
390+ ; SI-NEXT: v_or_b32_e32 v0, v2, v1
393391; SI-NEXT: v_cvt_f32_ubyte3_e32 v3, v0
394392; SI-NEXT: v_cvt_f32_ubyte2_e32 v2, v0
395393; SI-NEXT: v_cvt_f32_ubyte1_e32 v1, v0
396394; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
395+ ; SI-NEXT: buffer_store_dword v7, off, s[4:7], 0 offset:24
397396; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 offset:16
398397; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
399398; SI-NEXT: s_endpgm
0 commit comments