@@ -692,148 +692,148 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
692692;
693693; GFX9-LABEL: sdivrem_v2i32:
694694; GFX9: ; %bb.0:
695- ; GFX9-NEXT: s_load_dwordx8 s[8:15 ], s[4:5], 0x0
695+ ; GFX9-NEXT: s_load_dwordx8 s[0:7 ], s[4:5], 0x0
696696; GFX9-NEXT: s_waitcnt lgkmcnt(0)
697- ; GFX9-NEXT: s_ashr_i32 s0, s14 , 31
698- ; GFX9-NEXT: s_add_i32 s1, s14, s0
699- ; GFX9-NEXT: s_xor_b32 s1, s1, s0
700- ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s1
701- ; GFX9-NEXT: s_ashr_i32 s2, s15 , 31
702- ; GFX9-NEXT: s_add_i32 s3, s15, s2
703- ; GFX9-NEXT: s_xor_b32 s3, s3, s2
697+ ; GFX9-NEXT: s_ashr_i32 s8, s6 , 31
698+ ; GFX9-NEXT: s_add_i32 s6, s6, s8
699+ ; GFX9-NEXT: s_xor_b32 s6, s6, s8
700+ ; GFX9-NEXT: v_cvt_f32_u32_e32 v0, s6
701+ ; GFX9-NEXT: s_ashr_i32 s9, s7 , 31
702+ ; GFX9-NEXT: s_add_i32 s7, s7, s9
703+ ; GFX9-NEXT: s_xor_b32 s7, s7, s9
704704; GFX9-NEXT: v_rcp_iflag_f32_e32 v0, v0
705- ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s3
706- ; GFX9-NEXT: s_sub_i32 s6 , 0, s1
707- ; GFX9-NEXT: s_ashr_i32 s4, s12 , 31
705+ ; GFX9-NEXT: v_cvt_f32_u32_e32 v1, s7
706+ ; GFX9-NEXT: s_sub_i32 s12 , 0, s6
707+ ; GFX9-NEXT: s_ashr_i32 s10, s4 , 31
708708; GFX9-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
709709; GFX9-NEXT: v_cvt_u32_f32_e32 v0, v0
710710; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v1
711- ; GFX9-NEXT: s_sub_i32 s7, 0, s3
712- ; GFX9-NEXT: s_ashr_i32 s5, s13, 31
713- ; GFX9-NEXT: v_mul_lo_u32 v2, s6 , v0
711+ ; GFX9-NEXT: s_add_i32 s4, s4, s10
712+ ; GFX9-NEXT: s_xor_b32 s4, s4, s10
713+ ; GFX9-NEXT: v_mul_lo_u32 v2, s12 , v0
714714; GFX9-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
715715; GFX9-NEXT: v_cvt_u32_f32_e32 v1, v1
716- ; GFX9-NEXT: s_add_i32 s6, s12, s4
716+ ; GFX9-NEXT: s_sub_i32 s12, 0, s7
717717; GFX9-NEXT: v_mul_hi_u32 v2, v0, v2
718- ; GFX9-NEXT: s_xor_b32 s6, s6, s4
719- ; GFX9-NEXT: v_mul_lo_u32 v3, s7 , v1
720- ; GFX9-NEXT: s_add_i32 s7, s13, s5
718+ ; GFX9-NEXT: s_ashr_i32 s11, s5, 31
719+ ; GFX9-NEXT: v_mul_lo_u32 v3, s12 , v1
720+ ; GFX9-NEXT: s_add_i32 s5, s5, s11
721721; GFX9-NEXT: v_add_u32_e32 v0, v0, v2
722- ; GFX9-NEXT: v_mul_hi_u32 v0, s6 , v0
722+ ; GFX9-NEXT: v_mul_hi_u32 v0, s4 , v0
723723; GFX9-NEXT: v_mul_hi_u32 v2, v1, v3
724- ; GFX9-NEXT: s_xor_b32 s7, s7, s5
725- ; GFX9-NEXT: s_xor_b32 s0, s4, s0
726- ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s1
724+ ; GFX9-NEXT: s_xor_b32 s5, s5, s11
725+ ; GFX9-NEXT: v_mul_lo_u32 v3, v0, s6
727726; GFX9-NEXT: v_add_u32_e32 v1, v1, v2
728727; GFX9-NEXT: v_add_u32_e32 v2, 1, v0
729- ; GFX9-NEXT: v_mul_hi_u32 v1, s7 , v1
730- ; GFX9-NEXT: v_sub_u32_e32 v3, s6 , v3
731- ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1 , v3
728+ ; GFX9-NEXT: v_mul_hi_u32 v1, s5 , v1
729+ ; GFX9-NEXT: v_sub_u32_e32 v3, s4 , v3
730+ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6 , v3
732731; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
733- ; GFX9-NEXT: v_subrev_u32_e32 v2, s1 , v3
732+ ; GFX9-NEXT: v_subrev_u32_e32 v2, s6 , v3
734733; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
735734; GFX9-NEXT: v_add_u32_e32 v3, 1, v0
736- ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s1 , v2
735+ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s6 , v2
737736; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
738- ; GFX9-NEXT: v_subrev_u32_e32 v3, s1 , v2
737+ ; GFX9-NEXT: v_subrev_u32_e32 v3, s6 , v2
739738; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
740- ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s3
739+ ; GFX9-NEXT: v_mul_lo_u32 v3, v1, s7
741740; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
742- ; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0
743- ; GFX9-NEXT: v_subrev_u32_e32 v0, s0 , v0
744- ; GFX9-NEXT: v_sub_u32_e32 v3, s7 , v3
745- ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3 , v3
741+ ; GFX9-NEXT: s_xor_b32 s4, s10, s8
742+ ; GFX9-NEXT: v_xor_b32_e32 v0, s4 , v0
743+ ; GFX9-NEXT: v_sub_u32_e32 v3, s5 , v3
744+ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7 , v3
746745; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
747- ; GFX9-NEXT: v_subrev_u32_e32 v4, s3 , v3
746+ ; GFX9-NEXT: v_subrev_u32_e32 v4, s7 , v3
748747; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
749748; GFX9-NEXT: v_add_u32_e32 v4, 1, v1
750- ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s3, v3
749+ ; GFX9-NEXT: v_cmp_le_u32_e32 vcc, s7, v3
750+ ; GFX9-NEXT: v_subrev_u32_e32 v0, s4, v0
751751; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc
752- ; GFX9-NEXT: v_subrev_u32_e32 v4, s3 , v3
753- ; GFX9-NEXT: s_xor_b32 s0, s5, s2
752+ ; GFX9-NEXT: v_subrev_u32_e32 v4, s7 , v3
753+ ; GFX9-NEXT: s_xor_b32 s4, s11, s9
754754; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc
755- ; GFX9-NEXT: v_xor_b32_e32 v1, s0 , v1
756- ; GFX9-NEXT: v_xor_b32_e32 v2, s4 , v2
757- ; GFX9-NEXT: v_subrev_u32_e32 v1, s0 , v1
758- ; GFX9-NEXT: v_xor_b32_e32 v3, s5 , v3
755+ ; GFX9-NEXT: v_xor_b32_e32 v1, s4 , v1
756+ ; GFX9-NEXT: v_xor_b32_e32 v2, s10 , v2
757+ ; GFX9-NEXT: v_subrev_u32_e32 v1, s4 , v1
758+ ; GFX9-NEXT: v_xor_b32_e32 v3, s11 , v3
759759; GFX9-NEXT: v_mov_b32_e32 v4, 0
760- ; GFX9-NEXT: v_subrev_u32_e32 v2, s4 , v2
761- ; GFX9-NEXT: v_subrev_u32_e32 v3, s5 , v3
762- ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9 ]
763- ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11 ]
760+ ; GFX9-NEXT: v_subrev_u32_e32 v2, s10 , v2
761+ ; GFX9-NEXT: v_subrev_u32_e32 v3, s11 , v3
762+ ; GFX9-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1 ]
763+ ; GFX9-NEXT: global_store_dwordx2 v4, v[2:3], s[2:3 ]
764764; GFX9-NEXT: s_endpgm
765765;
766766; GFX10-LABEL: sdivrem_v2i32:
767767; GFX10: ; %bb.0:
768- ; GFX10-NEXT: s_load_dwordx8 s[8:15 ], s[4:5], 0x0
768+ ; GFX10-NEXT: s_load_dwordx8 s[4:11 ], s[4:5], 0x0
769769; GFX10-NEXT: s_waitcnt lgkmcnt(0)
770- ; GFX10-NEXT: s_ashr_i32 s1, s14 , 31
771- ; GFX10-NEXT: s_ashr_i32 s2, s15 , 31
772- ; GFX10-NEXT: s_add_i32 s0, s14 , s1
773- ; GFX10-NEXT: s_add_i32 s3, s15 , s2
774- ; GFX10-NEXT: s_xor_b32 s4 , s0, s1
770+ ; GFX10-NEXT: s_ashr_i32 s1, s10 , 31
771+ ; GFX10-NEXT: s_ashr_i32 s2, s11 , 31
772+ ; GFX10-NEXT: s_add_i32 s0, s10 , s1
773+ ; GFX10-NEXT: s_add_i32 s3, s11 , s2
774+ ; GFX10-NEXT: s_xor_b32 s10 , s0, s1
775775; GFX10-NEXT: s_xor_b32 s3, s3, s2
776- ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s4
776+ ; GFX10-NEXT: v_cvt_f32_u32_e32 v0, s10
777777; GFX10-NEXT: v_cvt_f32_u32_e32 v1, s3
778- ; GFX10-NEXT: s_sub_i32 s0, 0, s4
779- ; GFX10-NEXT: s_sub_i32 s5 , 0, s3
780- ; GFX10-NEXT: s_ashr_i32 s6, s13 , 31
778+ ; GFX10-NEXT: s_sub_i32 s0, 0, s10
779+ ; GFX10-NEXT: s_sub_i32 s11 , 0, s3
780+ ; GFX10-NEXT: s_ashr_i32 s12, s9 , 31
781781; GFX10-NEXT: v_rcp_iflag_f32_e32 v0, v0
782782; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v1
783- ; GFX10-NEXT: s_add_i32 s7, s13, s6
784- ; GFX10-NEXT: s_xor_b32 s7, s7, s6
785783; GFX10-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
786784; GFX10-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
787785; GFX10-NEXT: v_cvt_u32_f32_e32 v0, v0
788786; GFX10-NEXT: v_cvt_u32_f32_e32 v1, v1
789787; GFX10-NEXT: v_mul_lo_u32 v2, s0, v0
790- ; GFX10-NEXT: v_mul_lo_u32 v3, s5, v1
791- ; GFX10-NEXT: s_ashr_i32 s5, s12, 31
792- ; GFX10-NEXT: s_add_i32 s0, s12, s5
793- ; GFX10-NEXT: s_xor_b32 s1, s5, s1
794- ; GFX10-NEXT: s_xor_b32 s0, s0, s5
788+ ; GFX10-NEXT: v_mul_lo_u32 v3, s11, v1
789+ ; GFX10-NEXT: s_ashr_i32 s11, s8, 31
790+ ; GFX10-NEXT: s_add_i32 s0, s8, s11
791+ ; GFX10-NEXT: s_add_i32 s8, s9, s12
792+ ; GFX10-NEXT: s_xor_b32 s0, s0, s11
793+ ; GFX10-NEXT: s_xor_b32 s8, s8, s12
795794; GFX10-NEXT: v_mul_hi_u32 v2, v0, v2
796795; GFX10-NEXT: v_mul_hi_u32 v3, v1, v3
796+ ; GFX10-NEXT: s_xor_b32 s1, s11, s1
797797; GFX10-NEXT: v_add_nc_u32_e32 v0, v0, v2
798798; GFX10-NEXT: v_add_nc_u32_e32 v1, v1, v3
799799; GFX10-NEXT: v_mul_hi_u32 v0, s0, v0
800- ; GFX10-NEXT: v_mul_hi_u32 v1, s7 , v1
801- ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s4
800+ ; GFX10-NEXT: v_mul_hi_u32 v1, s8 , v1
801+ ; GFX10-NEXT: v_mul_lo_u32 v2, v0, s10
802802; GFX10-NEXT: v_mul_lo_u32 v3, v1, s3
803803; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
804804; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
805805; GFX10-NEXT: v_sub_nc_u32_e32 v2, s0, v2
806- ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s7 , v3
807- ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4 , v2
806+ ; GFX10-NEXT: v_sub_nc_u32_e32 v3, s8 , v3
807+ ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10 , v2
808808; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3
809- ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4 , v2
809+ ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10 , v2
810810; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3
811811; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
812812; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
813813; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
814814; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
815815; GFX10-NEXT: v_add_nc_u32_e32 v5, 1, v1
816816; GFX10-NEXT: v_add_nc_u32_e32 v4, 1, v0
817- ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s4 , v2
817+ ; GFX10-NEXT: v_cmp_le_u32_e64 s0, s10 , v2
818818; GFX10-NEXT: v_cmp_le_u32_e32 vcc_lo, s3, v3
819- ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s4 , v2
819+ ; GFX10-NEXT: v_subrev_nc_u32_e32 v6, s10 , v2
820820; GFX10-NEXT: v_subrev_nc_u32_e32 v7, s3, v3
821821; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0
822822; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo
823823; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0
824824; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc_lo
825- ; GFX10-NEXT: s_xor_b32 s0, s6 , s2
825+ ; GFX10-NEXT: s_xor_b32 s0, s12 , s2
826826; GFX10-NEXT: v_xor_b32_e32 v0, s1, v0
827827; GFX10-NEXT: v_xor_b32_e32 v1, s0, v1
828- ; GFX10-NEXT: v_xor_b32_e32 v2, s5 , v2
829- ; GFX10-NEXT: v_xor_b32_e32 v3, s6 , v3
828+ ; GFX10-NEXT: v_xor_b32_e32 v2, s11 , v2
829+ ; GFX10-NEXT: v_xor_b32_e32 v3, s12 , v3
830830; GFX10-NEXT: v_mov_b32_e32 v4, 0
831831; GFX10-NEXT: v_subrev_nc_u32_e32 v0, s1, v0
832832; GFX10-NEXT: v_subrev_nc_u32_e32 v1, s0, v1
833- ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s5 , v2
834- ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s6 , v3
835- ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[8:9 ]
836- ; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[10:11 ]
833+ ; GFX10-NEXT: v_subrev_nc_u32_e32 v2, s11 , v2
834+ ; GFX10-NEXT: v_subrev_nc_u32_e32 v3, s12 , v3
835+ ; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5 ]
836+ ; GFX10-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7 ]
837837; GFX10-NEXT: s_endpgm
838838 %div = sdiv <2 x i32 > %x , %y
839839 store <2 x i32 > %div , ptr addrspace (1 ) %out0
0 commit comments