@@ -606,7 +606,7 @@ define void @srem_v16i32(ptr %a, ptr %b) #0 {
606606;
607607; VBITS_GE_256-LABEL: srem_v16i32:
608608; VBITS_GE_256: // %bb.0:
609- ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
609+ ; VBITS_GE_256-NEXT: mov x8, #8
610610; VBITS_GE_256-NEXT: ptrue p0.s, vl8
611611; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
612612; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
@@ -680,13 +680,13 @@ define void @srem_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
680680define <1 x i64 > @srem_v1i64 (<1 x i64 > %op1 , <1 x i64 > %op2 ) vscale_range(1 ,0 ) #0 {
681681; CHECK-LABEL: srem_v1i64:
682682; CHECK: // %bb.0:
683- ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
684- ; CHECK-NEXT: ptrue p0.d, vl1
685683; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
684+ ; CHECK-NEXT: ptrue p0.d, vl1
685+ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
686686; CHECK-NEXT: movprfx z2, z0
687687; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
688- ; CHECK-NEXT: mls z0 .d, p0/m, z2 .d, z1 .d
689- ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
688+ ; CHECK-NEXT: mul z1 .d, p0/m, z1 .d, z2 .d
689+ ; CHECK-NEXT: sub d0, d0, d1
690690; CHECK-NEXT: ret
691691 %res = srem <1 x i64 > %op1 , %op2
692692 ret <1 x i64 > %res
@@ -697,13 +697,13 @@ define <1 x i64> @srem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #
697697define <2 x i64 > @srem_v2i64 (<2 x i64 > %op1 , <2 x i64 > %op2 ) vscale_range(1 ,0 ) #0 {
698698; CHECK-LABEL: srem_v2i64:
699699; CHECK: // %bb.0:
700- ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
701- ; CHECK-NEXT: ptrue p0.d, vl2
702700; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
701+ ; CHECK-NEXT: ptrue p0.d, vl2
702+ ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
703703; CHECK-NEXT: movprfx z2, z0
704704; CHECK-NEXT: sdiv z2.d, p0/m, z2.d, z1.d
705- ; CHECK-NEXT: mls z0 .d, p0/m, z2 .d, z1 .d
706- ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
705+ ; CHECK-NEXT: mul z1 .d, p0/m, z1 .d, z2 .d
706+ ; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d
707707; CHECK-NEXT: ret
708708 %res = srem <2 x i64 > %op1 , %op2
709709 ret <2 x i64 > %res
@@ -730,32 +730,34 @@ define void @srem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
730730define void @srem_v8i64 (ptr %a , ptr %b ) #0 {
731731; VBITS_GE_128-LABEL: srem_v8i64:
732732; VBITS_GE_128: // %bb.0:
733- ; VBITS_GE_128-NEXT: ldp q0, q1 , [x0, #32 ]
733+ ; VBITS_GE_128-NEXT: ldp q4, q5 , [x1 ]
734734; VBITS_GE_128-NEXT: ptrue p0.d, vl2
735- ; VBITS_GE_128-NEXT: ldp q2, q3, [x1, #32]
735+ ; VBITS_GE_128-NEXT: ldp q7, q6, [x1, #32]
736+ ; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
737+ ; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
738+ ; VBITS_GE_128-NEXT: movprfx z16, z3
739+ ; VBITS_GE_128-NEXT: sdiv z16.d, p0/m, z16.d, z5.d
740+ ; VBITS_GE_128-NEXT: movprfx z17, z2
741+ ; VBITS_GE_128-NEXT: sdiv z17.d, p0/m, z17.d, z4.d
742+ ; VBITS_GE_128-NEXT: mul z5.d, p0/m, z5.d, z16.d
736743; VBITS_GE_128-NEXT: movprfx z16, z1
737- ; VBITS_GE_128-NEXT: sdiv z16.d, p0/m, z16.d, z3.d
738- ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z3.d
739- ; VBITS_GE_128-NEXT: movprfx z3, z0
740- ; VBITS_GE_128-NEXT: sdiv z3.d, p0/m, z3.d, z2.d
741- ; VBITS_GE_128-NEXT: mls z0.d, p0/m, z3.d, z2.d
742- ; VBITS_GE_128-NEXT: ldp q4, q5, [x0]
743- ; VBITS_GE_128-NEXT: ldp q7, q6, [x1]
744- ; VBITS_GE_128-NEXT: movprfx z16, z5
745744; VBITS_GE_128-NEXT: sdiv z16.d, p0/m, z16.d, z6.d
746- ; VBITS_GE_128-NEXT: movprfx z2, z4
747- ; VBITS_GE_128-NEXT: sdiv z2.d, p0/m, z2.d, z7.d
745+ ; VBITS_GE_128-NEXT: mul z4.d, p0/m, z4.d, z17.d
746+ ; VBITS_GE_128-NEXT: movprfx z17, z0
747+ ; VBITS_GE_128-NEXT: sdiv z17.d, p0/m, z17.d, z7.d
748+ ; VBITS_GE_128-NEXT: mul z6.d, p0/m, z6.d, z16.d
749+ ; VBITS_GE_128-NEXT: mul z7.d, p0/m, z7.d, z17.d
750+ ; VBITS_GE_128-NEXT: sub v0.2d, v0.2d, v7.2d
751+ ; VBITS_GE_128-NEXT: sub v1.2d, v1.2d, v6.2d
752+ ; VBITS_GE_128-NEXT: sub v2.2d, v2.2d, v4.2d
748753; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
749- ; VBITS_GE_128-NEXT: movprfx z0, z4
750- ; VBITS_GE_128-NEXT: mls z0.d, p0/m, z2.d, z7.d
751- ; VBITS_GE_128-NEXT: movprfx z1, z5
752- ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z6.d
753- ; VBITS_GE_128-NEXT: stp q0, q1, [x0]
754+ ; VBITS_GE_128-NEXT: sub v0.2d, v3.2d, v5.2d
755+ ; VBITS_GE_128-NEXT: stp q2, q0, [x0]
754756; VBITS_GE_128-NEXT: ret
755757;
756758; VBITS_GE_256-LABEL: srem_v8i64:
757759; VBITS_GE_256: // %bb.0:
758- ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
760+ ; VBITS_GE_256-NEXT: mov x8, #4
759761; VBITS_GE_256-NEXT: ptrue p0.d, vl4
760762; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
761763; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
@@ -1424,7 +1426,7 @@ define void @urem_v16i32(ptr %a, ptr %b) #0 {
14241426;
14251427; VBITS_GE_256-LABEL: urem_v16i32:
14261428; VBITS_GE_256: // %bb.0:
1427- ; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
1429+ ; VBITS_GE_256-NEXT: mov x8, #8
14281430; VBITS_GE_256-NEXT: ptrue p0.s, vl8
14291431; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
14301432; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
@@ -1498,13 +1500,13 @@ define void @urem_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 {
14981500define <1 x i64 > @urem_v1i64 (<1 x i64 > %op1 , <1 x i64 > %op2 ) vscale_range(1 ,0 ) #0 {
14991501; CHECK-LABEL: urem_v1i64:
15001502; CHECK: // %bb.0:
1501- ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
1502- ; CHECK-NEXT: ptrue p0.d, vl1
15031503; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
1504+ ; CHECK-NEXT: ptrue p0.d, vl1
1505+ ; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
15041506; CHECK-NEXT: movprfx z2, z0
15051507; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d
1506- ; CHECK-NEXT: mls z0 .d, p0/m, z2 .d, z1 .d
1507- ; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
1508+ ; CHECK-NEXT: mul z1 .d, p0/m, z1 .d, z2 .d
1509+ ; CHECK-NEXT: sub d0, d0, d1
15081510; CHECK-NEXT: ret
15091511 %res = urem <1 x i64 > %op1 , %op2
15101512 ret <1 x i64 > %res
@@ -1515,13 +1517,13 @@ define <1 x i64> @urem_v1i64(<1 x i64> %op1, <1 x i64> %op2) vscale_range(1,0) #
15151517define <2 x i64 > @urem_v2i64 (<2 x i64 > %op1 , <2 x i64 > %op2 ) vscale_range(1 ,0 ) #0 {
15161518; CHECK-LABEL: urem_v2i64:
15171519; CHECK: // %bb.0:
1518- ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
1519- ; CHECK-NEXT: ptrue p0.d, vl2
15201520; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
1521+ ; CHECK-NEXT: ptrue p0.d, vl2
1522+ ; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
15211523; CHECK-NEXT: movprfx z2, z0
15221524; CHECK-NEXT: udiv z2.d, p0/m, z2.d, z1.d
1523- ; CHECK-NEXT: mls z0 .d, p0/m, z2 .d, z1 .d
1524- ; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
1525+ ; CHECK-NEXT: mul z1 .d, p0/m, z1 .d, z2 .d
1526+ ; CHECK-NEXT: sub v0.2d, v0.2d, v1.2d
15251527; CHECK-NEXT: ret
15261528 %res = urem <2 x i64 > %op1 , %op2
15271529 ret <2 x i64 > %res
@@ -1548,32 +1550,34 @@ define void @urem_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 {
15481550define void @urem_v8i64 (ptr %a , ptr %b ) #0 {
15491551; VBITS_GE_128-LABEL: urem_v8i64:
15501552; VBITS_GE_128: // %bb.0:
1551- ; VBITS_GE_128-NEXT: ldp q0, q1 , [x0, #32 ]
1553+ ; VBITS_GE_128-NEXT: ldp q4, q5 , [x1 ]
15521554; VBITS_GE_128-NEXT: ptrue p0.d, vl2
1553- ; VBITS_GE_128-NEXT: ldp q2, q3, [x1, #32]
1555+ ; VBITS_GE_128-NEXT: ldp q7, q6, [x1, #32]
1556+ ; VBITS_GE_128-NEXT: ldp q0, q1, [x0, #32]
1557+ ; VBITS_GE_128-NEXT: ldp q2, q3, [x0]
1558+ ; VBITS_GE_128-NEXT: movprfx z16, z3
1559+ ; VBITS_GE_128-NEXT: udiv z16.d, p0/m, z16.d, z5.d
1560+ ; VBITS_GE_128-NEXT: movprfx z17, z2
1561+ ; VBITS_GE_128-NEXT: udiv z17.d, p0/m, z17.d, z4.d
1562+ ; VBITS_GE_128-NEXT: mul z5.d, p0/m, z5.d, z16.d
15541563; VBITS_GE_128-NEXT: movprfx z16, z1
1555- ; VBITS_GE_128-NEXT: udiv z16.d, p0/m, z16.d, z3.d
1556- ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z3.d
1557- ; VBITS_GE_128-NEXT: movprfx z3, z0
1558- ; VBITS_GE_128-NEXT: udiv z3.d, p0/m, z3.d, z2.d
1559- ; VBITS_GE_128-NEXT: mls z0.d, p0/m, z3.d, z2.d
1560- ; VBITS_GE_128-NEXT: ldp q4, q5, [x0]
1561- ; VBITS_GE_128-NEXT: ldp q7, q6, [x1]
1562- ; VBITS_GE_128-NEXT: movprfx z16, z5
15631564; VBITS_GE_128-NEXT: udiv z16.d, p0/m, z16.d, z6.d
1564- ; VBITS_GE_128-NEXT: movprfx z2, z4
1565- ; VBITS_GE_128-NEXT: udiv z2.d, p0/m, z2.d, z7.d
1565+ ; VBITS_GE_128-NEXT: mul z4.d, p0/m, z4.d, z17.d
1566+ ; VBITS_GE_128-NEXT: movprfx z17, z0
1567+ ; VBITS_GE_128-NEXT: udiv z17.d, p0/m, z17.d, z7.d
1568+ ; VBITS_GE_128-NEXT: mul z6.d, p0/m, z6.d, z16.d
1569+ ; VBITS_GE_128-NEXT: mul z7.d, p0/m, z7.d, z17.d
1570+ ; VBITS_GE_128-NEXT: sub v0.2d, v0.2d, v7.2d
1571+ ; VBITS_GE_128-NEXT: sub v1.2d, v1.2d, v6.2d
1572+ ; VBITS_GE_128-NEXT: sub v2.2d, v2.2d, v4.2d
15661573; VBITS_GE_128-NEXT: stp q0, q1, [x0, #32]
1567- ; VBITS_GE_128-NEXT: movprfx z0, z4
1568- ; VBITS_GE_128-NEXT: mls z0.d, p0/m, z2.d, z7.d
1569- ; VBITS_GE_128-NEXT: movprfx z1, z5
1570- ; VBITS_GE_128-NEXT: mls z1.d, p0/m, z16.d, z6.d
1571- ; VBITS_GE_128-NEXT: stp q0, q1, [x0]
1574+ ; VBITS_GE_128-NEXT: sub v0.2d, v3.2d, v5.2d
1575+ ; VBITS_GE_128-NEXT: stp q2, q0, [x0]
15721576; VBITS_GE_128-NEXT: ret
15731577;
15741578; VBITS_GE_256-LABEL: urem_v8i64:
15751579; VBITS_GE_256: // %bb.0:
1576- ; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
1580+ ; VBITS_GE_256-NEXT: mov x8, #4
15771581; VBITS_GE_256-NEXT: ptrue p0.d, vl4
15781582; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
15791583; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
0 commit comments