@@ -437,21 +437,99 @@ define void @test_srem_pow2_v2i32(<2 x i32>* %x, <2 x i32>* %y) nounwind {
437437 ret void
438438}
439439
440+ define void @test_udiv_v2i32 (<2 x i32 >* %x , <2 x i32 >* %y , <2 x i32 >* %z ) nounwind {
441+ ; X64-LABEL: test_udiv_v2i32:
442+ ; X64: # %bb.0:
443+ ; X64-NEXT: movq %rdx, %rcx
444+ ; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
445+ ; X64-NEXT: pxor %xmm1, %xmm1
446+ ; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
447+ ; X64-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
448+ ; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
449+ ; X64-NEXT: movq %xmm0, %rax
450+ ; X64-NEXT: movq %xmm2, %rsi
451+ ; X64-NEXT: xorl %edx, %edx
452+ ; X64-NEXT: divq %rsi
453+ ; X64-NEXT: movq %rax, %xmm1
454+ ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
455+ ; X64-NEXT: movq %xmm0, %rax
456+ ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
457+ ; X64-NEXT: movq %xmm0, %rsi
458+ ; X64-NEXT: xorl %edx, %edx
459+ ; X64-NEXT: divq %rsi
460+ ; X64-NEXT: movq %rax, %xmm0
461+ ; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
462+ ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
463+ ; X64-NEXT: movq %xmm0, (%rcx)
464+ ; X64-NEXT: retq
465+ ;
466+ ; X86-LABEL: test_udiv_v2i32:
467+ ; X86: # %bb.0:
468+ ; X86-NEXT: pushl %esi
469+ ; X86-NEXT: subl $56, %esp
470+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
471+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
472+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
473+ ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
474+ ; X86-NEXT: pxor %xmm1, %xmm1
475+ ; X86-NEXT: movdqa %xmm0, %xmm2
476+ ; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
477+ ; X86-NEXT: movdqu %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
478+ ; X86-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
479+ ; X86-NEXT: movdqa %xmm2, %xmm3
480+ ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
481+ ; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
482+ ; X86-NEXT: movd %xmm0, (%esp)
483+ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
484+ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
485+ ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
486+ ; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
487+ ; X86-NEXT: calll __udivdi3
488+ ; X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
489+ ; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
490+ ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
491+ ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
492+ ; X86-NEXT: movd %xmm0, (%esp)
493+ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
494+ ; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
495+ ; X86-NEXT: movd %eax, %xmm0
496+ ; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
497+ ; X86-NEXT: calll __udivdi3
498+ ; X86-NEXT: movd %eax, %xmm0
499+ ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
500+ ; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
501+ ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
502+ ; X86-NEXT: movq %xmm0, (%esi)
503+ ; X86-NEXT: addl $56, %esp
504+ ; X86-NEXT: popl %esi
505+ ; X86-NEXT: retl
506+ %a = load <2 x i32 >, <2 x i32 >* %x
507+ %b = load <2 x i32 >, <2 x i32 >* %y
508+ %c = udiv <2 x i32 > %a , %b
509+ store <2 x i32 > %c , <2 x i32 >* %z
510+ ret void
511+ }
512+
440513define void @test_urem_v2i32 (<2 x i32 >* %x , <2 x i32 >* %y , <2 x i32 >* %z ) nounwind {
441514; X64-LABEL: test_urem_v2i32:
442515; X64: # %bb.0:
443516; X64-NEXT: movq %rdx, %rcx
444517; X64-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
445518; X64-NEXT: pxor %xmm1, %xmm1
446519; X64-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
520+ ; X64-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
521+ ; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
447522; X64-NEXT: movq %xmm0, %rax
523+ ; X64-NEXT: movq %xmm2, %rsi
448524; X64-NEXT: xorl %edx, %edx
449- ; X64-NEXT: divq %rax
525+ ; X64-NEXT: divq %rsi
450526; X64-NEXT: movq %rdx, %xmm1
451527; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
452528; X64-NEXT: movq %xmm0, %rax
529+ ; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
530+ ; X64-NEXT: movq %xmm0, %rsi
453531; X64-NEXT: xorl %edx, %edx
454- ; X64-NEXT: divq %rax
532+ ; X64-NEXT: divq %rsi
455533; X64-NEXT: movq %rdx, %xmm0
456534; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
457535; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
@@ -461,20 +539,30 @@ define void @test_urem_v2i32(<2 x i32>* %x, <2 x i32>* %y, <2 x i32>* %z) nounwi
461539; X86-LABEL: test_urem_v2i32:
462540; X86: # %bb.0:
463541; X86-NEXT: pushl %esi
464- ; X86-NEXT: subl $40 , %esp
542+ ; X86-NEXT: subl $56 , %esp
465543; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
466544; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
467- ; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
468- ; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
469- ; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
470- ; X86-NEXT: movss %xmm0, (%esp)
545+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
546+ ; X86-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
547+ ; X86-NEXT: pxor %xmm1, %xmm1
548+ ; X86-NEXT: movdqa %xmm0, %xmm2
549+ ; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
550+ ; X86-NEXT: movdqu %xmm2, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
551+ ; X86-NEXT: movq {{.*#+}} xmm2 = mem[0],zero
552+ ; X86-NEXT: movdqa %xmm2, %xmm3
553+ ; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
554+ ; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
555+ ; X86-NEXT: movd %xmm0, (%esp)
471556; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
472557; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
558+ ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
559+ ; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
473560; X86-NEXT: calll __umoddi3
474561; X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
475- ; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
476562; X86-NEXT: movss %xmm0, {{[0-9]+}}(%esp)
477- ; X86-NEXT: movss %xmm0, (%esp)
563+ ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
564+ ; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
565+ ; X86-NEXT: movd %xmm0, (%esp)
478566; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
479567; X86-NEXT: movl $0, {{[0-9]+}}(%esp)
480568; X86-NEXT: movd %eax, %xmm0
@@ -485,11 +573,11 @@ define void @test_urem_v2i32(<2 x i32>* %x, <2 x i32>* %y, <2 x i32>* %z) nounwi
485573; X86-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
486574; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
487575; X86-NEXT: movq %xmm0, (%esi)
488- ; X86-NEXT: addl $40 , %esp
576+ ; X86-NEXT: addl $56 , %esp
489577; X86-NEXT: popl %esi
490578; X86-NEXT: retl
491579 %a = load <2 x i32 >, <2 x i32 >* %x
492- %b = load <2 x i32 >, <2 x i32 >* %x
580+ %b = load <2 x i32 >, <2 x i32 >* %y
493581 %c = urem <2 x i32 > %a , %b
494582 store <2 x i32 > %c , <2 x i32 >* %z
495583 ret void
@@ -498,62 +586,72 @@ define void @test_urem_v2i32(<2 x i32>* %x, <2 x i32>* %y, <2 x i32>* %z) nounwi
498586define void @test_sdiv_v2i32 (<2 x i32 >* %x , <2 x i32 >* %y , <2 x i32 >* %z ) nounwind {
499587; X64-LABEL: test_sdiv_v2i32:
500588; X64: # %bb.0:
501- ; X64-NEXT: movq %rdx, %rcx
502- ; X64-NEXT: movslq (%rdi), %rsi
503- ; X64-NEXT: movslq 4(%rdi), %rdi
504- ; X64-NEXT: movq %rdi, %rax
589+ ; X64-NEXT: movq %rdx, %r8
590+ ; X64-NEXT: movslq (%rdi), %rcx
591+ ; X64-NEXT: movslq 4(%rdi), %rax
592+ ; X64-NEXT: movslq (%rsi), %rdi
593+ ; X64-NEXT: movslq 4(%rsi), %rsi
505594; X64-NEXT: cqto
506- ; X64-NEXT: idivq %rdi
595+ ; X64-NEXT: idivq %rsi
507596; X64-NEXT: movq %rax, %xmm0
508- ; X64-NEXT: movq %rsi , %rax
597+ ; X64-NEXT: movq %rcx , %rax
509598; X64-NEXT: cqto
510- ; X64-NEXT: idivq %rsi
599+ ; X64-NEXT: idivq %rdi
511600; X64-NEXT: movq %rax, %xmm1
512601; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
513602; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
514- ; X64-NEXT: movq %xmm0, (%rcx )
603+ ; X64-NEXT: movq %xmm0, (%r8 )
515604; X64-NEXT: retq
516605;
517606; X86-LABEL: test_sdiv_v2i32:
518607; X86: # %bb.0:
608+ ; X86-NEXT: pushl %ebp
519609; X86-NEXT: pushl %ebx
520610; X86-NEXT: pushl %edi
521611; X86-NEXT: pushl %esi
522- ; X86-NEXT: subl $16, %esp
523- ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
612+ ; X86-NEXT: subl $44, %esp
524613; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
525- ; X86-NEXT: movl (%eax), %edi
526- ; X86-NEXT: movl 4(%eax), %eax
527- ; X86-NEXT: movl %edi, %ebx
528- ; X86-NEXT: sarl $31, %ebx
529- ; X86-NEXT: movl %eax, %ecx
530- ; X86-NEXT: sarl $31, %ecx
531- ; X86-NEXT: pushl %ecx
614+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
615+ ; X86-NEXT: movl (%ecx), %edi
616+ ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
617+ ; X86-NEXT: movl 4(%ecx), %esi
618+ ; X86-NEXT: sarl $31, %edi
619+ ; X86-NEXT: movl %esi, %edx
620+ ; X86-NEXT: sarl $31, %edx
621+ ; X86-NEXT: movl (%eax), %ebx
622+ ; X86-NEXT: movl 4(%eax), %ecx
623+ ; X86-NEXT: movl %ebx, %ebp
624+ ; X86-NEXT: sarl $31, %ebp
625+ ; X86-NEXT: movl %ecx, %eax
626+ ; X86-NEXT: sarl $31, %eax
532627; X86-NEXT: pushl %eax
533628; X86-NEXT: pushl %ecx
534- ; X86-NEXT: pushl %eax
629+ ; X86-NEXT: pushl %edx
630+ ; X86-NEXT: pushl %esi
535631; X86-NEXT: calll __divdi3
536632; X86-NEXT: addl $16, %esp
537633; X86-NEXT: movd %eax, %xmm0
538- ; X86-NEXT: movdqu %xmm0, (%esp) # 16-byte Spill
539- ; X86-NEXT: pushl %ebx
540- ; X86-NEXT: pushl %edi
634+ ; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
635+ ; X86-NEXT: pushl %ebp
541636; X86-NEXT: pushl %ebx
542637; X86-NEXT: pushl %edi
638+ ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
543639; X86-NEXT: calll __divdi3
544640; X86-NEXT: addl $16, %esp
545641; X86-NEXT: movd %eax, %xmm0
546- ; X86-NEXT: movdqu (%esp ), %xmm1 # 16-byte Reload
642+ ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p ), %xmm1 # 16-byte Reload
547643; X86-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
548644; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
549- ; X86-NEXT: movq %xmm0, (%esi)
550- ; X86-NEXT: addl $16, %esp
645+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
646+ ; X86-NEXT: movq %xmm0, (%eax)
647+ ; X86-NEXT: addl $44, %esp
551648; X86-NEXT: popl %esi
552649; X86-NEXT: popl %edi
553650; X86-NEXT: popl %ebx
651+ ; X86-NEXT: popl %ebp
554652; X86-NEXT: retl
555653 %a = load <2 x i32 >, <2 x i32 >* %x
556- %b = load <2 x i32 >, <2 x i32 >* %x
654+ %b = load <2 x i32 >, <2 x i32 >* %y
557655 %c = sdiv <2 x i32 > %a , %b
558656 store <2 x i32 > %c , <2 x i32 >* %z
559657 ret void
@@ -562,62 +660,72 @@ define void @test_sdiv_v2i32(<2 x i32>* %x, <2 x i32>* %y, <2 x i32>* %z) nounwi
562660define void @test_srem_v2i32 (<2 x i32 >* %x , <2 x i32 >* %y , <2 x i32 >* %z ) nounwind {
563661; X64-LABEL: test_srem_v2i32:
564662; X64: # %bb.0:
565- ; X64-NEXT: movq %rdx, %rcx
566- ; X64-NEXT: movslq (%rdi), %rsi
567- ; X64-NEXT: movslq 4(%rdi), %rdi
568- ; X64-NEXT: movq %rdi, %rax
663+ ; X64-NEXT: movq %rdx, %r8
664+ ; X64-NEXT: movslq (%rdi), %rcx
665+ ; X64-NEXT: movslq 4(%rdi), %rax
666+ ; X64-NEXT: movslq (%rsi), %rdi
667+ ; X64-NEXT: movslq 4(%rsi), %rsi
569668; X64-NEXT: cqto
570- ; X64-NEXT: idivq %rdi
669+ ; X64-NEXT: idivq %rsi
571670; X64-NEXT: movq %rax, %xmm0
572- ; X64-NEXT: movq %rsi , %rax
671+ ; X64-NEXT: movq %rcx , %rax
573672; X64-NEXT: cqto
574- ; X64-NEXT: idivq %rsi
673+ ; X64-NEXT: idivq %rdi
575674; X64-NEXT: movq %rax, %xmm1
576675; X64-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
577676; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
578- ; X64-NEXT: movq %xmm0, (%rcx )
677+ ; X64-NEXT: movq %xmm0, (%r8 )
579678; X64-NEXT: retq
580679;
581680; X86-LABEL: test_srem_v2i32:
582681; X86: # %bb.0:
682+ ; X86-NEXT: pushl %ebp
583683; X86-NEXT: pushl %ebx
584684; X86-NEXT: pushl %edi
585685; X86-NEXT: pushl %esi
586- ; X86-NEXT: subl $16, %esp
587- ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
686+ ; X86-NEXT: subl $44, %esp
588687; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
589- ; X86-NEXT: movl (%eax), %edi
590- ; X86-NEXT: movl 4(%eax), %eax
591- ; X86-NEXT: movl %edi, %ebx
592- ; X86-NEXT: sarl $31, %ebx
593- ; X86-NEXT: movl %eax, %ecx
594- ; X86-NEXT: sarl $31, %ecx
595- ; X86-NEXT: pushl %ecx
688+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
689+ ; X86-NEXT: movl (%ecx), %edi
690+ ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
691+ ; X86-NEXT: movl 4(%ecx), %esi
692+ ; X86-NEXT: sarl $31, %edi
693+ ; X86-NEXT: movl %esi, %edx
694+ ; X86-NEXT: sarl $31, %edx
695+ ; X86-NEXT: movl (%eax), %ebx
696+ ; X86-NEXT: movl 4(%eax), %ecx
697+ ; X86-NEXT: movl %ebx, %ebp
698+ ; X86-NEXT: sarl $31, %ebp
699+ ; X86-NEXT: movl %ecx, %eax
700+ ; X86-NEXT: sarl $31, %eax
596701; X86-NEXT: pushl %eax
597702; X86-NEXT: pushl %ecx
598- ; X86-NEXT: pushl %eax
703+ ; X86-NEXT: pushl %edx
704+ ; X86-NEXT: pushl %esi
599705; X86-NEXT: calll __divdi3
600706; X86-NEXT: addl $16, %esp
601707; X86-NEXT: movd %eax, %xmm0
602- ; X86-NEXT: movdqu %xmm0, (%esp) # 16-byte Spill
603- ; X86-NEXT: pushl %ebx
604- ; X86-NEXT: pushl %edi
708+ ; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
709+ ; X86-NEXT: pushl %ebp
605710; X86-NEXT: pushl %ebx
606711; X86-NEXT: pushl %edi
712+ ; X86-NEXT: pushl {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
607713; X86-NEXT: calll __divdi3
608714; X86-NEXT: addl $16, %esp
609715; X86-NEXT: movd %eax, %xmm0
610- ; X86-NEXT: movdqu (%esp ), %xmm1 # 16-byte Reload
716+ ; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p ), %xmm1 # 16-byte Reload
611717; X86-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
612718; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
613- ; X86-NEXT: movq %xmm0, (%esi)
614- ; X86-NEXT: addl $16, %esp
719+ ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
720+ ; X86-NEXT: movq %xmm0, (%eax)
721+ ; X86-NEXT: addl $44, %esp
615722; X86-NEXT: popl %esi
616723; X86-NEXT: popl %edi
617724; X86-NEXT: popl %ebx
725+ ; X86-NEXT: popl %ebp
618726; X86-NEXT: retl
619727 %a = load <2 x i32 >, <2 x i32 >* %x
620- %b = load <2 x i32 >, <2 x i32 >* %x
728+ %b = load <2 x i32 >, <2 x i32 >* %y
621729 %c = sdiv <2 x i32 > %a , %b
622730 store <2 x i32 > %c , <2 x i32 >* %z
623731 ret void
0 commit comments