@@ -761,17 +761,12 @@ define <4 x i32> @vector_div_leading_zeros(<4 x i32> %x) {
761761; SSE2: # %bb.0:
762762; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
763763; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [613566757,613566757,613566757,613566757]
764- ; SSE2-NEXT: movdqa %xmm0, %xmm2
764+ ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
765+ ; SSE2-NEXT: pmuludq %xmm1, %xmm0
766+ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
765767; SSE2-NEXT: pmuludq %xmm1, %xmm2
766- ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,3,2,3]
767- ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
768- ; SSE2-NEXT: pmuludq %xmm1, %xmm3
769- ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm3[1,3,2,3]
770- ; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
771- ; SSE2-NEXT: psubd %xmm2, %xmm0
772- ; SSE2-NEXT: psrld $1, %xmm0
773- ; SSE2-NEXT: paddd %xmm2, %xmm0
774- ; SSE2-NEXT: psrld $2, %xmm0
768+ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,3,2,3]
769+ ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
775770; SSE2-NEXT: retq
776771;
777772; SSE41-LABEL: vector_div_leading_zeros:
@@ -780,13 +775,9 @@ define <4 x i32> @vector_div_leading_zeros(<4 x i32> %x) {
780775; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
781776; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
782777; SSE41-NEXT: pmuludq %xmm2, %xmm1
783- ; SSE41-NEXT: pmuludq %xmm0, %xmm2
784- ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
785- ; SSE41-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
786- ; SSE41-NEXT: psubd %xmm2, %xmm0
787- ; SSE41-NEXT: psrld $1, %xmm0
788- ; SSE41-NEXT: paddd %xmm2, %xmm0
789- ; SSE41-NEXT: psrld $2, %xmm0
778+ ; SSE41-NEXT: pmuludq %xmm2, %xmm0
779+ ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
780+ ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
790781; SSE41-NEXT: retq
791782;
792783; AVX1-LABEL: vector_div_leading_zeros:
@@ -795,13 +786,9 @@ define <4 x i32> @vector_div_leading_zeros(<4 x i32> %x) {
795786; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
796787; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
797788; AVX1-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
798- ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
799- ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
800- ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
801- ; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
802- ; AVX1-NEXT: vpsrld $1, %xmm0, %xmm0
803- ; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
804- ; AVX1-NEXT: vpsrld $2, %xmm0, %xmm0
789+ ; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
790+ ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
791+ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
805792; AVX1-NEXT: retq
806793;
807794; AVX2-LABEL: vector_div_leading_zeros:
@@ -810,13 +797,9 @@ define <4 x i32> @vector_div_leading_zeros(<4 x i32> %x) {
810797; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
811798; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
812799; AVX2-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
813- ; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
814- ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
815- ; AVX2-NEXT: vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
816- ; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0
817- ; AVX2-NEXT: vpsrld $1, %xmm0, %xmm0
818- ; AVX2-NEXT: vpaddd %xmm1, %xmm0, %xmm0
819- ; AVX2-NEXT: vpsrld $2, %xmm0, %xmm0
800+ ; AVX2-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
801+ ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
802+ ; AVX2-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
820803; AVX2-NEXT: retq
821804;
822805; XOP-LABEL: vector_div_leading_zeros:
@@ -825,13 +808,9 @@ define <4 x i32> @vector_div_leading_zeros(<4 x i32> %x) {
825808; XOP-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
826809; XOP-NEXT: vbroadcastss {{.*#+}} xmm2 = [613566757,613566757,613566757,613566757]
827810; XOP-NEXT: vpmuludq %xmm2, %xmm1, %xmm1
828- ; XOP-NEXT: vpmuludq %xmm2, %xmm0, %xmm2
829- ; XOP-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
830- ; XOP-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
831- ; XOP-NEXT: vpsubd %xmm1, %xmm0, %xmm0
832- ; XOP-NEXT: vpsrld $1, %xmm0, %xmm0
833- ; XOP-NEXT: vpaddd %xmm1, %xmm0, %xmm0
834- ; XOP-NEXT: vpsrld $2, %xmm0, %xmm0
811+ ; XOP-NEXT: vpmuludq %xmm2, %xmm0, %xmm0
812+ ; XOP-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
813+ ; XOP-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
835814; XOP-NEXT: retq
836815 %a = and <4 x i32 > %x , <i32 255 , i32 255 , i32 255 , i32 255 >
837816 %b = udiv <4 x i32 > %a , <i32 7 , i32 7 , i32 7 , i32 7 >
0 commit comments