@@ -3968,6 +3968,238 @@ class StubGenerator: public StubCodeGenerator {
39683968 return start;
39693969 }
39703970
3971+ // Arguments:
3972+ //
3973+ // Input:
3974+ // c_rarg0 - newArr address
3975+ // c_rarg1 - oldArr address
3976+ // c_rarg2 - newIdx
3977+ // c_rarg3 - shiftCount
3978+ // c_rarg4 - numIter
3979+ //
3980+ address generate_bigIntegerRightShift () {
3981+ __ align (CodeEntryAlignment);
3982+ StubCodeMark mark (this , " StubRoutines" , " bigIntegerRightShiftWorker" );
3983+ address start = __ pc ();
3984+
3985+ Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
3986+
3987+ Register newArr = c_rarg0;
3988+ Register oldArr = c_rarg1;
3989+ Register newIdx = c_rarg2;
3990+ Register shiftCount = c_rarg3;
3991+ Register numIter = c_rarg4;
3992+ Register idx = numIter;
3993+
3994+ Register newArrCur = rscratch1;
3995+ Register shiftRevCount = rscratch2;
3996+ Register oldArrCur = r13;
3997+ Register oldArrNext = r14;
3998+
3999+ FloatRegister oldElem0 = v0;
4000+ FloatRegister oldElem1 = v1;
4001+ FloatRegister newElem = v2;
4002+ FloatRegister shiftVCount = v3;
4003+ FloatRegister shiftVRevCount = v4;
4004+
4005+ __ cbz (idx, Exit);
4006+
4007+ __ add (newArr, newArr, newIdx, Assembler::LSL, 2 );
4008+
4009+ // left shift count
4010+ __ movw (shiftRevCount, 32 );
4011+ __ subw (shiftRevCount, shiftRevCount, shiftCount);
4012+
4013+ // numIter too small to allow a 4-words SIMD loop, rolling back
4014+ __ cmp (numIter, (u1)4 );
4015+ __ br (Assembler::LT, ShiftThree);
4016+
4017+ __ dup (shiftVCount, __ T4S, shiftCount);
4018+ __ dup (shiftVRevCount, __ T4S, shiftRevCount);
4019+ __ negr (shiftVCount, __ T4S, shiftVCount);
4020+
4021+ __ BIND (ShiftSIMDLoop);
4022+
4023+ // Calculate the load addresses
4024+ __ sub (idx, idx, 4 );
4025+ __ add (oldArrNext, oldArr, idx, Assembler::LSL, 2 );
4026+ __ add (newArrCur, newArr, idx, Assembler::LSL, 2 );
4027+ __ add (oldArrCur, oldArrNext, 4 );
4028+
4029+ // Load 4 words and process
4030+ __ ld1 (oldElem0, __ T4S, Address (oldArrCur));
4031+ __ ld1 (oldElem1, __ T4S, Address (oldArrNext));
4032+ __ ushl (oldElem0, __ T4S, oldElem0, shiftVCount);
4033+ __ ushl (oldElem1, __ T4S, oldElem1, shiftVRevCount);
4034+ __ orr (newElem, __ T16B, oldElem0, oldElem1);
4035+ __ st1 (newElem, __ T4S, Address (newArrCur));
4036+
4037+ __ cmp (idx, (u1)4 );
4038+ __ br (Assembler::LT, ShiftTwoLoop);
4039+ __ b (ShiftSIMDLoop);
4040+
4041+ __ BIND (ShiftTwoLoop);
4042+ __ cbz (idx, Exit);
4043+ __ cmp (idx, (u1)1 );
4044+ __ br (Assembler::EQ, ShiftOne);
4045+
4046+ // Calculate the load addresses
4047+ __ sub (idx, idx, 2 );
4048+ __ add (oldArrNext, oldArr, idx, Assembler::LSL, 2 );
4049+ __ add (newArrCur, newArr, idx, Assembler::LSL, 2 );
4050+ __ add (oldArrCur, oldArrNext, 4 );
4051+
4052+ // Load 2 words and process
4053+ __ ld1 (oldElem0, __ T2S, Address (oldArrCur));
4054+ __ ld1 (oldElem1, __ T2S, Address (oldArrNext));
4055+ __ ushl (oldElem0, __ T2S, oldElem0, shiftVCount);
4056+ __ ushl (oldElem1, __ T2S, oldElem1, shiftVRevCount);
4057+ __ orr (newElem, __ T8B, oldElem0, oldElem1);
4058+ __ st1 (newElem, __ T2S, Address (newArrCur));
4059+ __ b (ShiftTwoLoop);
4060+
4061+ __ BIND (ShiftThree);
4062+ __ tbz (idx, 1 , ShiftOne);
4063+ __ tbz (idx, 0 , ShiftTwo);
4064+ __ ldrw (r10, Address (oldArr, 12 ));
4065+ __ ldrw (r11, Address (oldArr, 8 ));
4066+ __ lsrvw (r10, r10, shiftCount);
4067+ __ lslvw (r11, r11, shiftRevCount);
4068+ __ orrw (r12, r10, r11);
4069+ __ strw (r12, Address (newArr, 8 ));
4070+
4071+ __ BIND (ShiftTwo);
4072+ __ ldrw (r10, Address (oldArr, 8 ));
4073+ __ ldrw (r11, Address (oldArr, 4 ));
4074+ __ lsrvw (r10, r10, shiftCount);
4075+ __ lslvw (r11, r11, shiftRevCount);
4076+ __ orrw (r12, r10, r11);
4077+ __ strw (r12, Address (newArr, 4 ));
4078+
4079+ __ BIND (ShiftOne);
4080+ __ ldrw (r10, Address (oldArr, 4 ));
4081+ __ ldrw (r11, Address (oldArr));
4082+ __ lsrvw (r10, r10, shiftCount);
4083+ __ lslvw (r11, r11, shiftRevCount);
4084+ __ orrw (r12, r10, r11);
4085+ __ strw (r12, Address (newArr));
4086+
4087+ __ BIND (Exit);
4088+ __ ret (lr);
4089+
4090+ return start;
4091+ }
4092+
4093+ // Arguments:
4094+ //
4095+ // Input:
4096+ // c_rarg0 - newArr address
4097+ // c_rarg1 - oldArr address
4098+ // c_rarg2 - newIdx
4099+ // c_rarg3 - shiftCount
4100+ // c_rarg4 - numIter
4101+ //
4102+ address generate_bigIntegerLeftShift () {
4103+ __ align (CodeEntryAlignment);
4104+ StubCodeMark mark (this , " StubRoutines" , " bigIntegerLeftShiftWorker" );
4105+ address start = __ pc ();
4106+
4107+ Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit;
4108+
4109+ Register newArr = c_rarg0;
4110+ Register oldArr = c_rarg1;
4111+ Register newIdx = c_rarg2;
4112+ Register shiftCount = c_rarg3;
4113+ Register numIter = c_rarg4;
4114+
4115+ Register shiftRevCount = rscratch1;
4116+ Register oldArrNext = rscratch2;
4117+
4118+ FloatRegister oldElem0 = v0;
4119+ FloatRegister oldElem1 = v1;
4120+ FloatRegister newElem = v2;
4121+ FloatRegister shiftVCount = v3;
4122+ FloatRegister shiftVRevCount = v4;
4123+
4124+ __ cbz (numIter, Exit);
4125+
4126+ __ add (oldArrNext, oldArr, 4 );
4127+ __ add (newArr, newArr, newIdx, Assembler::LSL, 2 );
4128+
4129+ // right shift count
4130+ __ movw (shiftRevCount, 32 );
4131+ __ subw (shiftRevCount, shiftRevCount, shiftCount);
4132+
4133+ // numIter too small to allow a 4-words SIMD loop, rolling back
4134+ __ cmp (numIter, (u1)4 );
4135+ __ br (Assembler::LT, ShiftThree);
4136+
4137+ __ dup (shiftVCount, __ T4S, shiftCount);
4138+ __ dup (shiftVRevCount, __ T4S, shiftRevCount);
4139+ __ negr (shiftVRevCount, __ T4S, shiftVRevCount);
4140+
4141+ __ BIND (ShiftSIMDLoop);
4142+
4143+ // load 4 words and process
4144+ __ ld1 (oldElem0, __ T4S, __ post (oldArr, 16 ));
4145+ __ ld1 (oldElem1, __ T4S, __ post (oldArrNext, 16 ));
4146+ __ ushl (oldElem0, __ T4S, oldElem0, shiftVCount);
4147+ __ ushl (oldElem1, __ T4S, oldElem1, shiftVRevCount);
4148+ __ orr (newElem, __ T16B, oldElem0, oldElem1);
4149+ __ st1 (newElem, __ T4S, __ post (newArr, 16 ));
4150+ __ sub (numIter, numIter, 4 );
4151+
4152+ __ cmp (numIter, (u1)4 );
4153+ __ br (Assembler::LT, ShiftTwoLoop);
4154+ __ b (ShiftSIMDLoop);
4155+
4156+ __ BIND (ShiftTwoLoop);
4157+ __ cbz (numIter, Exit);
4158+ __ cmp (numIter, (u1)1 );
4159+ __ br (Assembler::EQ, ShiftOne);
4160+
4161+ // load 2 words and process
4162+ __ ld1 (oldElem0, __ T2S, __ post (oldArr, 8 ));
4163+ __ ld1 (oldElem1, __ T2S, __ post (oldArrNext, 8 ));
4164+ __ ushl (oldElem0, __ T2S, oldElem0, shiftVCount);
4165+ __ ushl (oldElem1, __ T2S, oldElem1, shiftVRevCount);
4166+ __ orr (newElem, __ T8B, oldElem0, oldElem1);
4167+ __ st1 (newElem, __ T2S, __ post (newArr, 8 ));
4168+ __ sub (numIter, numIter, 2 );
4169+ __ b (ShiftTwoLoop);
4170+
4171+ __ BIND (ShiftThree);
4172+ __ ldrw (r10, __ post (oldArr, 4 ));
4173+ __ ldrw (r11, __ post (oldArrNext, 4 ));
4174+ __ lslvw (r10, r10, shiftCount);
4175+ __ lsrvw (r11, r11, shiftRevCount);
4176+ __ orrw (r12, r10, r11);
4177+ __ strw (r12, __ post (newArr, 4 ));
4178+ __ tbz (numIter, 1 , Exit);
4179+ __ tbz (numIter, 0 , ShiftOne);
4180+
4181+ __ BIND (ShiftTwo);
4182+ __ ldrw (r10, __ post (oldArr, 4 ));
4183+ __ ldrw (r11, __ post (oldArrNext, 4 ));
4184+ __ lslvw (r10, r10, shiftCount);
4185+ __ lsrvw (r11, r11, shiftRevCount);
4186+ __ orrw (r12, r10, r11);
4187+ __ strw (r12, __ post (newArr, 4 ));
4188+
4189+ __ BIND (ShiftOne);
4190+ __ ldrw (r10, Address (oldArr));
4191+ __ ldrw (r11, Address (oldArrNext));
4192+ __ lslvw (r10, r10, shiftCount);
4193+ __ lsrvw (r11, r11, shiftRevCount);
4194+ __ orrw (r12, r10, r11);
4195+ __ strw (r12, Address (newArr));
4196+
4197+ __ BIND (Exit);
4198+ __ ret (lr);
4199+
4200+ return start;
4201+ }
4202+
39714203 void ghash_multiply (FloatRegister result_lo, FloatRegister result_hi,
39724204 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
39734205 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
@@ -6224,6 +6456,11 @@ class StubGenerator: public StubCodeGenerator {
62246456 StubRoutines::_mulAdd = generate_mulAdd ();
62256457 }
62266458
6459+ if (UseSIMDForBigIntegerShiftIntrinsics) {
6460+ StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift ();
6461+ StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift ();
6462+ }
6463+
62276464 if (UseMontgomeryMultiplyIntrinsic) {
62286465 StubCodeMark mark (this , " StubRoutines" , " montgomeryMultiply" );
62296466 MontgomeryMultiplyGenerator g (_masm, /* squaring*/ false );
0 commit comments