From 89fd28b7b29c9057432907869f0db77ffa105571 Mon Sep 17 00:00:00 2001 From: Dong Bo Date: Thu, 22 Oct 2020 16:24:33 +0800 Subject: [PATCH 01/10] aarch64: intrisify BigInteger.shiftRightImplWorker and BigInteger.shiftLeftImplWorker with NEON instructions --- src/hotspot/cpu/aarch64/globals_aarch64.hpp | 2 + .../cpu/aarch64/stubGenerator_aarch64.cpp | 187 ++++++++++++++++++ 2 files changed, 189 insertions(+) diff --git a/src/hotspot/cpu/aarch64/globals_aarch64.hpp b/src/hotspot/cpu/aarch64/globals_aarch64.hpp index 294b6b13495d3..9ad1360fa919c 100644 --- a/src/hotspot/cpu/aarch64/globals_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/globals_aarch64.hpp @@ -93,6 +93,8 @@ define_pd_global(intx, InlineSmallCode, 1000); "Use SIMD instructions in generated array equals code") \ product(bool, UseSimpleArrayEquals, false, \ "Use simpliest and shortest implementation for array equals") \ + product(bool, UseSIMDForBigIntegerShiftIntrinsics, true, \ + "Use SIMD instructions for left/right shift of BigInteger") \ product(bool, AvoidUnalignedAccesses, false, \ "Avoid generating unaligned memory accesses") \ product(bool, UseLSE, false, \ diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index a230e4e5bba76..291087617de9a 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -3968,6 +3968,188 @@ class StubGenerator: public StubCodeGenerator { return start; } + // Arguments: + // + // Input: + // c_rarg0 - newArr address + // c_rarg1 - oldArr address + // c_rarg2 - newIdx + // c_rarg3 - shiftCount + // c_rarg4 - numIter + // + address generate_bigIntegerRightShift() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); + address start = __ pc(); + + Label ShiftSIMDLoop, ShiftTwoLoop, ShiftOne, Exit; + + Register newArr = c_rarg0; + Register oldArr = c_rarg1; + Register newIdx = c_rarg2; + Register shiftCount = c_rarg3; + Register numIter = c_rarg4; + Register idx = numIter; + Register oldArrCur = r5; + Register oldArrNext = r6; + Register newArrCur = rscratch1; + Register shiftRevCount = rscratch2; + + FloatRegister oldElem0 = v0; + FloatRegister oldElem1 = v1; + FloatRegister newElem = v2; + FloatRegister shiftVCount = v3; + FloatRegister shiftVRevCount = v4; + + __ cbz(idx, Exit); + __ add(newArr, newArr, newIdx, Assembler::LSL, 2); + + __ negw(shiftCount, shiftCount); + __ addw(shiftRevCount, shiftCount, 32); + + __ dup(shiftVCount, __ T4S, shiftCount); + __ dup(shiftVRevCount, __ T4S, shiftRevCount); + + __ BIND(ShiftSIMDLoop); + __ cmp(idx, (u1)4); + __ br(Assembler::LT, ShiftTwoLoop); + + // Calculate the load addresses + __ sub(idx, idx, 4); + __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); + __ add(newArrCur, newArr, idx, Assembler::LSL, 2); + __ add(oldArrCur, oldArrNext, 4); + + // Load 4 words and process + __ ld1(oldElem0, __ T4S, Address(oldArrCur)); + __ ld1(oldElem1, __ T4S, Address(oldArrNext)); + __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); + __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); + __ orr(newElem, __ T16B, oldElem0, oldElem1); + __ st1(newElem, __ T4S, Address(newArrCur)); + + __ b(ShiftSIMDLoop); + + __ BIND(ShiftTwoLoop); + __ cbz(idx, Exit); + __ cmp(idx, (u1)1); + __ br(Assembler::EQ, ShiftOne); + + // Calculate the load addresses + __ sub(idx, idx, 2); + __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); + __ add(newArrCur, newArr, idx, Assembler::LSL, 2); + __ add(oldArrCur, oldArrNext, 4); + + // Load 2 words and process + __ ld1(oldElem0, __ T2S, Address(oldArrCur)); + __ ld1(oldElem1, __ T2S, Address(oldArrNext)); + __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); + __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); + __ orr(newElem, __ T8B, oldElem0, oldElem1); + __ st1(newElem, __ T2S, Address(newArrCur)); + + __ b(ShiftTwoLoop); + + __ BIND(ShiftOne); + __ negw(shiftCount, shiftCount); + __ ldrw(r5, Address(oldArr, 4)); + __ ldrw(r6, Address(oldArr)); + __ lsrvw(r5, r5, shiftCount); + __ lslvw(r6, r6, shiftRevCount); + __ orrw(r7, r5, r6); + __ strw(r7, Address(newArr)); + + __ BIND(Exit); + __ ret(lr); + + return start; + } + + // Arguments: + // + // Input: + // c_rarg0 - newArr address + // c_rarg1 - oldArr address + // c_rarg2 - newIdx + // c_rarg3 - shiftCount + // c_rarg4 - numIter + // + address generate_bigIntegerLeftShift() { + __ align(CodeEntryAlignment); + StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); + address start = __ pc(); + + Label ShiftSIMDLoop, ShiftTwoLoop, ShiftOne, Exit; + + Register newArr = c_rarg0; + Register oldArr = c_rarg1; + Register newIdx = c_rarg2; + Register shiftCount = c_rarg3; + Register numIter = c_rarg4; + Register shiftRevCount = rscratch1; + Register oldArrNext = rscratch2; + + FloatRegister oldElem0 = v0; + FloatRegister oldElem1 = v1; + FloatRegister newElem = v2; + FloatRegister shiftVCount = v3; + FloatRegister shiftVRevCount = v4; + + __ cbz(numIter, Exit); + + __ add(oldArrNext, oldArr, 4); + __ addw(shiftRevCount, shiftCount, -32); + __ add(newArr, newArr, newIdx, Assembler::LSL, 2); + + __ dup(shiftVCount, __ T4S, shiftCount); + __ dup(shiftVRevCount, __ T4S, shiftRevCount); + + __ BIND(ShiftSIMDLoop); + __ cmp(numIter, (u1)4); + __ br(Assembler::LT, ShiftTwoLoop); + + // load 4 words and process + __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); + __ ld1(oldElem1, __ T4S, __ post(oldArrNext, 16)); + __ ushl(oldElem0, __ T4S, oldElem0, shiftVCount); + __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); + __ orr(newElem, __ T16B, oldElem0, oldElem1); + __ st1(newElem, __ T4S, __ post(newArr, 16)); + __ sub(numIter, numIter, 4); + __ b(ShiftSIMDLoop); + + __ BIND(ShiftTwoLoop); + __ cbz(numIter, Exit); + __ cmp(numIter, (u1)1); + __ br(Assembler::EQ, ShiftOne); + + // load 2 words and process + __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); + __ ld1(oldElem1, __ T2S, __ post(oldArrNext, 8)); + __ ushl(oldElem0, __ T2S, oldElem0, shiftVCount); + __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); + __ orr(newElem, __ T8B, oldElem0, oldElem1); + __ st1(newElem, __ T2S, __ post(newArr, 8)); + __ sub(numIter, numIter, 2); + + __ b(ShiftTwoLoop); + + __ BIND(ShiftOne); + __ negw(shiftRevCount, shiftRevCount); + __ ldrw(r5, Address(oldArr)); + __ ldrw(r6, Address(oldArrNext)); + __ lslvw(r5, r5, shiftCount); + __ lsrvw(r6, r6, shiftRevCount); + __ orrw(r7, r5, r6); + __ strw(r7, Address(newArr)); + + __ BIND(Exit); + __ ret(lr); + + return start; + } + void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { @@ -6224,6 +6406,11 @@ class StubGenerator: public StubCodeGenerator { StubRoutines::_mulAdd = generate_mulAdd(); } + if (UseSIMDForBigIntegerShiftIntrinsics) { + StubRoutines::_bigIntegerRightShiftWorker = generate_bigIntegerRightShift(); + StubRoutines::_bigIntegerLeftShiftWorker = generate_bigIntegerLeftShift(); + } + if (UseMontgomeryMultiplyIntrinsic) { StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); From f9e42b072e49ac018270276c762967bc6812ec07 Mon Sep 17 00:00:00 2001 From: Dong Bo Date: Fri, 23 Oct 2020 17:50:59 +0800 Subject: [PATCH 02/10] roll back for short magLen --- .../cpu/aarch64/stubGenerator_aarch64.cpp | 55 ++++++++++++------- 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index 291087617de9a..7e8b53a0c42c0 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -3982,7 +3982,7 @@ class StubGenerator: public StubCodeGenerator { StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); address start = __ pc(); - Label ShiftSIMDLoop, ShiftTwoLoop, ShiftOne, Exit; + Label ShiftSIMDLoop, ShiftTwoLoop, ShiftOneLoop, Exit; Register newArr = c_rarg0; Register oldArr = c_rarg1; @@ -4002,13 +4002,18 @@ class StubGenerator: public StubCodeGenerator { FloatRegister shiftVRevCount = v4; __ cbz(idx, Exit); + __ add(newArr, newArr, newIdx, Assembler::LSL, 2); - __ negw(shiftCount, shiftCount); - __ addw(shiftRevCount, shiftCount, 32); + __ movw(shiftRevCount, 32); + __ subw(shiftRevCount, shiftRevCount, shiftCount); + + __ cmp(numIter, (u1) 8); + __ br(Assembler::LT, ShiftOneLoop); __ dup(shiftVCount, __ T4S, shiftCount); __ dup(shiftVRevCount, __ T4S, shiftRevCount); + __ negr(shiftVCount, __ T4S, shiftVCount); __ BIND(ShiftSIMDLoop); __ cmp(idx, (u1)4); @@ -4027,13 +4032,12 @@ class StubGenerator: public StubCodeGenerator { __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); __ orr(newElem, __ T16B, oldElem0, oldElem1); __ st1(newElem, __ T4S, Address(newArrCur)); - __ b(ShiftSIMDLoop); __ BIND(ShiftTwoLoop); __ cbz(idx, Exit); __ cmp(idx, (u1)1); - __ br(Assembler::EQ, ShiftOne); + __ br(Assembler::EQ, ShiftOneLoop); // Calculate the load addresses __ sub(idx, idx, 2); @@ -4048,17 +4052,21 @@ class StubGenerator: public StubCodeGenerator { __ ushl(oldElem1, __ T2S, oldElem1, shiftVRevCount); __ orr(newElem, __ T8B, oldElem0, oldElem1); __ st1(newElem, __ T2S, Address(newArrCur)); - __ b(ShiftTwoLoop); - __ BIND(ShiftOne); - __ negw(shiftCount, shiftCount); - __ ldrw(r5, Address(oldArr, 4)); - __ ldrw(r6, Address(oldArr)); + __ BIND(ShiftOneLoop); + __ sub(idx, idx, 1); + __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); + __ add(newArrCur, newArr, idx, Assembler::LSL, 2); + __ add(oldArrCur, oldArrNext, 4); + __ ldrw(r5, Address(oldArrCur)); + __ ldrw(r6, Address(oldArrNext)); __ lsrvw(r5, r5, shiftCount); __ lslvw(r6, r6, shiftRevCount); __ orrw(r7, r5, r6); - __ strw(r7, Address(newArr)); + __ strw(r7, Address(newArrCur)); + __ cbz(idx, Exit); + __ b(ShiftOneLoop); __ BIND(Exit); __ ret(lr); @@ -4080,7 +4088,7 @@ class StubGenerator: public StubCodeGenerator { StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); address start = __ pc(); - Label ShiftSIMDLoop, ShiftTwoLoop, ShiftOne, Exit; + Label ShiftSIMDLoop, ShiftTwoLoop, ShiftOneLoop, Exit; Register newArr = c_rarg0; Register oldArr = c_rarg1; @@ -4099,11 +4107,17 @@ class StubGenerator: public StubCodeGenerator { __ cbz(numIter, Exit); __ add(oldArrNext, oldArr, 4); - __ addw(shiftRevCount, shiftCount, -32); __ add(newArr, newArr, newIdx, Assembler::LSL, 2); + __ movw(shiftRevCount, 32); + __ subw(shiftRevCount, shiftRevCount, shiftCount); + + __ cmp(numIter, (u1) 8); + __ br(Assembler::LT, ShiftOneLoop); + __ dup(shiftVCount, __ T4S, shiftCount); __ dup(shiftVRevCount, __ T4S, shiftRevCount); + __ negr(shiftVRevCount, __ T4S, shiftVRevCount); __ BIND(ShiftSIMDLoop); __ cmp(numIter, (u1)4); @@ -4122,7 +4136,7 @@ class StubGenerator: public StubCodeGenerator { __ BIND(ShiftTwoLoop); __ cbz(numIter, Exit); __ cmp(numIter, (u1)1); - __ br(Assembler::EQ, ShiftOne); + __ br(Assembler::EQ, ShiftOneLoop); // load 2 words and process __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); @@ -4132,17 +4146,18 @@ class StubGenerator: public StubCodeGenerator { __ orr(newElem, __ T8B, oldElem0, oldElem1); __ st1(newElem, __ T2S, __ post(newArr, 8)); __ sub(numIter, numIter, 2); - __ b(ShiftTwoLoop); - __ BIND(ShiftOne); - __ negw(shiftRevCount, shiftRevCount); - __ ldrw(r5, Address(oldArr)); - __ ldrw(r6, Address(oldArrNext)); + __ BIND(ShiftOneLoop); + __ ldrw(r5, __ post(oldArr, 4)); + __ ldrw(r6, __ post(oldArrNext, 4)); __ lslvw(r5, r5, shiftCount); __ lsrvw(r6, r6, shiftRevCount); __ orrw(r7, r5, r6); - __ strw(r7, Address(newArr)); + __ strw(r7, __ post(newArr, 4)); + __ sub(numIter, numIter, 1); + __ cbz(numIter, Exit); + __ b(ShiftOneLoop); __ BIND(Exit); __ ret(lr); From 86de731027628d1cba639a2b3b18ebb9c2860f8d Mon Sep 17 00:00:00 2001 From: Dong Bo Date: Fri, 23 Oct 2020 18:27:05 +0800 Subject: [PATCH 03/10] unify code style --- src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index 7e8b53a0c42c0..4b98f74bea31c 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -4008,12 +4008,12 @@ class StubGenerator: public StubCodeGenerator { __ movw(shiftRevCount, 32); __ subw(shiftRevCount, shiftRevCount, shiftCount); - __ cmp(numIter, (u1) 8); + __ cmp(numIter, (u1)8); __ br(Assembler::LT, ShiftOneLoop); __ dup(shiftVCount, __ T4S, shiftCount); __ dup(shiftVRevCount, __ T4S, shiftRevCount); - __ negr(shiftVCount, __ T4S, shiftVCount); + __ negr(shiftVCount, __ T4S, shiftVCount); __ BIND(ShiftSIMDLoop); __ cmp(idx, (u1)4); @@ -4112,7 +4112,7 @@ class StubGenerator: public StubCodeGenerator { __ movw(shiftRevCount, 32); __ subw(shiftRevCount, shiftRevCount, shiftCount); - __ cmp(numIter, (u1) 8); + __ cmp(numIter, (u1)8); __ br(Assembler::LT, ShiftOneLoop); __ dup(shiftVCount, __ T4S, shiftCount); From 7466be062dda6e5033c128517d1143b2ee2b9ccb Mon Sep 17 00:00:00 2001 From: Dong Bo Date: Fri, 23 Oct 2020 18:38:47 +0800 Subject: [PATCH 04/10] more comments --- src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index 4b98f74bea31c..c7e4e66375497 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -4005,9 +4005,11 @@ class StubGenerator: public StubCodeGenerator { __ add(newArr, newArr, newIdx, Assembler::LSL, 2); + // left shift count __ movw(shiftRevCount, 32); __ subw(shiftRevCount, shiftRevCount, shiftCount); + // numIter too short, rolling back __ cmp(numIter, (u1)8); __ br(Assembler::LT, ShiftOneLoop); @@ -4109,14 +4111,16 @@ class StubGenerator: public StubCodeGenerator { __ add(oldArrNext, oldArr, 4); __ add(newArr, newArr, newIdx, Assembler::LSL, 2); + // right shift count __ movw(shiftRevCount, 32); __ subw(shiftRevCount, shiftRevCount, shiftCount); + // numIter too short, rolling back __ cmp(numIter, (u1)8); __ br(Assembler::LT, ShiftOneLoop); - __ dup(shiftVCount, __ T4S, shiftCount); - __ dup(shiftVRevCount, __ T4S, shiftRevCount); + __ dup(shiftVCount, __ T4S, shiftCount); + __ dup(shiftVRevCount, __ T4S, shiftRevCount); __ negr(shiftVRevCount, __ T4S, shiftVRevCount); __ BIND(ShiftSIMDLoop); From f8f0b2976fb2ec77f93bddb0d43f1e6f0ae91095 Mon Sep 17 00:00:00 2001 From: Dong Bo Date: Sat, 24 Oct 2020 14:37:16 +0800 Subject: [PATCH 05/10] self-review --- .../cpu/aarch64/stubGenerator_aarch64.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index c7e4e66375497..41695a35ba625 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -4009,8 +4009,8 @@ class StubGenerator: public StubCodeGenerator { __ movw(shiftRevCount, 32); __ subw(shiftRevCount, shiftRevCount, shiftCount); - // numIter too short, rolling back - __ cmp(numIter, (u1)8); + // numIter too small to allow a 4-words SIMD loop, rolling back + __ cmp(numIter, (u1)4); __ br(Assembler::LT, ShiftOneLoop); __ dup(shiftVCount, __ T4S, shiftCount); @@ -4018,8 +4018,6 @@ class StubGenerator: public StubCodeGenerator { __ negr(shiftVCount, __ T4S, shiftVCount); __ BIND(ShiftSIMDLoop); - __ cmp(idx, (u1)4); - __ br(Assembler::LT, ShiftTwoLoop); // Calculate the load addresses __ sub(idx, idx, 4); @@ -4034,6 +4032,9 @@ class StubGenerator: public StubCodeGenerator { __ ushl(oldElem1, __ T4S, oldElem1, shiftVRevCount); __ orr(newElem, __ T16B, oldElem0, oldElem1); __ st1(newElem, __ T4S, Address(newArrCur)); + + __ cmp(idx, (u1)4); + __ br(Assembler::LT, ShiftTwoLoop); __ b(ShiftSIMDLoop); __ BIND(ShiftTwoLoop); @@ -4115,8 +4116,8 @@ class StubGenerator: public StubCodeGenerator { __ movw(shiftRevCount, 32); __ subw(shiftRevCount, shiftRevCount, shiftCount); - // numIter too short, rolling back - __ cmp(numIter, (u1)8); + // numIter too small to allow a 4-words SIMD loop, rolling back + __ cmp(numIter, (u1)4); __ br(Assembler::LT, ShiftOneLoop); __ dup(shiftVCount, __ T4S, shiftCount); @@ -4124,8 +4125,6 @@ class StubGenerator: public StubCodeGenerator { __ negr(shiftVRevCount, __ T4S, shiftVRevCount); __ BIND(ShiftSIMDLoop); - __ cmp(numIter, (u1)4); - __ br(Assembler::LT, ShiftTwoLoop); // load 4 words and process __ ld1(oldElem0, __ T4S, __ post(oldArr, 16)); @@ -4135,6 +4134,9 @@ class StubGenerator: public StubCodeGenerator { __ orr(newElem, __ T16B, oldElem0, oldElem1); __ st1(newElem, __ T4S, __ post(newArr, 16)); __ sub(numIter, numIter, 4); + + __ cmp(numIter, (u1)4); + __ br(Assembler::LT, ShiftTwoLoop); __ b(ShiftSIMDLoop); __ BIND(ShiftTwoLoop); From 3dcc68fa9c54b20621ed67e2e2b57789b86d42ba Mon Sep 17 00:00:00 2001 From: Dong Bo Date: Sat, 24 Oct 2020 18:19:48 +0800 Subject: [PATCH 06/10] modify register usage --- .../cpu/aarch64/stubGenerator_aarch64.cpp | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index 41695a35ba625..52c83a9e9e342 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -4062,12 +4062,12 @@ class StubGenerator: public StubCodeGenerator { __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); __ add(newArrCur, newArr, idx, Assembler::LSL, 2); __ add(oldArrCur, oldArrNext, 4); - __ ldrw(r5, Address(oldArrCur)); - __ ldrw(r6, Address(oldArrNext)); - __ lsrvw(r5, r5, shiftCount); - __ lslvw(r6, r6, shiftRevCount); - __ orrw(r7, r5, r6); - __ strw(r7, Address(newArrCur)); + __ ldrw(r10, Address(oldArrCur)); + __ ldrw(r11, Address(oldArrNext)); + __ lsrvw(r10, r10, shiftCount); + __ lslvw(r11, r11, shiftRevCount); + __ orrw(r12, r10, r11); + __ strw(r12, Address(newArrCur)); __ cbz(idx, Exit); __ b(ShiftOneLoop); @@ -4155,12 +4155,12 @@ class StubGenerator: public StubCodeGenerator { __ b(ShiftTwoLoop); __ BIND(ShiftOneLoop); - __ ldrw(r5, __ post(oldArr, 4)); - __ ldrw(r6, __ post(oldArrNext, 4)); - __ lslvw(r5, r5, shiftCount); - __ lsrvw(r6, r6, shiftRevCount); - __ orrw(r7, r5, r6); - __ strw(r7, __ post(newArr, 4)); + __ ldrw(r10, __ post(oldArr, 4)); + __ ldrw(r11, __ post(oldArrNext, 4)); + __ lslvw(r10, r10, shiftCount); + __ lsrvw(r11, r11, shiftRevCount); + __ orrw(r12, r10, r11); + __ strw(r12, __ post(newArr, 4)); __ sub(numIter, numIter, 1); __ cbz(numIter, Exit); __ b(ShiftOneLoop); From a9a5c1123cb3d9863c55f56d285e3d71ddd187ff Mon Sep 17 00:00:00 2001 From: Dong Bo Date: Mon, 26 Oct 2020 14:54:58 +0800 Subject: [PATCH 07/10] fix register usage --- src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index 52c83a9e9e342..c311f282d143c 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -3990,8 +3990,8 @@ class StubGenerator: public StubCodeGenerator { Register shiftCount = c_rarg3; Register numIter = c_rarg4; Register idx = numIter; - Register oldArrCur = r5; - Register oldArrNext = r6; + Register oldArrCur = r13; + Register oldArrNext = r14; Register newArrCur = rscratch1; Register shiftRevCount = rscratch2; From 24a6d91c8365df3635e747acbb48baf3ab68ca2f Mon Sep 17 00:00:00 2001 From: Dong Bo Date: Mon, 26 Oct 2020 15:13:45 +0800 Subject: [PATCH 08/10] self-review: code style --- src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index c311f282d143c..10d6b55b1a0b0 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -3990,10 +3990,11 @@ class StubGenerator: public StubCodeGenerator { Register shiftCount = c_rarg3; Register numIter = c_rarg4; Register idx = numIter; - Register oldArrCur = r13; - Register oldArrNext = r14; + Register newArrCur = rscratch1; Register shiftRevCount = rscratch2; + Register oldArrCur = r13; + Register oldArrNext = r14; FloatRegister oldElem0 = v0; FloatRegister oldElem1 = v1; @@ -4098,6 +4099,7 @@ class StubGenerator: public StubCodeGenerator { Register newIdx = c_rarg2; Register shiftCount = c_rarg3; Register numIter = c_rarg4; + Register shiftRevCount = rscratch1; Register oldArrNext = rscratch2; From e8df2a98cca7fed66e3085583ab28ce8e6cb502b Mon Sep 17 00:00:00 2001 From: d00348425 Date: Mon, 26 Oct 2020 17:36:41 -0400 Subject: [PATCH 09/10] fix trailing whitespace --- src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index 10d6b55b1a0b0..901f0d22910da 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -3981,7 +3981,7 @@ class StubGenerator: public StubCodeGenerator { __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); address start = __ pc(); - + Label ShiftSIMDLoop, ShiftTwoLoop, ShiftOneLoop, Exit; Register newArr = c_rarg0; @@ -4091,7 +4091,7 @@ class StubGenerator: public StubCodeGenerator { __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); address start = __ pc(); - + Label ShiftSIMDLoop, ShiftTwoLoop, ShiftOneLoop, Exit; Register newArr = c_rarg0; From 7a5d76f51e693d441dee30b3d109d1b67b525378 Mon Sep 17 00:00:00 2001 From: d00348425 Date: Tue, 27 Oct 2020 14:26:02 -0400 Subject: [PATCH 10/10] minor improvements for small BigIntegers --- .../cpu/aarch64/stubGenerator_aarch64.cpp | 67 +++++++++++++------ .../openjdk/bench/java/math/BigIntegers.java | 38 ++++++++++- 2 files changed, 83 insertions(+), 22 deletions(-) diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index 901f0d22910da..b39e793e850b8 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -3982,7 +3982,7 @@ class StubGenerator: public StubCodeGenerator { StubCodeMark mark(this, "StubRoutines", "bigIntegerRightShiftWorker"); address start = __ pc(); - Label ShiftSIMDLoop, ShiftTwoLoop, ShiftOneLoop, Exit; + Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; Register newArr = c_rarg0; Register oldArr = c_rarg1; @@ -4012,7 +4012,7 @@ class StubGenerator: public StubCodeGenerator { // numIter too small to allow a 4-words SIMD loop, rolling back __ cmp(numIter, (u1)4); - __ br(Assembler::LT, ShiftOneLoop); + __ br(Assembler::LT, ShiftThree); __ dup(shiftVCount, __ T4S, shiftCount); __ dup(shiftVRevCount, __ T4S, shiftRevCount); @@ -4041,7 +4041,7 @@ class StubGenerator: public StubCodeGenerator { __ BIND(ShiftTwoLoop); __ cbz(idx, Exit); __ cmp(idx, (u1)1); - __ br(Assembler::EQ, ShiftOneLoop); + __ br(Assembler::EQ, ShiftOne); // Calculate the load addresses __ sub(idx, idx, 2); @@ -4058,19 +4058,31 @@ class StubGenerator: public StubCodeGenerator { __ st1(newElem, __ T2S, Address(newArrCur)); __ b(ShiftTwoLoop); - __ BIND(ShiftOneLoop); - __ sub(idx, idx, 1); - __ add(oldArrNext, oldArr, idx, Assembler::LSL, 2); - __ add(newArrCur, newArr, idx, Assembler::LSL, 2); - __ add(oldArrCur, oldArrNext, 4); - __ ldrw(r10, Address(oldArrCur)); - __ ldrw(r11, Address(oldArrNext)); + __ BIND(ShiftThree); + __ tbz(idx, 1, ShiftOne); + __ tbz(idx, 0, ShiftTwo); + __ ldrw(r10, Address(oldArr, 12)); + __ ldrw(r11, Address(oldArr, 8)); __ lsrvw(r10, r10, shiftCount); __ lslvw(r11, r11, shiftRevCount); __ orrw(r12, r10, r11); - __ strw(r12, Address(newArrCur)); - __ cbz(idx, Exit); - __ b(ShiftOneLoop); + __ strw(r12, Address(newArr, 8)); + + __ BIND(ShiftTwo); + __ ldrw(r10, Address(oldArr, 8)); + __ ldrw(r11, Address(oldArr, 4)); + __ lsrvw(r10, r10, shiftCount); + __ lslvw(r11, r11, shiftRevCount); + __ orrw(r12, r10, r11); + __ strw(r12, Address(newArr, 4)); + + __ BIND(ShiftOne); + __ ldrw(r10, Address(oldArr, 4)); + __ ldrw(r11, Address(oldArr)); + __ lsrvw(r10, r10, shiftCount); + __ lslvw(r11, r11, shiftRevCount); + __ orrw(r12, r10, r11); + __ strw(r12, Address(newArr)); __ BIND(Exit); __ ret(lr); @@ -4092,7 +4104,7 @@ class StubGenerator: public StubCodeGenerator { StubCodeMark mark(this, "StubRoutines", "bigIntegerLeftShiftWorker"); address start = __ pc(); - Label ShiftSIMDLoop, ShiftTwoLoop, ShiftOneLoop, Exit; + Label ShiftSIMDLoop, ShiftTwoLoop, ShiftThree, ShiftTwo, ShiftOne, Exit; Register newArr = c_rarg0; Register oldArr = c_rarg1; @@ -4120,7 +4132,7 @@ class StubGenerator: public StubCodeGenerator { // numIter too small to allow a 4-words SIMD loop, rolling back __ cmp(numIter, (u1)4); - __ br(Assembler::LT, ShiftOneLoop); + __ br(Assembler::LT, ShiftThree); __ dup(shiftVCount, __ T4S, shiftCount); __ dup(shiftVRevCount, __ T4S, shiftRevCount); @@ -4144,7 +4156,7 @@ class StubGenerator: public StubCodeGenerator { __ BIND(ShiftTwoLoop); __ cbz(numIter, Exit); __ cmp(numIter, (u1)1); - __ br(Assembler::EQ, ShiftOneLoop); + __ br(Assembler::EQ, ShiftOne); // load 2 words and process __ ld1(oldElem0, __ T2S, __ post(oldArr, 8)); @@ -4156,16 +4168,31 @@ class StubGenerator: public StubCodeGenerator { __ sub(numIter, numIter, 2); __ b(ShiftTwoLoop); - __ BIND(ShiftOneLoop); + __ BIND(ShiftThree); __ ldrw(r10, __ post(oldArr, 4)); __ ldrw(r11, __ post(oldArrNext, 4)); __ lslvw(r10, r10, shiftCount); __ lsrvw(r11, r11, shiftRevCount); __ orrw(r12, r10, r11); __ strw(r12, __ post(newArr, 4)); - __ sub(numIter, numIter, 1); - __ cbz(numIter, Exit); - __ b(ShiftOneLoop); + __ tbz(numIter, 1, Exit); + __ tbz(numIter, 0, ShiftOne); + + __ BIND(ShiftTwo); + __ ldrw(r10, __ post(oldArr, 4)); + __ ldrw(r11, __ post(oldArrNext, 4)); + __ lslvw(r10, r10, shiftCount); + __ lsrvw(r11, r11, shiftRevCount); + __ orrw(r12, r10, r11); + __ strw(r12, __ post(newArr, 4)); + + __ BIND(ShiftOne); + __ ldrw(r10, Address(oldArr)); + __ ldrw(r11, Address(oldArrNext)); + __ lslvw(r10, r10, shiftCount); + __ lsrvw(r11, r11, shiftRevCount); + __ orrw(r12, r10, r11); + __ strw(r12, Address(newArr)); __ BIND(Exit); __ ret(lr); diff --git a/test/micro/org/openjdk/bench/java/math/BigIntegers.java b/test/micro/org/openjdk/bench/java/math/BigIntegers.java index 8b26b47cf1c76..d3a20a3ee5e26 100644 --- a/test/micro/org/openjdk/bench/java/math/BigIntegers.java +++ b/test/micro/org/openjdk/bench/java/math/BigIntegers.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2020, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -34,6 +34,7 @@ import org.openjdk.jmh.annotations.Scope; import org.openjdk.jmh.annotations.Setup; import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Param; import org.openjdk.jmh.infra.Blackhole; import java.math.BigInteger; @@ -45,11 +46,14 @@ @State(Scope.Thread) public class BigIntegers { - private BigInteger[] hugeArray, largeArray, smallArray, shiftArray; + private BigInteger[] hugeArray, largeArray, smallArray, shiftArray, smallShiftArray; public String[] dummyStringArray; public Object[] dummyArr; private static final int TESTSIZE = 1000; + @Param({"32", "64", "96", "128", "160", "192", "224", "256"}) + private int maxNumbits; + @Setup public void setup() { Random r = new Random(1123); @@ -72,6 +76,9 @@ public void setup() { * Each array entry is atmost 16k bits * in size */ + smallShiftArray = new BigInteger[TESTSIZE]; /* + * Small numbers, bits count in range [maxNumbits - 31, maxNumbits] + */ dummyStringArray = new String[TESTSIZE]; dummyArr = new Object[TESTSIZE]; @@ -84,6 +91,7 @@ public void setup() { largeArray[i] = new BigInteger("" + ((long) value + (long) Integer.MAX_VALUE)); smallArray[i] = new BigInteger("" + ((long) value / 1000)); shiftArray[i] = new BigInteger(numbits, r); + smallShiftArray[i] = new BigInteger(Math.max(maxNumbits - value % 32, 0), r); } } @@ -177,4 +185,30 @@ public void testRightShift(Blackhole bh) { } bh.consume(tmp); } + + /** Invokes the shiftLeft method of small BigInteger with different values. */ + @Benchmark + @OperationsPerInvocation(TESTSIZE) + public void testSmallLeftShift(Blackhole bh) { + Random rand = new Random(); + int shift = rand.nextInt(30) + 1; + BigInteger tmp = null; + for (BigInteger s : smallShiftArray) { + tmp = s.shiftLeft(shift); + bh.consume(tmp); + } + } + + /** Invokes the shiftRight method of small BigInteger with different values. */ + @Benchmark + @OperationsPerInvocation(TESTSIZE) + public void testSmallRightShift(Blackhole bh) { + Random rand = new Random(); + int shift = rand.nextInt(30) + 1; + BigInteger tmp = null; + for (BigInteger s : smallShiftArray) { + tmp = s.shiftRight(shift); + bh.consume(tmp); + } + } }