Skip to content

Commit 1ce2a36

Browse files
author
Andrew Haley
committed
8179444: AArch64: Put zero_words on a diet
Reviewed-by: roland
1 parent 99e8874 commit 1ce2a36

File tree

6 files changed

+184
-132
lines changed

6 files changed

+184
-132
lines changed

hotspot/src/cpu/aarch64/vm/aarch64.ad

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14021,10 +14021,12 @@ instruct clearArray_reg_reg(iRegL_R11 cnt, iRegP_R10 base, Universe dummy, rFlag
1402114021
ins_pipe(pipe_class_memory);
1402214022
%}
1402314023

14024-
instruct clearArray_imm_reg(immL cnt, iRegP_R10 base, iRegL_R11 tmp, Universe dummy, rFlagsReg cr)
14024+
instruct clearArray_imm_reg(immL cnt, iRegP_R10 base, Universe dummy, rFlagsReg cr)
1402514025
%{
14026+
predicate((u_int64_t)n->in(2)->get_long()
14027+
< (u_int64_t)(BlockZeroingLowLimit >> LogBytesPerWord));
1402614028
match(Set dummy (ClearArray cnt base));
14027-
effect(USE_KILL base, TEMP tmp);
14029+
effect(USE_KILL base);
1402814030

1402914031
ins_cost(4 * INSN_COST);
1403014032
format %{ "ClearArray $cnt, $base" %}

hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.cpp

Lines changed: 95 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -698,6 +698,7 @@ void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, in
698698
// trampolines won't be emitted.
699699

700700
address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
701+
assert(JavaThread::current()->is_Compiler_thread(), "just checking");
701702
assert(entry.rspec().type() == relocInfo::runtime_call_type
702703
|| entry.rspec().type() == relocInfo::opt_virtual_call_type
703704
|| entry.rspec().type() == relocInfo::static_call_type
@@ -4944,34 +4945,67 @@ void MacroAssembler::arrays_equals(Register a1, Register a2,
49444945
}
49454946

49464947

4947-
// base: Address of a buffer to be zeroed, 8 bytes aligned.
4948-
// cnt: Count in HeapWords.
4949-
// is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
4950-
void MacroAssembler::zero_words(Register base, Register cnt)
4948+
// The size of the blocks erased by the zero_blocks stub. We must
4949+
// handle anything smaller than this ourselves in zero_words().
4950+
const int MacroAssembler::zero_words_block_size = 8;
4951+
4952+
// zero_words() is used by C2 ClearArray patterns. It is as small as
4953+
// possible, handling small word counts locally and delegating
4954+
// anything larger to the zero_blocks stub. It is expanded many times
4955+
// in compiled code, so it is important to keep it short.
4956+
4957+
// ptr: Address of a buffer to be zeroed.
4958+
// cnt: Count in HeapWords.
4959+
//
4960+
// ptr, cnt, rscratch1, and rscratch2 are clobbered.
4961+
void MacroAssembler::zero_words(Register ptr, Register cnt)
49514962
{
4952-
if (UseBlockZeroing) {
4953-
block_zero(base, cnt);
4954-
} else {
4955-
fill_words(base, cnt, zr);
4963+
assert(is_power_of_2(zero_words_block_size), "adjust this");
4964+
assert(ptr == r10 && cnt == r11, "mismatch in register usage");
4965+
4966+
BLOCK_COMMENT("zero_words {");
4967+
cmp(cnt, zero_words_block_size);
4968+
Label around, done, done16;
4969+
br(LO, around);
4970+
{
4971+
RuntimeAddress zero_blocks = RuntimeAddress(StubRoutines::aarch64::zero_blocks());
4972+
assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
4973+
if (StubRoutines::aarch64::complete()) {
4974+
trampoline_call(zero_blocks);
4975+
} else {
4976+
bl(zero_blocks);
4977+
}
4978+
}
4979+
bind(around);
4980+
for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
4981+
Label l;
4982+
tbz(cnt, exact_log2(i), l);
4983+
for (int j = 0; j < i; j += 2) {
4984+
stp(zr, zr, post(ptr, 16));
4985+
}
4986+
bind(l);
4987+
}
4988+
{
4989+
Label l;
4990+
tbz(cnt, 0, l);
4991+
str(zr, Address(ptr));
4992+
bind(l);
49564993
}
4994+
BLOCK_COMMENT("} zero_words");
49574995
}
49584996

4959-
// r10 = base: Address of a buffer to be zeroed, 8 bytes aligned.
4997+
// base: Address of a buffer to be zeroed, 8 bytes aligned.
49604998
// cnt: Immediate count in HeapWords.
4961-
// r11 = tmp: For use as cnt if we need to call out
4962-
#define ShortArraySize (18 * BytesPerLong)
4999+
#define SmallArraySize (18 * BytesPerLong)
49635000
void MacroAssembler::zero_words(Register base, u_int64_t cnt)
49645001
{
4965-
Register tmp = r11;
5002+
BLOCK_COMMENT("zero_words {");
49665003
int i = cnt & 1; // store any odd word to start
49675004
if (i) str(zr, Address(base));
49685005

4969-
if (cnt <= ShortArraySize / BytesPerLong) {
5006+
if (cnt <= SmallArraySize / BytesPerLong) {
49705007
for (; i < (int)cnt; i += 2)
49715008
stp(zr, zr, Address(base, i * wordSize));
4972-
} else if (UseBlockZeroing && cnt >= (u_int64_t)(BlockZeroingLowLimit >> LogBytesPerWord)) {
4973-
mov(tmp, cnt);
4974-
block_zero(base, tmp, true);
49755009
} else {
49765010
const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
49775011
int remainder = cnt % (2 * unroll);
@@ -4992,6 +5026,51 @@ void MacroAssembler::zero_words(Register base, u_int64_t cnt)
49925026
stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
49935027
cbnz(cnt_reg, loop);
49945028
}
5029+
BLOCK_COMMENT("} zero_words");
5030+
}
5031+
5032+
// Zero blocks of memory by using DC ZVA.
5033+
//
5034+
// Aligns the base address first sufficently for DC ZVA, then uses
5035+
// DC ZVA repeatedly for every full block. cnt is the size to be
5036+
// zeroed in HeapWords. Returns the count of words left to be zeroed
5037+
// in cnt.
5038+
//
5039+
// NOTE: This is intended to be used in the zero_blocks() stub. If
5040+
// you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5041+
void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5042+
Register tmp = rscratch1;
5043+
Register tmp2 = rscratch2;
5044+
int zva_length = VM_Version::zva_length();
5045+
Label initial_table_end, loop_zva;
5046+
Label fini;
5047+
5048+
// Base must be 16 byte aligned. If not just return and let caller handle it
5049+
tst(base, 0x0f);
5050+
br(Assembler::NE, fini);
5051+
// Align base with ZVA length.
5052+
neg(tmp, base);
5053+
andr(tmp, tmp, zva_length - 1);
5054+
5055+
// tmp: the number of bytes to be filled to align the base with ZVA length.
5056+
add(base, base, tmp);
5057+
sub(cnt, cnt, tmp, Assembler::ASR, 3);
5058+
adr(tmp2, initial_table_end);
5059+
sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5060+
br(tmp2);
5061+
5062+
for (int i = -zva_length + 16; i < 0; i += 16)
5063+
stp(zr, zr, Address(base, i));
5064+
bind(initial_table_end);
5065+
5066+
sub(cnt, cnt, zva_length >> 3);
5067+
bind(loop_zva);
5068+
dc(Assembler::ZVA, base);
5069+
subs(cnt, cnt, zva_length >> 3);
5070+
add(base, base, zva_length);
5071+
br(Assembler::GE, loop_zva);
5072+
add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5073+
bind(fini);
49955074
}
49965075

49975076
// base: Address of a buffer to be filled, 8 bytes aligned.
@@ -5052,69 +5131,6 @@ void MacroAssembler::fill_words(Register base, Register cnt, Register value)
50525131
bind(fini);
50535132
}
50545133

5055-
// Use DC ZVA to do fast zeroing.
5056-
// base: Address of a buffer to be zeroed, 8 bytes aligned.
5057-
// cnt: Count in HeapWords.
5058-
// is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
5059-
void MacroAssembler::block_zero(Register base, Register cnt, bool is_large)
5060-
{
5061-
Label small;
5062-
Label store_pair, loop_store_pair, done;
5063-
Label base_aligned;
5064-
5065-
assert_different_registers(base, cnt, rscratch1);
5066-
guarantee(base == r10 && cnt == r11, "fix register usage");
5067-
5068-
Register tmp = rscratch1;
5069-
Register tmp2 = rscratch2;
5070-
int zva_length = VM_Version::zva_length();
5071-
5072-
// Ensure ZVA length can be divided by 16. This is required by
5073-
// the subsequent operations.
5074-
assert (zva_length % 16 == 0, "Unexpected ZVA Length");
5075-
5076-
if (!is_large) cbz(cnt, done);
5077-
tbz(base, 3, base_aligned);
5078-
str(zr, Address(post(base, 8)));
5079-
sub(cnt, cnt, 1);
5080-
bind(base_aligned);
5081-
5082-
// Ensure count >= zva_length * 2 so that it still deserves a zva after
5083-
// alignment.
5084-
if (!is_large || !(BlockZeroingLowLimit >= zva_length * 2)) {
5085-
int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
5086-
subs(tmp, cnt, low_limit >> 3);
5087-
br(Assembler::LT, small);
5088-
}
5089-
5090-
far_call(StubRoutines::aarch64::get_zero_longs());
5091-
5092-
bind(small);
5093-
5094-
const int unroll = 8; // Number of stp instructions we'll unroll
5095-
Label small_loop, small_table_end;
5096-
5097-
andr(tmp, cnt, (unroll-1) * 2);
5098-
sub(cnt, cnt, tmp);
5099-
add(base, base, tmp, Assembler::LSL, 3);
5100-
adr(tmp2, small_table_end);
5101-
sub(tmp2, tmp2, tmp, Assembler::LSL, 1);
5102-
br(tmp2);
5103-
5104-
bind(small_loop);
5105-
add(base, base, unroll * 16);
5106-
for (int i = -unroll; i < 0; i++)
5107-
stp(zr, zr, Address(base, i * 16));
5108-
bind(small_table_end);
5109-
subs(cnt, cnt, unroll * 2);
5110-
br(Assembler::GE, small_loop);
5111-
5112-
tbz(cnt, 0, done);
5113-
str(zr, Address(post(base, 8)));
5114-
5115-
bind(done);
5116-
}
5117-
51185134
// Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
51195135
// java/lang/StringUTF16.compress.
51205136
void MacroAssembler::encode_iso_array(Register src, Register dst,

hotspot/src/cpu/aarch64/vm/macroAssembler_aarch64.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1213,8 +1213,10 @@ class MacroAssembler: public Assembler {
12131213

12141214
void fill_words(Register base, Register cnt, Register value);
12151215
void zero_words(Register base, u_int64_t cnt);
1216-
void zero_words(Register base, Register cnt);
1217-
void block_zero(Register base, Register cnt, bool is_large = false);
1216+
void zero_words(Register ptr, Register cnt);
1217+
void zero_dcache_blocks(Register base, Register cnt);
1218+
1219+
static const int zero_words_block_size;
12181220

12191221
void byte_array_inflate(Register src, Register dst, Register len,
12201222
FloatRegister vtmp1, FloatRegister vtmp2,

hotspot/src/cpu/aarch64/vm/stubGenerator_aarch64.cpp

Lines changed: 66 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -719,48 +719,74 @@ class StubGenerator: public StubCodeGenerator {
719719
}
720720
}
721721

722-
address generate_zero_longs(Register base, Register cnt) {
723-
Register tmp = rscratch1;
724-
Register tmp2 = rscratch2;
725-
int zva_length = VM_Version::zva_length();
726-
Label initial_table_end, loop_zva;
727-
Label fini;
722+
// The inner part of zero_words(). This is the bulk operation,
723+
// zeroing words in blocks, possibly using DC ZVA to do it. The
724+
// caller is responsible for zeroing the last few words.
725+
//
726+
// Inputs:
727+
// r10: the HeapWord-aligned base address of an array to zero.
728+
// r11: the count in HeapWords, r11 > 0.
729+
//
730+
// Returns r10 and r11, adjusted for the caller to clear.
731+
// r10: the base address of the tail of words left to clear.
732+
// r11: the number of words in the tail.
733+
// r11 < MacroAssembler::zero_words_block_size.
734+
735+
address generate_zero_blocks() {
736+
Label store_pair, loop_store_pair, done;
737+
Label base_aligned;
738+
739+
Register base = r10, cnt = r11;
728740

729741
__ align(CodeEntryAlignment);
730-
StubCodeMark mark(this, "StubRoutines", "zero_longs");
742+
StubCodeMark mark(this, "StubRoutines", "zero_blocks");
731743
address start = __ pc();
732744

733-
// Base must be 16 byte aligned. If not just return and let caller handle it
734-
__ tst(base, 0x0f);
735-
__ br(Assembler::NE, fini);
736-
// Align base with ZVA length.
737-
__ neg(tmp, base);
738-
__ andr(tmp, tmp, zva_length - 1);
739-
740-
// tmp: the number of bytes to be filled to align the base with ZVA length.
741-
__ add(base, base, tmp);
742-
__ sub(cnt, cnt, tmp, Assembler::ASR, 3);
743-
__ adr(tmp2, initial_table_end);
744-
__ sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
745-
__ br(tmp2);
746-
747-
for (int i = -zva_length + 16; i < 0; i += 16)
748-
__ stp(zr, zr, Address(base, i));
749-
__ bind(initial_table_end);
750-
751-
__ sub(cnt, cnt, zva_length >> 3);
752-
__ bind(loop_zva);
753-
__ dc(Assembler::ZVA, base);
754-
__ subs(cnt, cnt, zva_length >> 3);
755-
__ add(base, base, zva_length);
756-
__ br(Assembler::GE, loop_zva);
757-
__ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
758-
__ bind(fini);
745+
if (UseBlockZeroing) {
746+
int zva_length = VM_Version::zva_length();
747+
748+
// Ensure ZVA length can be divided by 16. This is required by
749+
// the subsequent operations.
750+
assert (zva_length % 16 == 0, "Unexpected ZVA Length");
751+
752+
__ tbz(base, 3, base_aligned);
753+
__ str(zr, Address(__ post(base, 8)));
754+
__ sub(cnt, cnt, 1);
755+
__ bind(base_aligned);
756+
757+
// Ensure count >= zva_length * 2 so that it still deserves a zva after
758+
// alignment.
759+
Label small;
760+
int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
761+
__ cmp(cnt, low_limit >> 3);
762+
__ br(Assembler::LT, small);
763+
__ zero_dcache_blocks(base, cnt);
764+
__ bind(small);
765+
}
766+
767+
{
768+
// Number of stp instructions we'll unroll
769+
const int unroll =
770+
MacroAssembler::zero_words_block_size / 2;
771+
// Clear the remaining blocks.
772+
Label loop;
773+
__ subs(cnt, cnt, unroll * 2);
774+
__ br(Assembler::LT, done);
775+
__ bind(loop);
776+
for (int i = 0; i < unroll; i++)
777+
__ stp(zr, zr, __ post(base, 16));
778+
__ subs(cnt, cnt, unroll * 2);
779+
__ br(Assembler::GE, loop);
780+
__ bind(done);
781+
__ add(cnt, cnt, unroll * 2);
782+
}
783+
759784
__ ret(lr);
760785

761786
return start;
762787
}
763788

789+
764790
typedef enum {
765791
copy_forwards = 1,
766792
copy_backwards = -1
@@ -2346,20 +2372,16 @@ class StubGenerator: public StubCodeGenerator {
23462372
__ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
23472373
if (UseBlockZeroing) {
23482374
Label non_block_zeroing, rest;
2349-
Register tmp = rscratch1;
2350-
// count >= BlockZeroingLowLimit && value == 0
2351-
__ subs(tmp, cnt_words, BlockZeroingLowLimit >> 3);
2352-
__ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE);
2353-
__ br(Assembler::NE, non_block_zeroing);
2375+
// If the fill value is zero we can use the fast zero_words().
2376+
__ cbnz(value, non_block_zeroing);
23542377
__ mov(bz_base, to);
2355-
__ block_zero(bz_base, cnt_words, true);
2356-
__ mov(to, bz_base);
2378+
__ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2379+
__ zero_words(bz_base, cnt_words);
23572380
__ b(rest);
23582381
__ bind(non_block_zeroing);
23592382
__ fill_words(to, cnt_words, value);
23602383
__ bind(rest);
2361-
}
2362-
else {
2384+
} else {
23632385
__ fill_words(to, cnt_words, value);
23642386
}
23652387

@@ -2420,7 +2442,7 @@ class StubGenerator: public StubCodeGenerator {
24202442
generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
24212443
generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
24222444

2423-
StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11);
2445+
StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
24242446

24252447
//*** jbyte
24262448
// Always need aligned and unaligned versions
@@ -4769,6 +4791,7 @@ class StubGenerator: public StubCodeGenerator {
47694791
&StubRoutines::_safefetchN_fault_pc,
47704792
&StubRoutines::_safefetchN_continuation_pc);
47714793
#endif
4794+
StubRoutines::aarch64::set_completed();
47724795
}
47734796

47744797
public:

0 commit comments

Comments
 (0)