Skip to content

Commit 82967f4

Browse files
steveatghmcimadamore
authored andcommitted
8310159: Bulk copy with Unsafe::arrayCopy is slower compared to memcpy
Co-authored-by: Maurizio Cimadamore <[email protected]> Reviewed-by: thartmann, jbhateja, sviswanathan
1 parent f0a12c5 commit 82967f4

File tree

5 files changed

+259
-0
lines changed

5 files changed

+259
-0
lines changed

src/hotspot/cpu/x86/assembler_x86.cpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3417,6 +3417,27 @@ void Assembler::evmovdquq(XMMRegister dst, KRegister mask, Address src, bool mer
34173417
emit_operand(dst, src, 0);
34183418
}
34193419

3420+
void Assembler::evmovntdquq(Address dst, XMMRegister src, int vector_len) {
3421+
// Unmasked instruction
3422+
evmovntdquq(dst, k0, src, /*merge*/ true, vector_len);
3423+
}
3424+
3425+
void Assembler::evmovntdquq(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len) {
3426+
assert(VM_Version::supports_evex(), "");
3427+
assert(src != xnoreg, "sanity");
3428+
InstructionMark im(this);
3429+
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
3430+
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
3431+
attributes.set_embedded_opmask_register_specifier(mask);
3432+
if (merge) {
3433+
attributes.reset_is_clear_context();
3434+
}
3435+
attributes.set_is_evex_instruction();
3436+
vex_prefix(dst, 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
3437+
emit_int8(0xE7);
3438+
emit_operand(src, dst, 0);
3439+
}
3440+
34203441
void Assembler::evmovdquq(Address dst, XMMRegister src, int vector_len) {
34213442
// Unmasked instruction
34223443
evmovdquq(dst, k0, src, /*merge*/ true, vector_len);

src/hotspot/cpu/x86/assembler_x86.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1615,6 +1615,9 @@ class Assembler : public AbstractAssembler {
16151615
void evmovdqul(XMMRegister dst, KRegister mask, Address src, bool merge, int vector_len);
16161616
void evmovdqul(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
16171617

1618+
void evmovntdquq(Address dst, KRegister mask, XMMRegister src, bool merge, int vector_len);
1619+
void evmovntdquq(Address dst, XMMRegister src, int vector_len);
1620+
16181621
void evmovdquq(Address dst, XMMRegister src, int vector_len);
16191622
void evmovdquq(XMMRegister dst, Address src, int vector_len);
16201623
void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len);

src/hotspot/cpu/x86/stubGenerator_x86_64.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,18 +187,30 @@ class StubGenerator: public StubCodeGenerator {
187187
Register index, Register temp,
188188
bool use64byteVector, Label& L_entry, Label& L_exit);
189189

190+
void arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from,
191+
Register to, Register count, int shift,
192+
Register index, Register temp, Label& L_exit);
193+
190194
void arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
191195
Register to, Register start_index, Register end_index,
192196
Register count, int shift, Register temp,
193197
bool use64byteVector, Label& L_entry, Label& L_exit);
194198

199+
void arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2,
200+
Register temp3, Register temp4, Register count,
201+
XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
202+
XMMRegister xmm4, int shift);
203+
195204
void copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
196205
int shift = Address::times_1, int offset = 0);
197206

198207
void copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
199208
bool conjoint, int shift = Address::times_1, int offset = 0,
200209
bool use64byteVector = false);
201210

211+
void copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1, XMMRegister xmm2,
212+
XMMRegister xmm3, XMMRegister xmm4, int shift, int offset = 0);
213+
202214
void copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
203215
KRegister mask, Register length, Register index,
204216
Register temp, int shift = Address::times_1, int offset = 0,

src/hotspot/cpu/x86/stubGenerator_x86_64_arraycopy.cpp

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -515,8 +515,10 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const
515515

516516
int avx3threshold = VM_Version::avx3_threshold();
517517
bool use64byteVector = (MaxVectorSize > 32) && (avx3threshold == 0);
518+
const int large_threshold = 2621440; // 2.5 MB
518519
Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
519520
Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
521+
Label L_copy_large, L_finish;
520522
const Register from = rdi; // source array address
521523
const Register to = rsi; // destination array address
522524
const Register count = rdx; // elements count
@@ -577,6 +579,12 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const
577579
// PRE-MAIN-POST loop for aligned copy.
578580
__ BIND(L_entry);
579581

582+
if (MaxVectorSize == 64) {
583+
__ movq(temp2, temp1);
584+
__ shlq(temp2, shift);
585+
__ cmpq(temp2, large_threshold);
586+
__ jcc(Assembler::greaterEqual, L_copy_large);
587+
}
580588
if (avx3threshold != 0) {
581589
__ cmpq(count, threshold[shift]);
582590
if (MaxVectorSize == 64) {
@@ -703,6 +711,7 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const
703711
__ BIND(L_exit);
704712
}
705713

714+
__ BIND(L_finish);
706715
address ucme_exit_pc = __ pc();
707716
// When called from generic_arraycopy r11 contains specific values
708717
// used during arraycopy epilogue, re-initializing r11.
@@ -717,9 +726,77 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const
717726
__ leave(); // required for proper stackwalking of RuntimeStub frame
718727
__ ret(0);
719728

729+
if (MaxVectorSize == 64) {
730+
__ BIND(L_copy_large);
731+
arraycopy_avx3_large(to, from, temp1, temp2, temp3, temp4, count, xmm1, xmm2, xmm3, xmm4, shift);
732+
__ jmp(L_finish);
733+
}
720734
return start;
721735
}
722736

737+
void StubGenerator::arraycopy_avx3_large(Register to, Register from, Register temp1, Register temp2,
738+
Register temp3, Register temp4, Register count,
739+
XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
740+
XMMRegister xmm4, int shift) {
741+
742+
// Type(shift) byte(0), short(1), int(2), long(3)
743+
int loop_size[] = { 256, 128, 64, 32};
744+
int threshold[] = { 4096, 2048, 1024, 512};
745+
746+
Label L_main_loop_large;
747+
Label L_tail_large;
748+
Label L_exit_large;
749+
Label L_entry_large;
750+
Label L_main_pre_loop_large;
751+
Label L_pre_main_post_large;
752+
753+
assert(MaxVectorSize == 64, "vector length != 64");
754+
__ BIND(L_entry_large);
755+
756+
__ BIND(L_pre_main_post_large);
757+
// Partial copy to make dst address 64 byte aligned.
758+
__ movq(temp2, to);
759+
__ andq(temp2, 63);
760+
__ jcc(Assembler::equal, L_main_pre_loop_large);
761+
762+
__ negptr(temp2);
763+
__ addq(temp2, 64);
764+
if (shift) {
765+
__ shrq(temp2, shift);
766+
}
767+
__ movq(temp3, temp2);
768+
copy64_masked_avx(to, from, xmm1, k2, temp3, temp4, temp1, shift, 0, true);
769+
__ movq(temp4, temp2);
770+
__ movq(temp1, count);
771+
__ subq(temp1, temp2);
772+
773+
__ cmpq(temp1, loop_size[shift]);
774+
__ jcc(Assembler::less, L_tail_large);
775+
776+
__ BIND(L_main_pre_loop_large);
777+
__ subq(temp1, loop_size[shift]);
778+
779+
// Main loop with aligned copy block size of 256 bytes at 64 byte copy granularity.
780+
__ align32();
781+
__ BIND(L_main_loop_large);
782+
copy256_avx3(to, from, temp4, xmm1, xmm2, xmm3, xmm4, shift, 0);
783+
__ addptr(temp4, loop_size[shift]);
784+
__ subq(temp1, loop_size[shift]);
785+
__ jcc(Assembler::greater, L_main_loop_large);
786+
// fence needed because copy256_avx3 uses non-temporal stores
787+
__ sfence();
788+
789+
__ addq(temp1, loop_size[shift]);
790+
// Zero length check.
791+
__ jcc(Assembler::lessEqual, L_exit_large);
792+
__ BIND(L_tail_large);
793+
// Tail handling using 64 byte [masked] vector copy operations.
794+
__ cmpq(temp1, 0);
795+
__ jcc(Assembler::lessEqual, L_exit_large);
796+
arraycopy_avx3_special_cases_256(xmm1, k2, from, to, temp1, shift,
797+
temp4, temp3, L_exit_large);
798+
__ BIND(L_exit_large);
799+
}
723800

724801
// Inputs:
725802
// c_rarg0 - source array address
@@ -965,6 +1042,55 @@ void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask
9651042
__ jmp(L_exit);
9661043
}
9671044

1045+
void StubGenerator::arraycopy_avx3_special_cases_256(XMMRegister xmm, KRegister mask, Register from,
1046+
Register to, Register count, int shift, Register index,
1047+
Register temp, Label& L_exit) {
1048+
Label L_entry_64, L_entry_128, L_entry_192, L_entry_256;
1049+
1050+
int size_mat[][4] = {
1051+
/* T_BYTE */ {64, 128, 192, 256},
1052+
/* T_SHORT*/ {32, 64 , 96 , 128},
1053+
/* T_INT */ {16, 32 , 48 , 64},
1054+
/* T_LONG */ { 8, 16 , 24 , 32}
1055+
};
1056+
1057+
assert(MaxVectorSize == 64, "vector length != 64");
1058+
// Case A) Special case for length less than or equal to 64 bytes.
1059+
__ BIND(L_entry_64);
1060+
__ cmpq(count, size_mat[shift][0]);
1061+
__ jccb(Assembler::greater, L_entry_128);
1062+
copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, true);
1063+
__ jmp(L_exit);
1064+
1065+
// Case B) Special case for length less than or equal to 128 bytes.
1066+
__ BIND(L_entry_128);
1067+
__ cmpq(count, size_mat[shift][1]);
1068+
__ jccb(Assembler::greater, L_entry_192);
1069+
copy64_avx(to, from, index, xmm, false, shift, 0, true);
1070+
__ subq(count, 64 >> shift);
1071+
copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64, true);
1072+
__ jmp(L_exit);
1073+
1074+
// Case C) Special case for length less than or equal to 192 bytes.
1075+
__ BIND(L_entry_192);
1076+
__ cmpq(count, size_mat[shift][2]);
1077+
__ jcc(Assembler::greater, L_entry_256);
1078+
copy64_avx(to, from, index, xmm, false, shift, 0, true);
1079+
copy64_avx(to, from, index, xmm, false, shift, 64, true);
1080+
__ subq(count, 128 >> shift);
1081+
copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128, true);
1082+
__ jmp(L_exit);
1083+
1084+
// Case D) Special case for length less than or equal to 256 bytes.
1085+
__ BIND(L_entry_256);
1086+
copy64_avx(to, from, index, xmm, false, shift, 0, true);
1087+
copy64_avx(to, from, index, xmm, false, shift, 64, true);
1088+
copy64_avx(to, from, index, xmm, false, shift, 128, true);
1089+
__ subq(count, 192 >> shift);
1090+
copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 192, true);
1091+
__ jmp(L_exit);
1092+
}
1093+
9681094
void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
9691095
Register to, Register start_index, Register end_index,
9701096
Register count, int shift, Register temp,
@@ -1040,6 +1166,33 @@ void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegi
10401166
__ jmp(L_exit);
10411167
}
10421168

1169+
void StubGenerator::copy256_avx3(Register dst, Register src, Register index, XMMRegister xmm1,
1170+
XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1171+
int shift, int offset) {
1172+
if (MaxVectorSize == 64) {
1173+
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1174+
__ prefetcht0(Address(src, index, scale, offset + 0x200));
1175+
__ prefetcht0(Address(src, index, scale, offset + 0x240));
1176+
__ prefetcht0(Address(src, index, scale, offset + 0x280));
1177+
__ prefetcht0(Address(src, index, scale, offset + 0x2C0));
1178+
1179+
__ prefetcht0(Address(src, index, scale, offset + 0x400));
1180+
__ prefetcht0(Address(src, index, scale, offset + 0x440));
1181+
__ prefetcht0(Address(src, index, scale, offset + 0x480));
1182+
__ prefetcht0(Address(src, index, scale, offset + 0x4C0));
1183+
1184+
__ evmovdquq(xmm1, Address(src, index, scale, offset), Assembler::AVX_512bit);
1185+
__ evmovdquq(xmm2, Address(src, index, scale, offset + 0x40), Assembler::AVX_512bit);
1186+
__ evmovdquq(xmm3, Address(src, index, scale, offset + 0x80), Assembler::AVX_512bit);
1187+
__ evmovdquq(xmm4, Address(src, index, scale, offset + 0xC0), Assembler::AVX_512bit);
1188+
1189+
__ evmovntdquq(Address(dst, index, scale, offset), xmm1, Assembler::AVX_512bit);
1190+
__ evmovntdquq(Address(dst, index, scale, offset + 0x40), xmm2, Assembler::AVX_512bit);
1191+
__ evmovntdquq(Address(dst, index, scale, offset + 0x80), xmm3, Assembler::AVX_512bit);
1192+
__ evmovntdquq(Address(dst, index, scale, offset + 0xC0), xmm4, Assembler::AVX_512bit);
1193+
}
1194+
}
1195+
10431196
void StubGenerator::copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
10441197
KRegister mask, Register length, Register index,
10451198
Register temp, int shift, int offset,
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
/*
2+
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* This code is free software; you can redistribute it and/or modify it
6+
* under the terms of the GNU General Public License version 2 only, as
7+
* published by the Free Software Foundation.
8+
*
9+
* This code is distributed in the hope that it will be useful, but WITHOUT
10+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12+
* version 2 for more details (a copy is included in the LICENSE file that
13+
* accompanied this code).
14+
*
15+
* You should have received a copy of the GNU General Public License version
16+
* 2 along with this work; if not, write to the Free Software Foundation,
17+
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18+
*
19+
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20+
* or visit www.oracle.com if you need additional information or have any
21+
* questions.
22+
*/
23+
package org.openjdk.bench.java.lang;
24+
25+
import org.openjdk.jmh.annotations.Benchmark;
26+
import org.openjdk.jmh.annotations.BenchmarkMode;
27+
import org.openjdk.jmh.annotations.Fork;
28+
import org.openjdk.jmh.annotations.Measurement;
29+
import org.openjdk.jmh.annotations.Mode;
30+
import org.openjdk.jmh.annotations.OutputTimeUnit;
31+
import org.openjdk.jmh.annotations.Param;
32+
import org.openjdk.jmh.annotations.Scope;
33+
import org.openjdk.jmh.annotations.Setup;
34+
import org.openjdk.jmh.annotations.State;
35+
import org.openjdk.jmh.annotations.Warmup;
36+
37+
import java.util.concurrent.TimeUnit;
38+
39+
/**
40+
* Benchmark measuring aligned System.arraycopy.
41+
*/
42+
@BenchmarkMode(Mode.AverageTime)
43+
@OutputTimeUnit(TimeUnit.NANOSECONDS)
44+
@State(Scope.Thread)
45+
@Warmup(iterations = 10, time = 1)
46+
@Measurement(iterations = 5, time = 1)
47+
@Fork(value = 3)
48+
public class ArrayCopyAlignedLarge {
49+
50+
@Param({"100000", "1000000", "2000000", "5000000", "10000000"})
51+
int length;
52+
53+
int fromPos, toPos;
54+
byte[] fromByteArr, toByteArr;
55+
56+
@Setup
57+
public void setup() {
58+
// Both positions aligned
59+
fromPos = 0;
60+
toPos = 0;
61+
62+
fromByteArr = new byte[length];
63+
toByteArr = new byte[length];
64+
}
65+
66+
@Benchmark
67+
public void testByte() {
68+
System.arraycopy(fromByteArr, fromPos, toByteArr, toPos, length);
69+
}
70+
}

0 commit comments

Comments
 (0)