@@ -515,8 +515,10 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const
515515
516516 int avx3threshold = VM_Version::avx3_threshold ();
517517 bool use64byteVector = (MaxVectorSize > 32 ) && (avx3threshold == 0 );
518+ const int large_threshold = 2621440 ; // 2.5 MB
518519 Label L_main_loop, L_main_loop_64bytes, L_tail, L_tail64, L_exit, L_entry;
519520 Label L_repmovs, L_main_pre_loop, L_main_pre_loop_64bytes, L_pre_main_post_64;
521+ Label L_copy_large, L_finish;
520522 const Register from = rdi; // source array address
521523 const Register to = rsi; // destination array address
522524 const Register count = rdx; // elements count
@@ -577,6 +579,12 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const
577579 // PRE-MAIN-POST loop for aligned copy.
578580 __ BIND (L_entry);
579581
582+ if (MaxVectorSize == 64 ) {
583+ __ movq (temp2, temp1);
584+ __ shlq (temp2, shift);
585+ __ cmpq (temp2, large_threshold);
586+ __ jcc (Assembler::greaterEqual, L_copy_large);
587+ }
580588 if (avx3threshold != 0 ) {
581589 __ cmpq (count, threshold[shift]);
582590 if (MaxVectorSize == 64 ) {
@@ -703,6 +711,7 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const
703711 __ BIND (L_exit);
704712 }
705713
714+ __ BIND (L_finish);
706715 address ucme_exit_pc = __ pc ();
707716 // When called from generic_arraycopy r11 contains specific values
708717 // used during arraycopy epilogue, re-initializing r11.
@@ -717,9 +726,77 @@ address StubGenerator::generate_disjoint_copy_avx3_masked(address* entry, const
717726 __ leave (); // required for proper stackwalking of RuntimeStub frame
718727 __ ret (0 );
719728
729+ if (MaxVectorSize == 64 ) {
730+ __ BIND (L_copy_large);
731+ arraycopy_avx3_large (to, from, temp1, temp2, temp3, temp4, count, xmm1, xmm2, xmm3, xmm4, shift);
732+ __ jmp (L_finish);
733+ }
720734 return start;
721735}
722736
737+ void StubGenerator::arraycopy_avx3_large (Register to, Register from, Register temp1, Register temp2,
738+ Register temp3, Register temp4, Register count,
739+ XMMRegister xmm1, XMMRegister xmm2, XMMRegister xmm3,
740+ XMMRegister xmm4, int shift) {
741+
742+ // Type(shift) byte(0), short(1), int(2), long(3)
743+ int loop_size[] = { 256 , 128 , 64 , 32 };
744+ int threshold[] = { 4096 , 2048 , 1024 , 512 };
745+
746+ Label L_main_loop_large;
747+ Label L_tail_large;
748+ Label L_exit_large;
749+ Label L_entry_large;
750+ Label L_main_pre_loop_large;
751+ Label L_pre_main_post_large;
752+
753+ assert (MaxVectorSize == 64 , " vector length != 64" );
754+ __ BIND (L_entry_large);
755+
756+ __ BIND (L_pre_main_post_large);
757+ // Partial copy to make dst address 64 byte aligned.
758+ __ movq (temp2, to);
759+ __ andq (temp2, 63 );
760+ __ jcc (Assembler::equal, L_main_pre_loop_large);
761+
762+ __ negptr (temp2);
763+ __ addq (temp2, 64 );
764+ if (shift) {
765+ __ shrq (temp2, shift);
766+ }
767+ __ movq (temp3, temp2);
768+ copy64_masked_avx (to, from, xmm1, k2, temp3, temp4, temp1, shift, 0 , true );
769+ __ movq (temp4, temp2);
770+ __ movq (temp1, count);
771+ __ subq (temp1, temp2);
772+
773+ __ cmpq (temp1, loop_size[shift]);
774+ __ jcc (Assembler::less, L_tail_large);
775+
776+ __ BIND (L_main_pre_loop_large);
777+ __ subq (temp1, loop_size[shift]);
778+
779+ // Main loop with aligned copy block size of 256 bytes at 64 byte copy granularity.
780+ __ align32 ();
781+ __ BIND (L_main_loop_large);
782+ copy256_avx3 (to, from, temp4, xmm1, xmm2, xmm3, xmm4, shift, 0 );
783+ __ addptr (temp4, loop_size[shift]);
784+ __ subq (temp1, loop_size[shift]);
785+ __ jcc (Assembler::greater, L_main_loop_large);
786+ // fence needed because copy256_avx3 uses non-temporal stores
787+ __ sfence ();
788+
789+ __ addq (temp1, loop_size[shift]);
790+ // Zero length check.
791+ __ jcc (Assembler::lessEqual, L_exit_large);
792+ __ BIND (L_tail_large);
793+ // Tail handling using 64 byte [masked] vector copy operations.
794+ __ cmpq (temp1, 0 );
795+ __ jcc (Assembler::lessEqual, L_exit_large);
796+ arraycopy_avx3_special_cases_256 (xmm1, k2, from, to, temp1, shift,
797+ temp4, temp3, L_exit_large);
798+ __ BIND (L_exit_large);
799+ }
723800
724801// Inputs:
725802// c_rarg0 - source array address
@@ -965,6 +1042,55 @@ void StubGenerator::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask
9651042 __ jmp (L_exit);
9661043}
9671044
1045+ void StubGenerator::arraycopy_avx3_special_cases_256 (XMMRegister xmm, KRegister mask, Register from,
1046+ Register to, Register count, int shift, Register index,
1047+ Register temp, Label& L_exit) {
1048+ Label L_entry_64, L_entry_128, L_entry_192, L_entry_256;
1049+
1050+ int size_mat[][4 ] = {
1051+ /* T_BYTE */ {64 , 128 , 192 , 256 },
1052+ /* T_SHORT*/ {32 , 64 , 96 , 128 },
1053+ /* T_INT */ {16 , 32 , 48 , 64 },
1054+ /* T_LONG */ { 8 , 16 , 24 , 32 }
1055+ };
1056+
1057+ assert (MaxVectorSize == 64 , " vector length != 64" );
1058+ // Case A) Special case for length less than or equal to 64 bytes.
1059+ __ BIND (L_entry_64);
1060+ __ cmpq (count, size_mat[shift][0 ]);
1061+ __ jccb (Assembler::greater, L_entry_128);
1062+ copy64_masked_avx (to, from, xmm, mask, count, index, temp, shift, 0 , true );
1063+ __ jmp (L_exit);
1064+
1065+ // Case B) Special case for length less than or equal to 128 bytes.
1066+ __ BIND (L_entry_128);
1067+ __ cmpq (count, size_mat[shift][1 ]);
1068+ __ jccb (Assembler::greater, L_entry_192);
1069+ copy64_avx (to, from, index, xmm, false , shift, 0 , true );
1070+ __ subq (count, 64 >> shift);
1071+ copy64_masked_avx (to, from, xmm, mask, count, index, temp, shift, 64 , true );
1072+ __ jmp (L_exit);
1073+
1074+ // Case C) Special case for length less than or equal to 192 bytes.
1075+ __ BIND (L_entry_192);
1076+ __ cmpq (count, size_mat[shift][2 ]);
1077+ __ jcc (Assembler::greater, L_entry_256);
1078+ copy64_avx (to, from, index, xmm, false , shift, 0 , true );
1079+ copy64_avx (to, from, index, xmm, false , shift, 64 , true );
1080+ __ subq (count, 128 >> shift);
1081+ copy64_masked_avx (to, from, xmm, mask, count, index, temp, shift, 128 , true );
1082+ __ jmp (L_exit);
1083+
1084+ // Case D) Special case for length less than or equal to 256 bytes.
1085+ __ BIND (L_entry_256);
1086+ copy64_avx (to, from, index, xmm, false , shift, 0 , true );
1087+ copy64_avx (to, from, index, xmm, false , shift, 64 , true );
1088+ copy64_avx (to, from, index, xmm, false , shift, 128 , true );
1089+ __ subq (count, 192 >> shift);
1090+ copy64_masked_avx (to, from, xmm, mask, count, index, temp, shift, 192 , true );
1091+ __ jmp (L_exit);
1092+ }
1093+
9681094void StubGenerator::arraycopy_avx3_special_cases_conjoint (XMMRegister xmm, KRegister mask, Register from,
9691095 Register to, Register start_index, Register end_index,
9701096 Register count, int shift, Register temp,
@@ -1040,6 +1166,33 @@ void StubGenerator::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegi
10401166 __ jmp (L_exit);
10411167}
10421168
1169+ void StubGenerator::copy256_avx3 (Register dst, Register src, Register index, XMMRegister xmm1,
1170+ XMMRegister xmm2, XMMRegister xmm3, XMMRegister xmm4,
1171+ int shift, int offset) {
1172+ if (MaxVectorSize == 64 ) {
1173+ Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
1174+ __ prefetcht0 (Address (src, index, scale, offset + 0x200 ));
1175+ __ prefetcht0 (Address (src, index, scale, offset + 0x240 ));
1176+ __ prefetcht0 (Address (src, index, scale, offset + 0x280 ));
1177+ __ prefetcht0 (Address (src, index, scale, offset + 0x2C0 ));
1178+
1179+ __ prefetcht0 (Address (src, index, scale, offset + 0x400 ));
1180+ __ prefetcht0 (Address (src, index, scale, offset + 0x440 ));
1181+ __ prefetcht0 (Address (src, index, scale, offset + 0x480 ));
1182+ __ prefetcht0 (Address (src, index, scale, offset + 0x4C0 ));
1183+
1184+ __ evmovdquq (xmm1, Address (src, index, scale, offset), Assembler::AVX_512bit);
1185+ __ evmovdquq (xmm2, Address (src, index, scale, offset + 0x40 ), Assembler::AVX_512bit);
1186+ __ evmovdquq (xmm3, Address (src, index, scale, offset + 0x80 ), Assembler::AVX_512bit);
1187+ __ evmovdquq (xmm4, Address (src, index, scale, offset + 0xC0 ), Assembler::AVX_512bit);
1188+
1189+ __ evmovntdquq (Address (dst, index, scale, offset), xmm1, Assembler::AVX_512bit);
1190+ __ evmovntdquq (Address (dst, index, scale, offset + 0x40 ), xmm2, Assembler::AVX_512bit);
1191+ __ evmovntdquq (Address (dst, index, scale, offset + 0x80 ), xmm3, Assembler::AVX_512bit);
1192+ __ evmovntdquq (Address (dst, index, scale, offset + 0xC0 ), xmm4, Assembler::AVX_512bit);
1193+ }
1194+ }
1195+
10431196void StubGenerator::copy64_masked_avx (Register dst, Register src, XMMRegister xmm,
10441197 KRegister mask, Register length, Register index,
10451198 Register temp, int shift, int offset,
0 commit comments