Skip to content

Commit 857b0f9

Browse files
author
Xiaohong Gong
committed
8293409: [vectorapi] Intrinsify VectorSupport.indexVector
Reviewed-by: eliu, jbhateja
1 parent 3f3d63d commit 857b0f9

File tree

14 files changed

+382
-30
lines changed

14 files changed

+382
-30
lines changed

src/hotspot/cpu/aarch64/aarch64_vector.ad

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -457,22 +457,31 @@ instruct storeV_masked(vReg src, vmemA mem, pRegGov pg) %{
457457

458458
// vector load const
459459

460-
instruct vloadconB(vReg dst, immI0 src) %{
461-
predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
460+
instruct vloadcon(vReg dst, immI0 src) %{
462461
match(Set dst (VectorLoadConst src));
463-
format %{ "vloadconB $dst, $src\t# load/generate iota indices" %}
462+
format %{ "vloadcon $dst, $src\t# load/generate iota indices" %}
464463
ins_encode %{
464+
BasicType bt = Matcher::vector_element_basic_type(this);
465465
if (UseSVE == 0) {
466466
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
467467
assert(length_in_bytes <= 16, "must be");
468-
__ lea(rscratch1, ExternalAddress(StubRoutines::aarch64::vector_iota_indices()));
468+
// The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 16.
469+
int offset = exact_log2(type2aelembytes(bt)) << 4;
470+
if (is_floating_point_type(bt)) {
471+
offset += 32;
472+
}
473+
__ lea(rscratch1, ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + offset));
469474
if (length_in_bytes == 16) {
470475
__ ldrq($dst$$FloatRegister, rscratch1);
471476
} else {
472477
__ ldrd($dst$$FloatRegister, rscratch1);
473478
}
474479
} else {
475-
__ sve_index($dst$$FloatRegister, __ B, 0, 1);
480+
Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
481+
__ sve_index($dst$$FloatRegister, size, 0, 1);
482+
if (is_floating_point_type(bt)) {
483+
__ sve_scvtf($dst$$FloatRegister, size, ptrue, $dst$$FloatRegister, size);
484+
}
476485
}
477486
%}
478487
ins_pipe(pipe_slow);

src/hotspot/cpu/aarch64/aarch64_vector_ad.m4

Lines changed: 14 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -396,22 +396,31 @@ instruct storeV_masked(vReg src, vmemA mem, pRegGov pg) %{
396396

397397
// vector load const
398398

399-
instruct vloadconB(vReg dst, immI0 src) %{
400-
predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
399+
instruct vloadcon(vReg dst, immI0 src) %{
401400
match(Set dst (VectorLoadConst src));
402-
format %{ "vloadconB $dst, $src\t# load/generate iota indices" %}
401+
format %{ "vloadcon $dst, $src\t# load/generate iota indices" %}
403402
ins_encode %{
403+
BasicType bt = Matcher::vector_element_basic_type(this);
404404
if (UseSVE == 0) {
405405
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
406406
assert(length_in_bytes <= 16, "must be");
407-
__ lea(rscratch1, ExternalAddress(StubRoutines::aarch64::vector_iota_indices()));
407+
// The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 16.
408+
int offset = exact_log2(type2aelembytes(bt)) << 4;
409+
if (is_floating_point_type(bt)) {
410+
offset += 32;
411+
}
412+
__ lea(rscratch1, ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + offset));
408413
if (length_in_bytes == 16) {
409414
__ ldrq($dst$$FloatRegister, rscratch1);
410415
} else {
411416
__ ldrd($dst$$FloatRegister, rscratch1);
412417
}
413418
} else {
414-
__ sve_index($dst$$FloatRegister, __ B, 0, 1);
419+
Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
420+
__ sve_index($dst$$FloatRegister, size, 0, 1);
421+
if (is_floating_point_type(bt)) {
422+
__ sve_scvtf($dst$$FloatRegister, size, ptrue, $dst$$FloatRegister, size);
423+
}
415424
}
416425
%}
417426
ins_pipe(pipe_slow);

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -630,8 +630,24 @@ class StubGenerator: public StubCodeGenerator {
630630
__ align(CodeEntryAlignment);
631631
StubCodeMark mark(this, "StubRoutines", stub_name);
632632
address start = __ pc();
633+
// B
633634
__ emit_data64(0x0706050403020100, relocInfo::none);
634635
__ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
636+
// H
637+
__ emit_data64(0x0003000200010000, relocInfo::none);
638+
__ emit_data64(0x0007000600050004, relocInfo::none);
639+
// S
640+
__ emit_data64(0x0000000100000000, relocInfo::none);
641+
__ emit_data64(0x0000000300000002, relocInfo::none);
642+
// D
643+
__ emit_data64(0x0000000000000000, relocInfo::none);
644+
__ emit_data64(0x0000000000000001, relocInfo::none);
645+
// S - FP
646+
__ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
647+
__ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
648+
// D - FP
649+
__ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
650+
__ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
635651
return start;
636652
}
637653

@@ -7846,7 +7862,9 @@ class StubGenerator: public StubCodeGenerator {
78467862
SharedRuntime::
78477863
throw_NullPointerException_at_call));
78487864

7849-
StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices");
7865+
if (UseSVE == 0) {
7866+
StubRoutines::aarch64::_vector_iota_indices = generate_iota_indices("iota_indices");
7867+
}
78507868

78517869
// arraycopy stubs used by compilers
78527870
generate_arraycopy_stubs();

src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1691,8 +1691,13 @@ void C2_MacroAssembler::load_constant_vector(BasicType bt, XMMRegister dst, Inte
16911691
}
16921692
}
16931693

1694-
void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes) {
1695-
ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
1694+
void C2_MacroAssembler::load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt) {
1695+
// The iota indices are ordered by type B/S/I/L/F/D, and the offset between two types is 64.
1696+
int offset = exact_log2(type2aelembytes(bt)) << 6;
1697+
if (is_floating_point_type(bt)) {
1698+
offset += 128;
1699+
}
1700+
ExternalAddress addr(StubRoutines::x86::vector_iota_indices() + offset);
16961701
if (vlen_in_bytes <= 4) {
16971702
movdl(dst, addr);
16981703
} else if (vlen_in_bytes == 8) {

src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@
159159
void load_vector_mask(KRegister dst, XMMRegister src, XMMRegister xtmp, bool novlbwdq, int vlen_enc);
160160

161161
void load_constant_vector(BasicType bt, XMMRegister dst, InternalAddress src, int vlen);
162-
void load_iota_indices(XMMRegister dst, int vlen_in_bytes);
162+
void load_iota_indices(XMMRegister dst, int vlen_in_bytes, BasicType bt);
163163

164164
// Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
165165

src/hotspot/cpu/x86/stubGenerator_x86_64.cpp

Lines changed: 46 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -811,7 +811,7 @@ address StubGenerator::generate_iota_indices(const char *stub_name) {
811811
__ align(CodeEntryAlignment);
812812
StubCodeMark mark(this, "StubRoutines", stub_name);
813813
address start = __ pc();
814-
814+
// B
815815
__ emit_data64(0x0706050403020100, relocInfo::none);
816816
__ emit_data64(0x0F0E0D0C0B0A0908, relocInfo::none);
817817
__ emit_data64(0x1716151413121110, relocInfo::none);
@@ -820,7 +820,51 @@ address StubGenerator::generate_iota_indices(const char *stub_name) {
820820
__ emit_data64(0x2F2E2D2C2B2A2928, relocInfo::none);
821821
__ emit_data64(0x3736353433323130, relocInfo::none);
822822
__ emit_data64(0x3F3E3D3C3B3A3938, relocInfo::none);
823-
823+
// W
824+
__ emit_data64(0x0003000200010000, relocInfo::none);
825+
__ emit_data64(0x0007000600050004, relocInfo::none);
826+
__ emit_data64(0x000B000A00090008, relocInfo::none);
827+
__ emit_data64(0x000F000E000D000C, relocInfo::none);
828+
__ emit_data64(0x0013001200110010, relocInfo::none);
829+
__ emit_data64(0x0017001600150014, relocInfo::none);
830+
__ emit_data64(0x001B001A00190018, relocInfo::none);
831+
__ emit_data64(0x001F001E001D001C, relocInfo::none);
832+
// D
833+
__ emit_data64(0x0000000100000000, relocInfo::none);
834+
__ emit_data64(0x0000000300000002, relocInfo::none);
835+
__ emit_data64(0x0000000500000004, relocInfo::none);
836+
__ emit_data64(0x0000000700000006, relocInfo::none);
837+
__ emit_data64(0x0000000900000008, relocInfo::none);
838+
__ emit_data64(0x0000000B0000000A, relocInfo::none);
839+
__ emit_data64(0x0000000D0000000C, relocInfo::none);
840+
__ emit_data64(0x0000000F0000000E, relocInfo::none);
841+
// Q
842+
__ emit_data64(0x0000000000000000, relocInfo::none);
843+
__ emit_data64(0x0000000000000001, relocInfo::none);
844+
__ emit_data64(0x0000000000000002, relocInfo::none);
845+
__ emit_data64(0x0000000000000003, relocInfo::none);
846+
__ emit_data64(0x0000000000000004, relocInfo::none);
847+
__ emit_data64(0x0000000000000005, relocInfo::none);
848+
__ emit_data64(0x0000000000000006, relocInfo::none);
849+
__ emit_data64(0x0000000000000007, relocInfo::none);
850+
// D - FP
851+
__ emit_data64(0x3F80000000000000, relocInfo::none); // 0.0f, 1.0f
852+
__ emit_data64(0x4040000040000000, relocInfo::none); // 2.0f, 3.0f
853+
__ emit_data64(0x40A0000040800000, relocInfo::none); // 4.0f, 5.0f
854+
__ emit_data64(0x40E0000040C00000, relocInfo::none); // 6.0f, 7.0f
855+
__ emit_data64(0x4110000041000000, relocInfo::none); // 8.0f, 9.0f
856+
__ emit_data64(0x4130000041200000, relocInfo::none); // 10.0f, 11.0f
857+
__ emit_data64(0x4150000041400000, relocInfo::none); // 12.0f, 13.0f
858+
__ emit_data64(0x4170000041600000, relocInfo::none); // 14.0f, 15.0f
859+
// Q - FP
860+
__ emit_data64(0x0000000000000000, relocInfo::none); // 0.0d
861+
__ emit_data64(0x3FF0000000000000, relocInfo::none); // 1.0d
862+
__ emit_data64(0x4000000000000000, relocInfo::none); // 2.0d
863+
__ emit_data64(0x4008000000000000, relocInfo::none); // 3.0d
864+
__ emit_data64(0x4010000000000000, relocInfo::none); // 4.0d
865+
__ emit_data64(0x4014000000000000, relocInfo::none); // 5.0d
866+
__ emit_data64(0x4018000000000000, relocInfo::none); // 6.0d
867+
__ emit_data64(0x401c000000000000, relocInfo::none); // 7.0d
824868
return start;
825869
}
826870

src/hotspot/cpu/x86/x86.ad

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8400,12 +8400,12 @@ instruct vmaskcast_avx(vec dst, vec src) %{
84008400
//-------------------------------- Load Iota Indices ----------------------------------
84018401

84028402
instruct loadIotaIndices(vec dst, immI_0 src) %{
8403-
predicate(Matcher::vector_element_basic_type(n) == T_BYTE);
84048403
match(Set dst (VectorLoadConst src));
84058404
format %{ "vector_load_iota $dst CONSTANT_MEMORY\t! load iota indices" %}
84068405
ins_encode %{
84078406
int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
8408-
__ load_iota_indices($dst$$XMMRegister, vlen_in_bytes);
8407+
BasicType bt = Matcher::vector_element_basic_type(this);
8408+
__ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, bt);
84098409
%}
84108410
ins_pipe( pipe_slow );
84118411
%}
@@ -8417,14 +8417,11 @@ instruct VectorPopulateIndex(vec dst, rRegI src1, immI_1 src2, vec vtmp) %{
84178417
format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
84188418
ins_encode %{
84198419
assert($src2$$constant == 1, "required");
8420-
int vlen = Matcher::vector_length(this);
8420+
int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
84218421
int vlen_enc = vector_length_encoding(this);
84228422
BasicType elem_bt = Matcher::vector_element_basic_type(this);
84238423
__ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
8424-
__ load_iota_indices($dst$$XMMRegister, vlen);
8425-
if (elem_bt != T_BYTE) {
8426-
__ vconvert_b2x(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8427-
}
8424+
__ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
84288425
__ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
84298426
%}
84308427
ins_pipe( pipe_slow );
@@ -8436,14 +8433,11 @@ instruct VectorPopulateLIndex(vec dst, rRegL src1, immI_1 src2, vec vtmp) %{
84368433
format %{ "vector_populate_index $dst $src1 $src2\t! using $vtmp as TEMP" %}
84378434
ins_encode %{
84388435
assert($src2$$constant == 1, "required");
8439-
int vlen = Matcher::vector_length(this);
8436+
int vlen_in_bytes = Matcher::vector_length_in_bytes(this);
84408437
int vlen_enc = vector_length_encoding(this);
84418438
BasicType elem_bt = Matcher::vector_element_basic_type(this);
84428439
__ vpbroadcast(elem_bt, $vtmp$$XMMRegister, $src1$$Register, vlen_enc);
8443-
__ load_iota_indices($dst$$XMMRegister, vlen);
8444-
if (elem_bt != T_BYTE) {
8445-
__ vconvert_b2x(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
8446-
}
8440+
__ load_iota_indices($dst$$XMMRegister, vlen_in_bytes, elem_bt);
84478441
__ vpadd(elem_bt, $dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
84488442
%}
84498443
ins_pipe( pipe_slow );

src/hotspot/share/classfile/vmIntrinsics.hpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1178,6 +1178,18 @@ class methodHandle;
11781178
"Ljdk/internal/vm/vector/VectorSupport$CompressExpandOperation;)" \
11791179
"Ljdk/internal/vm/vector/VectorSupport$VectorPayload;") \
11801180
do_name(vector_compress_expand_op_name, "compressExpandOp") \
1181+
\
1182+
do_intrinsic(_IndexVector, jdk_internal_vm_vector_VectorSupport, index_vector_op_name, index_vector_op_sig, F_S) \
1183+
do_signature(index_vector_op_sig, "(Ljava/lang/Class;" \
1184+
"Ljava/lang/Class;" \
1185+
"I" \
1186+
"Ljdk/internal/vm/vector/VectorSupport$Vector;" \
1187+
"I" \
1188+
"Ljdk/internal/vm/vector/VectorSupport$VectorSpecies;" \
1189+
"Ljdk/internal/vm/vector/VectorSupport$IndexOperation;)" \
1190+
"Ljdk/internal/vm/vector/VectorSupport$Vector;") \
1191+
do_name(index_vector_op_name, "indexVector") \
1192+
\
11811193
/* (2) Bytecode intrinsics */ \
11821194
\
11831195
do_intrinsic(_park, jdk_internal_misc_Unsafe, park_name, park_signature, F_RN) \
@@ -1286,7 +1298,7 @@ enum class vmIntrinsicID : int {
12861298
__IGNORE_CLASS, __IGNORE_NAME, __IGNORE_SIGNATURE, __IGNORE_ALIAS)
12871299

12881300
ID_LIMIT,
1289-
LAST_COMPILER_INLINE = _VectorCompressExpand,
1301+
LAST_COMPILER_INLINE = _IndexVector,
12901302
FIRST_MH_SIG_POLY = _invokeGeneric,
12911303
FIRST_MH_STATIC = _linkToVirtual,
12921304
LAST_MH_SIG_POLY = _linkToNative,

src/hotspot/share/opto/c2compiler.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -776,6 +776,7 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method, bool is_virt
776776
case vmIntrinsics::_VectorInsert:
777777
case vmIntrinsics::_VectorExtract:
778778
case vmIntrinsics::_VectorMaskOp:
779+
case vmIntrinsics::_IndexVector:
779780
return EnableVectorSupport;
780781
case vmIntrinsics::_blackhole:
781782
break;

src/hotspot/share/opto/library_call.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -716,6 +716,8 @@ bool LibraryCallKit::try_to_inline(int predicate) {
716716
return inline_vector_extract();
717717
case vmIntrinsics::_VectorCompressExpand:
718718
return inline_vector_compress_expand();
719+
case vmIntrinsics::_IndexVector:
720+
return inline_index_vector();
719721

720722
case vmIntrinsics::_getObjectSize:
721723
return inline_getObjectSize();

0 commit comments

Comments
 (0)