From c2c1373926a0bacaa5081e362119b949c1355ba9 Mon Sep 17 00:00:00 2001 From: Faye Gao Date: Wed, 2 Mar 2022 01:56:54 +0000 Subject: [PATCH 1/6] 8283091: Support type conversion between different data sizes in SLP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After JDK-8275317, C2's SLP vectorizer has supported type conversion between the same data size. We can also support conversions between different data sizes like: int <-> double float <-> long int <-> long float <-> double A typical test case: int[] a; double[] b; for (int i = start; i < limit; i++) { b[i] = (double) a[i]; } Our expected OptoAssembly code for one iteration is like below: add R12, R2, R11, LShiftL #2 vector_load V16,[R12, #16] vectorcast_i2d V16, V16 # convert I to D vector add R11, R1, R11, LShiftL #3 # ptr add R13, R11, #16 # ptr vector_store [R13], V16 To enable the vectorization, the patch solves the following problems in the SLP. There are three main operations in the case above, LoadI, ConvI2D and StoreD. Assuming that the vector length is 128 bits, how many scalar nodes should be packed together to a vector? If we decide it separately for each operation node, like what we did before the patch in SuperWord::combine_packs(), a 128-bit vector will support 4 LoadI or 2 ConvI2D or 2 StoreD nodes. However, if we put these packed nodes in a vector node sequence, like loading 4 elements to a vector, then typecasting 2 elements and lastly storing these 2 elements, they become invalid. As a result, we should look through the whole def-use chain and then pick up the minimum of these element sizes, like function SuperWord::max_vector_size_in_ud_chain() do in the superword.cpp. In this case, we pack 2 LoadI, 2 ConvI2D and 2 StoreD nodes, and then generate valid vector node sequence, like loading 2 elements, converting the 2 elements to another type and storing the 2 elements with new type. After this, LoadI nodes don't make full use of the whole vector and only occupy part of it. So we adapt the code in SuperWord::get_vw_bytes_special() to the situation. In SLP, we calculate a kind of alignment as position trace for each scalar node in the whole vector. In this case, the alignments for 2 LoadI nodes are 0, 4 while the alignment for 2 ConvI2D nodes are 0, 8. Sometimes, 4 for LoadI and 8 for ConvI2D work the same, both of which mark that this node is the second node in the whole vector, while the difference between 4 and 8 are just because of their own data sizes. In this situation, we should try to remove the impact caused by different data size in SLP. For example, in the stage of SuperWord::extend_packlist(), while determining if it's potential to pack a pair of def nodes in the function SuperWord::follow_use_defs(), we remove the side effect of different data size by transforming the target alignment from the use node. Because we believe that, assuming that the vector length is 512 bits, if the ConvI2D use nodes have alignments of 16-24 and their def nodes, LoadI, have alignments of 8-12, these two LoadI nodes should be packed as a pair as well. Similarly, when determining if the vectorization is profitable, type conversion between different data size takes a type of one size and produces a type of another size, hence the special checks on alignment and size should be applied, like what we do in SuperWord::is_vector_use. After solving these problems, we successfully implemented the vectorization of type conversion between different data sizes. Here is the test data on NEON: Before the patch: Benchmark (length) Mode Cnt Score Error Units VectorLoop.convertD2F 523 avgt 15 216.431 ± 0.131 ns/op VectorLoop.convertD2I 523 avgt 15 220.522 ± 0.311 ns/op VectorLoop.convertF2D 523 avgt 15 217.034 ± 0.292 ns/op VectorLoop.convertF2L 523 avgt 15 231.634 ± 1.881 ns/op VectorLoop.convertI2D 523 avgt 15 229.538 ± 0.095 ns/op VectorLoop.convertI2L 523 avgt 15 214.822 ± 0.131 ns/op VectorLoop.convertL2F 523 avgt 15 230.188 ± 0.217 ns/op VectorLoop.convertL2I 523 avgt 15 162.234 ± 0.235 ns/op After the patch: Benchmark (length) Mode Cnt Score Error Units VectorLoop.convertD2F 523 avgt 15 124.352 ± 1.079 ns/op VectorLoop.convertD2I 523 avgt 15 557.388 ± 8.166 ns/op VectorLoop.convertF2D 523 avgt 15 118.082 ± 4.026 ns/op VectorLoop.convertF2L 523 avgt 15 225.810 ± 11.180 ns/op VectorLoop.convertI2D 523 avgt 15 166.247 ± 0.120 ns/op VectorLoop.convertI2L 523 avgt 15 119.699 ± 2.925 ns/op VectorLoop.convertL2F 523 avgt 15 220.847 ± 0.053 ns/op VectorLoop.convertL2I 523 avgt 15 122.339 ± 2.738 ns/op perf data on X86: Before the patch: Benchmark (length) Mode Cnt Score Error Units VectorLoop.convertD2F 523 avgt 15 279.466 ± 0.069 ns/op VectorLoop.convertD2I 523 avgt 15 551.009 ± 7.459 ns/op VectorLoop.convertF2D 523 avgt 15 276.066 ± 0.117 ns/op VectorLoop.convertF2L 523 avgt 15 545.108 ± 5.697 ns/op VectorLoop.convertI2D 523 avgt 15 745.303 ± 0.185 ns/op VectorLoop.convertI2L 523 avgt 15 260.878 ± 0.044 ns/op VectorLoop.convertL2F 523 avgt 15 502.016 ± 0.172 ns/op VectorLoop.convertL2I 523 avgt 15 261.654 ± 3.326 ns/op After the patch: Benchmark (length) Mode Cnt Score Error Units VectorLoop.convertD2F 523 avgt 15 106.975 ± 0.045 ns/op VectorLoop.convertD2I 523 avgt 15 546.866 ± 9.287 ns/op VectorLoop.convertF2D 523 avgt 15 82.414 ± 0.340 ns/op VectorLoop.convertF2L 523 avgt 15 542.235 ± 2.785 ns/op VectorLoop.convertI2D 523 avgt 15 92.966 ± 1.400 ns/op VectorLoop.convertI2L 523 avgt 15 79.960 ± 0.528 ns/op VectorLoop.convertL2F 523 avgt 15 504.712 ± 4.794 ns/op VectorLoop.convertL2I 523 avgt 15 129.753 ± 0.094 ns/op perf data on AVX512: Before the patch: Benchmark (length) Mode Cnt Score Error Units VectorLoop.convertD2F 523 avgt 15 282.984 ± 4.022 ns/op VectorLoop.convertD2I 523 avgt 15 543.080 ± 3.873 ns/op VectorLoop.convertF2D 523 avgt 15 273.950 ± 0.131 ns/op VectorLoop.convertF2L 523 avgt 15 539.568 ± 2.747 ns/op VectorLoop.convertI2D 523 avgt 15 745.238 ± 0.069 ns/op VectorLoop.convertI2L 523 avgt 15 260.935 ± 0.169 ns/op VectorLoop.convertL2F 523 avgt 15 501.870 ± 0.359 ns/op VectorLoop.convertL2I 523 avgt 15 257.508 ± 0.174 ns/op After the patch: Benchmark (length) Mode Cnt Score Error Units VectorLoop.convertD2F 523 avgt 15 76.687 ± 0.530 ns/op VectorLoop.convertD2I 523 avgt 15 545.408 ± 4.657 ns/op VectorLoop.convertF2D 523 avgt 15 273.935 ± 0.099 ns/op VectorLoop.convertF2L 523 avgt 15 540.534 ± 3.032 ns/op VectorLoop.convertI2D 523 avgt 15 745.234 ± 0.053 ns/op VectorLoop.convertI2L 523 avgt 15 260.865 ± 0.104 ns/op VectorLoop.convertL2F 523 avgt 15 63.834 ± 4.777 ns/op VectorLoop.convertL2I 523 avgt 15 48.183 ± 0.990 ns/op Change-Id: I93e60fd956547dad9204ceec90220145c58a72ef --- src/hotspot/share/opto/superword.cpp | 112 +++++++++++++++-- src/hotspot/share/opto/superword.hpp | 6 +- src/hotspot/share/opto/vectornode.cpp | 41 +++++-- src/hotspot/share/opto/vectornode.hpp | 3 +- .../compiler/codegen/TestByteDoubleVect.java | 95 ++++++++++++++- .../compiler/codegen/TestByteFloatVect.java | 95 ++++++++++++++- .../compiler/codegen/TestByteLongVect.java | 87 +++++++++++++- .../compiler/codegen/TestFloatDoubleVect.java | 113 +++++++++++++++++- .../compiler/codegen/TestIntDoubleVect.java | 100 +++++++++++++++- .../compiler/codegen/TestIntLongVect.java | 88 +++++++++++++- .../compiler/codegen/TestLongFloatVect.java | 103 +++++++++++++++- .../compiler/codegen/TestShortDoubleVect.java | 96 ++++++++++++++- .../compiler/codegen/TestShortFloatVect.java | 95 ++++++++++++++- .../compiler/codegen/TestShortLongVect.java | 86 ++++++++++++- 14 files changed, 1071 insertions(+), 49 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 1bf4c7a328282..dd76776cb72b2 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -999,6 +999,12 @@ int SuperWord::get_vw_bytes_special(MemNode* s) { } } + // Check for special case where there is a type conversion between different data size. + int vectsize = max_vector_size_in_ud_chain(s); + if (vectsize < Matcher::max_vector_size(btype)) { + vw = MIN2(vectsize * type2aelembytes(btype), vw); + } + return vw; } @@ -1187,7 +1193,9 @@ bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) { BasicType bt2 = velt_basic_type(s2); if(!is_java_primitive(bt1) || !is_java_primitive(bt2)) return false; - if (Matcher::max_vector_size(bt1) < 2) { + BasicType longer_bt = longer_type_for_conversion(s1); + if (Matcher::max_vector_size(bt1) < 2 || + (longer_bt != T_ILLEGAL && Matcher::max_vector_size(longer_bt) < 2)) { return false; // No vectors for this type } @@ -1432,16 +1440,20 @@ bool SuperWord::follow_use_defs(Node_List* p) { if (s1->is_Load()) return false; - int align = alignment(s1); - NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_use_defs: s1 %d, align %d", s1->_idx, align);) + NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SuperWord::follow_use_defs: s1 %d, align %d", s1->_idx, alignment(s1));) bool changed = false; int start = s1->is_Store() ? MemNode::ValueIn : 1; int end = s1->is_Store() ? MemNode::ValueIn+1 : s1->req(); for (int j = start; j < end; j++) { + int align = alignment(s1); Node* t1 = s1->in(j); Node* t2 = s2->in(j); if (!in_bb(t1) || !in_bb(t2)) continue; + if (longer_type_for_conversion(s1) != T_ILLEGAL || + longer_type_for_conversion(t1) != T_ILLEGAL) { + align = align / data_size(s1) * data_size(t1); + } if (stmts_can_pack(t1, t2, align)) { if (est_savings(t1, t2) >= 0) { Node_List* pair = new Node_List(); @@ -1485,12 +1497,18 @@ bool SuperWord::follow_def_uses(Node_List* p) { if (t2->Opcode() == Op_AddI && t2 == _lp->as_CountedLoop()->incr()) continue; // don't mess with the iv if (!opnd_positions_match(s1, t1, s2, t2)) continue; - if (stmts_can_pack(t1, t2, align)) { + int adjusted_align = alignment(s1); + if (longer_type_for_conversion(s1) != T_ILLEGAL || + longer_type_for_conversion(t1) != T_ILLEGAL) { + adjusted_align = adjusted_align / data_size(s1) * data_size(t1); + } + if (stmts_can_pack(t1, t2, adjusted_align)) { int my_savings = est_savings(t1, t2); if (my_savings > savings) { savings = my_savings; u1 = t1; u2 = t2; + align = adjusted_align; } } } @@ -1683,8 +1701,7 @@ void SuperWord::combine_packs() { for (int i = 0; i < _packset.length(); i++) { Node_List* p1 = _packset.at(i); if (p1 != NULL) { - BasicType bt = velt_basic_type(p1->at(0)); - uint max_vlen = Matcher::max_vector_size(bt); // Max elements in vector + uint max_vlen = max_vector_size_in_ud_chain(p1->at(0)); // Max elements in vector assert(is_power_of_2(max_vlen), "sanity"); uint psize = p1->size(); if (!is_power_of_2(psize)) { @@ -2007,6 +2024,8 @@ bool SuperWord::implemented(Node_List* p) { } else { retValue = ReductionNode::implemented(opc, size, arith_type->basic_type()); } + } else if (VectorNode::is_convert_opcode(opc)) { + retValue = VectorCastNode::implemented(opc, size, velt_basic_type(p0->in(1)), velt_basic_type(p0)); } else { retValue = VectorNode::implemented(opc, size, velt_basic_type(p0)); } @@ -2558,12 +2577,11 @@ void SuperWord::output() { Node* in = vector_opd(p, 1); vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n)); vlen_in_bytes = vn->as_Vector()->length_in_bytes(); - } else if (opc == Op_ConvI2F || opc == Op_ConvL2D || - opc == Op_ConvF2I || opc == Op_ConvD2L) { + } else if (VectorNode::is_convert_opcode(opc)) { assert(n->req() == 2, "only one input expected"); BasicType bt = velt_basic_type(n); - int vopc = VectorNode::opcode(opc, bt); Node* in = vector_opd(p, 1); + int vopc = VectorCastNode::opcode(in->bottom_type()->is_vect()->element_basic_type()); vn = VectorCastNode::make(vopc, in, bt, vlen); vlen_in_bytes = vn->as_Vector()->length_in_bytes(); } else if (is_cmov_pack(p)) { @@ -2961,9 +2979,26 @@ bool SuperWord::is_vector_use(Node* use, int u_idx) { return true; } - if (u_pk->size() != d_pk->size()) return false; + + if (longer_type_for_conversion(use) != T_ILLEGAL) { + // type conversion takes a type of a kind of size and produces a type of + // another size - hence the special checks on alignment and size. + for (uint i = 0; i < u_pk->size(); i++) { + Node* ui = u_pk->at(i); + Node* di = d_pk->at(i); + if (ui->in(u_idx) != di) { + return false; + } + if (alignment(ui) / type2aelembytes(velt_basic_type(ui)) != + alignment(di) / type2aelembytes(velt_basic_type(di))) { + return false; + } + } + return true; + } + for (uint i = 0; i < u_pk->size(); i++) { Node* ui = u_pk->at(i); Node* di = d_pk->at(i); @@ -3196,6 +3231,63 @@ void SuperWord::compute_max_depth() { } } +BasicType SuperWord::longer_type_for_conversion(Node* n) { + int opcode = n->Opcode(); + switch (opcode) { + case Op_ConvD2I: + case Op_ConvI2D: + case Op_ConvF2D: + case Op_ConvD2F: return T_DOUBLE; + case Op_ConvF2L: + case Op_ConvL2F: + case Op_ConvL2I: + case Op_ConvI2L: return T_LONG; + case Op_ConvI2F: { + BasicType src_t = velt_basic_type(n->in(1)); + if (src_t == T_BYTE || src_t == T_SHORT) { + return T_FLOAT; + } + return T_ILLEGAL; + } + case Op_ConvF2I: { + BasicType dst_t = velt_basic_type(n); + if (dst_t == T_BYTE || dst_t == T_SHORT) { + return T_FLOAT; + } + return T_ILLEGAL; + } + } + return T_ILLEGAL; +} + +int SuperWord::max_vector_size_in_ud_chain(Node* n) { + BasicType bt = velt_basic_type(n); + BasicType vt = bt; + + // find the longest type among def nodes. + uint start, end; + VectorNode::vector_operands(n, &start, &end); + for (uint i = start; i < end; ++i) { + Node* input = n->in(i); + if (!in_bb(input)) continue; + BasicType newt = longer_type_for_conversion(input); + vt = (newt == T_ILLEGAL) ? vt : newt; + } + + // find the longest type among use nodes. + for (uint i = 0; i < n->outcnt(); ++i) { + Node* output = n->raw_out(i); + if (!in_bb(output)) continue; + BasicType newt = longer_type_for_conversion(output); + vt = (newt == T_ILLEGAL) ? vt : newt; + } + + int max = Matcher::max_vector_size(vt); + // If now there is no vectors for the longest type, the nodes with the longest + // type in the def-use chain are not packed in SuperWord::stmts_can_pack. + return max < 2 ? Matcher::max_vector_size(bt) : max; +} + //-------------------------compute_vector_element_type----------------------- // Compute necessary vector element type for expressions // This propagates backwards a narrower integer type when the diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index ddddfc8366375..8c64ca2b4e0fe 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007, 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2007, 2022, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -528,6 +528,10 @@ class SuperWord : public ResourceObj { void bb_insert_after(Node* n, int pos); // Compute max depth for expressions from beginning of block void compute_max_depth(); + // Return the longer type for type-conversion node and return illegal type for other nodes. + BasicType longer_type_for_conversion(Node* n); + // Find the longest type in def-use chain for packed nodes, and then compute the max vector size. + int max_vector_size_in_ud_chain(Node* n); // Compute necessary vector element type for expressions void compute_vector_element_type(); // Are s1 and s2 in a pack pair and ordered as s1,s2? diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index a5ec2173966b4..72c3e64671b25 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -226,15 +226,6 @@ int VectorNode::opcode(int sopc, BasicType bt) { return Op_StoreVector; case Op_MulAddS2I: return Op_MulAddVS2VI; - case Op_ConvI2F: - return Op_VectorCastI2X; - case Op_ConvL2D: - return Op_VectorCastL2X; - case Op_ConvF2I: - return Op_VectorCastF2X; - case Op_ConvD2L: - return Op_VectorCastD2X; - default: return 0; // Unimplemented } @@ -365,6 +356,26 @@ bool VectorNode::is_shift_opcode(int opc) { } } +bool VectorNode::is_convert_opcode(int opc) { + switch (opc) { + case Op_ConvI2F: + case Op_ConvL2D: + case Op_ConvF2I: + case Op_ConvD2L: + case Op_ConvI2D: + case Op_ConvL2F: + case Op_ConvL2I: + case Op_ConvI2L: + case Op_ConvF2L: + case Op_ConvD2F: + case Op_ConvF2D: + case Op_ConvD2I: + return true; + default: + return false; + } +} + bool VectorNode::is_shift(Node* n) { return is_shift_opcode(n->Opcode()); } @@ -1118,11 +1129,21 @@ int VectorCastNode::opcode(BasicType bt, bool is_signed) { case T_FLOAT: return Op_VectorCastF2X; case T_DOUBLE: return Op_VectorCastD2X; default: - assert(false, "unknown type: %s", type2name(bt)); + assert(bt == T_CHAR || bt == T_BOOLEAN, "unknown type: %s", type2name(bt)); return 0; } } +bool VectorCastNode::implemented(int opc, uint vlen, BasicType src_type, BasicType dst_type) { + if (is_java_primitive(dst_type) && + (vlen > 1) && is_power_of_2(vlen) && + Matcher::vector_size_supported(dst_type, vlen)) { + int vopc = VectorCastNode::opcode(src_type); + return vopc > 0 && Matcher::match_rule_supported_vector(vopc, vlen, dst_type); + } + return false; +} + Node* VectorCastNode::Identity(PhaseGVN* phase) { if (!in(1)->is_top()) { BasicType in_bt = in(1)->bottom_type()->is_vect()->element_basic_type(); diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index 4853d719ce0ff..a20ab9363aa4c 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -80,6 +80,7 @@ class VectorNode : public TypeNode { static VectorNode* make_mask_node(int vopc, Node* n1, Node* n2, uint vlen, BasicType bt); static bool is_shift_opcode(int opc); + static bool is_convert_opcode(int opc); static bool is_vshift_cnt_opcode(int opc); @@ -1469,7 +1470,7 @@ class VectorCastNode : public VectorNode { static VectorCastNode* make(int vopc, Node* n1, BasicType bt, uint vlen); static int opcode(BasicType bt, bool is_signed = true); - static bool implemented(BasicType bt, uint vlen); + static bool implemented(int opc, uint vlen, BasicType src_type, BasicType dst_type); virtual Node* Identity(PhaseGVN* phase); }; diff --git a/test/hotspot/jtreg/compiler/codegen/TestByteDoubleVect.java b/test/hotspot/jtreg/compiler/codegen/TestByteDoubleVect.java index d91f3f2c3021f..c8e9f8fadd6ca 100644 --- a/test/hotspot/jtreg/compiler/codegen/TestByteDoubleVect.java +++ b/test/hotspot/jtreg/compiler/codegen/TestByteDoubleVect.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2022, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -23,9 +23,10 @@ /** * @test + * @key randomness * @bug 7119644 * @summary Increase superword's vector size up to 256 bits - * + * @library /test/lib * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions * -XX:-TieredCompilation -XX:-OptimizeFill * compiler.codegen.TestByteDoubleVect @@ -33,6 +34,9 @@ package compiler.codegen; +import java.util.Random; +import jdk.test.lib.Utils; + public class TestByteDoubleVect { private static final int ARRLEN = 997; private static final int ITERS = 11000; @@ -41,6 +45,32 @@ public class TestByteDoubleVect { private static final int ALIGN_OFF = 8; private static final int UNALIGN_OFF = 5; + private static final byte[] bspecial = { + 0, 0x8, 0xF, 0x3F, 0x7C, 0x7F, (byte)0x80, (byte)0x81, (byte)0x8F, + (byte)0xF3, (byte)0xF8, (byte)0xFF, (byte)0x38FF, (byte)0x3FFF, + (byte)0xFFFF, (byte)Integer.MAX_VALUE, (byte)Integer.MIN_VALUE + }; + + private static final double[] dspecial = { + 0.0, + -0.0, + Double.MAX_VALUE, + Double.MIN_VALUE, + -Double.MAX_VALUE, + -Double.MIN_VALUE, + Double.NaN, + Double.POSITIVE_INFINITY, + Double.NEGATIVE_INFINITY, + Integer.MAX_VALUE, + Integer.MIN_VALUE, + Long.MIN_VALUE, + Long.MAX_VALUE, + -Integer.MAX_VALUE, + -Integer.MIN_VALUE, + -Long.MIN_VALUE, + -Long.MAX_VALUE + }; + public static void main(String args[]) { System.out.println("Testing Byte + Double vectors"); int errn = test(); @@ -75,6 +105,8 @@ static int test() { test_vi_unaln(a1, b1, (byte)123, 103.); test_cp_unalndst(a1, a2, b1, b2); test_cp_unalnsrc(a1, a2, b1, b2); + test_conv_b2d(a1, b1); + test_conv_d2b(a1, b1); } // Initialize for (int i=0; i 0) @@ -448,6 +518,18 @@ static int test() { } end = System.currentTimeMillis(); System.out.println("test_cp_unalnsrc: " + (end - start)); + start = System.currentTimeMillis(); + for (int i = 0; i < ITERS; i++) { + test_conv_i2d(a1, b1); + } + end = System.currentTimeMillis(); + System.out.println("test_conv_i2d: " + (end - start)); + start = System.currentTimeMillis(); + for (int i = 0; i < ITERS; i++) { + test_conv_d2i(a1, b1); + } + end = System.currentTimeMillis(); + System.out.println("test_conv_d2i: " + (end - start)); return errn; } @@ -556,6 +638,16 @@ static void test_cp_unalnsrc(int[] a, int[] b, double[] c, double[] d) { c[i] = d[i+UNALIGN_OFF]; } } + static void test_conv_i2d(int[] a, double[] b) { + for (int i = 0; i < a.length; i+=1) { + b[i] = (double) a[i]; + } + } + static void test_conv_d2i(int[] a, double[] b) { + for (int i = 0; i < a.length; i+=1) { + a[i] = (int)b[i]; + } + } static int verify(String text, int i, int elem, int val) { if (elem != val) { @@ -565,7 +657,7 @@ static int verify(String text, int i, int elem, int val) { return 0; } static int verify(String text, int i, double elem, double val) { - if (elem != val) { + if (elem != val && !(Double.isNaN(elem) && Double.isNaN(val))) { System.err.println(text + "[" + i + "] = " + elem + " != " + val); return 1; } diff --git a/test/hotspot/jtreg/compiler/codegen/TestIntLongVect.java b/test/hotspot/jtreg/compiler/codegen/TestIntLongVect.java index 447586bdd49c6..5c37542caff10 100644 --- a/test/hotspot/jtreg/compiler/codegen/TestIntLongVect.java +++ b/test/hotspot/jtreg/compiler/codegen/TestIntLongVect.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2022, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -23,9 +23,10 @@ /** * @test + * @key randomness * @bug 7119644 * @summary Increase superword's vector size up to 256 bits - * + * @library /test/lib * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions * -XX:-TieredCompilation -XX:-OptimizeFill * compiler.codegen.TestIntLongVect @@ -33,6 +34,9 @@ package compiler.codegen; +import java.util.Random; +import jdk.test.lib.Utils; + public class TestIntLongVect { private static final int ARRLEN = 997; private static final int ITERS = 11000; @@ -41,6 +45,26 @@ public class TestIntLongVect { private static final int ALIGN_OFF = 8; private static final int UNALIGN_OFF = 5; + private static int[] ispecial = { + 0, + Integer.MAX_VALUE, + Integer.MIN_VALUE, + -Integer.MAX_VALUE, + -Integer.MIN_VALUE + }; + + private static long[] lspecial = { + 0, + Integer.MAX_VALUE, + Integer.MIN_VALUE, + -Integer.MAX_VALUE, + -Integer.MIN_VALUE, + Long.MAX_VALUE, + Long.MIN_VALUE, + -Long.MAX_VALUE, + -Long.MIN_VALUE + }; + public static void main(String args[]) { System.out.println("Testing Integer + Long vectors"); int errn = test(); @@ -75,6 +99,8 @@ static int test() { test_vi_unaln(a1, b1, (int)123, (long)103); test_cp_unalndst(a1, a2, b1, b2); test_cp_unalnsrc(a1, a2, b1, b2); + test_conv_i2l(a1, b1); + test_conv_l2i(a1, b1); } // Initialize for (int i=0; i 0) @@ -448,6 +516,18 @@ static int test() { } end = System.currentTimeMillis(); System.out.println("test_cp_unalnsrc: " + (end - start)); + start = System.currentTimeMillis(); + for (int i = 0; i < ITERS; i++) { + test_conv_s2d(a1, b1); + } + end = System.currentTimeMillis(); + System.out.println("test_conv_s2d: " + (end - start)); + start = System.currentTimeMillis(); + for (int i = 0; i < ITERS; i++) { + test_conv_d2s(a1, b1); + } + end = System.currentTimeMillis(); + System.out.println("test_conv_d2s: " + (end - start)); return errn; } @@ -556,6 +636,16 @@ static void test_cp_unalnsrc(short[] a, short[] b, double[] c, double[] d) { c[i] = d[i+UNALIGN_OFF]; } } + static void test_conv_s2d(short[] a, double[] b) { + for (int i = 0; i < a.length; i+=1) { + b[i] = (double) a[i]; + } + } + static void test_conv_d2s(short[] a, double[] b) { + for (int i = 0; i < a.length; i+=1) { + a[i] = (short) b[i]; + } + } static int verify(String text, int i, short elem, short val) { if (elem != val) { diff --git a/test/hotspot/jtreg/compiler/codegen/TestShortFloatVect.java b/test/hotspot/jtreg/compiler/codegen/TestShortFloatVect.java index 5d27cb2de90fe..11c7ada1d186a 100644 --- a/test/hotspot/jtreg/compiler/codegen/TestShortFloatVect.java +++ b/test/hotspot/jtreg/compiler/codegen/TestShortFloatVect.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2022, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -23,9 +23,10 @@ /** * @test + * @key randomness * @bug 7119644 * @summary Increase superword's vector size up to 256 bits - * + * @library /test/lib * @run main/othervm/timeout=300 -Xbatch -XX:+IgnoreUnrecognizedVMOptions * -XX:-TieredCompilation -XX:-OptimizeFill * compiler.codegen.TestShortFloatVect @@ -33,6 +34,9 @@ package compiler.codegen; +import java.util.Random; +import jdk.test.lib.Utils; + public class TestShortFloatVect { private static final int ARRLEN = 997; private static final int ITERS = 11000; @@ -41,6 +45,34 @@ public class TestShortFloatVect { private static final int ALIGN_OFF = 8; private static final int UNALIGN_OFF = 5; + private static final short[] sspecial = { + 0, 0x8, 0xF, 0x3F, 0x7C, 0x7F, 0x8F, 0xF3, 0xF8, 0xFF, 0x38FF, (short)0x8F8F, + (short)0x8FFF, 0x7FF3, 0x7FFF, (short)0xFF33, (short)0xFFF8, (short)0xFFFF, + (short)0xFFFFFF, (short)Integer.MAX_VALUE, (short)Integer.MIN_VALUE + }; + + private static final float[] fspecial = { + 1.0f, + -1.0f, + 0.0f, + -0.0f, + Float.MAX_VALUE, + Float.MIN_VALUE, + -Float.MAX_VALUE, + -Float.MIN_VALUE, + Float.NaN, + Float.POSITIVE_INFINITY, + Float.NEGATIVE_INFINITY, + Integer.MAX_VALUE, + Integer.MIN_VALUE, + -Integer.MAX_VALUE, + -Integer.MIN_VALUE, + Long.MAX_VALUE, + Long.MIN_VALUE, + -Long.MAX_VALUE, + -Long.MIN_VALUE + }; + public static void main(String args[]) { System.out.println("Testing Short + Float vectors"); int errn = test(); @@ -75,6 +107,8 @@ static int test() { test_vi_unaln(a1, b1, (short)123, 103.f); test_cp_unalndst(a1, a2, b1, b2); test_cp_unalnsrc(a1, a2, b1, b2); + test_conv_s2f(a1, b1); + test_conv_f2s(a1, b1); } // Initialize for (int i=0; i 0) @@ -448,6 +506,18 @@ static int test() { } end = System.currentTimeMillis(); System.out.println("test_cp_unalnsrc: " + (end - start)); + start = System.currentTimeMillis(); + for (int i = 0; i < ITERS; i++) { + test_conv_s2l(a1, b1); + } + end = System.currentTimeMillis(); + System.out.println("test_conv_s2l: " + (end - start)); + start = System.currentTimeMillis(); + for (int i = 0; i < ITERS; i++) { + test_conv_l2s(a1, b1); + } + end = System.currentTimeMillis(); + System.out.println("test_conv_l2s: " + (end - start)); return errn; } @@ -556,6 +626,16 @@ static void test_cp_unalnsrc(short[] a, short[] b, long[] c, long[] d) { c[i] = d[i+UNALIGN_OFF]; } } + static void test_conv_s2l(short[] a, long[] b) { + for (int i = 0; i < a.length; i+=1) { + b[i] = (long) a[i]; + } + } + static void test_conv_l2s(short[] a, long[] b) { + for (int i = 0; i < a.length; i+=1) { + a[i] = (short) b[i]; + } + } static int verify(String text, int i, short elem, short val) { if (elem != val) { From bf3fc418aaf601b619ae7de2aa0dc85d6397c21f Mon Sep 17 00:00:00 2001 From: Fei Gao Date: Tue, 15 Mar 2022 08:04:33 +0000 Subject: [PATCH 2/6] Add micro-benchmark cases Change-Id: I3c741255804ce410c8b6dcbdec974fa2c9051fd8 --- .../vm/compiler/TypeVectorOperations.java | 58 ++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/test/micro/org/openjdk/bench/vm/compiler/TypeVectorOperations.java b/test/micro/org/openjdk/bench/vm/compiler/TypeVectorOperations.java index 5e3799c8f3db6..df92ccaf00ea2 100644 --- a/test/micro/org/openjdk/bench/vm/compiler/TypeVectorOperations.java +++ b/test/micro/org/openjdk/bench/vm/compiler/TypeVectorOperations.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, 2021, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -223,6 +223,62 @@ public void convert_d2l() { } } + @Benchmark + public void convert_d2f() { + for (int i = 0; i < COUNT; i++) { + resF[i] = (float) doubles[i]; + } + } + + @Benchmark + public void convert_d2i() { + for (int i = 0; i < COUNT; i++) { + resI[i] = (int) doubles[i]; + } + } + + @Benchmark + public void convert_f2d() { + for (int i = 0; i < COUNT; i++) { + resD[i] = (double) floats[i]; + } + } + + @Benchmark + public void convert_f2l() { + for (int i = 0; i < COUNT; i++) { + resL[i] = (long) floats[i]; + } + } + + @Benchmark + public void convert_i2d() { + for (int i = 0; i < COUNT; i++) { + resD[i] = (double) ints[i]; + } + } + + @Benchmark + public void convert_i2l() { + for (int i = 0; i < COUNT; i++) { + resL[i] = (long) ints[i]; + } + } + + @Benchmark + public void convert_l2f() { + for (int i = 0; i < COUNT; i++) { + resF[i] = (float) longs[i]; + } + } + + @Benchmark + public void convert_l2i() { + for (int i = 0; i < COUNT; i++) { + resI[i] = (int) longs[i]; + } + } + @Fork(value = 1, jvmArgsPrepend = { "-XX:+UseSuperWord" }) From 74895bf12a4365b97b1c002a9528b7555e3044b9 Mon Sep 17 00:00:00 2001 From: Fei Gao Date: Thu, 2 Jun 2022 12:26:16 +0000 Subject: [PATCH 3/6] Implement an interface for auto-vectorization to consult supported match rules Change-Id: I8dcfae69a40717356757396faa06ae2d6015d701 --- src/hotspot/cpu/aarch64/aarch64.ad | 13 ++ src/hotspot/cpu/arm/arm.ad | 4 + src/hotspot/cpu/ppc/ppc.ad | 4 + src/hotspot/cpu/riscv/riscv.ad | 4 + src/hotspot/cpu/s390/s390.ad | 4 + src/hotspot/cpu/x86/x86.ad | 4 + src/hotspot/share/opto/matcher.hpp | 5 +- src/hotspot/share/opto/vectornode.cpp | 6 +- .../vm/compiler/TypeVectorOperations.java | 116 +++++++++++++++--- 9 files changed, 140 insertions(+), 20 deletions(-) diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad index 5d302b08be4f9..edbca62d83e3c 100644 --- a/src/hotspot/cpu/aarch64/aarch64.ad +++ b/src/hotspot/cpu/aarch64/aarch64.ad @@ -2432,6 +2432,19 @@ const bool Matcher::match_rule_supported(int opcode) { return ret_value; // Per default match rules are supported. } +const bool Matcher::match_rule_supported_vectorization(int opcode, int vlen, BasicType bt) { + if (UseSVE == 0) { + // ConvD2I and ConvL2F are not profitable to be vectorized on NEON, because no direct + // NEON instructions support them. But the match rule support for them is profitable for + // Vector API intrinsics. + if ((opcode == Op_VectorCastD2X && bt == T_INT) || + (opcode == Op_VectorCastL2X && bt == T_FLOAT)) { + return false; + } + } + return match_rule_supported_vector(opcode, vlen, bt); +} + // Identify extra cases that we might want to provide match rules for vector nodes and // other intrinsics guarded with vector length (vlen) and element type (bt). const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { diff --git a/src/hotspot/cpu/arm/arm.ad b/src/hotspot/cpu/arm/arm.ad index a801ff1217cd5..ab49e13f10e9c 100644 --- a/src/hotspot/cpu/arm/arm.ad +++ b/src/hotspot/cpu/arm/arm.ad @@ -981,6 +981,10 @@ const bool Matcher::match_rule_supported(int opcode) { return true; // Per default match rules are supported. } +const bool Matcher::match_rule_supported_vectorization(int opcode, int vlen, BasicType bt) { + return match_rule_supported_vector(opcode, vlen, bt); +} + const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { // TODO diff --git a/src/hotspot/cpu/ppc/ppc.ad b/src/hotspot/cpu/ppc/ppc.ad index 5ac1589ea2424..9f710eb8507a0 100644 --- a/src/hotspot/cpu/ppc/ppc.ad +++ b/src/hotspot/cpu/ppc/ppc.ad @@ -2165,6 +2165,10 @@ const bool Matcher::match_rule_supported(int opcode) { return true; // Per default match rules are supported. } +const bool Matcher::match_rule_supported_vectorization(int opcode, int vlen, BasicType bt) { + return match_rule_supported_vector(opcode, vlen, bt); +} + const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { if (!match_rule_supported(opcode) || !vector_size_supported(bt, vlen)) { return false; diff --git a/src/hotspot/cpu/riscv/riscv.ad b/src/hotspot/cpu/riscv/riscv.ad index 483181ff5cec4..d92035be05e4f 100644 --- a/src/hotspot/cpu/riscv/riscv.ad +++ b/src/hotspot/cpu/riscv/riscv.ad @@ -1815,6 +1815,10 @@ const bool Matcher::match_rule_supported(int opcode) { return true; // Per default match rules are supported. } +const bool Matcher::match_rule_supported_vectorization(int opcode, int vlen, BasicType bt) { + return match_rule_supported_vector(opcode, vlen, bt); +} + // Identify extra cases that we might want to provide match rules for vector nodes and // other intrinsics guarded with vector length (vlen) and element type (bt). const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { diff --git a/src/hotspot/cpu/s390/s390.ad b/src/hotspot/cpu/s390/s390.ad index 13e5edd294666..655058f09f925 100644 --- a/src/hotspot/cpu/s390/s390.ad +++ b/src/hotspot/cpu/s390/s390.ad @@ -1505,6 +1505,10 @@ const bool Matcher::match_rule_supported(int opcode) { return true; // Per default match rules are supported. } +const bool Matcher::match_rule_supported_vectorization(int opcode, int vlen, BasicType bt) { + return match_rule_supported_vector(opcode, vlen, bt); +} + const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { if (!match_rule_supported(opcode) || !vector_size_supported(bt, vlen)) { return false; diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index 07ed62f293d8f..d4a904c017ec6 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -1677,6 +1677,10 @@ static inline bool is_pop_count_instr_target(BasicType bt) { (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq()); } +const bool Matcher::match_rule_supported_vectorization(int opcode, int vlen, BasicType bt) { + return match_rule_supported_vector(opcode, vlen, bt); +} + // Identify extra cases that we might want to provide match rules for vector nodes and // other intrinsics guarded with vector length (vlen) and element type (bt). const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) { diff --git a/src/hotspot/share/opto/matcher.hpp b/src/hotspot/share/opto/matcher.hpp index c428193ff72b7..e0c420e7f014f 100644 --- a/src/hotspot/share/opto/matcher.hpp +++ b/src/hotspot/share/opto/matcher.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2022, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -325,6 +325,9 @@ class Matcher : public PhaseTransform { // should generate this one. static const bool match_rule_supported(int opcode); + // Identify extra cases that we might want to vectorize automatically. + static const bool match_rule_supported_vectorization(int opcode, int vlen, BasicType bt); + // identify extra cases that we might want to provide match rules for // e.g. Op_ vector nodes and other intrinsics while guarding with vlen static const bool match_rule_supported_vector(int opcode, int vlen, BasicType bt); diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index 10e57f9181031..042af99b098a4 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -302,7 +302,7 @@ bool VectorNode::implemented(int opc, uint vlen, BasicType bt) { if (VectorNode::is_vector_integral_negate(vopc)) { return is_vector_integral_negate_supported(vopc, vlen, bt, false); } - return vopc > 0 && Matcher::match_rule_supported_vector(vopc, vlen, bt); + return vopc > 0 && Matcher::match_rule_supported_vectorization(vopc, vlen, bt); } return false; } @@ -1232,7 +1232,7 @@ bool VectorCastNode::implemented(int opc, uint vlen, BasicType src_type, BasicTy (vlen > 1) && is_power_of_2(vlen) && Matcher::vector_size_supported(dst_type, vlen)) { int vopc = VectorCastNode::opcode(src_type); - return vopc > 0 && Matcher::match_rule_supported_vector(vopc, vlen, dst_type); + return vopc > 0 && Matcher::match_rule_supported_vectorization(vopc, vlen, dst_type); } return false; } @@ -1326,7 +1326,7 @@ bool ReductionNode::implemented(int opc, uint vlen, BasicType bt) { (vlen > 1) && is_power_of_2(vlen) && Matcher::vector_size_supported(bt, vlen)) { int vopc = ReductionNode::opcode(opc, bt); - return vopc != opc && Matcher::match_rule_supported_vector(vopc, vlen, bt); + return vopc != opc && Matcher::match_rule_supported_vectorization(vopc, vlen, bt); } return false; } diff --git a/test/micro/org/openjdk/bench/vm/compiler/TypeVectorOperations.java b/test/micro/org/openjdk/bench/vm/compiler/TypeVectorOperations.java index df92ccaf00ea2..5733b6092b53c 100644 --- a/test/micro/org/openjdk/bench/vm/compiler/TypeVectorOperations.java +++ b/test/micro/org/openjdk/bench/vm/compiler/TypeVectorOperations.java @@ -196,89 +196,173 @@ public void negD() { } @Benchmark - public void convert_i2f() { + public void convertB2D() { for (int i = 0; i < COUNT; i++) { - resF[i] = (float) ints[i]; + resD[i] = (double) bytesA[i]; } } @Benchmark - public void convert_f2i() { + public void convertB2F() { for (int i = 0; i < COUNT; i++) { - resI[i] = (int) floats[i]; + resF[i] = (float) bytesA[i]; } } @Benchmark - public void convert_l2d() { + public void convertB2L() { for (int i = 0; i < COUNT; i++) { - resD[i] = (double) longs[i]; + resL[i] = (long) bytesA[i]; } } @Benchmark - public void convert_d2l() { + public void convertD2B() { for (int i = 0; i < COUNT; i++) { - resL[i] = (long) doubles[i]; + resB[i] = (byte) doubles[i]; } } @Benchmark - public void convert_d2f() { + public void convertD2F() { for (int i = 0; i < COUNT; i++) { resF[i] = (float) doubles[i]; } } @Benchmark - public void convert_d2i() { + public void convertD2I() { for (int i = 0; i < COUNT; i++) { resI[i] = (int) doubles[i]; } } @Benchmark - public void convert_f2d() { + public void convertD2S() { + for (int i = 0; i < COUNT; i++) { + resS[i] = (short) doubles[i]; + } + } + + @Benchmark + public void convertD2L() { + for (int i = 0; i < COUNT; i++) { + resL[i] = (long) doubles[i]; + } + } + + @Benchmark + public void convertF2I() { + for (int i = 0; i < COUNT; i++) { + resI[i] = (int) floats[i]; + } + } + + @Benchmark + public void convertF2B() { + for (int i = 0; i < COUNT; i++) { + resB[i] = (byte) floats[i]; + } + } + + @Benchmark + public void convertF2D() { for (int i = 0; i < COUNT; i++) { resD[i] = (double) floats[i]; } } @Benchmark - public void convert_f2l() { + public void convertF2L() { for (int i = 0; i < COUNT; i++) { resL[i] = (long) floats[i]; } } @Benchmark - public void convert_i2d() { + public void convertF2S() { + for (int i = 0; i < COUNT; i++) { + resS[i] = (short) floats[i]; + } + } + + @Benchmark + public void convertI2F() { + for (int i = 0; i < COUNT; i++) { + resF[i] = (float) ints[i]; + } + } + + @Benchmark + public void convertI2D() { for (int i = 0; i < COUNT; i++) { resD[i] = (double) ints[i]; } } @Benchmark - public void convert_i2l() { + public void convertI2L() { for (int i = 0; i < COUNT; i++) { resL[i] = (long) ints[i]; } } @Benchmark - public void convert_l2f() { + public void convertL2D() { + for (int i = 0; i < COUNT; i++) { + resD[i] = (double) longs[i]; + } + } + + @Benchmark + public void convertL2B() { + for (int i = 0; i < COUNT; i++) { + resB[i] = (byte) longs[i]; + } + } + + @Benchmark + public void convertL2F() { for (int i = 0; i < COUNT; i++) { resF[i] = (float) longs[i]; } } @Benchmark - public void convert_l2i() { + public void convertL2I() { for (int i = 0; i < COUNT; i++) { resI[i] = (int) longs[i]; } } + @Benchmark + public void convertL2S() { + for (int i = 0; i < COUNT; i++) { + resS[i] = (short) longs[i]; + } + } + + @Benchmark + public void convertS2D() { + for (int i = 0; i < COUNT; i++) { + resD[i] = (double) shorts[i]; + } + } + + @Benchmark + public void convertS2F() { + for (int i = 0; i < COUNT; i++) { + resF[i] = (float) shorts[i]; + } + } + + @Benchmark + public void convertS2L() { + for (int i = 0; i < COUNT; i++) { + resL[i] = (long) shorts[i]; + } + } + @Fork(value = 1, jvmArgsPrepend = { "-XX:+UseSuperWord" }) From cf97e42836818163e7e78cb2e1dbe1bbbedbd984 Mon Sep 17 00:00:00 2001 From: Fei Gao Date: Mon, 6 Jun 2022 13:23:26 +0000 Subject: [PATCH 4/6] Add assertion line for opcode() and withdraw some common code as a function Change-Id: I7b5dbe60fec6979de454f347d074e6fc01126dfe --- src/hotspot/share/opto/matcher.hpp | 3 ++- src/hotspot/share/opto/superword.cpp | 32 ++++++++++++++++----------- src/hotspot/share/opto/superword.hpp | 3 ++- src/hotspot/share/opto/vectornode.cpp | 3 +++ 4 files changed, 26 insertions(+), 15 deletions(-) diff --git a/src/hotspot/share/opto/matcher.hpp b/src/hotspot/share/opto/matcher.hpp index e0c420e7f014f..fea1a1632d8ef 100644 --- a/src/hotspot/share/opto/matcher.hpp +++ b/src/hotspot/share/opto/matcher.hpp @@ -325,7 +325,8 @@ class Matcher : public PhaseTransform { // should generate this one. static const bool match_rule_supported(int opcode); - // Identify extra cases that we might want to vectorize automatically. + // Identify extra cases that we might want to vectorize automatically + // And exclude cases which are not profitable to auto-vectorize. static const bool match_rule_supported_vectorization(int opcode, int vlen, BasicType bt); // identify extra cases that we might want to provide match rules for diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index f78e9bcdfcaa1..751636ccd932c 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -364,8 +364,10 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) { break; } - // Map the maximal common vector - if (VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) { + // Map the maximal common vector except conversion nodes, because we can't get + // the precise basic type for conversion nodes in the stage of early analysis. + if (!VectorNode::is_convert_opcode(n->Opcode()) && + VectorNode::implemented(n->Opcode(), cur_max_vector, bt)) { if (cur_max_vector < max_vector && !flag_small_bt) { max_vector = cur_max_vector; } else if (cur_max_vector > max_vector && UseSubwordForMaxVector) { @@ -996,7 +998,7 @@ int SuperWord::get_vw_bytes_special(MemNode* s) { } // Check for special case where there is a type conversion between different data size. - int vectsize = max_vector_size_in_ud_chain(s); + int vectsize = max_vector_size_in_def_use_chain(s); if (vectsize < Matcher::max_vector_size(btype)) { vw = MIN2(vectsize * type2aelembytes(btype), vw); } @@ -1434,6 +1436,16 @@ void SuperWord::extend_packlist() { } } +//------------------------------adjust_alignment_for_type_conversion--------------------------------- +// Adjust the target alignment if conversion between different data size exists in def-use nodes. +int SuperWord::adjust_alignment_for_type_conversion(Node* s, Node* t, int align) { + if (longer_type_for_conversion(s) != T_ILLEGAL || + longer_type_for_conversion(t) != T_ILLEGAL) { + align = align / data_size(s) * data_size(t); + } + return align; +} + //------------------------------follow_use_defs--------------------------- // Extend the packset by visiting operand definitions of nodes in pack p bool SuperWord::follow_use_defs(Node_List* p) { @@ -1455,10 +1467,7 @@ bool SuperWord::follow_use_defs(Node_List* p) { Node* t2 = s2->in(j); if (!in_bb(t1) || !in_bb(t2)) continue; - if (longer_type_for_conversion(s1) != T_ILLEGAL || - longer_type_for_conversion(t1) != T_ILLEGAL) { - align = align / data_size(s1) * data_size(t1); - } + align = adjust_alignment_for_type_conversion(s1, t1, align); if (stmts_can_pack(t1, t2, align)) { if (est_savings(t1, t2) >= 0) { Node_List* pair = new Node_List(); @@ -1503,10 +1512,7 @@ bool SuperWord::follow_def_uses(Node_List* p) { if (!opnd_positions_match(s1, t1, s2, t2)) continue; int adjusted_align = alignment(s1); - if (longer_type_for_conversion(s1) != T_ILLEGAL || - longer_type_for_conversion(t1) != T_ILLEGAL) { - adjusted_align = adjusted_align / data_size(s1) * data_size(t1); - } + adjusted_align = adjust_alignment_for_type_conversion(s1, t1, adjusted_align); if (stmts_can_pack(t1, t2, adjusted_align)) { int my_savings = est_savings(t1, t2); if (my_savings > savings) { @@ -1706,7 +1712,7 @@ void SuperWord::combine_packs() { for (int i = 0; i < _packset.length(); i++) { Node_List* p1 = _packset.at(i); if (p1 != NULL) { - uint max_vlen = max_vector_size_in_ud_chain(p1->at(0)); // Max elements in vector + uint max_vlen = max_vector_size_in_def_use_chain(p1->at(0)); // Max elements in vector assert(is_power_of_2(max_vlen), "sanity"); uint psize = p1->size(); if (!is_power_of_2(psize)) { @@ -3423,7 +3429,7 @@ BasicType SuperWord::longer_type_for_conversion(Node* n) { return T_ILLEGAL; } -int SuperWord::max_vector_size_in_ud_chain(Node* n) { +int SuperWord::max_vector_size_in_def_use_chain(Node* n) { BasicType bt = velt_basic_type(n); BasicType vt = bt; diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index 9b05fdc98f0a1..07ccc1610824e 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -516,6 +516,7 @@ class SuperWord : public ResourceObj { int data_size(Node* s); // Extend packset by following use->def and def->use links from pack members. void extend_packlist(); + int adjust_alignment_for_type_conversion(Node* s, Node* t, int align); // Extend the packset by visiting operand definitions of nodes in pack p bool follow_use_defs(Node_List* p); // Extend the packset by visiting uses of nodes in pack p @@ -572,7 +573,7 @@ class SuperWord : public ResourceObj { // Return the longer type for type-conversion node and return illegal type for other nodes. BasicType longer_type_for_conversion(Node* n); // Find the longest type in def-use chain for packed nodes, and then compute the max vector size. - int max_vector_size_in_ud_chain(Node* n); + int max_vector_size_in_def_use_chain(Node* n); // Compute necessary vector element type for expressions void compute_vector_element_type(); // Are s1 and s2 in a pack pair and ordered as s1,s2? diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index 6060590f8bd57..67e42de07fc42 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -260,6 +260,9 @@ int VectorNode::opcode(int sopc, BasicType bt) { return Op_SignumVD; default: + assert(!VectorNode::is_convert_opcode(sopc), + "Convert node %s should be processed by VectorCastNode::opcode()", + NodeClassNames[sopc]); return 0; // Unimplemented } } From 0d731bb2b5215ca7a28077c8e2fa913db5d67720 Mon Sep 17 00:00:00 2001 From: Fei Gao Date: Thu, 9 Jun 2022 02:18:51 +0000 Subject: [PATCH 5/6] Update to the latest JDK and fix the function name Change-Id: Ie1907f86e2df7051aa2ddb7e5b05a371e887d1bc --- src/hotspot/cpu/aarch64/aarch64.ad | 2 +- src/hotspot/cpu/arm/arm.ad | 2 +- src/hotspot/cpu/ppc/ppc.ad | 2 +- src/hotspot/cpu/riscv/riscv.ad | 2 +- src/hotspot/cpu/s390/s390.ad | 2 +- src/hotspot/cpu/x86/x86.ad | 2 +- src/hotspot/share/opto/matcher.hpp | 2 +- src/hotspot/share/opto/superword.cpp | 10 +++++----- src/hotspot/share/opto/vectornode.cpp | 8 ++++---- 9 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad index edbca62d83e3c..36d9b5cbdf0bd 100644 --- a/src/hotspot/cpu/aarch64/aarch64.ad +++ b/src/hotspot/cpu/aarch64/aarch64.ad @@ -2432,7 +2432,7 @@ const bool Matcher::match_rule_supported(int opcode) { return ret_value; // Per default match rules are supported. } -const bool Matcher::match_rule_supported_vectorization(int opcode, int vlen, BasicType bt) { +const bool Matcher::match_rule_supported_superword(int opcode, int vlen, BasicType bt) { if (UseSVE == 0) { // ConvD2I and ConvL2F are not profitable to be vectorized on NEON, because no direct // NEON instructions support them. But the match rule support for them is profitable for diff --git a/src/hotspot/cpu/arm/arm.ad b/src/hotspot/cpu/arm/arm.ad index ab49e13f10e9c..310601c4dc758 100644 --- a/src/hotspot/cpu/arm/arm.ad +++ b/src/hotspot/cpu/arm/arm.ad @@ -981,7 +981,7 @@ const bool Matcher::match_rule_supported(int opcode) { return true; // Per default match rules are supported. } -const bool Matcher::match_rule_supported_vectorization(int opcode, int vlen, BasicType bt) { +const bool Matcher::match_rule_supported_superword(int opcode, int vlen, BasicType bt) { return match_rule_supported_vector(opcode, vlen, bt); } diff --git a/src/hotspot/cpu/ppc/ppc.ad b/src/hotspot/cpu/ppc/ppc.ad index 9f710eb8507a0..3217977aa68b8 100644 --- a/src/hotspot/cpu/ppc/ppc.ad +++ b/src/hotspot/cpu/ppc/ppc.ad @@ -2165,7 +2165,7 @@ const bool Matcher::match_rule_supported(int opcode) { return true; // Per default match rules are supported. } -const bool Matcher::match_rule_supported_vectorization(int opcode, int vlen, BasicType bt) { +const bool Matcher::match_rule_supported_superword(int opcode, int vlen, BasicType bt) { return match_rule_supported_vector(opcode, vlen, bt); } diff --git a/src/hotspot/cpu/riscv/riscv.ad b/src/hotspot/cpu/riscv/riscv.ad index d92035be05e4f..7f7f20bac8c1d 100644 --- a/src/hotspot/cpu/riscv/riscv.ad +++ b/src/hotspot/cpu/riscv/riscv.ad @@ -1815,7 +1815,7 @@ const bool Matcher::match_rule_supported(int opcode) { return true; // Per default match rules are supported. } -const bool Matcher::match_rule_supported_vectorization(int opcode, int vlen, BasicType bt) { +const bool Matcher::match_rule_supported_superword(int opcode, int vlen, BasicType bt) { return match_rule_supported_vector(opcode, vlen, bt); } diff --git a/src/hotspot/cpu/s390/s390.ad b/src/hotspot/cpu/s390/s390.ad index 655058f09f925..01533060ed159 100644 --- a/src/hotspot/cpu/s390/s390.ad +++ b/src/hotspot/cpu/s390/s390.ad @@ -1505,7 +1505,7 @@ const bool Matcher::match_rule_supported(int opcode) { return true; // Per default match rules are supported. } -const bool Matcher::match_rule_supported_vectorization(int opcode, int vlen, BasicType bt) { +const bool Matcher::match_rule_supported_superword(int opcode, int vlen, BasicType bt) { return match_rule_supported_vector(opcode, vlen, bt); } diff --git a/src/hotspot/cpu/x86/x86.ad b/src/hotspot/cpu/x86/x86.ad index 2e1b7a8eb770a..f45fe580e7363 100644 --- a/src/hotspot/cpu/x86/x86.ad +++ b/src/hotspot/cpu/x86/x86.ad @@ -1693,7 +1693,7 @@ static inline bool is_pop_count_instr_target(BasicType bt) { (is_non_subword_integral_type(bt) && VM_Version::supports_avx512_vpopcntdq()); } -const bool Matcher::match_rule_supported_vectorization(int opcode, int vlen, BasicType bt) { +const bool Matcher::match_rule_supported_superword(int opcode, int vlen, BasicType bt) { return match_rule_supported_vector(opcode, vlen, bt); } diff --git a/src/hotspot/share/opto/matcher.hpp b/src/hotspot/share/opto/matcher.hpp index fea1a1632d8ef..4576e470acfa7 100644 --- a/src/hotspot/share/opto/matcher.hpp +++ b/src/hotspot/share/opto/matcher.hpp @@ -327,7 +327,7 @@ class Matcher : public PhaseTransform { // Identify extra cases that we might want to vectorize automatically // And exclude cases which are not profitable to auto-vectorize. - static const bool match_rule_supported_vectorization(int opcode, int vlen, BasicType bt); + static const bool match_rule_supported_superword(int opcode, int vlen, BasicType bt); // identify extra cases that we might want to provide match rules for // e.g. Op_ vector nodes and other intrinsics while guarding with vlen diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 2cdd666e201a2..3803a0a4f0c36 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -1009,7 +1009,7 @@ int SuperWord::get_vw_bytes_special(MemNode* s) { // Check for special case where there is a type conversion between different data size. int vectsize = max_vector_size_in_def_use_chain(s); - if (vectsize < Matcher::max_vector_size(btype)) { + if (vectsize < max_vector_size(btype)) { vw = MIN2(vectsize * type2aelembytes(btype), vw); } @@ -1202,8 +1202,8 @@ bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) { if(!is_java_primitive(bt1) || !is_java_primitive(bt2)) return false; BasicType longer_bt = longer_type_for_conversion(s1); - if (Matcher::max_vector_size(bt1) < 2 || - (longer_bt != T_ILLEGAL && Matcher::max_vector_size(longer_bt) < 2)) { + if (max_vector_size(bt1) < 2 || + (longer_bt != T_ILLEGAL && max_vector_size(longer_bt) < 2)) { return false; // No vectors for this type } @@ -3461,10 +3461,10 @@ int SuperWord::max_vector_size_in_def_use_chain(Node* n) { vt = (newt == T_ILLEGAL) ? vt : newt; } - int max = Matcher::max_vector_size(vt); + int max = max_vector_size(vt); // If now there is no vectors for the longest type, the nodes with the longest // type in the def-use chain are not packed in SuperWord::stmts_can_pack. - return max < 2 ? Matcher::max_vector_size(bt) : max; + return max < 2 ? max_vector_size(bt) : max; } //-------------------------compute_vector_element_type----------------------- diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index 1dee384d84ada..d368700bac8f3 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -310,7 +310,7 @@ bool VectorNode::implemented(int opc, uint vlen, BasicType bt) { if (VectorNode::is_vector_integral_negate(vopc)) { return is_vector_integral_negate_supported(vopc, vlen, bt, false); } - return vopc > 0 && Matcher::match_rule_supported_vectorization(vopc, vlen, bt); + return vopc > 0 && Matcher::match_rule_supported_superword(vopc, vlen, bt); } return false; } @@ -1256,9 +1256,9 @@ int VectorCastNode::opcode(BasicType bt, bool is_signed) { bool VectorCastNode::implemented(int opc, uint vlen, BasicType src_type, BasicType dst_type) { if (is_java_primitive(dst_type) && (vlen > 1) && is_power_of_2(vlen) && - Matcher::vector_size_supported(dst_type, vlen)) { + VectorNode::vector_size_supported(dst_type, vlen)) { int vopc = VectorCastNode::opcode(src_type); - return vopc > 0 && Matcher::match_rule_supported_vectorization(vopc, vlen, dst_type); + return vopc > 0 && Matcher::match_rule_supported_superword(vopc, vlen, dst_type); } return false; } @@ -1352,7 +1352,7 @@ bool ReductionNode::implemented(int opc, uint vlen, BasicType bt) { (vlen > 1) && is_power_of_2(vlen) && VectorNode::vector_size_supported(bt, vlen)) { int vopc = ReductionNode::opcode(opc, bt); - return vopc != opc && Matcher::match_rule_supported_vectorization(vopc, vlen, bt); + return vopc != opc && Matcher::match_rule_supported_superword(vopc, vlen, bt); } return false; } From 49e6f56e0f174f32500eb74e35ca83c5c84d913e Mon Sep 17 00:00:00 2001 From: Fei Gao Date: Tue, 14 Jun 2022 01:37:38 +0000 Subject: [PATCH 6/6] Add an IR framework testcase Change-Id: Ifbcc8d233aa27dfe93acef548c7e42721d86376e --- .../irTests/TestVectorizeTypeConversion.java | 97 +++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 test/hotspot/jtreg/compiler/c2/irTests/TestVectorizeTypeConversion.java diff --git a/test/hotspot/jtreg/compiler/c2/irTests/TestVectorizeTypeConversion.java b/test/hotspot/jtreg/compiler/c2/irTests/TestVectorizeTypeConversion.java new file mode 100644 index 0000000000000..cd7a90c197a22 --- /dev/null +++ b/test/hotspot/jtreg/compiler/c2/irTests/TestVectorizeTypeConversion.java @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2022, Arm Limited. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package compiler.c2.irTests; + +import compiler.lib.ir_framework.*; +import java.util.Random; +import jdk.test.lib.Asserts; +import jdk.test.lib.Utils; + +/* + * @test + * @bug 8283091 + * @summary Auto-vectorization enhancement for type conversion between different data sizes. + * @requires (os.simpleArch == "x64" & vm.cpu.features ~= ".*avx2.*") | os.arch=="aarch64" + * @library /test/lib / + * @run driver compiler.c2.irTests.TestVectorizeTypeConversion + */ + +public class TestVectorizeTypeConversion { + + final private static int SIZE = 3000; + + private static double[] doublea = new double[SIZE]; + private static double[] doubleb = new double[SIZE]; + private static long[] longa = new long[SIZE]; + private static long[] longb = new long[SIZE]; + private static int[] inta = new int[SIZE]; + private static int[] intb = new int[SIZE]; + private static float[] floata = new float[SIZE]; + private static float[] floatb = new float[SIZE]; + + public static void main(String[] args) { + TestFramework.run(); + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR, ">0", + IRNode.VECTOR_CAST_I2X, ">0", + IRNode.STORE_VECTOR, ">0"}) + private static void testConvI2D(double[] d, int[] a) { + for(int i = 0; i < d.length; i++) { + d[i] = (double) (a[i]); + } + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR, ">0", + IRNode.VECTOR_CAST_I2X, ">0", + IRNode.VECTOR_CAST_L2X, ">0", + IRNode.STORE_VECTOR, ">0"}) + private static void testConvI2L(int[] d1, int d2[], long[] a1, long[] a2) { + for(int i = 0; i < d1.length; i++) { + d1[i] = (int) (a1[i]); + a2[i] = (long) (d2[i]); + } + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR, ">0", + IRNode.VECTOR_CAST_D2X, ">0", + IRNode.VECTOR_CAST_F2X, ">0", + IRNode.STORE_VECTOR, ">0"}) + private static void testConvF2D(double[] d1, double[] d2, float[] a1, float[] a2) { + for(int i = 0; i < d1.length; i++) { + d1[i] = (double) (a1[i]); + a2[i] = (float) (d2[i]); + } + } + + @Run(test = {"testConvI2D", "testConvI2L", "testConvF2D"}) + private void test_runner() { + testConvI2D(doublea, inta); + testConvI2L(inta, intb, longa, longb); + testConvF2D(doublea, doubleb, floata, floatb); + } +}