8332119: Incorrect IllegalArgumentException for C2 compiled permute kernel

Jatin Bhateja · Jatin Bhateja · commit 4c09d9f82800 · 2024-06-05T18:36:49.000Z
Reviewed-by: sviswanathan, kvn
diff --git a/src/hotspot/share/opto/library_call.hpp b/src/hotspot/share/opto/library_call.hpp
@@ -350,6 +350,7 @@ class LibraryCallKit : public GraphKit {
   bool inline_vector_frombits_coerced();
   bool inline_vector_shuffle_to_vector();
   bool inline_vector_shuffle_iota();
+  Node* partially_wrap_indexes(Node* index_vec, int num_elem, BasicType type_bt);
   bool inline_vector_mask_operation();
   bool inline_vector_mem_operation(bool is_store);
   bool inline_vector_mem_masked_operation(bool is_store);
diff --git a/src/hotspot/share/opto/vectorIntrinsics.cpp b/src/hotspot/share/opto/vectorIntrinsics.cpp
@@ -511,6 +511,37 @@ bool LibraryCallKit::inline_vector_nary_operation(int n) {
   return true;
 }
 
+// Following routine generates IR corresponding to AbstractShuffle::partiallyWrapIndex method,
+// which partially wraps index by modulo VEC_LENGTH and generates a negative index value if original
+// index is out of valid index range [0, VEC_LENGTH)
+//
+//   wrapped_index = (VEC_LENGTH - 1) & index
+//   if (index u> VEC_LENGTH) {
+//     wrapped_index -= VEC_LENGTH;
+//
+// Note: Unsigned greater than comparison treat both <0 and >VEC_LENGTH indices as out-of-bound
+// indexes.
+Node* LibraryCallKit::partially_wrap_indexes(Node* index_vec, int num_elem, BasicType elem_bt) {
+  assert(elem_bt == T_BYTE, "Shuffles use byte array based backing storage.");
+  const TypeVect* vt  = TypeVect::make(elem_bt, num_elem);
+  const Type* type_bt = Type::get_const_basic_type(elem_bt);
+
+  Node* mod_mask = gvn().makecon(TypeInt::make(num_elem-1));
+  Node* bcast_mod_mask  = gvn().transform(VectorNode::scalar2vector(mod_mask, num_elem, type_bt));
+
+  BoolTest::mask pred = BoolTest::ugt;
+  ConINode* pred_node = (ConINode*)gvn().makecon(TypeInt::make(pred));
+  Node* lane_cnt  = gvn().makecon(TypeInt::make(num_elem));
+  Node* bcast_lane_cnt = gvn().transform(VectorNode::scalar2vector(lane_cnt, num_elem, type_bt));
+  const TypeVect* vmask_type = TypeVect::makemask(type_bt, num_elem);
+  Node*  mask = gvn().transform(new VectorMaskCmpNode(pred, bcast_lane_cnt, index_vec, pred_node, vmask_type));
+
+  // Make the indices greater than lane count as -ve values to match the java side implementation.
+  index_vec = gvn().transform(VectorNode::make(Op_AndV, index_vec, bcast_mod_mask, vt));
+  Node* biased_val = gvn().transform(VectorNode::make(Op_SubVB, index_vec, bcast_lane_cnt, vt));
+  return gvn().transform(new VectorBlendNode(biased_val, index_vec, mask));
+}
+
 // <Sh extends VectorShuffle<E>,  E>
 //  Sh ShuffleIota(Class<?> E, Class<?> shuffleClass, Vector.Species<E> s, int length,
 //                  int start, int step, int wrap, ShuffleIotaOperation<Sh, E> defaultImpl)
@@ -596,18 +627,9 @@ bool LibraryCallKit::inline_vector_shuffle_iota() {
 
   if (do_wrap)  {
     // Wrap the indices greater than lane count.
-     res = gvn().transform(VectorNode::make(Op_AndV, res, bcast_mod, vt));
-  } else {
-    ConINode* pred_node = (ConINode*)gvn().makecon(TypeInt::make(BoolTest::ugt));
-    Node * lane_cnt  = gvn().makecon(TypeInt::make(num_elem));
-    Node * bcast_lane_cnt = gvn().transform(VectorNode::scalar2vector(lane_cnt, num_elem, type_bt));
-    const TypeVect* vmask_type = TypeVect::makemask(elem_bt, num_elem);
-    Node* mask = gvn().transform(new VectorMaskCmpNode(BoolTest::ugt, bcast_lane_cnt, res, pred_node, vmask_type));
-
-    // Make the indices greater than lane count as -ve values to match the java side implementation.
     res = gvn().transform(VectorNode::make(Op_AndV, res, bcast_mod, vt));
-    Node * biased_val = gvn().transform(VectorNode::make(Op_SubVB, res, bcast_lane_cnt, vt));
-    res = gvn().transform(new VectorBlendNode(biased_val, res, mask));
+  } else {
+    res = partially_wrap_indexes(res, num_elem, elem_bt);
   }
 
   ciKlass* sbox_klass = shuffle_klass->const_oop()->as_instance()->java_lang_Class_klass();
@@ -2286,6 +2308,18 @@ bool LibraryCallKit::inline_vector_convert() {
     return false;
   }
 
+
+  if (is_vector_shuffle(vbox_klass_to) &&
+      (!arch_supports_vector(Op_SubVB, num_elem_to, elem_bt_to, VecMaskNotUsed)           ||
+       !arch_supports_vector(Op_VectorBlend, num_elem_to, elem_bt_to, VecMaskNotUsed)     ||
+       !arch_supports_vector(Op_VectorMaskCmp, num_elem_to, elem_bt_to, VecMaskNotUsed)   ||
+       !arch_supports_vector(Op_AndV, num_elem_to, elem_bt_to, VecMaskNotUsed)            ||
+       !arch_supports_vector(Op_Replicate, num_elem_to, elem_bt_to, VecMaskNotUsed))) {
+    log_if_needed("  ** not supported: arity=1 op=shuffle_index_wrap vlen2=%d etype2=%s",
+                    num_elem_to, type2name(elem_bt_to));
+    return false;
+  }
+
   // At this point, we know that both input and output vector registers are supported
   // by the architecture. Next check if the casted type is simply to same type - which means
   // that it is actually a resize and not a cast.
@@ -2383,6 +2417,10 @@ bool LibraryCallKit::inline_vector_convert() {
     op = gvn().transform(new VectorReinterpretNode(op, src_type, dst_type));
   }
 
+  if (is_vector_shuffle(vbox_klass_to)) {
+     op = partially_wrap_indexes(op, num_elem_to, elem_bt_to);
+  }
+
   const TypeInstPtr* vbox_type_to = TypeInstPtr::make_exact(TypePtr::NotNull, vbox_klass_to);
   Node* vbox = box_vector(op, vbox_type_to, elem_bt_to, num_elem_to);
   set_result(vbox);
diff --git a/test/hotspot/jtreg/compiler/vectorapi/TestTwoVectorPermute.java b/test/hotspot/jtreg/compiler/vectorapi/TestTwoVectorPermute.java
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+* @test
+* @bug 8332119
+* @summary Incorrect IllegalArgumentException for C2 compiled permute kernel
+* @modules jdk.incubator.vector
+* @library /test/lib /
+* @run main/othervm -XX:+UnlockDiagnosticVMOptions -Xbatch -XX:-TieredCompilation -XX:CompileOnly=TestTwoVectorPermute::micro compiler.vectorapi.TestTwoVectorPermute
+* @run main/othervm -XX:+UnlockDiagnosticVMOptions -Xbatch -XX:-TieredCompilation compiler.vectorapi.TestTwoVectorPermute
+* @run main/othervm -XX:+UnlockDiagnosticVMOptions -Xbatch -XX:TieredStopAtLevel=3 compiler.vectorapi.TestTwoVectorPermute
+*/
+package compiler.vectorapi;
+
+
+import jdk.incubator.vector.*;
+import java.util.Arrays;
+import java.util.Random;
+
+public class TestTwoVectorPermute {
+   public static final VectorSpecies<Float> FSP = FloatVector.SPECIES_256;
+
+   public static void validate(float[] res, float[] shuf, float[] src1, float[] src2) {
+       for (int i = 0; i < res.length; i++) {
+           float expected = Float.NaN;
+           int shuf_index = (int)shuf[i];
+           // Exceptional index.
+           if (shuf_index < 0 || shuf_index >= FSP.length()) {
+               int wrapped_index = (shuf_index & (FSP.length() - 1));
+               if (Integer.compareUnsigned(shuf_index, FSP.length()) > 0) {
+                   wrapped_index -= FSP.length();
+               }
+               wrapped_index = wrapped_index < 0 ? wrapped_index + FSP.length() : wrapped_index;
+               expected = src2[wrapped_index];
+           } else {
+               expected = src1[shuf_index];
+           }
+           if (res[i] != expected) {
+              throw new AssertionError("Result mismatch at " + i + " index, (actual = " + res[i] + ") != ( expected " +   expected + " )");
+           }
+       }
+   }
+
+   public static void micro(float[] res, float[] shuf, float[] src1, float[] src2) {
+       VectorShuffle<Float> vshuf = FloatVector.fromArray(FSP, shuf, 0).toShuffle();
+       VectorShuffle<Float> vshuf_wrapped = vshuf.wrapIndexes();
+       FloatVector.fromArray(FSP, src1, 0)
+         .rearrange(vshuf_wrapped)
+         .blend(FloatVector.fromArray(FSP, src2, 0)
+                           .rearrange(vshuf_wrapped),
+                           vshuf.laneIsValid().not())
+         .intoArray(res, 0);
+   }
+
+   public static void main(String [] args) {
+       float [] res  = new float[FSP.length()];
+       float [] shuf = new float[FSP.length()];
+       float [] src1 = new float[FSP.length()];
+       float [] src2 = new float[FSP.length()];
+
+       for (int i = 0; i < FSP.length(); i++) {
+           shuf[i] = i * 2;
+       }
+       for (int i = 0; i < FSP.length(); i++) {
+           src1[i] = i;
+           src2[i] = i + FSP.length();
+       }
+       for (int i = 0; i < 10000; i++) {
+           micro(res, shuf, src1, src2);
+       }
+       validate(res, shuf, src1, src2);
+       for (int i = 0; i < FSP.length(); i++) {
+           shuf[i] = -i * 2;
+       }
+       for (int i = 0; i < 10000; i++) {
+           micro(res, shuf, src1, src2);
+       }
+       validate(res, shuf, src1, src2);
+       System.out.println("PASSED");
+   }
+}