diff --git a/make/test/BuildMicrobenchmark.gmk b/make/test/BuildMicrobenchmark.gmk
index ba502a5612870..5940a38c9dba2 100644
--- a/make/test/BuildMicrobenchmark.gmk
+++ b/make/test/BuildMicrobenchmark.gmk
@@ -101,6 +101,7 @@ $(eval $(call SetupJavaCompilation, BUILD_JDK_MICROBENCHMARK, \
         --add-exports java.base/jdk.internal.event=ALL-UNNAMED \
         --add-exports java.base/jdk.internal.foreign=ALL-UNNAMED \
         --add-exports java.base/jdk.internal.misc=ALL-UNNAMED \
+        --add-exports java.base/jdk.internal.util=ALL-UNNAMED \
         --add-exports java.base/jdk.internal.org.objectweb.asm.tree=ALL-UNNAMED \
         --add-exports java.base/jdk.internal.org.objectweb.asm=ALL-UNNAMED \
         --add-exports java.base/jdk.internal.vm=ALL-UNNAMED \
diff --git a/src/hotspot/share/opto/addnode.cpp b/src/hotspot/share/opto/addnode.cpp
index 928e51191d67b..f0e55d6ed48c9 100644
--- a/src/hotspot/share/opto/addnode.cpp
+++ b/src/hotspot/share/opto/addnode.cpp
@@ -724,9 +724,9 @@ Node* AddPNode::Ideal_base_and_offset(Node* ptr, PhaseValues* phase,
 //------------------------------unpack_offsets----------------------------------
 // Collect the AddP offset values into the elements array, giving up
 // if there are more than length.
-int AddPNode::unpack_offsets(Node* elements[], int length) {
+int AddPNode::unpack_offsets(Node* elements[], int length) const {
   int count = 0;
-  Node* addr = this;
+  Node const* addr = this;
   Node* base = addr->in(AddPNode::Base);
   while (addr->is_AddP()) {
     if (addr->in(AddPNode::Base) != base) {
diff --git a/src/hotspot/share/opto/addnode.hpp b/src/hotspot/share/opto/addnode.hpp
index a6ef58b98ce9d..19043b5e40f5d 100644
--- a/src/hotspot/share/opto/addnode.hpp
+++ b/src/hotspot/share/opto/addnode.hpp
@@ -181,7 +181,7 @@ class AddPNode : public Node {
 
   // Collect the AddP offset values into the elements array, giving up
   // if there are more than length.
-  int unpack_offsets(Node* elements[], int length);
+  int unpack_offsets(Node* elements[], int length) const;
 
   // Do not match base-ptr edge
   virtual uint match_edge(uint idx) const;
diff --git a/src/hotspot/share/opto/c2_globals.hpp b/src/hotspot/share/opto/c2_globals.hpp
index 68d4f89b8f2b9..ac807e071aa77 100644
--- a/src/hotspot/share/opto/c2_globals.hpp
+++ b/src/hotspot/share/opto/c2_globals.hpp
@@ -354,6 +354,12 @@
   notproduct(bool, TraceNewVectors, false,                                  \
           "Trace creation of Vector nodes")                                 \
                                                                             \
+  product(bool, MergeStores, true, DIAGNOSTIC,                              \
+          "Optimize stores by combining values into larger store")          \
+                                                                            \
+  develop(bool, TraceMergeStores, false,                                    \
+          "Trace creation of merged stores")                                \
+                                                                            \
   product_pd(bool, OptoBundling,                                            \
           "Generate nops to fill i-cache lines")                            \
                                                                             \
diff --git a/src/hotspot/share/opto/compile.cpp b/src/hotspot/share/opto/compile.cpp
index 417f828013f92..c1652230f3102 100644
--- a/src/hotspot/share/opto/compile.cpp
+++ b/src/hotspot/share/opto/compile.cpp
@@ -930,6 +930,7 @@ Compile::Compile( ciEnv* ci_env,
     _directive(directive),
     _log(ci_env->log()),
     _first_failure_details(nullptr),
+    _for_post_loop_igvn(comp_arena(), 8, 0, nullptr),
     _congraph(nullptr),
     NOT_PRODUCT(_igv_printer(nullptr) COMMA)
     _unique(0),
diff --git a/src/hotspot/share/opto/memnode.cpp b/src/hotspot/share/opto/memnode.cpp
index e0a364d5056b3..de1c61e29f685 100644
--- a/src/hotspot/share/opto/memnode.cpp
+++ b/src/hotspot/share/opto/memnode.cpp
@@ -2685,6 +2685,683 @@ uint StoreNode::hash() const {
   return NO_HASH;
 }
 
+// Class to parse array pointers, and determine if they are adjacent. We parse the form:
+//
+//   pointer =   base
+//             + constant_offset
+//             + LShiftL( ConvI2L(int_offset + int_con), int_offset_shift)
+//             + sum(other_offsets)
+//
+//
+// Note: we accumulate all constant offsets into constant_offset, even the int constant behind
+//       the "LShiftL(ConvI2L(...))" pattern. We convert "ConvI2L(int_offset + int_con)" to
+//       "ConvI2L(int_offset) + int_con", which is only safe if we can assume that either all
+//       compared addresses have an overflow for "int_offset + int_con" or none.
+//       For loads and stores on arrays, we know that if one overflows and the other not, then
+//       the two addresses lay almost max_int indices apart, but the maximal array size is
+//       only about half of that. Therefore, the RangeCheck on at least one of them must have
+//       failed.
+//
+//   constant_offset += LShiftL( ConvI2L(int_con), int_offset_shift)
+//
+//   pointer =   base
+//             + constant_offset
+//             + LShiftL( ConvI2L(int_offset), int_offset_shift)
+//             + sum(other_offsets)
+//
+class ArrayPointer {
+private:
+  const bool _is_valid;          // The parsing succeeded
+  const Node* _pointer;          // The final pointer to the position in the array
+  const Node* _base;             // Base address of the array
+  const jlong _constant_offset;  // Sum of collected constant offsets
+  const Node* _int_offset;       // (optional) Offset behind LShiftL and ConvI2L
+  const jint  _int_offset_shift; // (optional) Shift value for int_offset
+  const GrowableArray<Node*>* _other_offsets; // List of other AddP offsets
+
+  ArrayPointer(const bool is_valid,
+               const Node* pointer,
+               const Node* base,
+               const jlong constant_offset,
+               const Node* int_offset,
+               const jint int_offset_shift,
+               const GrowableArray<Node*>* other_offsets) :
+      _is_valid(is_valid),
+      _pointer(pointer),
+      _base(base),
+      _constant_offset(constant_offset),
+      _int_offset(int_offset),
+      _int_offset_shift(int_offset_shift),
+      _other_offsets(other_offsets)
+  {
+    assert(_pointer != nullptr, "must always have pointer");
+    assert(is_valid == (_base != nullptr), "have base exactly if valid");
+    assert(is_valid == (_other_offsets != nullptr), "have other_offsets exactly if valid");
+  }
+
+  static ArrayPointer make_invalid(const Node* pointer) {
+    return ArrayPointer(false, pointer, nullptr, 0, nullptr, 0, nullptr);
+  }
+
+  static bool parse_int_offset(Node* offset, Node*& int_offset, jint& int_offset_shift) {
+    // offset = LShiftL( ConvI2L(int_offset), int_offset_shift)
+    if (offset->Opcode() == Op_LShiftL &&
+        offset->in(1)->Opcode() == Op_ConvI2L &&
+        offset->in(2)->Opcode() == Op_ConI) {
+      int_offset = offset->in(1)->in(1); // LShiftL -> ConvI2L -> int_offset
+      int_offset_shift = offset->in(2)->get_int(); // LShiftL -> int_offset_shift
+      return true;
+    }
+
+    // offset = ConvI2L(int_offset) = LShiftL( ConvI2L(int_offset), 0)
+    if (offset->Opcode() == Op_ConvI2L) {
+      int_offset = offset->in(1);
+      int_offset_shift = 0;
+      return true;
+    }
+
+    // parse failed
+    return false;
+  }
+
+public:
+  // Parse the structure above the pointer
+  static ArrayPointer make(PhaseGVN* phase, const Node* pointer) {
+    assert(phase->type(pointer)->isa_aryptr() != nullptr, "must be array pointer");
+    if (!pointer->is_AddP()) { return ArrayPointer::make_invalid(pointer); }
+
+    const Node* base = pointer->in(AddPNode::Base);
+    if (base == nullptr) { return ArrayPointer::make_invalid(pointer); }
+
+    const int search_depth = 5;
+    Node* offsets[search_depth];
+    int count = pointer->as_AddP()->unpack_offsets(offsets, search_depth);
+
+    // We expect at least a constant each
+    if (count <= 0) { return ArrayPointer::make_invalid(pointer); }
+
+    // We extract the form:
+    //
+    //   pointer =   base
+    //             + constant_offset
+    //             + LShiftL( ConvI2L(int_offset + int_con), int_offset_shift)
+    //             + sum(other_offsets)
+    //
+    jlong constant_offset = 0;
+    Node* int_offset = nullptr;
+    jint int_offset_shift = 0;
+    GrowableArray<Node*>* other_offsets = new GrowableArray<Node*>(count);
+
+    for (int i = 0; i < count; i++) {
+      Node* offset = offsets[i];
+      if (offset->Opcode() == Op_ConI) {
+        // Constant int offset
+        constant_offset += offset->get_int();
+      } else if (offset->Opcode() == Op_ConL) {
+        // Constant long offset
+        constant_offset += offset->get_long();
+      } else if(int_offset == nullptr && parse_int_offset(offset, int_offset, int_offset_shift)) {
+        // LShiftL( ConvI2L(int_offset), int_offset_shift)
+        int_offset = int_offset->uncast();
+        if (int_offset->Opcode() == Op_AddI && int_offset->in(2)->Opcode() == Op_ConI) {
+          // LShiftL( ConvI2L(int_offset + int_con), int_offset_shift)
+          constant_offset += ((jlong)int_offset->in(2)->get_int()) << int_offset_shift;
+          int_offset = int_offset->in(1);
+        }
+      } else {
+        // All others
+        other_offsets->append(offset);
+      }
+    }
+
+    return ArrayPointer(true, pointer, base, constant_offset, int_offset, int_offset_shift, other_offsets);
+  }
+
+  bool is_adjacent_to_and_before(const ArrayPointer& other, const jlong data_size) const {
+    if (!_is_valid || !other._is_valid) { return false; }
+
+    // Offset adjacent?
+    if (this->_constant_offset + data_size != other._constant_offset) { return false; }
+
+    // All other components identical?
+    if (this->_base != other._base ||
+        this->_int_offset != other._int_offset ||
+        this->_int_offset_shift != other._int_offset_shift ||
+        this->_other_offsets->length() != other._other_offsets->length()) {
+      return false;
+    }
+
+    for (int i = 0; i < this->_other_offsets->length(); i++) {
+      Node* o1 = this->_other_offsets->at(i);
+      Node* o2 = other._other_offsets->at(i);
+      if (o1 != o2) { return false; }
+    }
+
+    return true;
+  }
+
+#ifndef PRODUCT
+  void dump() {
+    if (!_is_valid) {
+      tty->print("ArrayPointer[%d %s, invalid]", _pointer->_idx, _pointer->Name());
+      return;
+    }
+    tty->print("ArrayPointer[%d %s, base[%d %s] + %lld",
+               _pointer->_idx, _pointer->Name(),
+               _base->_idx, _base->Name(),
+               (long long)_constant_offset);
+    if (_int_offset != 0) {
+      tty->print(" + I2L[%d %s] << %d",
+                 _int_offset->_idx, _int_offset->Name(), _int_offset_shift);
+    }
+    for (int i = 0; i < _other_offsets->length(); i++) {
+      Node* n = _other_offsets->at(i);
+      tty->print(" + [%d %s]", n->_idx, n->Name());
+    }
+    tty->print_cr("]");
+  }
+#endif
+};
+
+// Link together multiple stores (B/S/C/I) into a longer one.
+//
+// Example: _store = StoreB[i+3]
+//
+//   RangeCheck[i+0]           RangeCheck[i+0]
+//   StoreB[i+0]
+//   RangeCheck[i+1]           RangeCheck[i+1]
+//   StoreB[i+1]         -->   pass:             fail:
+//   StoreB[i+2]               StoreI[i+0]       StoreB[i+0]
+//   StoreB[i+3]
+//
+// The 4 StoreB are merged into a single StoreI node. We have to be careful with RangeCheck[i+1]: before
+// the optimization, if this RangeCheck[i+1] fails, then we execute only StoreB[i+0], and then trap. After
+// the optimization, the new StoreI[i+0] is on the passing path of RangeCheck[i+1], and StoreB[i+0] on the
+// failing path.
+//
+// Note: For normal array stores, every store at first has a RangeCheck. But they can be removed with:
+//       - RCE (RangeCheck Elimination): the RangeChecks in the loop are hoisted out and before the loop,
+//                                       and possibly no RangeChecks remain between the stores.
+//       - RangeCheck smearing: the earlier RangeChecks are adjusted such that they cover later RangeChecks,
+//                              and those later RangeChecks can be removed. Example:
+//
+//                              RangeCheck[i+0]                         RangeCheck[i+0] <- before first store
+//                              StoreB[i+0]                             StoreB[i+0]     <- first store
+//                              RangeCheck[i+1]     --> smeared -->     RangeCheck[i+3] <- only RC between first and last store
+//                              StoreB[i+0]                             StoreB[i+1]     <- second store
+//                              RangeCheck[i+2]     --> removed
+//                              StoreB[i+0]                             StoreB[i+2]
+//                              RangeCheck[i+3]     --> removed
+//                              StoreB[i+0]                             StoreB[i+3]     <- last store
+//
+//                              Thus, it is a common pattern that between the first and last store in a chain
+//                              of adjacent stores there remains exactly one RangeCheck, located between the
+//                              first and the second store (e.g. RangeCheck[i+3]).
+//
+class MergePrimitiveArrayStores : public StackObj {
+private:
+  PhaseGVN* _phase;
+  StoreNode* _store;
+
+public:
+  MergePrimitiveArrayStores(PhaseGVN* phase, StoreNode* store) : _phase(phase), _store(store) {}
+
+  StoreNode* run();
+
+private:
+  bool is_compatible_store(const StoreNode* other_store) const;
+  bool is_adjacent_pair(const StoreNode* use_store, const StoreNode* def_store) const;
+  bool is_adjacent_input_pair(const Node* n1, const Node* n2, const int memory_size) const;
+  static bool is_con_RShift(const Node* n, Node const*& base_out, jint& shift_out);
+  enum CFGStatus { SuccessNoRangeCheck, SuccessWithRangeCheck, Failure };
+  static CFGStatus cfg_status_for_pair(const StoreNode* use_store, const StoreNode* def_store);
+
+  class Status {
+  private:
+    StoreNode* _found_store;
+    bool       _found_range_check;
+
+    Status(StoreNode* found_store, bool found_range_check)
+      : _found_store(found_store), _found_range_check(found_range_check) {}
+
+  public:
+    StoreNode* found_store() const { return _found_store; }
+    bool found_range_check() const { return _found_range_check; }
+    static Status make_failure() { return Status(nullptr, false); }
+
+    static Status make(StoreNode* found_store, const CFGStatus cfg_status) {
+      if (cfg_status == CFGStatus::Failure) {
+        return Status::make_failure();
+      }
+      return Status(found_store, cfg_status == CFGStatus::SuccessWithRangeCheck);
+    }
+  };
+
+  Status find_adjacent_use_store(const StoreNode* def_store) const;
+  Status find_adjacent_def_store(const StoreNode* use_store) const;
+  Status find_use_store(const StoreNode* def_store) const;
+  Status find_def_store(const StoreNode* use_store) const;
+  Status find_use_store_unidirectional(const StoreNode* def_store) const;
+  Status find_def_store_unidirectional(const StoreNode* use_store) const;
+
+  void collect_merge_list(Node_List& merge_list) const;
+  Node* make_merged_input_value(const Node_List& merge_list);
+  StoreNode* make_merged_store(const Node_List& merge_list, Node* merged_input_value);
+
+  DEBUG_ONLY( void trace(const Node_List& merge_list, const Node* merged_input_value, const StoreNode* merged_store) const; )
+};
+
+StoreNode* MergePrimitiveArrayStores::run() {
+  // Check for B/S/C/I
+  int opc = _store->Opcode();
+  if (opc != Op_StoreB && opc != Op_StoreC && opc != Op_StoreI) {
+    return nullptr;
+  }
+
+  // Only merge stores on arrays, and the stores must have the same size as the elements.
+  const TypeAryPtr* aryptr_t = _store->adr_type()->isa_aryptr();
+  if (aryptr_t == nullptr ||
+      type2aelembytes(aryptr_t->elem()->array_element_basic_type()) != _store->memory_size()) {
+    return nullptr;
+  }
+
+  // The _store must be the "last" store in a chain. If we find a use we could merge with
+  // then that use or a store further down is the "last" store.
+  Status status_use = find_adjacent_use_store(_store);
+  if (status_use.found_store() != nullptr) {
+    return nullptr;
+  }
+
+  // Check if we can merge with at least one def, so that we have at least 2 stores to merge.
+  Status status_def = find_adjacent_def_store(_store);
+  if (status_def.found_store() == nullptr) {
+    return nullptr;
+  }
+
+  ResourceMark rm;
+  Node_List merge_list;
+  collect_merge_list(merge_list);
+
+  Node* merged_input_value = make_merged_input_value(merge_list);
+  if (merged_input_value == nullptr) { return nullptr; }
+
+  StoreNode* merged_store = make_merged_store(merge_list, merged_input_value);
+
+  DEBUG_ONLY( if(TraceMergeStores) { trace(merge_list, merged_input_value, merged_store); } )
+
+  return merged_store;
+}
+
+// Check compatibility between _store and other_store.
+bool MergePrimitiveArrayStores::is_compatible_store(const StoreNode* other_store) const {
+  int opc = _store->Opcode();
+  assert(opc == Op_StoreB || opc == Op_StoreC || opc == Op_StoreI, "precondition");
+  assert(_store->adr_type()->isa_aryptr() != nullptr, "must be array store");
+
+  if (other_store == nullptr ||
+      _store->Opcode() != other_store->Opcode() ||
+      other_store->adr_type()->isa_aryptr() == nullptr) {
+    return false;
+  }
+
+  // Check that the size of the stores, and the array elements are all the same.
+  const TypeAryPtr* aryptr_t1 = _store->adr_type()->is_aryptr();
+  const TypeAryPtr* aryptr_t2 = other_store->adr_type()->is_aryptr();
+  int size1 = type2aelembytes(aryptr_t1->elem()->array_element_basic_type());
+  int size2 = type2aelembytes(aryptr_t2->elem()->array_element_basic_type());
+  if (size1 != size2 ||
+      size1 != _store->memory_size() ||
+      _store->memory_size() != other_store->memory_size()) {
+    return false;
+  }
+  return true;
+}
+
+bool MergePrimitiveArrayStores::is_adjacent_pair(const StoreNode* use_store, const StoreNode* def_store) const {
+  if (!is_adjacent_input_pair(def_store->in(MemNode::ValueIn),
+                              use_store->in(MemNode::ValueIn),
+                              def_store->memory_size())) {
+    return false;
+  }
+
+  ResourceMark rm;
+  ArrayPointer array_pointer_use = ArrayPointer::make(_phase, use_store->in(MemNode::Address));
+  ArrayPointer array_pointer_def = ArrayPointer::make(_phase, def_store->in(MemNode::Address));
+  if (!array_pointer_def.is_adjacent_to_and_before(array_pointer_use, use_store->memory_size())) {
+    return false;
+  }
+
+  return true;
+}
+
+bool MergePrimitiveArrayStores::is_adjacent_input_pair(const Node* n1, const Node* n2, const int memory_size) const {
+  // Pattern: [n1 = ConI, n2 = ConI]
+  if (n1->Opcode() == Op_ConI) {
+    return n2->Opcode() == Op_ConI;
+  }
+
+  // Pattern: [n1 = base >> shift, n2 = base >> (shift + memory_size)]
+  Node const* base_n2;
+  jint shift_n2;
+  if (!is_con_RShift(n2, base_n2, shift_n2)) {
+    return false;
+  }
+  if (n1->Opcode() == Op_ConvL2I) {
+    // look through
+    n1 = n1->in(1);
+  }
+  Node const* base_n1;
+  jint shift_n1;
+  if (n1 == base_n2) {
+    // n1 = base = base >> 0
+    base_n1 = n1;
+    shift_n1 = 0;
+  } else if (!is_con_RShift(n1, base_n1, shift_n1)) {
+    return false;
+  }
+  int bits_per_store = memory_size * 8;
+  if (base_n1 != base_n2 ||
+      shift_n1 + bits_per_store != shift_n2 ||
+      shift_n1 % bits_per_store != 0) {
+    return false;
+  }
+
+  // both load from same value with correct shift
+  return true;
+}
+
+// Detect pattern: n = base_out >> shift_out
+bool MergePrimitiveArrayStores::is_con_RShift(const Node* n, Node const*& base_out, jint& shift_out) {
+  assert(n != nullptr, "precondition");
+
+  int opc = n->Opcode();
+  if (opc == Op_ConvL2I) {
+    n = n->in(1);
+    opc = n->Opcode();
+  }
+
+  if ((opc == Op_RShiftI ||
+       opc == Op_RShiftL ||
+       opc == Op_URShiftI ||
+       opc == Op_URShiftL) &&
+      n->in(2)->is_ConI()) {
+    base_out = n->in(1);
+    shift_out = n->in(2)->get_int();
+    assert(shift_out >= 0, "must be positive");
+    return true;
+  }
+  return false;
+}
+
+// Check if there is nothing between the two stores, except optionally a RangeCheck leading to an uncommon trap.
+MergePrimitiveArrayStores::CFGStatus MergePrimitiveArrayStores::cfg_status_for_pair(const StoreNode* use_store, const StoreNode* def_store) {
+  assert(use_store->in(MemNode::Memory) == def_store, "use-def relationship");
+
+  Node* ctrl_use = use_store->in(MemNode::Control);
+  Node* ctrl_def = def_store->in(MemNode::Control);
+  if (ctrl_use == nullptr || ctrl_def == nullptr) {
+    return CFGStatus::Failure;
+  }
+
+  if (ctrl_use == ctrl_def) {
+    // Same ctrl -> no RangeCheck in between.
+    // Check: use_store must be the only use of def_store.
+    if (def_store->outcnt() > 1) {
+      return CFGStatus::Failure;
+    }
+    return CFGStatus::SuccessNoRangeCheck;
+  }
+
+  // Different ctrl -> could have RangeCheck in between.
+  // Check: 1. def_store only has these uses: use_store and MergeMem for uncommon trap, and
+  //        2. ctrl separated by RangeCheck.
+  if (def_store->outcnt() != 2) {
+    return CFGStatus::Failure; // Cannot have exactly these uses: use_store and MergeMem for uncommon trap.
+  }
+  int use_store_out_idx = def_store->raw_out(0) == use_store ? 0 : 1;
+  Node* merge_mem = def_store->raw_out(1 - use_store_out_idx)->isa_MergeMem();
+  if (merge_mem == nullptr ||
+      merge_mem->outcnt() != 1) {
+    return CFGStatus::Failure; // Does not have MergeMem for uncommon trap.
+  }
+  if (!ctrl_use->is_IfProj() ||
+      !ctrl_use->in(0)->is_RangeCheck() ||
+      ctrl_use->in(0)->outcnt() != 2) {
+    return CFGStatus::Failure; // Not RangeCheck.
+  }
+  ProjNode* other_proj = ctrl_use->as_IfProj()->other_if_proj();
+  Node* trap = other_proj->is_uncommon_trap_proj(Deoptimization::Reason_range_check);
+  if (trap != merge_mem->unique_out() ||
+      ctrl_use->in(0)->in(0) != ctrl_def) {
+    return CFGStatus::Failure; // Not RangeCheck with merge_mem leading to uncommon trap.
+  }
+
+  return CFGStatus::SuccessWithRangeCheck;
+}
+
+MergePrimitiveArrayStores::Status MergePrimitiveArrayStores::find_adjacent_use_store(const StoreNode* def_store) const {
+  Status status_use = find_use_store(def_store);
+  StoreNode* use_store = status_use.found_store();
+  if (use_store != nullptr && !is_adjacent_pair(use_store, def_store)) {
+    return Status::make_failure();
+  }
+  return status_use;
+}
+
+MergePrimitiveArrayStores::Status MergePrimitiveArrayStores::find_adjacent_def_store(const StoreNode* use_store) const {
+  Status status_def = find_def_store(use_store);
+  StoreNode* def_store = status_def.found_store();
+  if (def_store != nullptr && !is_adjacent_pair(use_store, def_store)) {
+    return Status::make_failure();
+  }
+  return status_def;
+}
+
+MergePrimitiveArrayStores::Status MergePrimitiveArrayStores::find_use_store(const StoreNode* def_store) const {
+  Status status_use = find_use_store_unidirectional(def_store);
+
+#ifdef ASSERT
+  StoreNode* use_store = status_use.found_store();
+  if (use_store != nullptr) {
+    Status status_def = find_def_store_unidirectional(use_store);
+    assert(status_def.found_store() == def_store &&
+           status_def.found_range_check() == status_use.found_range_check(),
+           "find_use_store and find_def_store must be symmetric");
+  }
+#endif
+
+  return status_use;
+}
+
+MergePrimitiveArrayStores::Status MergePrimitiveArrayStores::find_def_store(const StoreNode* use_store) const {
+  Status status_def = find_def_store_unidirectional(use_store);
+
+#ifdef ASSERT
+  StoreNode* def_store = status_def.found_store();
+  if (def_store != nullptr) {
+    Status status_use = find_use_store_unidirectional(def_store);
+    assert(status_use.found_store() == use_store &&
+           status_use.found_range_check() == status_def.found_range_check(),
+           "find_use_store and find_def_store must be symmetric");
+  }
+#endif
+
+  return status_def;
+}
+
+MergePrimitiveArrayStores::Status MergePrimitiveArrayStores::find_use_store_unidirectional(const StoreNode* def_store) const {
+  assert(is_compatible_store(def_store), "precondition: must be compatible with _store");
+
+  for (DUIterator_Fast imax, i = def_store->fast_outs(imax); i < imax; i++) {
+    StoreNode* use_store = def_store->fast_out(i)->isa_Store();
+    if (is_compatible_store(use_store)) {
+      return Status::make(use_store, cfg_status_for_pair(use_store, def_store));
+    }
+  }
+
+  return Status::make_failure();
+}
+
+MergePrimitiveArrayStores::Status MergePrimitiveArrayStores::find_def_store_unidirectional(const StoreNode* use_store) const {
+  assert(is_compatible_store(use_store), "precondition: must be compatible with _store");
+
+  StoreNode* def_store = use_store->in(MemNode::Memory)->isa_Store();
+  if (!is_compatible_store(def_store)) {
+    return Status::make_failure();
+  }
+
+  return Status::make(def_store, cfg_status_for_pair(use_store, def_store));
+}
+
+void MergePrimitiveArrayStores::collect_merge_list(Node_List& merge_list) const {
+  // The merged store can be at most 8 bytes.
+  const uint merge_list_max_size = 8 / _store->memory_size();
+  assert(merge_list_max_size >= 2 &&
+         merge_list_max_size <= 8 &&
+         is_power_of_2(merge_list_max_size),
+         "must be 2, 4 or 8");
+
+  // Traverse up the chain of adjacent def stores.
+  StoreNode* current = _store;
+  merge_list.push(current);
+  while (current != nullptr && merge_list.size() < merge_list_max_size) {
+    Status status = find_adjacent_def_store(current);
+    current = status.found_store();
+    if (current != nullptr) {
+      merge_list.push(current);
+
+      // We can have at most one RangeCheck.
+      if (status.found_range_check()) {
+        break;
+      }
+    }
+  }
+
+  // Truncate the merge_list to a power of 2.
+  const uint pow2size = round_down_power_of_2(merge_list.size());
+  assert(pow2size >= 2, "must be merging at least 2 stores");
+  while (merge_list.size() > pow2size) { merge_list.pop(); }
+}
+
+// Merge the input values of the smaller stores to a single larger input value.
+Node* MergePrimitiveArrayStores::make_merged_input_value(const Node_List& merge_list) {
+  int new_memory_size = _store->memory_size() * merge_list.size();
+  Node* first = merge_list.at(merge_list.size()-1);
+  Node* merged_input_value = nullptr;
+  if (_store->in(MemNode::ValueIn)->Opcode() == Op_ConI) {
+    // Pattern: [ConI, ConI, ...] -> new constant
+    jlong con = 0;
+    jlong bits_per_store = _store->memory_size() * 8;
+    jlong mask = (((jlong)1) << bits_per_store) - 1;
+    for (uint i = 0; i < merge_list.size(); i++) {
+      jlong con_i = merge_list.at(i)->in(MemNode::ValueIn)->get_int();
+      con = con << bits_per_store;
+      con = con | (mask & con_i);
+    }
+    merged_input_value = _phase->longcon(con);
+  } else {
+    // Pattern: [base >> 24, base >> 16, base >> 8, base] -> base
+    //             |                                  |
+    //           _store                             first
+    //
+    merged_input_value = first->in(MemNode::ValueIn);
+    Node const* base_last;
+    jint shift_last;
+    bool is_true = is_con_RShift(_store->in(MemNode::ValueIn), base_last, shift_last);
+    assert(is_true, "must detect con RShift");
+    if (merged_input_value != base_last && merged_input_value->Opcode() == Op_ConvL2I) {
+      // look through
+      merged_input_value = merged_input_value->in(1);
+    }
+    if (merged_input_value != base_last) {
+      // merged_input_value is not the base
+      return nullptr;
+    }
+  }
+
+  if (_phase->type(merged_input_value)->isa_long() != nullptr && new_memory_size <= 4) {
+    // Example:
+    //
+    //   long base = ...;
+    //   a[0] = (byte)(base >> 0);
+    //   a[1] = (byte)(base >> 8);
+    //
+    merged_input_value = _phase->transform(new ConvL2INode(merged_input_value));
+  }
+
+  assert((_phase->type(merged_input_value)->isa_int() != nullptr && new_memory_size <= 4) ||
+         (_phase->type(merged_input_value)->isa_long() != nullptr && new_memory_size == 8),
+         "merged_input_value is either int or long, and new_memory_size is small enough");
+
+  return merged_input_value;
+}
+
+//                                                                                                          //
+// first_ctrl    first_mem   first_adr                first_ctrl    first_mem         first_adr             //
+//  |                |           |                     |                |                 |                 //
+//  |                |           |                     |                +---------------+ |                 //
+//  |                |           |                     |                |               | |                 //
+//  |                | +---------+                     |                | +---------------+                 //
+//  |                | |                               |                | |             | |                 //
+//  +--------------+ | |  v1                           +------------------------------+ | |  v1             //
+//  |              | | |  |                            |                | |           | | |  |              //
+// RangeCheck     first_store                         RangeCheck        | |          first_store            //
+//  |                |  |                              |                | |                |                //
+// last_ctrl         |  +----> unc_trap               last_ctrl         | |                +----> unc_trap  //
+//  |                |                       ===>      |                | |                                 //
+//  +--------------+ | a2 v2                           |                | |                                 //
+//  |              | | |  |                            |                | |                                 //
+//  |             second_store                         |                | |                                 //
+//  |                |                                 |                | | [v1 v2   ...   vn]              //
+// ...              ...                                |                | |         |                       //
+//  |                |                                 |                | |         v                       //
+//  +--------------+ | an vn                           +--------------+ | | merged_input_value              //
+//                 | | |  |                                           | | |  |                              //
+//                last_store (= _store)                              merged_store                           //
+//                                                                                                          //
+StoreNode* MergePrimitiveArrayStores::make_merged_store(const Node_List& merge_list, Node* merged_input_value) {
+  Node* first_store = merge_list.at(merge_list.size()-1);
+  Node* last_ctrl   = _store->in(MemNode::Control); // after (optional) RangeCheck
+  Node* first_mem   = first_store->in(MemNode::Memory);
+  Node* first_adr   = first_store->in(MemNode::Address);
+
+  const TypePtr* new_adr_type = _store->adr_type();
+
+  int new_memory_size = _store->memory_size() * merge_list.size();
+  BasicType bt = T_ILLEGAL;
+  switch (new_memory_size) {
+    case 2: bt = T_SHORT; break;
+    case 4: bt = T_INT;   break;
+    case 8: bt = T_LONG;  break;
+  }
+
+  StoreNode* merged_store = StoreNode::make(*_phase, last_ctrl, first_mem, first_adr,
+                                            new_adr_type, merged_input_value, bt, MemNode::unordered);
+
+  // Marking the store mismatched is sufficient to prevent reordering, since array stores
+  // are all on the same slice. Hence, we need no barriers.
+  merged_store->set_mismatched_access();
+
+  // Constants above may now also be be packed -> put candidate on worklist
+  _phase->is_IterGVN()->_worklist.push(first_mem);
+
+  return merged_store;
+}
+
+#ifdef ASSERT
+void MergePrimitiveArrayStores::trace(const Node_List& merge_list, const Node* merged_input_value, const StoreNode* merged_store) const {
+  stringStream ss;
+  ss.print_cr("[TraceMergeStores]: Replace");
+  for (int i = (int)merge_list.size() - 1; i >= 0; i--) {
+    merge_list.at(i)->dump("\n", false, &ss);
+  }
+  ss.print_cr("[TraceMergeStores]: with");
+  merged_input_value->dump("\n", false, &ss);
+  merged_store->dump("\n", false, &ss);
+  tty->print("%s", ss.as_string());
+}
+#endif
+
 //------------------------------Ideal------------------------------------------
 // Change back-to-back Store(, p, x) -> Store(m, p, y) to Store(m, p, x).
 // When a store immediately follows a relevant allocation/initialization,
@@ -2770,6 +3447,18 @@ Node *StoreNode::Ideal(PhaseGVN *phase, bool can_reshape) {
     }
   }
 
+#ifdef VM_LITTLE_ENDIAN
+  if (MergeStores && UseUnalignedAccesses) {
+    if (phase->C->post_loop_opts_phase()) {
+      MergePrimitiveArrayStores merge(phase, this);
+      Node* progress = merge.run();
+      if (progress != nullptr) { return progress; }
+    } else {
+      phase->C->record_for_post_loop_opts_igvn(this);
+    }
+  }
+#endif
+
   return nullptr;                  // No further progress
 }
 
diff --git a/src/hotspot/share/opto/phaseX.cpp b/src/hotspot/share/opto/phaseX.cpp
index c791146f75769..1a1e4e04e16e1 100644
--- a/src/hotspot/share/opto/phaseX.cpp
+++ b/src/hotspot/share/opto/phaseX.cpp
@@ -2273,7 +2273,15 @@ void PhasePeephole::print_statistics() {
 //------------------------------set_req_X--------------------------------------
 void Node::set_req_X( uint i, Node *n, PhaseIterGVN *igvn ) {
   assert( is_not_dead(n), "can not use dead node");
-  assert( igvn->hash_find(this) != this, "Need to remove from hash before changing edges" );
+#ifdef ASSERT
+  if (igvn->hash_find(this) == this) {
+    tty->print_cr("Need to remove from hash before changing edges");
+    this->dump(1);
+    tty->print_cr("Set at i = %d", i);
+    n->dump();
+    assert(false, "Need to remove from hash before changing edges");
+  }
+#endif
   Node *old = in(i);
   set_req(i, n);
 
diff --git a/test/hotspot/jtreg/compiler/c2/TestMergeStores.java b/test/hotspot/jtreg/compiler/c2/TestMergeStores.java
new file mode 100644
index 0000000000000..0af46b56a56c0
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/c2/TestMergeStores.java
@@ -0,0 +1,1247 @@
+/*
+ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.c2;
+
+import compiler.lib.ir_framework.*;
+import jdk.test.lib.Utils;
+import jdk.internal.misc.Unsafe;
+import java.lang.reflect.Array;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Random;
+
+/*
+ * @test
+ * @bug 8318446
+ * @summary Test merging of consecutive stores
+ * @modules java.base/jdk.internal.misc
+ * @library /test/lib /
+ * @run main compiler.c2.TestMergeStores aligned
+ */
+
+/*
+ * @test
+ * @bug 8318446
+ * @summary Test merging of consecutive stores
+ * @modules java.base/jdk.internal.misc
+ * @library /test/lib /
+ * @run main compiler.c2.TestMergeStores unaligned
+ */
+
+public class TestMergeStores {
+    static int RANGE = 1000;
+    private static final Unsafe UNSAFE = Unsafe.getUnsafe();
+    private static final Random RANDOM = Utils.getRandomInstance();
+
+    // Inputs
+    byte[] aB = new byte[RANGE];
+    byte[] bB = new byte[RANGE];
+    short[] aS = new short[RANGE];
+    short[] bS = new short[RANGE];
+    int[] aI = new int[RANGE];
+    int[] bI = new int[RANGE];
+    long[] aL = new long[RANGE];
+    long[] bL = new long[RANGE];
+
+    int offset1;
+    int offset2;
+    byte vB1;
+    byte vB2;
+    short vS1;
+    short vS2;
+    int vI1;
+    int vI2;
+    long vL1;
+    long vL2;
+
+    interface TestFunction {
+        Object[] run(boolean isWarmUp, int rnd);
+    }
+
+    Map<String, Map<String, TestFunction>> testGroups = new HashMap<String, Map<String, TestFunction>>();
+
+    public static void main(String[] args) {
+        TestFramework framework = new TestFramework(TestMergeStores.class);
+        framework.addFlags("--add-modules", "java.base", "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED");
+
+        switch (args[0]) {
+            case "aligned"     -> { framework.addFlags("-XX:-UseUnalignedAccesses"); }
+            case "unaligned"   -> { framework.addFlags("-XX:+UseUnalignedAccesses"); }
+            default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); }
+        }
+        framework.start();
+    }
+
+    public TestMergeStores() {
+        testGroups.put("test1", new HashMap<String,TestFunction>());
+        testGroups.get("test1").put("test1R", (_,_) -> { return test1R(aB.clone()); });
+        testGroups.get("test1").put("test1a", (_,_) -> { return test1a(aB.clone()); });
+        testGroups.get("test1").put("test1b", (_,_) -> { return test1b(aB.clone()); });
+        testGroups.get("test1").put("test1c", (_,_) -> { return test1c(aB.clone()); });
+        testGroups.get("test1").put("test1d", (_,_) -> { return test1d(aB.clone()); });
+        testGroups.get("test1").put("test1e", (_,_) -> { return test1e(aB.clone()); });
+        testGroups.get("test1").put("test1f", (_,_) -> { return test1f(aB.clone()); });
+        testGroups.get("test1").put("test1g", (_,_) -> { return test1g(aB.clone()); });
+        testGroups.get("test1").put("test1h", (_,_) -> { return test1h(aB.clone()); });
+        testGroups.get("test1").put("test1i", (_,_) -> { return test1i(aB.clone()); });
+
+        testGroups.put("test2", new HashMap<String,TestFunction>());
+        testGroups.get("test2").put("test2R", (_,_) -> { return test2R(aB.clone(), offset1, vL1); });
+        testGroups.get("test2").put("test2a", (_,_) -> { return test2a(aB.clone(), offset1, vL1); });
+        testGroups.get("test2").put("test2b", (_,_) -> { return test2b(aB.clone(), offset1, vL1); });
+        testGroups.get("test2").put("test2c", (_,_) -> { return test2c(aB.clone(), offset1, vL1); });
+        testGroups.get("test2").put("test2d", (_,_) -> { return test2d(aB.clone(), offset1, vL1); });
+        testGroups.get("test2").put("test2e", (_,_) -> { return test2d(aB.clone(), offset1, vL1); });
+
+        testGroups.put("test3", new HashMap<String,TestFunction>());
+        testGroups.get("test3").put("test3R", (_,_) -> { return test3R(aB.clone(), offset1, vL1); });
+        testGroups.get("test3").put("test3a", (_,_) -> { return test3a(aB.clone(), offset1, vL1); });
+
+        testGroups.put("test4", new HashMap<String,TestFunction>());
+        testGroups.get("test4").put("test4R", (_,_) -> { return test4R(aB.clone(), offset1, vL1, vI1, vS1, vB1); });
+        testGroups.get("test4").put("test4a", (_,_) -> { return test4a(aB.clone(), offset1, vL1, vI1, vS1, vB1); });
+
+        testGroups.put("test5", new HashMap<String,TestFunction>());
+        testGroups.get("test5").put("test5R", (_,_) -> { return test5R(aB.clone(), offset1); });
+        testGroups.get("test5").put("test5a", (_,_) -> { return test5a(aB.clone(), offset1); });
+
+        testGroups.put("test6", new HashMap<String,TestFunction>());
+        testGroups.get("test6").put("test6R", (_,_) -> { return test6R(aB.clone(), bB.clone(), offset1, offset2); });
+        testGroups.get("test6").put("test6a", (_,_) -> { return test6a(aB.clone(), bB.clone(), offset1, offset2); });
+
+        testGroups.put("test7", new HashMap<String,TestFunction>());
+        testGroups.get("test7").put("test7R", (_,_) -> { return test7R(aB.clone(), offset1, vI1); });
+        testGroups.get("test7").put("test7a", (_,_) -> { return test7a(aB.clone(), offset1, vI1); });
+
+        testGroups.put("test100", new HashMap<String,TestFunction>());
+        testGroups.get("test100").put("test100R", (_,_) -> { return test100R(aS.clone(), offset1); });
+        testGroups.get("test100").put("test100a", (_,_) -> { return test100a(aS.clone(), offset1); });
+
+        testGroups.put("test101", new HashMap<String,TestFunction>());
+        testGroups.get("test101").put("test101R", (_,_) -> { return test101R(aS.clone(), offset1); });
+        testGroups.get("test101").put("test101a", (_,_) -> { return test101a(aS.clone(), offset1); });
+
+        testGroups.put("test102", new HashMap<String,TestFunction>());
+        testGroups.get("test102").put("test102R", (_,_) -> { return test102R(aS.clone(), offset1, vL1, vI1, vS1); });
+        testGroups.get("test102").put("test102a", (_,_) -> { return test102a(aS.clone(), offset1, vL1, vI1, vS1); });
+
+        testGroups.put("test200", new HashMap<String,TestFunction>());
+        testGroups.get("test200").put("test200R", (_,_) -> { return test200R(aI.clone(), offset1); });
+        testGroups.get("test200").put("test200a", (_,_) -> { return test200a(aI.clone(), offset1); });
+
+        testGroups.put("test201", new HashMap<String,TestFunction>());
+        testGroups.get("test201").put("test201R", (_,_) -> { return test201R(aI.clone(), offset1); });
+        testGroups.get("test201").put("test201a", (_,_) -> { return test201a(aI.clone(), offset1); });
+
+        testGroups.put("test202", new HashMap<String,TestFunction>());
+        testGroups.get("test202").put("test202R", (_,_) -> { return test202R(aI.clone(), offset1, vL1, vI1); });
+        testGroups.get("test202").put("test202a", (_,_) -> { return test202a(aI.clone(), offset1, vL1, vI1); });
+
+        testGroups.put("test300", new HashMap<String,TestFunction>());
+        testGroups.get("test300").put("test300R", (_,_) -> { return test300R(aI.clone()); });
+        testGroups.get("test300").put("test300a", (_,_) -> { return test300a(aI.clone()); });
+
+        testGroups.put("test400", new HashMap<String,TestFunction>());
+        testGroups.get("test400").put("test400R", (_,_) -> { return test400R(aI.clone()); });
+        testGroups.get("test400").put("test400a", (_,_) -> { return test400a(aI.clone()); });
+
+        testGroups.put("test500", new HashMap<String,TestFunction>());
+        testGroups.get("test500").put("test500R", (_,_) -> { return test500R(aB.clone(), offset1, vL1); });
+        testGroups.get("test500").put("test500a", (_,_) -> { return test500a(aB.clone(), offset1, vL1); });
+
+        testGroups.put("test501", new HashMap<String,TestFunction>());
+        testGroups.get("test501").put("test500R", (_,i) -> { return test500R(aB.clone(), RANGE - 20 + (i % 30), vL1); });
+        testGroups.get("test501").put("test501a", (_,i) -> { return test501a(aB.clone(), RANGE - 20 + (i % 30), vL1); });
+        //                                                                               +-------------------+
+        // Create offsets that are sometimes going to pass all RangeChecks, and sometimes one, and sometimes none.
+        // Consequence: all RangeChecks stay in the final compilation.
+
+        testGroups.put("test502", new HashMap<String,TestFunction>());
+        testGroups.get("test502").put("test500R", (w,i) -> { return test500R(aB.clone(), w ? offset1 : RANGE - 20 + (i % 30), vL1); });
+        testGroups.get("test502").put("test502a", (w,i) -> { return test502a(aB.clone(), w ? offset1 : RANGE - 20 + (i % 30), vL1); });
+        //                                                                                   +-----+   +-------------------+
+        // First use something in range, and after warmup randomize going outside the range.
+        // Consequence: all RangeChecks stay in the final compilation.
+    }
+
+    @Warmup(100)
+    @Run(test = {"test1a",
+                 "test1b",
+                 "test1c",
+                 "test1d",
+                 "test1e",
+                 "test1f",
+                 "test1g",
+                 "test1h",
+                 "test1i",
+                 "test2a",
+                 "test2b",
+                 "test2c",
+                 "test2d",
+                 "test2e",
+                 "test3a",
+                 "test4a",
+                 "test5a",
+                 "test6a",
+                 "test7a",
+                 "test100a",
+                 "test101a",
+                 "test102a",
+                 "test200a",
+                 "test201a",
+                 "test202a",
+                 "test300a",
+                 "test400a",
+                 "test500a",
+                 "test501a",
+                 "test502a"})
+    public void runTests(RunInfo info) {
+        // Repeat many times, so that we also have multiple iterations for post-warmup to potentially recompile
+        int iters = info.isWarmUp() ? 1_000 : 50_000;
+        for (int iter = 0; iter < iters; iter++) {
+            // Write random values to inputs
+            set_random(aB);
+            set_random(bB);
+            set_random(aS);
+            set_random(bS);
+            set_random(aI);
+            set_random(bI);
+            set_random(aL);
+            set_random(bL);
+
+            offset1 = Math.abs(RANDOM.nextInt()) % 100;
+            offset2 = Math.abs(RANDOM.nextInt()) % 100;
+            vB1 = (byte)RANDOM.nextInt();
+            vB2 = (byte)RANDOM.nextInt();
+            vS1 = (short)RANDOM.nextInt();
+            vS2 = (short)RANDOM.nextInt();
+            vI1 = RANDOM.nextInt();
+            vI2 = RANDOM.nextInt();
+            vL1 = RANDOM.nextLong();
+            vL2 = RANDOM.nextLong();
+
+            // Run all tests
+            for (Map.Entry<String, Map<String,TestFunction>> group_entry : testGroups.entrySet()) {
+                String group_name = group_entry.getKey();
+                Map<String, TestFunction> group = group_entry.getValue();
+                Object[] gold = null;
+                String gold_name = "NONE";
+                for (Map.Entry<String,TestFunction> entry : group.entrySet()) {
+                    String name = entry.getKey();
+                    TestFunction test = entry.getValue();
+                    Object[] result = test.run(info.isWarmUp(), iter);
+                    if (gold == null) {
+                        gold = result;
+                        gold_name = name;
+                    } else {
+                        verify("group " + group_name + ", gold " + gold_name + ", test " + name, gold, result);
+                    }
+                }
+            }
+        }
+    }
+
+    static void verify(String name, Object[] gold, Object[] result) {
+        if (gold.length != result.length) {
+            throw new RuntimeException("verify " + name + ": not the same number of outputs: gold.length = " +
+                                       gold.length + ", result.length = " + result.length);
+        }
+        for (int i = 0; i < gold.length; i++) {
+            Object g = gold[i];
+            Object r = result[i];
+            if (g.getClass() != r.getClass() || !g.getClass().isArray() || !r.getClass().isArray()) {
+                throw new RuntimeException("verify " + name + ": must both be array of same type:" +
+                                           " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
+                                           " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
+            }
+            if (g == r) {
+                throw new RuntimeException("verify " + name + ": should be two separate arrays (with identical content):" +
+                                           " gold[" + i + "] == result[" + i + "]");
+            }
+            if (Array.getLength(g) != Array.getLength(r)) {
+                    throw new RuntimeException("verify " + name + ": arrays must have same length:" +
+                                           " gold[" + i + "].length = " + Array.getLength(g) +
+                                           " result[" + i + "].length = " + Array.getLength(r));
+            }
+            Class c = g.getClass().getComponentType();
+            if (c == byte.class) {
+                verifyB(name, i, (byte[])g, (byte[])r);
+            } else if (c == short.class) {
+                verifyS(name, i, (short[])g, (short[])r);
+            } else if (c == int.class) {
+                verifyI(name, i, (int[])g, (int[])r);
+            } else if (c == long.class) {
+                verifyL(name, i, (long[])g, (long[])r);
+            } else {
+                throw new RuntimeException("verify " + name + ": array type not supported for verify:" +
+                                       " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() +
+                                       " result[" + i + "].getClass() = " + r.getClass().getSimpleName());
+            }
+        }
+    }
+
+    static void verifyB(String name, int i, byte[] g, byte[] r) {
+        for (int j = 0; j < g.length; j++) {
+            if (g[j] != r[j]) {
+                throw new RuntimeException("verify " + name + ": arrays must have same content:" +
+                                           " gold[" + i + "][" + j + "] = " + g[j] +
+                                           " = " + String.format("%02X", g[j] & 0xFF) +
+                                           " result[" + i + "][" + j + "] = " + r[j] +
+                                           " = " + String.format("%02X", r[j] & 0xFF));
+            }
+        }
+    }
+
+    static void verifyS(String name, int i, short[] g, short[] r) {
+        for (int j = 0; j < g.length; j++) {
+            if (g[j] != r[j]) {
+                throw new RuntimeException("verify " + name + ": arrays must have same content:" +
+                                           " gold[" + i + "][" + j + "] = " + g[j] +
+                                           " result[" + i + "][" + j + "] = " + r[j]);
+            }
+        }
+    }
+
+    static void verifyI(String name, int i, int[] g, int[] r) {
+        for (int j = 0; j < g.length; j++) {
+            if (g[j] != r[j]) {
+                throw new RuntimeException("verify " + name + ": arrays must have same content:" +
+                                           " gold[" + i + "][" + j + "] = " + g[j] +
+                                           " result[" + i + "][" + j + "] = " + r[j]);
+            }
+        }
+    }
+
+    static void verifyL(String name, int i, long[] g, long[] r) {
+        for (int j = 0; j < g.length; j++) {
+            if (g[j] != r[j]) {
+                throw new RuntimeException("verify " + name + ": arrays must have same content:" +
+                                           " gold[" + i + "][" + j + "] = " + g[j] +
+                                           " result[" + i + "][" + j + "] = " + r[j]);
+            }
+        }
+    }
+
+    static void set_random(byte[] a) {
+        for (int i = 0; i < a.length; i++) {
+            a[i] = (byte)RANDOM.nextInt();
+        }
+    }
+
+    static void set_random(short[] a) {
+        for (int i = 0; i < a.length; i++) {
+            a[i] = (short)RANDOM.nextInt();
+        }
+    }
+
+    static void set_random(int[] a) {
+        for (int i = 0; i < a.length; i++) {
+            a[i] = RANDOM.nextInt();
+        }
+    }
+
+    static void set_random(long[] a) {
+        for (int i = 0; i < a.length; i++) {
+            a[i] = RANDOM.nextLong();
+        }
+    }
+
+    // -------------------------------------------
+    // -------     Little-Endian API    ----------
+    // -------------------------------------------
+    // Note: I had to add @ForceInline because otherwise it would sometimes
+    //       not inline nested method calls.
+
+    // Store a short LE into an array using store bytes in an array
+    @ForceInline
+    static void storeShortLE(byte[] bytes, int offset, short value) {
+        storeBytes(bytes, offset, (byte)(value >> 0),
+                                  (byte)(value >> 8));
+    }
+
+    // Store an int LE into an array using store bytes in an array
+    @ForceInline
+    static void storeIntLE(byte[] bytes, int offset, int value) {
+        storeBytes(bytes, offset, (byte)(value >> 0 ),
+                                  (byte)(value >> 8 ),
+                                  (byte)(value >> 16),
+                                  (byte)(value >> 24));
+    }
+
+    // Store an int LE into an array using store bytes in an array
+    @ForceInline
+    static void storeLongLE(byte[] bytes, int offset, long value) {
+        storeBytes(bytes, offset, (byte)(value >> 0 ),
+                                  (byte)(value >> 8 ),
+                                  (byte)(value >> 16),
+                                  (byte)(value >> 24),
+                                  (byte)(value >> 32),
+                                  (byte)(value >> 40),
+                                  (byte)(value >> 48),
+                                  (byte)(value >> 56));
+    }
+
+    // Store 2 bytes into an array
+    @ForceInline
+    static void storeBytes(byte[] bytes, int offset, byte b0, byte b1) {
+        bytes[offset + 0] = b0;
+        bytes[offset + 1] = b1;
+    }
+
+    // Store 4 bytes into an array
+    @ForceInline
+    static void storeBytes(byte[] bytes, int offset, byte b0, byte b1, byte b2, byte b3) {
+        bytes[offset + 0] = b0;
+        bytes[offset + 1] = b1;
+        bytes[offset + 2] = b2;
+        bytes[offset + 3] = b3;
+    }
+
+    // Store 8 bytes into an array
+    @ForceInline
+    static void storeBytes(byte[] bytes, int offset, byte b0, byte b1, byte b2, byte b3,
+                                                     byte b4, byte b5, byte b6, byte b7) {
+        bytes[offset + 0] = b0;
+        bytes[offset + 1] = b1;
+        bytes[offset + 2] = b2;
+        bytes[offset + 3] = b3;
+        bytes[offset + 4] = b4;
+        bytes[offset + 5] = b5;
+        bytes[offset + 6] = b6;
+        bytes[offset + 7] = b7;
+    }
+
+    @DontCompile
+    static Object[] test1R(byte[] a) {
+        a[0] = (byte)0xbe;
+        a[1] = (byte)0xba;
+        a[2] = (byte)0xad;
+        a[3] = (byte)0xba;
+        a[4] = (byte)0xef;
+        a[5] = (byte)0xbe;
+        a[6] = (byte)0xad;
+        a[7] = (byte)0xde;
+        return new Object[]{ a };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test1a(byte[] a) {
+        a[0] = (byte)0xbe;
+        a[1] = (byte)0xba;
+        a[2] = (byte)0xad;
+        a[3] = (byte)0xba;
+        a[4] = (byte)0xef;
+        a[5] = (byte)0xbe;
+        a[6] = (byte)0xad;
+        a[7] = (byte)0xde;
+        return new Object[]{ a };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test1b(byte[] a) {
+        // Add custom null check, to ensure the unsafe access always recognizes its type as an array store
+        if (a == null) {return null;}
+        UNSAFE.putLongUnaligned(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET, 0xdeadbeefbaadbabeL);
+        return new Object[]{ a };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test1c(byte[] a) {
+        storeLongLE(a, 0, 0xdeadbeefbaadbabeL);
+        return new Object[]{ a };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test1d(byte[] a) {
+        storeIntLE(a, 0, 0xbaadbabe);
+        storeIntLE(a, 4, 0xdeadbeef);
+        return new Object[]{ a };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test1e(byte[] a) {
+        storeShortLE(a, 0, (short)0xbabe);
+        storeShortLE(a, 2, (short)0xbaad);
+        storeShortLE(a, 4, (short)0xbeef);
+        storeShortLE(a, 6, (short)0xdead);
+        return new Object[]{ a };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test1f(byte[] a) {
+        UNSAFE.putByte(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 0, (byte)0xbe);
+        UNSAFE.putByte(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1, (byte)0xba);
+        UNSAFE.putByte(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 2, (byte)0xad);
+        UNSAFE.putByte(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 3, (byte)0xba);
+        UNSAFE.putByte(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 4, (byte)0xef);
+        UNSAFE.putByte(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 5, (byte)0xbe);
+        UNSAFE.putByte(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 6, (byte)0xad);
+        UNSAFE.putByte(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 7, (byte)0xde);
+        return new Object[]{ a };
+    }
+
+    @Test
+    // Do not optimize these, just to be sure we do not mess with store ordering.
+    @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_B_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "8"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test1g(byte[] a) {
+        UNSAFE.putByteRelease(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 0, (byte)0xbe);
+        UNSAFE.putByteRelease(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1, (byte)0xba);
+        UNSAFE.putByteRelease(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 2, (byte)0xad);
+        UNSAFE.putByteRelease(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 3, (byte)0xba);
+        UNSAFE.putByteRelease(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 4, (byte)0xef);
+        UNSAFE.putByteRelease(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 5, (byte)0xbe);
+        UNSAFE.putByteRelease(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 6, (byte)0xad);
+        UNSAFE.putByteRelease(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 7, (byte)0xde);
+        return new Object[]{ a };
+    }
+
+    @Test
+    // Do not optimize these, just to be sure we do not mess with store ordering.
+    @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_B_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "8"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test1h(byte[] a) {
+        UNSAFE.putByteVolatile(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 0, (byte)0xbe);
+        UNSAFE.putByteVolatile(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1, (byte)0xba);
+        UNSAFE.putByteVolatile(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 2, (byte)0xad);
+        UNSAFE.putByteVolatile(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 3, (byte)0xba);
+        UNSAFE.putByteVolatile(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 4, (byte)0xef);
+        UNSAFE.putByteVolatile(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 5, (byte)0xbe);
+        UNSAFE.putByteVolatile(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 6, (byte)0xad);
+        UNSAFE.putByteVolatile(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 7, (byte)0xde);
+        return new Object[]{ a };
+    }
+
+    @Test
+    // Do not optimize these, just to be sure we do not mess with store ordering.
+    @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_B_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "8"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test1i(byte[] a) {
+        UNSAFE.putByteOpaque(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 0, (byte)0xbe);
+        UNSAFE.putByteOpaque(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1, (byte)0xba);
+        UNSAFE.putByteOpaque(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 2, (byte)0xad);
+        UNSAFE.putByteOpaque(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 3, (byte)0xba);
+        UNSAFE.putByteOpaque(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 4, (byte)0xef);
+        UNSAFE.putByteOpaque(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 5, (byte)0xbe);
+        UNSAFE.putByteOpaque(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 6, (byte)0xad);
+        UNSAFE.putByteOpaque(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 7, (byte)0xde);
+        return new Object[]{ a };
+    }
+
+    @DontCompile
+    static Object[] test2R(byte[] a, int offset, long v) {
+        a[offset + 0] = (byte)(v >> 0);
+        a[offset + 1] = (byte)(v >> 8);
+        a[offset + 2] = (byte)(v >> 16);
+        a[offset + 3] = (byte)(v >> 24);
+        a[offset + 4] = (byte)(v >> 32);
+        a[offset + 5] = (byte)(v >> 40);
+        a[offset + 6] = (byte)(v >> 48);
+        a[offset + 7] = (byte)(v >> 56);
+        return new Object[]{ a };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test2a(byte[] a, int offset, long v) {
+        a[offset + 0] = (byte)(v >> 0);
+        a[offset + 1] = (byte)(v >> 8);
+        a[offset + 2] = (byte)(v >> 16);
+        a[offset + 3] = (byte)(v >> 24);
+        a[offset + 4] = (byte)(v >> 32);
+        a[offset + 5] = (byte)(v >> 40);
+        a[offset + 6] = (byte)(v >> 48);
+        a[offset + 7] = (byte)(v >> 56);
+        return new Object[]{ a };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test2b(byte[] a, int offset, long v) {
+        // Add custom null check, to ensure the unsafe access always recognizes its type as an array store
+        if (a == null) {return null;}
+        UNSAFE.putLongUnaligned(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + offset, v);
+        return new Object[]{ a };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test2c(byte[] a, int offset, long v) {
+        storeLongLE(a, offset, v);
+        return new Object[]{ a };
+    }
+
+    @Test
+    // No optimization, casting long -> int -> byte does not work
+    static Object[] test2d(byte[] a, int offset, long v) {
+        storeIntLE(a, offset + 0, (int)(v >> 0));
+        storeIntLE(a, offset + 4, (int)(v >> 32));
+        return new Object[]{ a };
+    }
+
+    @Test
+    // No optimization, casting long -> short -> byte does not work
+    static Object[] test2e(byte[] a, int offset, long v) {
+        storeShortLE(a, offset + 0, (short)(v >> 0));
+        storeShortLE(a, offset + 2, (short)(v >> 16));
+        storeShortLE(a, offset + 4, (short)(v >> 32));
+        storeShortLE(a, offset + 6, (short)(v >> 48));
+        return new Object[]{ a };
+    }
+
+    @DontCompile
+    static Object[] test3R(byte[] a, int offset, long v) {
+        a[offset + 0] = (byte)(v >> 0);
+        a[offset + 1] = (byte)(v >> 8);
+        a[offset + 2] = (byte)(v >> 16);
+        a[offset + 3] = (byte)(v >> 24);
+        a[offset + 4] = (byte)(v >> 0);
+        a[offset + 5] = (byte)(v >> 8);
+        a[offset + 6] = (byte)(v >> 16);
+        a[offset + 7] = (byte)(v >> 24);
+        return new Object[]{ a };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_I_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "2"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test3a(byte[] a, int offset, long v) {
+        a[offset + 0] = (byte)(v >> 0);
+        a[offset + 1] = (byte)(v >> 8);
+        a[offset + 2] = (byte)(v >> 16);
+        a[offset + 3] = (byte)(v >> 24);
+        a[offset + 4] = (byte)(v >> 0);
+        a[offset + 5] = (byte)(v >> 8);
+        a[offset + 6] = (byte)(v >> 16);
+        a[offset + 7] = (byte)(v >> 24);
+        return new Object[]{ a };
+    }
+
+    @DontCompile
+    static Object[] test4R(byte[] a, int offset, long v1, int v2, short v3, byte v4) {
+        a[offset +  0] = (byte)0x00;
+        a[offset +  1] = (byte)0xFF;
+        a[offset +  2] = v4;
+        a[offset +  3] = (byte)0x42;
+        a[offset +  4] = (byte)(v1 >> 0);
+        a[offset +  5] = (byte)(v1 >> 8);
+        a[offset +  6] = (byte)0xAB;
+        a[offset +  7] = (byte)0xCD;
+        a[offset +  8] = (byte)0xEF;
+        a[offset +  9] = (byte)0x01;
+        a[offset + 10] = (byte)(v2 >> 0);
+        a[offset + 11] = (byte)(v2 >> 8);
+        a[offset + 12] = (byte)(v2 >> 16);
+        a[offset + 13] = (byte)(v2 >> 24);
+        a[offset + 14] = (byte)(v3 >> 0);
+        a[offset + 15] = (byte)(v3 >> 8);
+        a[offset + 16] = (byte)0xEF;
+        return new Object[]{ a };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_B_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "4", // 3 (+ 1 for uncommon trap)
+                  IRNode.STORE_C_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "3",
+                  IRNode.STORE_I_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "2",
+                  IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test4a(byte[] a, int offset, long v1, int v2, short v3, byte v4) {
+        a[offset +  0] = (byte)0x00; // individual load expected to go into state of RC
+        a[offset +  1] = (byte)0xFF;
+        a[offset +  2] = v4;
+        a[offset +  3] = (byte)0x42;
+        a[offset +  4] = (byte)(v1 >> 0);
+        a[offset +  5] = (byte)(v1 >> 8);
+        a[offset +  6] = (byte)0xAB;
+        a[offset +  7] = (byte)0xCD;
+        a[offset +  8] = (byte)0xEF;
+        a[offset +  9] = (byte)0x01;
+        a[offset + 10] = (byte)(v2 >> 0);
+        a[offset + 11] = (byte)(v2 >> 8);
+        a[offset + 12] = (byte)(v2 >> 16);
+        a[offset + 13] = (byte)(v2 >> 24);
+        a[offset + 14] = (byte)(v3 >> 0);
+        a[offset + 15] = (byte)(v3 >> 8);
+        a[offset + 16] = (byte)0xEF;
+        return new Object[]{ a };
+    }
+
+    @DontCompile
+    static Object[] test5R(byte[] a, int offset) {
+        a[offset +  0] = (byte)0x01;
+        a[offset +  1] = (byte)0x02;
+        a[offset +  2] = (byte)0x03;
+        a[offset +  3] = (byte)0x04;
+        a[offset +  4] = (byte)0x11;
+        a[offset +  5] = (byte)0x22;
+        a[offset +  6] = (byte)0x33;
+        a[offset +  7] = (byte)0x44;
+        a[offset +  8] = (byte)0x55;
+        a[offset +  9] = (byte)0x66;
+        a[offset + 10] = (byte)0x77;
+        a[offset + 11] = (byte)0xAA;
+        a[offset + 12] = (byte)0xBB;
+        a[offset + 13] = (byte)0xCC;
+        a[offset + 14] = (byte)0xDD;
+        return new Object[]{ a };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_B_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1",
+                  IRNode.STORE_C_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1",
+                  IRNode.STORE_I_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1",
+                  IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test5a(byte[] a, int offset) {
+        a[offset +  0] = (byte)0x01;
+        a[offset +  1] = (byte)0x02;
+        a[offset +  2] = (byte)0x03;
+        a[offset +  3] = (byte)0x04;
+        a[offset +  4] = (byte)0x11;
+        a[offset +  5] = (byte)0x22;
+        a[offset +  6] = (byte)0x33;
+        a[offset +  7] = (byte)0x44;
+        a[offset +  8] = (byte)0x55;
+        a[offset +  9] = (byte)0x66;
+        a[offset + 10] = (byte)0x77;
+        a[offset + 11] = (byte)0xAA;
+        a[offset + 12] = (byte)0xBB;
+        a[offset + 13] = (byte)0xCC;
+        a[offset + 14] = (byte)0xDD;
+        return new Object[]{ a };
+    }
+
+    @DontCompile
+    static Object[] test6R(byte[] a, byte[] b, int offset1, int offset2) {
+        a[offset1 +  1] = (byte)0x02;
+        a[offset1 +  3] = (byte)0x04;
+        b[offset1 +  4] = (byte)0x11;
+        a[offset1 +  5] = (byte)0x22;
+        a[offset2 +  6] = (byte)0x33;
+        a[offset1 +  7] = (byte)0x44;
+        b[offset1 +  8] = (byte)0x55;
+        b[offset1 + 10] = (byte)0x66;
+        return new Object[]{ a, b };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_B_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "8",
+                  IRNode.STORE_C_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_I_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0"})
+    static Object[] test6a(byte[] a, byte[] b, int offset1, int offset2) {
+        a[offset1 +  1] = (byte)0x02;
+        a[offset1 +  3] = (byte)0x04;
+        b[offset1 +  4] = (byte)0x11;
+        a[offset1 +  5] = (byte)0x22;
+        a[offset2 +  6] = (byte)0x33;
+        a[offset1 +  7] = (byte)0x44;
+        b[offset1 +  8] = (byte)0x55;
+        b[offset1 + 10] = (byte)0x66;
+        return new Object[]{ a, b };
+    }
+
+    @DontCompile
+    static Object[] test7R(byte[] a, int offset1, int v1) {
+        a[offset1 +  1] = (byte)(v1 >> 8);
+        a[offset1 +  2] = (byte)(v1 >> 16);
+        a[offset1 +  3] = (byte)(v1 >> 24);
+        return new Object[]{ a };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_B_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "3",
+                  IRNode.STORE_C_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_I_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0"})
+    static Object[] test7a(byte[] a, int offset1, int v1) {
+        a[offset1 +  1] = (byte)(v1 >> 8);
+        a[offset1 +  2] = (byte)(v1 >> 16);
+        a[offset1 +  3] = (byte)(v1 >> 24);
+        return new Object[]{ a };
+    }
+
+    @DontCompile
+    static Object[] test100R(short[] a, int offset) {
+        a[offset +  0] = (short)0x0100;
+        a[offset +  1] = (short)0x0200;
+        a[offset +  2] = (short)0x0311;
+        a[offset +  3] = (short)0x0400;
+        a[offset +  4] = (short)0x1100;
+        a[offset +  5] = (short)0x2233;
+        a[offset +  6] = (short)0x3300;
+        a[offset +  7] = (short)0x4400;
+        a[offset +  8] = (short)0x5599;
+        a[offset +  9] = (short)0x6600;
+        a[offset + 10] = (short)0x7700;
+        a[offset + 11] = (short)0xAACC;
+        a[offset + 12] = (short)0xBB00;
+        a[offset + 13] = (short)0xCC00;
+        a[offset + 14] = (short)0xDDFF;
+        return new Object[]{ a };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_B_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_C_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1",
+                  IRNode.STORE_I_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1",
+                  IRNode.STORE_L_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "3"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test100a(short[] a, int offset) {
+        a[offset +  0] = (short)0x0100; // stays unchanged -> both used for RC and Return path
+        a[offset +  1] = (short)0x0200; //    I
+        a[offset +  2] = (short)0x0311; //    I
+        a[offset +  3] = (short)0x0400; //   L
+        a[offset +  4] = (short)0x1100; //   L
+        a[offset +  5] = (short)0x2233; //   L
+        a[offset +  6] = (short)0x3300; //   L
+        a[offset +  7] = (short)0x4400; //  L
+        a[offset +  8] = (short)0x5599; //  L
+        a[offset +  9] = (short)0x6600; //  L
+        a[offset + 10] = (short)0x7700; //  L
+        a[offset + 11] = (short)0xAACC; // L
+        a[offset + 12] = (short)0xBB00; // L
+        a[offset + 13] = (short)0xCC00; // L
+        a[offset + 14] = (short)0xDDFF; // L
+        return new Object[]{ a };
+    }
+
+    @DontCompile
+    static Object[] test101R(short[] a, int offset) {
+        a[offset +  0] = (short)0x0100;
+        a[offset +  1] = (short)0x0200;
+        a[offset +  2] = (short)0x0311;
+        a[offset +  3] = (short)0x0400;
+        a[offset +  4] = (short)0x1100;
+        a[offset +  5] = (short)0x2233;
+        a[offset +  6] = (short)0x3300;
+        a[offset +  7] = (short)0x4400;
+        a[offset +  8] = (short)0x5599;
+        a[offset +  9] = (short)0x6600;
+        a[offset + 10] = (short)0x7700;
+        a[offset + 11] = (short)0xAACC;
+        a[offset + 12] = (short)0xBB00;
+        a[offset + 13] = (short)0xCC00;
+        return new Object[]{ a };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_B_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_C_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1", // only for RC
+                  IRNode.STORE_I_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1",
+                  IRNode.STORE_L_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "3"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test101a(short[] a, int offset) {
+        a[offset +  0] = (short)0x0100; //    I plus kept unchanged for RC
+        a[offset +  1] = (short)0x0200; //    I
+        a[offset +  2] = (short)0x0311; //   L
+        a[offset +  3] = (short)0x0400; //   L
+        a[offset +  4] = (short)0x1100; //   L
+        a[offset +  5] = (short)0x2233; //   L
+        a[offset +  6] = (short)0x3300; //  L
+        a[offset +  7] = (short)0x4400; //  L
+        a[offset +  8] = (short)0x5599; //  L
+        a[offset +  9] = (short)0x6600; //  L
+        a[offset + 10] = (short)0x7700; // L
+        a[offset + 11] = (short)0xAACC; // L
+        a[offset + 12] = (short)0xBB00; // L
+        a[offset + 13] = (short)0xCC00; // L
+        return new Object[]{ a };
+    }
+
+    @DontCompile
+    static Object[] test102R(short[] a, int offset, long v1, int v2, short v3) {
+        a[offset +  0] = (short)0x0000;
+        a[offset +  1] = (short)0xFFFF;
+        a[offset +  2] = v3;
+        a[offset +  3] = (short)0x4242;
+        a[offset +  4] = (short)(v1 >>  0);
+        a[offset +  5] = (short)(v1 >> 16);
+        a[offset +  6] = (short)0xAB11;
+        a[offset +  7] = (short)0xCD36;
+        a[offset +  8] = (short)0xEF89;
+        a[offset +  9] = (short)0x0156;
+        a[offset + 10] = (short)(v1 >> 0);
+        a[offset + 11] = (short)(v1 >> 16);
+        a[offset + 12] = (short)(v1 >> 32);
+        a[offset + 13] = (short)(v1 >> 48);
+        a[offset + 14] = (short)(v2 >> 0);
+        a[offset + 15] = (short)(v2 >> 16);
+        a[offset + 16] = (short)0xEFEF;
+        return new Object[]{ a };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_B_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_C_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "4", // 3 (+1 that goes into RC)
+                  IRNode.STORE_I_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "3",
+                  IRNode.STORE_L_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "2"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test102a(short[] a, int offset, long v1, int v2, short v3) {
+        a[offset +  0] = (short)0x0000; // store goes into RC
+        a[offset +  1] = (short)0xFFFF;
+        a[offset +  2] = v3;
+        a[offset +  3] = (short)0x4242;
+        a[offset +  4] = (short)(v1 >>  0);
+        a[offset +  5] = (short)(v1 >> 16);
+        a[offset +  6] = (short)0xAB11;
+        a[offset +  7] = (short)0xCD36;
+        a[offset +  8] = (short)0xEF89;
+        a[offset +  9] = (short)0x0156;
+        a[offset + 10] = (short)(v1 >> 0);
+        a[offset + 11] = (short)(v1 >> 16);
+        a[offset + 12] = (short)(v1 >> 32);
+        a[offset + 13] = (short)(v1 >> 48);
+        a[offset + 14] = (short)(v2 >> 0);
+        a[offset + 15] = (short)(v2 >> 16);
+        a[offset + 16] = (short)0xEFEF;
+        return new Object[]{ a };
+    }
+
+    @DontCompile
+    static Object[] test200R(int[] a, int offset) {
+        a[offset +  0] = 0x01001236;
+        a[offset +  1] = 0x02001284;
+        a[offset +  2] = 0x03111235;
+        a[offset +  3] = 0x04001294;
+        a[offset +  4] = 0x11001234;
+        a[offset +  5] = 0x22331332;
+        a[offset +  6] = 0x33001234;
+        a[offset +  7] = 0x44001432;
+        a[offset +  8] = 0x55991234;
+        a[offset +  9] = 0x66001233;
+        a[offset + 10] = 0x77001434;
+        a[offset + 11] = 0xAACC1234;
+        a[offset + 12] = 0xBB001434;
+        a[offset + 13] = 0xCC001236;
+        a[offset + 14] = 0xDDFF1534;
+        return new Object[]{ a };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_B_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_C_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_I_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1",
+                  IRNode.STORE_L_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "7"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test200a(int[] a, int offset) {
+        a[offset +  0] = 0x01001236; // stays unchanged -> both used for RC and Return path
+        a[offset +  1] = 0x02001284; //       L
+        a[offset +  2] = 0x03111235; //       L
+        a[offset +  3] = 0x04001294; //      L
+        a[offset +  4] = 0x11001234; //      L
+        a[offset +  5] = 0x22331332; //     L
+        a[offset +  6] = 0x33001234; //     L
+        a[offset +  7] = 0x44001432; //    L
+        a[offset +  8] = 0x55991234; //    L
+        a[offset +  9] = 0x66001233; //   L
+        a[offset + 10] = 0x77001434; //   L
+        a[offset + 11] = 0xAACC1234; //  L
+        a[offset + 12] = 0xBB001434; //  L
+        a[offset + 13] = 0xCC001236; // L
+        a[offset + 14] = 0xDDFF1534; // L
+        return new Object[]{ a };
+    }
+
+    @DontCompile
+    static Object[] test201R(int[] a, int offset) {
+        a[offset +  0] = 0x01001236;
+        a[offset +  1] = 0x02001284;
+        a[offset +  2] = 0x03111235;
+        a[offset +  3] = 0x04001294;
+        a[offset +  4] = 0x11001234;
+        a[offset +  5] = 0x22331332;
+        a[offset +  6] = 0x33001234;
+        a[offset +  7] = 0x44001432;
+        a[offset +  8] = 0x55991234;
+        a[offset +  9] = 0x66001233;
+        a[offset + 10] = 0x77001434;
+        a[offset + 11] = 0xAACC1234;
+        a[offset + 12] = 0xBB001434;
+        a[offset + 13] = 0xCC001236;
+        return new Object[]{ a };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_B_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_C_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_I_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1", // only for RC
+                  IRNode.STORE_L_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "7"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test201a(int[] a, int offset) {
+        a[offset +  0] = 0x01001236; //       L and also kept unchanged for RC
+        a[offset +  1] = 0x02001284; //       L
+        a[offset +  2] = 0x03111235; //      L
+        a[offset +  3] = 0x04001294; //      L
+        a[offset +  4] = 0x11001234; //     L
+        a[offset +  5] = 0x22331332; //     L
+        a[offset +  6] = 0x33001234; //    L
+        a[offset +  7] = 0x44001432; //    L
+        a[offset +  8] = 0x55991234; //   L
+        a[offset +  9] = 0x66001233; //   L
+        a[offset + 10] = 0x77001434; //  L
+        a[offset + 11] = 0xAACC1234; //  L
+        a[offset + 12] = 0xBB001434; // L
+        a[offset + 13] = 0xCC001236; // L
+        return new Object[]{ a };
+    }
+
+    @DontCompile
+    static Object[] test202R(int[] a, int offset, long v1, int v2) {
+        a[offset +  0] = 0x00000000;
+        a[offset +  1] = 0xFFFFFFFF;
+        a[offset +  2] = v2;
+        a[offset +  3] = 0x42424242;
+        a[offset +  4] = (int)(v1 >>  0);
+        a[offset +  5] = (int)(v1 >> 32);
+        a[offset +  6] = 0xAB110129;
+        a[offset +  7] = 0xCD360183;
+        a[offset +  8] = 0xEF890173;
+        a[offset +  9] = 0x01560124;
+        a[offset + 10] = (int)(v1 >> 0);
+        a[offset + 11] = (int)(v1 >> 32);
+        a[offset + 12] = (int)(v1 >> 0);
+        a[offset + 13] = (int)(v1 >> 32);
+        a[offset + 14] = v2;
+        a[offset + 15] = v2;
+        a[offset + 16] = 0xEFEFEFEF;
+        return new Object[]{ a };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_B_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_C_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_I_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "6", // 5 (+1 that goes into RC)
+                  IRNode.STORE_L_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "6"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test202a(int[] a, int offset, long v1, int v2) {
+        a[offset +  0] = 0x00000000; // merged with store below, but also kept unchanged for RC
+        a[offset +  1] = 0xFFFFFFFF;
+        a[offset +  2] = v2;
+        a[offset +  3] = 0x42424242;
+        a[offset +  4] = (int)(v1 >>  0);
+        a[offset +  5] = (int)(v1 >> 32);
+        a[offset +  6] = 0xAB110129;
+        a[offset +  7] = 0xCD360183;
+        a[offset +  8] = 0xEF890173;
+        a[offset +  9] = 0x01560124;
+        a[offset + 10] = (int)(v1 >> 0);
+        a[offset + 11] = (int)(v1 >> 32);
+        a[offset + 12] = (int)(v1 >> 0);
+        a[offset + 13] = (int)(v1 >> 32);
+        a[offset + 14] = v2;
+        a[offset + 15] = v2;
+        a[offset + 16] = 0xEFEFEFEF;
+        return new Object[]{ a };
+    }
+
+    @DontCompile
+    static Object[] test300R(int[] a) {
+        a[2] = 42;
+        a[3] = 42;
+        a[4] = 42;
+        a[5] = 42;
+        int x = a[3]; // dependent load
+        return new Object[]{ a, new int[]{ x } };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_L_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "2"},
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test300a(int[] a) {
+        a[2] = 42;
+        a[3] = 42;
+        a[4] = 42;
+        a[5] = 42;
+        int x = a[3]; // dependent load
+        return new Object[]{ a, new int[]{ x } };
+    }
+
+    @DontCompile
+    static Object[] test400R(int[] a) {
+        UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 0, (byte)0xbe);
+        UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 1, (byte)0xba);
+        UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 2, (byte)0xad);
+        UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 3, (byte)0xba);
+        UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 4, (byte)0xef);
+        UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 5, (byte)0xbe);
+        UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 6, (byte)0xad);
+        UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 7, (byte)0xde);
+        return new Object[]{ a };
+    }
+
+    @Test
+    // We must be careful with mismatched accesses on arrays:
+    // An int-array can have about 2x max_int size, and hence if we address bytes in it, we can have int-overflows.
+    // We might consider addresses (x + 0) and (x + 1) as adjacent, even if x = max_int, and therefore the second
+    // address overflows and is not adjacent at all.
+    // Therefore, we should only consider stores that have the same size as the element type of the array.
+    @IR(counts = {IRNode.STORE_B_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "8", // no merging
+                  IRNode.STORE_C_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_I_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_L_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0"})
+    static Object[] test400a(int[] a) {
+        UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 0, (byte)0xbe);
+        UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 1, (byte)0xba);
+        UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 2, (byte)0xad);
+        UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 3, (byte)0xba);
+        UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 4, (byte)0xef);
+        UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 5, (byte)0xbe);
+        UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 6, (byte)0xad);
+        UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 7, (byte)0xde);
+        return new Object[]{ a };
+    }
+
+    @DontCompile
+    // The 500-series has all the same code, but is executed with different inputs:
+    // 500a: never violate a RangeCheck -> expect will always merge stores
+    // 501a: randomly violate RangeCheck, also during warmup -> never merge stores
+    // 502a: during warmup never violate RangeCheck -> compile once with merged stores
+    //       but then after warmup violate RangeCheck -> recompile without merged stores
+    static Object[] test500R(byte[] a, int offset, long v) {
+        int idx = 0;
+        try {
+            a[offset + 0] = (byte)(v >> 0);
+            idx = 1;
+            a[offset + 1] = (byte)(v >> 8);
+            idx = 2;
+            a[offset + 2] = (byte)(v >> 16);
+            idx = 3;
+            a[offset + 3] = (byte)(v >> 24);
+            idx = 4;
+            a[offset + 4] = (byte)(v >> 32);
+            idx = 5;
+            a[offset + 5] = (byte)(v >> 40);
+            idx = 6;
+            a[offset + 6] = (byte)(v >> 48);
+            idx = 7;
+            a[offset + 7] = (byte)(v >> 56);
+            idx = 8;
+        } catch (ArrayIndexOutOfBoundsException _) {}
+        return new Object[]{ a, new int[]{ idx } };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_B_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1", // for RangeCheck trap
+                  IRNode.STORE_C_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_I_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"}, // expect merged
+        applyIf = {"UseUnalignedAccesses", "true"})
+    static Object[] test500a(byte[] a, int offset, long v) {
+        int idx = 0;
+        try {
+            a[offset + 0] = (byte)(v >> 0);
+            idx = 1;
+            a[offset + 1] = (byte)(v >> 8);
+            idx = 2;
+            a[offset + 2] = (byte)(v >> 16);
+            idx = 3;
+            a[offset + 3] = (byte)(v >> 24);
+            idx = 4;
+            a[offset + 4] = (byte)(v >> 32);
+            idx = 5;
+            a[offset + 5] = (byte)(v >> 40);
+            idx = 6;
+            a[offset + 6] = (byte)(v >> 48);
+            idx = 7;
+            a[offset + 7] = (byte)(v >> 56);
+            idx = 8;
+        } catch (ArrayIndexOutOfBoundsException _) {}
+        return new Object[]{ a, new int[]{ idx } };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_B_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "8", // No optimization because of too many RangeChecks
+                  IRNode.STORE_C_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_I_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0"})
+    static Object[] test501a(byte[] a, int offset, long v) {
+        int idx = 0;
+        try {
+            a[offset + 0] = (byte)(v >> 0);
+            idx = 1;
+            a[offset + 1] = (byte)(v >> 8);
+            idx = 2;
+            a[offset + 2] = (byte)(v >> 16);
+            idx = 3;
+            a[offset + 3] = (byte)(v >> 24);
+            idx = 4;
+            a[offset + 4] = (byte)(v >> 32);
+            idx = 5;
+            a[offset + 5] = (byte)(v >> 40);
+            idx = 6;
+            a[offset + 6] = (byte)(v >> 48);
+            idx = 7;
+            a[offset + 7] = (byte)(v >> 56);
+            idx = 8;
+        } catch (ArrayIndexOutOfBoundsException _) {}
+        return new Object[]{ a, new int[]{ idx } };
+    }
+
+    @Test
+    @IR(counts = {IRNode.STORE_B_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "8", // No optimization because of too many RangeChecks
+                  IRNode.STORE_C_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_I_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0",
+                  IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0"})
+    static Object[] test502a(byte[] a, int offset, long v) {
+        int idx = 0;
+        try {
+            a[offset + 0] = (byte)(v >> 0);
+            idx = 1;
+            a[offset + 1] = (byte)(v >> 8);
+            idx = 2;
+            a[offset + 2] = (byte)(v >> 16);
+            idx = 3;
+            a[offset + 3] = (byte)(v >> 24);
+            idx = 4;
+            a[offset + 4] = (byte)(v >> 32);
+            idx = 5;
+            a[offset + 5] = (byte)(v >> 40);
+            idx = 6;
+            a[offset + 6] = (byte)(v >> 48);
+            idx = 7;
+            a[offset + 7] = (byte)(v >> 56);
+            idx = 8;
+        } catch (ArrayIndexOutOfBoundsException _) {}
+        return new Object[]{ a, new int[]{ idx } };
+    }
+}
diff --git a/test/micro/org/openjdk/bench/vm/compiler/MergeStores.java b/test/micro/org/openjdk/bench/vm/compiler/MergeStores.java
new file mode 100644
index 0000000000000..e1f0d5eaedcdf
--- /dev/null
+++ b/test/micro/org/openjdk/bench/vm/compiler/MergeStores.java
@@ -0,0 +1,696 @@
+/*
+ * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package org.openjdk.bench.vm.compiler;
+
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+
+
+import jdk.internal.misc.Unsafe;
+import jdk.internal.util.ByteArrayLittleEndian;
+import java.util.concurrent.TimeUnit;
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Warmup(iterations = 3, time = 3)
+@Measurement(iterations = 3, time = 3)
+@Fork(value = 3, jvmArgsAppend = {
+        "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED",
+        "--add-exports", "java.base/jdk.internal.util=ALL-UNNAMED"})
+@State(Scope.Benchmark)
+public class MergeStores {
+
+    public static final int RANGE = 100;
+
+    static Unsafe UNSAFE = Unsafe.getUnsafe();
+
+    @Param("1")
+    public static short vS;
+
+    @Param("1")
+    public static int vI;
+
+    @Param("1")
+    public static long vL;
+
+    public static int offset = 5;
+    public static byte[]  aB = new byte[RANGE];
+    public static short[] aS = new short[RANGE];
+    public static int[]   aI = new int[RANGE];
+
+    // -------------------------------------------
+    // -------     Little-Endian API    ----------
+    // -------------------------------------------
+
+    // Store a short LE into an array using store bytes in an array
+    static void storeShortLE(byte[] bytes, int offset, short value) {
+        storeBytes(bytes, offset, (byte)(value >> 0),
+                                  (byte)(value >> 8));
+    }
+
+    // Store an int LE into an array using store bytes in an array
+    static void storeIntLE(byte[] bytes, int offset, int value) {
+        storeBytes(bytes, offset, (byte)(value >> 0 ),
+                                  (byte)(value >> 8 ),
+                                  (byte)(value >> 16),
+                                  (byte)(value >> 24));
+    }
+
+    // Store an int LE into an array using store bytes in an array
+    static void storeLongLE(byte[] bytes, int offset, long value) {
+        storeBytes(bytes, offset, (byte)(value >> 0 ),
+                                  (byte)(value >> 8 ),
+                                  (byte)(value >> 16),
+                                  (byte)(value >> 24),
+                                  (byte)(value >> 32),
+                                  (byte)(value >> 40),
+                                  (byte)(value >> 48),
+                                  (byte)(value >> 56));
+    }
+
+    // Store 2 bytes into an array
+    static void storeBytes(byte[] bytes, int offset, byte b0, byte b1) {
+        bytes[offset + 0] = b0;
+        bytes[offset + 1] = b1;
+    }
+
+    // Store 4 bytes into an array
+    static void storeBytes(byte[] bytes, int offset, byte b0, byte b1, byte b2, byte b3) {
+        bytes[offset + 0] = b0;
+        bytes[offset + 1] = b1;
+        bytes[offset + 2] = b2;
+        bytes[offset + 3] = b3;
+    }
+
+    // Store 8 bytes into an array
+    static void storeBytes(byte[] bytes, int offset, byte b0, byte b1, byte b2, byte b3,
+                                                     byte b4, byte b5, byte b6, byte b7) {
+        bytes[offset + 0] = b0;
+        bytes[offset + 1] = b1;
+        bytes[offset + 2] = b2;
+        bytes[offset + 3] = b3;
+        bytes[offset + 4] = b4;
+        bytes[offset + 5] = b5;
+        bytes[offset + 6] = b6;
+        bytes[offset + 7] = b7;
+    }
+
+    // -------------------------------- BENCHMARKS --------------------------------
+
+    @Benchmark
+    public void baseline() {
+    }
+
+    @Benchmark
+    public byte[] baseline_allocate() {
+        byte[] aB = new byte[RANGE];
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_con_adr0_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[0] = (byte)0x01;
+        aB[1] = (byte)0x02;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_con_adr1_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[1] = (byte)0x01;
+        aB[2] = (byte)0x02;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_con_offs_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[offset + 0] = (byte)0x01;
+        aB[offset + 1] = (byte)0x02;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_con_offs_allocate_unsafe() {
+        byte[] aB = new byte[RANGE];
+        UNSAFE.putShortUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, (short)0x0201);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_con_offs_allocate_bale() {
+        byte[] aB = new byte[RANGE];
+        ByteArrayLittleEndian.setShort(aB, offset, (short)0x0201);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_con_offs_allocate_leapi() {
+        byte[] aB = new byte[RANGE];
+        storeShortLE(aB, offset, (short)0x0201);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_con_offs_nonalloc_direct() {
+        aB[offset + 0] = (byte)0x01;
+        aB[offset + 1] = (byte)0x02;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_con_offs_nonalloc_unsafe() {
+        UNSAFE.putShortUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, (short)0x0201);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_con_offs_nonalloc_bale() {
+        ByteArrayLittleEndian.setShort(aB, offset, (short)0x0201);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_con_offs_nonalloc_leapi() {
+        storeShortLE(aB, offset, (short)0x0201);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_S_offs_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[offset + 0] = (byte)(vS >> 0 );
+        aB[offset + 1] = (byte)(vS >> 8 );
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_S_offs_allocate_unsafe() {
+        byte[] aB = new byte[RANGE];
+        UNSAFE.putShortUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vS);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_S_offs_allocate_bale() {
+        byte[] aB = new byte[RANGE];
+        ByteArrayLittleEndian.setShort(aB, offset, vS);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_S_offs_allocate_leapi() {
+        byte[] aB = new byte[RANGE];
+        storeShortLE(aB, offset, vS);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_S_offs_nonalloc_direct() {
+        aB[offset + 0] = (byte)(vS >> 0 );
+        aB[offset + 1] = (byte)(vS >> 8 );
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_S_offs_nonalloc_unsafe() {
+        UNSAFE.putShortUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vS);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_S_offs_nonalloc_bale() {
+        ByteArrayLittleEndian.setShort(aB, offset, vS);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B2_S_offs_nonalloc_leapi() {
+        storeShortLE(aB, offset, vS);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_con_adr0_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[0] = (byte)0x01;
+        aB[1] = (byte)0x02;
+        aB[2] = (byte)0x03;
+        aB[3] = (byte)0x04;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_con_adr1_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[1] = (byte)0x01;
+        aB[2] = (byte)0x02;
+        aB[3] = (byte)0x03;
+        aB[4] = (byte)0x04;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_con_offs_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[offset + 0] = (byte)0x01;
+        aB[offset + 1] = (byte)0x02;
+        aB[offset + 2] = (byte)0x03;
+        aB[offset + 3] = (byte)0x04;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_con_offs_allocate_unsafe() {
+        byte[] aB = new byte[RANGE];
+        UNSAFE.putIntUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, 0x04030201);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_con_offs_allocate_bale() {
+        byte[] aB = new byte[RANGE];
+        ByteArrayLittleEndian.setInt(aB, offset, 0x04030201);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_con_offs_allocate_leapi() {
+        byte[] aB = new byte[RANGE];
+        storeIntLE(aB, offset, 0x04030201);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_con_offs_nonalloc_direct() {
+        aB[offset + 0] = (byte)0x01;
+        aB[offset + 1] = (byte)0x02;
+        aB[offset + 2] = (byte)0x03;
+        aB[offset + 3] = (byte)0x04;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_con_offs_nonalloc_unsafe() {
+        UNSAFE.putIntUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, 0x04030201);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_con_offs_nonalloc_bale() {
+        ByteArrayLittleEndian.setInt(aB, offset, 0x04030201);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_con_offs_nonalloc_leapi() {
+        storeIntLE(aB, offset, 0x04030201);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_I_offs_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[offset + 0] = (byte)(vI >> 0 );
+        aB[offset + 1] = (byte)(vI >> 8 );
+        aB[offset + 2] = (byte)(vI >> 16);
+        aB[offset + 3] = (byte)(vI >> 24);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_I_offs_allocate_unsafe() {
+        byte[] aB = new byte[RANGE];
+        UNSAFE.putIntUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vI);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_I_offs_allocate_bale() {
+        byte[] aB = new byte[RANGE];
+        ByteArrayLittleEndian.setInt(aB, offset, vI);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_I_offs_allocate_leapi() {
+        byte[] aB = new byte[RANGE];
+        storeIntLE(aB, offset, vI);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_I_offs_nonalloc_direct() {
+        aB[offset + 0] = (byte)(vI >> 0 );
+        aB[offset + 1] = (byte)(vI >> 8 );
+        aB[offset + 2] = (byte)(vI >> 16);
+        aB[offset + 3] = (byte)(vI >> 24);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_I_offs_nonalloc_unsafe() {
+        UNSAFE.putIntUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vI);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_I_offs_nonalloc_bale() {
+        ByteArrayLittleEndian.setInt(aB, offset, vI);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B4_I_offs_nonalloc_leapi() {
+        storeIntLE(aB, offset, vI);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_con_adr0_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[0] = (byte)0x01;
+        aB[1] = (byte)0x02;
+        aB[2] = (byte)0x03;
+        aB[3] = (byte)0x04;
+        aB[4] = (byte)0x05;
+        aB[5] = (byte)0x06;
+        aB[6] = (byte)0x07;
+        aB[7] = (byte)0x08;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_con_adr1_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[1] = (byte)0x01;
+        aB[2] = (byte)0x02;
+        aB[3] = (byte)0x03;
+        aB[4] = (byte)0x04;
+        aB[5] = (byte)0x05;
+        aB[6] = (byte)0x06;
+        aB[7] = (byte)0x07;
+        aB[8] = (byte)0x08;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_con_offs_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[offset + 0] = (byte)0x01;
+        aB[offset + 1] = (byte)0x02;
+        aB[offset + 2] = (byte)0x03;
+        aB[offset + 3] = (byte)0x04;
+        aB[offset + 4] = (byte)0x05;
+        aB[offset + 5] = (byte)0x06;
+        aB[offset + 6] = (byte)0x07;
+        aB[offset + 7] = (byte)0x08;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_con_offs_allocate_unsafe() {
+        byte[] aB = new byte[RANGE];
+        UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, 0x0807060504030201L);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_con_offs_allocate_bale() {
+        byte[] aB = new byte[RANGE];
+        ByteArrayLittleEndian.setLong(aB, offset, 0x0807060504030201L);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_con_offs_allocate_leapi() {
+        byte[] aB = new byte[RANGE];
+        storeLongLE(aB, offset, 0x0807060504030201L);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_con_offs_nonalloc_direct() {
+        aB[offset + 0] = (byte)0x01;
+        aB[offset + 1] = (byte)0x02;
+        aB[offset + 2] = (byte)0x03;
+        aB[offset + 3] = (byte)0x04;
+        aB[offset + 4] = (byte)0x05;
+        aB[offset + 5] = (byte)0x06;
+        aB[offset + 6] = (byte)0x07;
+        aB[offset + 7] = (byte)0x08;
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_con_offs_nonalloc_unsafe() {
+        UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, 0x0807060504030201L);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_con_offs_nonalloc_bale() {
+        ByteArrayLittleEndian.setLong(aB, offset, 0x0807060504030201L);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_con_offs_nonalloc_leapi() {
+        storeLongLE(aB, offset, 0x0807060504030201L);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_L_offs_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[offset + 0] = (byte)(vL >> 0 );
+        aB[offset + 1] = (byte)(vL >> 8 );
+        aB[offset + 2] = (byte)(vL >> 16);
+        aB[offset + 3] = (byte)(vL >> 24);
+        aB[offset + 4] = (byte)(vL >> 32);
+        aB[offset + 5] = (byte)(vL >> 40);
+        aB[offset + 6] = (byte)(vL >> 48);
+        aB[offset + 7] = (byte)(vL >> 56);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_L_offs_allocate_unsafe() {
+        byte[] aB = new byte[RANGE];
+        UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vL);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_L_offs_allocate_bale() {
+        byte[] aB = new byte[RANGE];
+        ByteArrayLittleEndian.setLong(aB, offset, vL);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_L_offs_allocate_leapi() {
+        byte[] aB = new byte[RANGE];
+        storeLongLE(aB, offset, vL);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_L_offs_nonalloc_direct() {
+        aB[offset + 0] = (byte)(vL >> 0 );
+        aB[offset + 1] = (byte)(vL >> 8 );
+        aB[offset + 2] = (byte)(vL >> 16);
+        aB[offset + 3] = (byte)(vL >> 24);
+        aB[offset + 4] = (byte)(vL >> 32);
+        aB[offset + 5] = (byte)(vL >> 40);
+        aB[offset + 6] = (byte)(vL >> 48);
+        aB[offset + 7] = (byte)(vL >> 56);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_L_offs_nonalloc_unsafe() {
+        UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vL);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_L_offs_nonalloc_bale() {
+        ByteArrayLittleEndian.setLong(aB, offset, vL);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_L_offs_nonalloc_leapi() {
+        storeLongLE(aB, offset, vL);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_I2_offs_allocate_direct() {
+        byte[] aB = new byte[RANGE];
+        aB[offset + 0] = (byte)(vI >> 0 );
+        aB[offset + 1] = (byte)(vI >> 8 );
+        aB[offset + 2] = (byte)(vI >> 16);
+        aB[offset + 3] = (byte)(vI >> 24);
+        aB[offset + 4] = (byte)(vI >> 0 );
+        aB[offset + 5] = (byte)(vI >> 8 );
+        aB[offset + 6] = (byte)(vI >> 16);
+        aB[offset + 7] = (byte)(vI >> 24);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_I2_offs_allocate_unsafe() {
+        byte[] aB = new byte[RANGE];
+        UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 0, vI);
+        UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 4, vI);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_I2_offs_allocate_bale() {
+        byte[] aB = new byte[RANGE];
+        ByteArrayLittleEndian.setInt(aB, offset + 0, vI);
+        ByteArrayLittleEndian.setInt(aB, offset + 4, vI);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_I2_offs_allocate_leapi() {
+        byte[] aB = new byte[RANGE];
+        storeIntLE(aB, offset + 0, vI);
+        storeIntLE(aB, offset + 4, vI);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_I2_offs_nonalloc_direct() {
+        aB[offset + 0] = (byte)(vI >> 0 );
+        aB[offset + 1] = (byte)(vI >> 8 );
+        aB[offset + 2] = (byte)(vI >> 16);
+        aB[offset + 3] = (byte)(vI >> 24);
+        aB[offset + 4] = (byte)(vI >> 0 );
+        aB[offset + 5] = (byte)(vI >> 8 );
+        aB[offset + 6] = (byte)(vI >> 16);
+        aB[offset + 7] = (byte)(vI >> 24);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_I2_offs_nonalloc_unsafe() {
+        UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 0, vI);
+        UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 4, vI);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_I2_offs_nonalloc_bale() {
+        ByteArrayLittleEndian.setInt(aB, offset + 0, vI);
+        ByteArrayLittleEndian.setInt(aB, offset + 4, vI);
+        return aB;
+    }
+
+    @Benchmark
+    public byte[] store_B8_I2_offs_nonalloc_leapi() {
+        storeIntLE(aB, offset + 0, vI);
+        storeIntLE(aB, offset + 4, vI);
+        return aB;
+    }
+
+    @Benchmark
+    public short[] store_S2_con_offs_allocate_direct() {
+        short[] aS = new short[RANGE];
+        aS[offset + 0] = (short)0x0102;
+        aS[offset + 1] = (short)0x0304;
+        return aS;
+    }
+
+    @Benchmark
+    public short[] store_S2_con_offs_nonalloc_direct() {
+        aS[offset + 0] = (short)0x0102;
+        aS[offset + 1] = (short)0x0304;
+        return aS;
+    }
+
+    @Benchmark
+    public short[] store_S4_con_offs_allocate_direct() {
+        short[] aS = new short[RANGE];
+        aS[offset + 0] = (short)0x0102;
+        aS[offset + 1] = (short)0x0304;
+        aS[offset + 2] = (short)0x0506;
+        aS[offset + 3] = (short)0x0708;
+        return aS;
+    }
+
+    @Benchmark
+    public short[] store_S4_con_offs_nonalloc_direct() {
+        aS[offset + 0] = (short)0x0102;
+        aS[offset + 1] = (short)0x0304;
+        aS[offset + 2] = (short)0x0506;
+        aS[offset + 3] = (short)0x0708;
+        return aS;
+    }
+
+    @Benchmark
+    public int[] store_I2_con_offs_allocate_direct() {
+        int[] aI = new int[RANGE];
+        aI[offset + 0] = 0x01020304;
+        aI[offset + 1] = 0x05060708;
+        return aI;
+    }
+
+    @Benchmark
+    public int[] store_I2_con_offs_nonalloc_direct() {
+        aI[offset + 0] = 0x01020304;
+        aI[offset + 1] = 0x05060708;
+        return aI;
+    }
+
+    @Benchmark
+    public int[] store_I2_zero_offs_allocate_direct() {
+        int[] aI = new int[RANGE];
+        aI[offset + 0] = 0;
+        aI[offset + 1] = 0;
+        return aI;
+    }
+
+    @Benchmark
+    public int[] store_I2_zero_offs_nonalloc_direct() {
+        aI[offset + 0] = 0;
+        aI[offset + 1] = 0;
+        return aI;
+    }
+}