diff --git a/make/test/BuildMicrobenchmark.gmk b/make/test/BuildMicrobenchmark.gmk index ba502a5612870..5940a38c9dba2 100644 --- a/make/test/BuildMicrobenchmark.gmk +++ b/make/test/BuildMicrobenchmark.gmk @@ -101,6 +101,7 @@ $(eval $(call SetupJavaCompilation, BUILD_JDK_MICROBENCHMARK, \ --add-exports java.base/jdk.internal.event=ALL-UNNAMED \ --add-exports java.base/jdk.internal.foreign=ALL-UNNAMED \ --add-exports java.base/jdk.internal.misc=ALL-UNNAMED \ + --add-exports java.base/jdk.internal.util=ALL-UNNAMED \ --add-exports java.base/jdk.internal.org.objectweb.asm.tree=ALL-UNNAMED \ --add-exports java.base/jdk.internal.org.objectweb.asm=ALL-UNNAMED \ --add-exports java.base/jdk.internal.vm=ALL-UNNAMED \ diff --git a/src/hotspot/share/opto/addnode.cpp b/src/hotspot/share/opto/addnode.cpp index 928e51191d67b..f0e55d6ed48c9 100644 --- a/src/hotspot/share/opto/addnode.cpp +++ b/src/hotspot/share/opto/addnode.cpp @@ -724,9 +724,9 @@ Node* AddPNode::Ideal_base_and_offset(Node* ptr, PhaseValues* phase, //------------------------------unpack_offsets---------------------------------- // Collect the AddP offset values into the elements array, giving up // if there are more than length. -int AddPNode::unpack_offsets(Node* elements[], int length) { +int AddPNode::unpack_offsets(Node* elements[], int length) const { int count = 0; - Node* addr = this; + Node const* addr = this; Node* base = addr->in(AddPNode::Base); while (addr->is_AddP()) { if (addr->in(AddPNode::Base) != base) { diff --git a/src/hotspot/share/opto/addnode.hpp b/src/hotspot/share/opto/addnode.hpp index a6ef58b98ce9d..19043b5e40f5d 100644 --- a/src/hotspot/share/opto/addnode.hpp +++ b/src/hotspot/share/opto/addnode.hpp @@ -181,7 +181,7 @@ class AddPNode : public Node { // Collect the AddP offset values into the elements array, giving up // if there are more than length. - int unpack_offsets(Node* elements[], int length); + int unpack_offsets(Node* elements[], int length) const; // Do not match base-ptr edge virtual uint match_edge(uint idx) const; diff --git a/src/hotspot/share/opto/c2_globals.hpp b/src/hotspot/share/opto/c2_globals.hpp index 68d4f89b8f2b9..ac807e071aa77 100644 --- a/src/hotspot/share/opto/c2_globals.hpp +++ b/src/hotspot/share/opto/c2_globals.hpp @@ -354,6 +354,12 @@ notproduct(bool, TraceNewVectors, false, \ "Trace creation of Vector nodes") \ \ + product(bool, MergeStores, true, DIAGNOSTIC, \ + "Optimize stores by combining values into larger store") \ + \ + develop(bool, TraceMergeStores, false, \ + "Trace creation of merged stores") \ + \ product_pd(bool, OptoBundling, \ "Generate nops to fill i-cache lines") \ \ diff --git a/src/hotspot/share/opto/compile.cpp b/src/hotspot/share/opto/compile.cpp index 417f828013f92..c1652230f3102 100644 --- a/src/hotspot/share/opto/compile.cpp +++ b/src/hotspot/share/opto/compile.cpp @@ -930,6 +930,7 @@ Compile::Compile( ciEnv* ci_env, _directive(directive), _log(ci_env->log()), _first_failure_details(nullptr), + _for_post_loop_igvn(comp_arena(), 8, 0, nullptr), _congraph(nullptr), NOT_PRODUCT(_igv_printer(nullptr) COMMA) _unique(0), diff --git a/src/hotspot/share/opto/memnode.cpp b/src/hotspot/share/opto/memnode.cpp index e0a364d5056b3..de1c61e29f685 100644 --- a/src/hotspot/share/opto/memnode.cpp +++ b/src/hotspot/share/opto/memnode.cpp @@ -2685,6 +2685,683 @@ uint StoreNode::hash() const { return NO_HASH; } +// Class to parse array pointers, and determine if they are adjacent. We parse the form: +// +// pointer = base +// + constant_offset +// + LShiftL( ConvI2L(int_offset + int_con), int_offset_shift) +// + sum(other_offsets) +// +// +// Note: we accumulate all constant offsets into constant_offset, even the int constant behind +// the "LShiftL(ConvI2L(...))" pattern. We convert "ConvI2L(int_offset + int_con)" to +// "ConvI2L(int_offset) + int_con", which is only safe if we can assume that either all +// compared addresses have an overflow for "int_offset + int_con" or none. +// For loads and stores on arrays, we know that if one overflows and the other not, then +// the two addresses lay almost max_int indices apart, but the maximal array size is +// only about half of that. Therefore, the RangeCheck on at least one of them must have +// failed. +// +// constant_offset += LShiftL( ConvI2L(int_con), int_offset_shift) +// +// pointer = base +// + constant_offset +// + LShiftL( ConvI2L(int_offset), int_offset_shift) +// + sum(other_offsets) +// +class ArrayPointer { +private: + const bool _is_valid; // The parsing succeeded + const Node* _pointer; // The final pointer to the position in the array + const Node* _base; // Base address of the array + const jlong _constant_offset; // Sum of collected constant offsets + const Node* _int_offset; // (optional) Offset behind LShiftL and ConvI2L + const jint _int_offset_shift; // (optional) Shift value for int_offset + const GrowableArray* _other_offsets; // List of other AddP offsets + + ArrayPointer(const bool is_valid, + const Node* pointer, + const Node* base, + const jlong constant_offset, + const Node* int_offset, + const jint int_offset_shift, + const GrowableArray* other_offsets) : + _is_valid(is_valid), + _pointer(pointer), + _base(base), + _constant_offset(constant_offset), + _int_offset(int_offset), + _int_offset_shift(int_offset_shift), + _other_offsets(other_offsets) + { + assert(_pointer != nullptr, "must always have pointer"); + assert(is_valid == (_base != nullptr), "have base exactly if valid"); + assert(is_valid == (_other_offsets != nullptr), "have other_offsets exactly if valid"); + } + + static ArrayPointer make_invalid(const Node* pointer) { + return ArrayPointer(false, pointer, nullptr, 0, nullptr, 0, nullptr); + } + + static bool parse_int_offset(Node* offset, Node*& int_offset, jint& int_offset_shift) { + // offset = LShiftL( ConvI2L(int_offset), int_offset_shift) + if (offset->Opcode() == Op_LShiftL && + offset->in(1)->Opcode() == Op_ConvI2L && + offset->in(2)->Opcode() == Op_ConI) { + int_offset = offset->in(1)->in(1); // LShiftL -> ConvI2L -> int_offset + int_offset_shift = offset->in(2)->get_int(); // LShiftL -> int_offset_shift + return true; + } + + // offset = ConvI2L(int_offset) = LShiftL( ConvI2L(int_offset), 0) + if (offset->Opcode() == Op_ConvI2L) { + int_offset = offset->in(1); + int_offset_shift = 0; + return true; + } + + // parse failed + return false; + } + +public: + // Parse the structure above the pointer + static ArrayPointer make(PhaseGVN* phase, const Node* pointer) { + assert(phase->type(pointer)->isa_aryptr() != nullptr, "must be array pointer"); + if (!pointer->is_AddP()) { return ArrayPointer::make_invalid(pointer); } + + const Node* base = pointer->in(AddPNode::Base); + if (base == nullptr) { return ArrayPointer::make_invalid(pointer); } + + const int search_depth = 5; + Node* offsets[search_depth]; + int count = pointer->as_AddP()->unpack_offsets(offsets, search_depth); + + // We expect at least a constant each + if (count <= 0) { return ArrayPointer::make_invalid(pointer); } + + // We extract the form: + // + // pointer = base + // + constant_offset + // + LShiftL( ConvI2L(int_offset + int_con), int_offset_shift) + // + sum(other_offsets) + // + jlong constant_offset = 0; + Node* int_offset = nullptr; + jint int_offset_shift = 0; + GrowableArray* other_offsets = new GrowableArray(count); + + for (int i = 0; i < count; i++) { + Node* offset = offsets[i]; + if (offset->Opcode() == Op_ConI) { + // Constant int offset + constant_offset += offset->get_int(); + } else if (offset->Opcode() == Op_ConL) { + // Constant long offset + constant_offset += offset->get_long(); + } else if(int_offset == nullptr && parse_int_offset(offset, int_offset, int_offset_shift)) { + // LShiftL( ConvI2L(int_offset), int_offset_shift) + int_offset = int_offset->uncast(); + if (int_offset->Opcode() == Op_AddI && int_offset->in(2)->Opcode() == Op_ConI) { + // LShiftL( ConvI2L(int_offset + int_con), int_offset_shift) + constant_offset += ((jlong)int_offset->in(2)->get_int()) << int_offset_shift; + int_offset = int_offset->in(1); + } + } else { + // All others + other_offsets->append(offset); + } + } + + return ArrayPointer(true, pointer, base, constant_offset, int_offset, int_offset_shift, other_offsets); + } + + bool is_adjacent_to_and_before(const ArrayPointer& other, const jlong data_size) const { + if (!_is_valid || !other._is_valid) { return false; } + + // Offset adjacent? + if (this->_constant_offset + data_size != other._constant_offset) { return false; } + + // All other components identical? + if (this->_base != other._base || + this->_int_offset != other._int_offset || + this->_int_offset_shift != other._int_offset_shift || + this->_other_offsets->length() != other._other_offsets->length()) { + return false; + } + + for (int i = 0; i < this->_other_offsets->length(); i++) { + Node* o1 = this->_other_offsets->at(i); + Node* o2 = other._other_offsets->at(i); + if (o1 != o2) { return false; } + } + + return true; + } + +#ifndef PRODUCT + void dump() { + if (!_is_valid) { + tty->print("ArrayPointer[%d %s, invalid]", _pointer->_idx, _pointer->Name()); + return; + } + tty->print("ArrayPointer[%d %s, base[%d %s] + %lld", + _pointer->_idx, _pointer->Name(), + _base->_idx, _base->Name(), + (long long)_constant_offset); + if (_int_offset != 0) { + tty->print(" + I2L[%d %s] << %d", + _int_offset->_idx, _int_offset->Name(), _int_offset_shift); + } + for (int i = 0; i < _other_offsets->length(); i++) { + Node* n = _other_offsets->at(i); + tty->print(" + [%d %s]", n->_idx, n->Name()); + } + tty->print_cr("]"); + } +#endif +}; + +// Link together multiple stores (B/S/C/I) into a longer one. +// +// Example: _store = StoreB[i+3] +// +// RangeCheck[i+0] RangeCheck[i+0] +// StoreB[i+0] +// RangeCheck[i+1] RangeCheck[i+1] +// StoreB[i+1] --> pass: fail: +// StoreB[i+2] StoreI[i+0] StoreB[i+0] +// StoreB[i+3] +// +// The 4 StoreB are merged into a single StoreI node. We have to be careful with RangeCheck[i+1]: before +// the optimization, if this RangeCheck[i+1] fails, then we execute only StoreB[i+0], and then trap. After +// the optimization, the new StoreI[i+0] is on the passing path of RangeCheck[i+1], and StoreB[i+0] on the +// failing path. +// +// Note: For normal array stores, every store at first has a RangeCheck. But they can be removed with: +// - RCE (RangeCheck Elimination): the RangeChecks in the loop are hoisted out and before the loop, +// and possibly no RangeChecks remain between the stores. +// - RangeCheck smearing: the earlier RangeChecks are adjusted such that they cover later RangeChecks, +// and those later RangeChecks can be removed. Example: +// +// RangeCheck[i+0] RangeCheck[i+0] <- before first store +// StoreB[i+0] StoreB[i+0] <- first store +// RangeCheck[i+1] --> smeared --> RangeCheck[i+3] <- only RC between first and last store +// StoreB[i+0] StoreB[i+1] <- second store +// RangeCheck[i+2] --> removed +// StoreB[i+0] StoreB[i+2] +// RangeCheck[i+3] --> removed +// StoreB[i+0] StoreB[i+3] <- last store +// +// Thus, it is a common pattern that between the first and last store in a chain +// of adjacent stores there remains exactly one RangeCheck, located between the +// first and the second store (e.g. RangeCheck[i+3]). +// +class MergePrimitiveArrayStores : public StackObj { +private: + PhaseGVN* _phase; + StoreNode* _store; + +public: + MergePrimitiveArrayStores(PhaseGVN* phase, StoreNode* store) : _phase(phase), _store(store) {} + + StoreNode* run(); + +private: + bool is_compatible_store(const StoreNode* other_store) const; + bool is_adjacent_pair(const StoreNode* use_store, const StoreNode* def_store) const; + bool is_adjacent_input_pair(const Node* n1, const Node* n2, const int memory_size) const; + static bool is_con_RShift(const Node* n, Node const*& base_out, jint& shift_out); + enum CFGStatus { SuccessNoRangeCheck, SuccessWithRangeCheck, Failure }; + static CFGStatus cfg_status_for_pair(const StoreNode* use_store, const StoreNode* def_store); + + class Status { + private: + StoreNode* _found_store; + bool _found_range_check; + + Status(StoreNode* found_store, bool found_range_check) + : _found_store(found_store), _found_range_check(found_range_check) {} + + public: + StoreNode* found_store() const { return _found_store; } + bool found_range_check() const { return _found_range_check; } + static Status make_failure() { return Status(nullptr, false); } + + static Status make(StoreNode* found_store, const CFGStatus cfg_status) { + if (cfg_status == CFGStatus::Failure) { + return Status::make_failure(); + } + return Status(found_store, cfg_status == CFGStatus::SuccessWithRangeCheck); + } + }; + + Status find_adjacent_use_store(const StoreNode* def_store) const; + Status find_adjacent_def_store(const StoreNode* use_store) const; + Status find_use_store(const StoreNode* def_store) const; + Status find_def_store(const StoreNode* use_store) const; + Status find_use_store_unidirectional(const StoreNode* def_store) const; + Status find_def_store_unidirectional(const StoreNode* use_store) const; + + void collect_merge_list(Node_List& merge_list) const; + Node* make_merged_input_value(const Node_List& merge_list); + StoreNode* make_merged_store(const Node_List& merge_list, Node* merged_input_value); + + DEBUG_ONLY( void trace(const Node_List& merge_list, const Node* merged_input_value, const StoreNode* merged_store) const; ) +}; + +StoreNode* MergePrimitiveArrayStores::run() { + // Check for B/S/C/I + int opc = _store->Opcode(); + if (opc != Op_StoreB && opc != Op_StoreC && opc != Op_StoreI) { + return nullptr; + } + + // Only merge stores on arrays, and the stores must have the same size as the elements. + const TypeAryPtr* aryptr_t = _store->adr_type()->isa_aryptr(); + if (aryptr_t == nullptr || + type2aelembytes(aryptr_t->elem()->array_element_basic_type()) != _store->memory_size()) { + return nullptr; + } + + // The _store must be the "last" store in a chain. If we find a use we could merge with + // then that use or a store further down is the "last" store. + Status status_use = find_adjacent_use_store(_store); + if (status_use.found_store() != nullptr) { + return nullptr; + } + + // Check if we can merge with at least one def, so that we have at least 2 stores to merge. + Status status_def = find_adjacent_def_store(_store); + if (status_def.found_store() == nullptr) { + return nullptr; + } + + ResourceMark rm; + Node_List merge_list; + collect_merge_list(merge_list); + + Node* merged_input_value = make_merged_input_value(merge_list); + if (merged_input_value == nullptr) { return nullptr; } + + StoreNode* merged_store = make_merged_store(merge_list, merged_input_value); + + DEBUG_ONLY( if(TraceMergeStores) { trace(merge_list, merged_input_value, merged_store); } ) + + return merged_store; +} + +// Check compatibility between _store and other_store. +bool MergePrimitiveArrayStores::is_compatible_store(const StoreNode* other_store) const { + int opc = _store->Opcode(); + assert(opc == Op_StoreB || opc == Op_StoreC || opc == Op_StoreI, "precondition"); + assert(_store->adr_type()->isa_aryptr() != nullptr, "must be array store"); + + if (other_store == nullptr || + _store->Opcode() != other_store->Opcode() || + other_store->adr_type()->isa_aryptr() == nullptr) { + return false; + } + + // Check that the size of the stores, and the array elements are all the same. + const TypeAryPtr* aryptr_t1 = _store->adr_type()->is_aryptr(); + const TypeAryPtr* aryptr_t2 = other_store->adr_type()->is_aryptr(); + int size1 = type2aelembytes(aryptr_t1->elem()->array_element_basic_type()); + int size2 = type2aelembytes(aryptr_t2->elem()->array_element_basic_type()); + if (size1 != size2 || + size1 != _store->memory_size() || + _store->memory_size() != other_store->memory_size()) { + return false; + } + return true; +} + +bool MergePrimitiveArrayStores::is_adjacent_pair(const StoreNode* use_store, const StoreNode* def_store) const { + if (!is_adjacent_input_pair(def_store->in(MemNode::ValueIn), + use_store->in(MemNode::ValueIn), + def_store->memory_size())) { + return false; + } + + ResourceMark rm; + ArrayPointer array_pointer_use = ArrayPointer::make(_phase, use_store->in(MemNode::Address)); + ArrayPointer array_pointer_def = ArrayPointer::make(_phase, def_store->in(MemNode::Address)); + if (!array_pointer_def.is_adjacent_to_and_before(array_pointer_use, use_store->memory_size())) { + return false; + } + + return true; +} + +bool MergePrimitiveArrayStores::is_adjacent_input_pair(const Node* n1, const Node* n2, const int memory_size) const { + // Pattern: [n1 = ConI, n2 = ConI] + if (n1->Opcode() == Op_ConI) { + return n2->Opcode() == Op_ConI; + } + + // Pattern: [n1 = base >> shift, n2 = base >> (shift + memory_size)] + Node const* base_n2; + jint shift_n2; + if (!is_con_RShift(n2, base_n2, shift_n2)) { + return false; + } + if (n1->Opcode() == Op_ConvL2I) { + // look through + n1 = n1->in(1); + } + Node const* base_n1; + jint shift_n1; + if (n1 == base_n2) { + // n1 = base = base >> 0 + base_n1 = n1; + shift_n1 = 0; + } else if (!is_con_RShift(n1, base_n1, shift_n1)) { + return false; + } + int bits_per_store = memory_size * 8; + if (base_n1 != base_n2 || + shift_n1 + bits_per_store != shift_n2 || + shift_n1 % bits_per_store != 0) { + return false; + } + + // both load from same value with correct shift + return true; +} + +// Detect pattern: n = base_out >> shift_out +bool MergePrimitiveArrayStores::is_con_RShift(const Node* n, Node const*& base_out, jint& shift_out) { + assert(n != nullptr, "precondition"); + + int opc = n->Opcode(); + if (opc == Op_ConvL2I) { + n = n->in(1); + opc = n->Opcode(); + } + + if ((opc == Op_RShiftI || + opc == Op_RShiftL || + opc == Op_URShiftI || + opc == Op_URShiftL) && + n->in(2)->is_ConI()) { + base_out = n->in(1); + shift_out = n->in(2)->get_int(); + assert(shift_out >= 0, "must be positive"); + return true; + } + return false; +} + +// Check if there is nothing between the two stores, except optionally a RangeCheck leading to an uncommon trap. +MergePrimitiveArrayStores::CFGStatus MergePrimitiveArrayStores::cfg_status_for_pair(const StoreNode* use_store, const StoreNode* def_store) { + assert(use_store->in(MemNode::Memory) == def_store, "use-def relationship"); + + Node* ctrl_use = use_store->in(MemNode::Control); + Node* ctrl_def = def_store->in(MemNode::Control); + if (ctrl_use == nullptr || ctrl_def == nullptr) { + return CFGStatus::Failure; + } + + if (ctrl_use == ctrl_def) { + // Same ctrl -> no RangeCheck in between. + // Check: use_store must be the only use of def_store. + if (def_store->outcnt() > 1) { + return CFGStatus::Failure; + } + return CFGStatus::SuccessNoRangeCheck; + } + + // Different ctrl -> could have RangeCheck in between. + // Check: 1. def_store only has these uses: use_store and MergeMem for uncommon trap, and + // 2. ctrl separated by RangeCheck. + if (def_store->outcnt() != 2) { + return CFGStatus::Failure; // Cannot have exactly these uses: use_store and MergeMem for uncommon trap. + } + int use_store_out_idx = def_store->raw_out(0) == use_store ? 0 : 1; + Node* merge_mem = def_store->raw_out(1 - use_store_out_idx)->isa_MergeMem(); + if (merge_mem == nullptr || + merge_mem->outcnt() != 1) { + return CFGStatus::Failure; // Does not have MergeMem for uncommon trap. + } + if (!ctrl_use->is_IfProj() || + !ctrl_use->in(0)->is_RangeCheck() || + ctrl_use->in(0)->outcnt() != 2) { + return CFGStatus::Failure; // Not RangeCheck. + } + ProjNode* other_proj = ctrl_use->as_IfProj()->other_if_proj(); + Node* trap = other_proj->is_uncommon_trap_proj(Deoptimization::Reason_range_check); + if (trap != merge_mem->unique_out() || + ctrl_use->in(0)->in(0) != ctrl_def) { + return CFGStatus::Failure; // Not RangeCheck with merge_mem leading to uncommon trap. + } + + return CFGStatus::SuccessWithRangeCheck; +} + +MergePrimitiveArrayStores::Status MergePrimitiveArrayStores::find_adjacent_use_store(const StoreNode* def_store) const { + Status status_use = find_use_store(def_store); + StoreNode* use_store = status_use.found_store(); + if (use_store != nullptr && !is_adjacent_pair(use_store, def_store)) { + return Status::make_failure(); + } + return status_use; +} + +MergePrimitiveArrayStores::Status MergePrimitiveArrayStores::find_adjacent_def_store(const StoreNode* use_store) const { + Status status_def = find_def_store(use_store); + StoreNode* def_store = status_def.found_store(); + if (def_store != nullptr && !is_adjacent_pair(use_store, def_store)) { + return Status::make_failure(); + } + return status_def; +} + +MergePrimitiveArrayStores::Status MergePrimitiveArrayStores::find_use_store(const StoreNode* def_store) const { + Status status_use = find_use_store_unidirectional(def_store); + +#ifdef ASSERT + StoreNode* use_store = status_use.found_store(); + if (use_store != nullptr) { + Status status_def = find_def_store_unidirectional(use_store); + assert(status_def.found_store() == def_store && + status_def.found_range_check() == status_use.found_range_check(), + "find_use_store and find_def_store must be symmetric"); + } +#endif + + return status_use; +} + +MergePrimitiveArrayStores::Status MergePrimitiveArrayStores::find_def_store(const StoreNode* use_store) const { + Status status_def = find_def_store_unidirectional(use_store); + +#ifdef ASSERT + StoreNode* def_store = status_def.found_store(); + if (def_store != nullptr) { + Status status_use = find_use_store_unidirectional(def_store); + assert(status_use.found_store() == use_store && + status_use.found_range_check() == status_def.found_range_check(), + "find_use_store and find_def_store must be symmetric"); + } +#endif + + return status_def; +} + +MergePrimitiveArrayStores::Status MergePrimitiveArrayStores::find_use_store_unidirectional(const StoreNode* def_store) const { + assert(is_compatible_store(def_store), "precondition: must be compatible with _store"); + + for (DUIterator_Fast imax, i = def_store->fast_outs(imax); i < imax; i++) { + StoreNode* use_store = def_store->fast_out(i)->isa_Store(); + if (is_compatible_store(use_store)) { + return Status::make(use_store, cfg_status_for_pair(use_store, def_store)); + } + } + + return Status::make_failure(); +} + +MergePrimitiveArrayStores::Status MergePrimitiveArrayStores::find_def_store_unidirectional(const StoreNode* use_store) const { + assert(is_compatible_store(use_store), "precondition: must be compatible with _store"); + + StoreNode* def_store = use_store->in(MemNode::Memory)->isa_Store(); + if (!is_compatible_store(def_store)) { + return Status::make_failure(); + } + + return Status::make(def_store, cfg_status_for_pair(use_store, def_store)); +} + +void MergePrimitiveArrayStores::collect_merge_list(Node_List& merge_list) const { + // The merged store can be at most 8 bytes. + const uint merge_list_max_size = 8 / _store->memory_size(); + assert(merge_list_max_size >= 2 && + merge_list_max_size <= 8 && + is_power_of_2(merge_list_max_size), + "must be 2, 4 or 8"); + + // Traverse up the chain of adjacent def stores. + StoreNode* current = _store; + merge_list.push(current); + while (current != nullptr && merge_list.size() < merge_list_max_size) { + Status status = find_adjacent_def_store(current); + current = status.found_store(); + if (current != nullptr) { + merge_list.push(current); + + // We can have at most one RangeCheck. + if (status.found_range_check()) { + break; + } + } + } + + // Truncate the merge_list to a power of 2. + const uint pow2size = round_down_power_of_2(merge_list.size()); + assert(pow2size >= 2, "must be merging at least 2 stores"); + while (merge_list.size() > pow2size) { merge_list.pop(); } +} + +// Merge the input values of the smaller stores to a single larger input value. +Node* MergePrimitiveArrayStores::make_merged_input_value(const Node_List& merge_list) { + int new_memory_size = _store->memory_size() * merge_list.size(); + Node* first = merge_list.at(merge_list.size()-1); + Node* merged_input_value = nullptr; + if (_store->in(MemNode::ValueIn)->Opcode() == Op_ConI) { + // Pattern: [ConI, ConI, ...] -> new constant + jlong con = 0; + jlong bits_per_store = _store->memory_size() * 8; + jlong mask = (((jlong)1) << bits_per_store) - 1; + for (uint i = 0; i < merge_list.size(); i++) { + jlong con_i = merge_list.at(i)->in(MemNode::ValueIn)->get_int(); + con = con << bits_per_store; + con = con | (mask & con_i); + } + merged_input_value = _phase->longcon(con); + } else { + // Pattern: [base >> 24, base >> 16, base >> 8, base] -> base + // | | + // _store first + // + merged_input_value = first->in(MemNode::ValueIn); + Node const* base_last; + jint shift_last; + bool is_true = is_con_RShift(_store->in(MemNode::ValueIn), base_last, shift_last); + assert(is_true, "must detect con RShift"); + if (merged_input_value != base_last && merged_input_value->Opcode() == Op_ConvL2I) { + // look through + merged_input_value = merged_input_value->in(1); + } + if (merged_input_value != base_last) { + // merged_input_value is not the base + return nullptr; + } + } + + if (_phase->type(merged_input_value)->isa_long() != nullptr && new_memory_size <= 4) { + // Example: + // + // long base = ...; + // a[0] = (byte)(base >> 0); + // a[1] = (byte)(base >> 8); + // + merged_input_value = _phase->transform(new ConvL2INode(merged_input_value)); + } + + assert((_phase->type(merged_input_value)->isa_int() != nullptr && new_memory_size <= 4) || + (_phase->type(merged_input_value)->isa_long() != nullptr && new_memory_size == 8), + "merged_input_value is either int or long, and new_memory_size is small enough"); + + return merged_input_value; +} + +// // +// first_ctrl first_mem first_adr first_ctrl first_mem first_adr // +// | | | | | | // +// | | | | +---------------+ | // +// | | | | | | | // +// | | +---------+ | | +---------------+ // +// | | | | | | | | // +// +--------------+ | | v1 +------------------------------+ | | v1 // +// | | | | | | | | | | | | // +// RangeCheck first_store RangeCheck | | first_store // +// | | | | | | | // +// last_ctrl | +----> unc_trap last_ctrl | | +----> unc_trap // +// | | ===> | | | // +// +--------------+ | a2 v2 | | | // +// | | | | | | | | // +// | second_store | | | // +// | | | | | [v1 v2 ... vn] // +// ... ... | | | | // +// | | | | | v // +// +--------------+ | an vn +--------------+ | | merged_input_value // +// | | | | | | | | // +// last_store (= _store) merged_store // +// // +StoreNode* MergePrimitiveArrayStores::make_merged_store(const Node_List& merge_list, Node* merged_input_value) { + Node* first_store = merge_list.at(merge_list.size()-1); + Node* last_ctrl = _store->in(MemNode::Control); // after (optional) RangeCheck + Node* first_mem = first_store->in(MemNode::Memory); + Node* first_adr = first_store->in(MemNode::Address); + + const TypePtr* new_adr_type = _store->adr_type(); + + int new_memory_size = _store->memory_size() * merge_list.size(); + BasicType bt = T_ILLEGAL; + switch (new_memory_size) { + case 2: bt = T_SHORT; break; + case 4: bt = T_INT; break; + case 8: bt = T_LONG; break; + } + + StoreNode* merged_store = StoreNode::make(*_phase, last_ctrl, first_mem, first_adr, + new_adr_type, merged_input_value, bt, MemNode::unordered); + + // Marking the store mismatched is sufficient to prevent reordering, since array stores + // are all on the same slice. Hence, we need no barriers. + merged_store->set_mismatched_access(); + + // Constants above may now also be be packed -> put candidate on worklist + _phase->is_IterGVN()->_worklist.push(first_mem); + + return merged_store; +} + +#ifdef ASSERT +void MergePrimitiveArrayStores::trace(const Node_List& merge_list, const Node* merged_input_value, const StoreNode* merged_store) const { + stringStream ss; + ss.print_cr("[TraceMergeStores]: Replace"); + for (int i = (int)merge_list.size() - 1; i >= 0; i--) { + merge_list.at(i)->dump("\n", false, &ss); + } + ss.print_cr("[TraceMergeStores]: with"); + merged_input_value->dump("\n", false, &ss); + merged_store->dump("\n", false, &ss); + tty->print("%s", ss.as_string()); +} +#endif + //------------------------------Ideal------------------------------------------ // Change back-to-back Store(, p, x) -> Store(m, p, y) to Store(m, p, x). // When a store immediately follows a relevant allocation/initialization, @@ -2770,6 +3447,18 @@ Node *StoreNode::Ideal(PhaseGVN *phase, bool can_reshape) { } } +#ifdef VM_LITTLE_ENDIAN + if (MergeStores && UseUnalignedAccesses) { + if (phase->C->post_loop_opts_phase()) { + MergePrimitiveArrayStores merge(phase, this); + Node* progress = merge.run(); + if (progress != nullptr) { return progress; } + } else { + phase->C->record_for_post_loop_opts_igvn(this); + } + } +#endif + return nullptr; // No further progress } diff --git a/src/hotspot/share/opto/phaseX.cpp b/src/hotspot/share/opto/phaseX.cpp index c791146f75769..1a1e4e04e16e1 100644 --- a/src/hotspot/share/opto/phaseX.cpp +++ b/src/hotspot/share/opto/phaseX.cpp @@ -2273,7 +2273,15 @@ void PhasePeephole::print_statistics() { //------------------------------set_req_X-------------------------------------- void Node::set_req_X( uint i, Node *n, PhaseIterGVN *igvn ) { assert( is_not_dead(n), "can not use dead node"); - assert( igvn->hash_find(this) != this, "Need to remove from hash before changing edges" ); +#ifdef ASSERT + if (igvn->hash_find(this) == this) { + tty->print_cr("Need to remove from hash before changing edges"); + this->dump(1); + tty->print_cr("Set at i = %d", i); + n->dump(); + assert(false, "Need to remove from hash before changing edges"); + } +#endif Node *old = in(i); set_req(i, n); diff --git a/test/hotspot/jtreg/compiler/c2/TestMergeStores.java b/test/hotspot/jtreg/compiler/c2/TestMergeStores.java new file mode 100644 index 0000000000000..0af46b56a56c0 --- /dev/null +++ b/test/hotspot/jtreg/compiler/c2/TestMergeStores.java @@ -0,0 +1,1247 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package compiler.c2; + +import compiler.lib.ir_framework.*; +import jdk.test.lib.Utils; +import jdk.internal.misc.Unsafe; +import java.lang.reflect.Array; +import java.util.Map; +import java.util.HashMap; +import java.util.Random; + +/* + * @test + * @bug 8318446 + * @summary Test merging of consecutive stores + * @modules java.base/jdk.internal.misc + * @library /test/lib / + * @run main compiler.c2.TestMergeStores aligned + */ + +/* + * @test + * @bug 8318446 + * @summary Test merging of consecutive stores + * @modules java.base/jdk.internal.misc + * @library /test/lib / + * @run main compiler.c2.TestMergeStores unaligned + */ + +public class TestMergeStores { + static int RANGE = 1000; + private static final Unsafe UNSAFE = Unsafe.getUnsafe(); + private static final Random RANDOM = Utils.getRandomInstance(); + + // Inputs + byte[] aB = new byte[RANGE]; + byte[] bB = new byte[RANGE]; + short[] aS = new short[RANGE]; + short[] bS = new short[RANGE]; + int[] aI = new int[RANGE]; + int[] bI = new int[RANGE]; + long[] aL = new long[RANGE]; + long[] bL = new long[RANGE]; + + int offset1; + int offset2; + byte vB1; + byte vB2; + short vS1; + short vS2; + int vI1; + int vI2; + long vL1; + long vL2; + + interface TestFunction { + Object[] run(boolean isWarmUp, int rnd); + } + + Map> testGroups = new HashMap>(); + + public static void main(String[] args) { + TestFramework framework = new TestFramework(TestMergeStores.class); + framework.addFlags("--add-modules", "java.base", "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED"); + + switch (args[0]) { + case "aligned" -> { framework.addFlags("-XX:-UseUnalignedAccesses"); } + case "unaligned" -> { framework.addFlags("-XX:+UseUnalignedAccesses"); } + default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); } + } + framework.start(); + } + + public TestMergeStores() { + testGroups.put("test1", new HashMap()); + testGroups.get("test1").put("test1R", (_,_) -> { return test1R(aB.clone()); }); + testGroups.get("test1").put("test1a", (_,_) -> { return test1a(aB.clone()); }); + testGroups.get("test1").put("test1b", (_,_) -> { return test1b(aB.clone()); }); + testGroups.get("test1").put("test1c", (_,_) -> { return test1c(aB.clone()); }); + testGroups.get("test1").put("test1d", (_,_) -> { return test1d(aB.clone()); }); + testGroups.get("test1").put("test1e", (_,_) -> { return test1e(aB.clone()); }); + testGroups.get("test1").put("test1f", (_,_) -> { return test1f(aB.clone()); }); + testGroups.get("test1").put("test1g", (_,_) -> { return test1g(aB.clone()); }); + testGroups.get("test1").put("test1h", (_,_) -> { return test1h(aB.clone()); }); + testGroups.get("test1").put("test1i", (_,_) -> { return test1i(aB.clone()); }); + + testGroups.put("test2", new HashMap()); + testGroups.get("test2").put("test2R", (_,_) -> { return test2R(aB.clone(), offset1, vL1); }); + testGroups.get("test2").put("test2a", (_,_) -> { return test2a(aB.clone(), offset1, vL1); }); + testGroups.get("test2").put("test2b", (_,_) -> { return test2b(aB.clone(), offset1, vL1); }); + testGroups.get("test2").put("test2c", (_,_) -> { return test2c(aB.clone(), offset1, vL1); }); + testGroups.get("test2").put("test2d", (_,_) -> { return test2d(aB.clone(), offset1, vL1); }); + testGroups.get("test2").put("test2e", (_,_) -> { return test2d(aB.clone(), offset1, vL1); }); + + testGroups.put("test3", new HashMap()); + testGroups.get("test3").put("test3R", (_,_) -> { return test3R(aB.clone(), offset1, vL1); }); + testGroups.get("test3").put("test3a", (_,_) -> { return test3a(aB.clone(), offset1, vL1); }); + + testGroups.put("test4", new HashMap()); + testGroups.get("test4").put("test4R", (_,_) -> { return test4R(aB.clone(), offset1, vL1, vI1, vS1, vB1); }); + testGroups.get("test4").put("test4a", (_,_) -> { return test4a(aB.clone(), offset1, vL1, vI1, vS1, vB1); }); + + testGroups.put("test5", new HashMap()); + testGroups.get("test5").put("test5R", (_,_) -> { return test5R(aB.clone(), offset1); }); + testGroups.get("test5").put("test5a", (_,_) -> { return test5a(aB.clone(), offset1); }); + + testGroups.put("test6", new HashMap()); + testGroups.get("test6").put("test6R", (_,_) -> { return test6R(aB.clone(), bB.clone(), offset1, offset2); }); + testGroups.get("test6").put("test6a", (_,_) -> { return test6a(aB.clone(), bB.clone(), offset1, offset2); }); + + testGroups.put("test7", new HashMap()); + testGroups.get("test7").put("test7R", (_,_) -> { return test7R(aB.clone(), offset1, vI1); }); + testGroups.get("test7").put("test7a", (_,_) -> { return test7a(aB.clone(), offset1, vI1); }); + + testGroups.put("test100", new HashMap()); + testGroups.get("test100").put("test100R", (_,_) -> { return test100R(aS.clone(), offset1); }); + testGroups.get("test100").put("test100a", (_,_) -> { return test100a(aS.clone(), offset1); }); + + testGroups.put("test101", new HashMap()); + testGroups.get("test101").put("test101R", (_,_) -> { return test101R(aS.clone(), offset1); }); + testGroups.get("test101").put("test101a", (_,_) -> { return test101a(aS.clone(), offset1); }); + + testGroups.put("test102", new HashMap()); + testGroups.get("test102").put("test102R", (_,_) -> { return test102R(aS.clone(), offset1, vL1, vI1, vS1); }); + testGroups.get("test102").put("test102a", (_,_) -> { return test102a(aS.clone(), offset1, vL1, vI1, vS1); }); + + testGroups.put("test200", new HashMap()); + testGroups.get("test200").put("test200R", (_,_) -> { return test200R(aI.clone(), offset1); }); + testGroups.get("test200").put("test200a", (_,_) -> { return test200a(aI.clone(), offset1); }); + + testGroups.put("test201", new HashMap()); + testGroups.get("test201").put("test201R", (_,_) -> { return test201R(aI.clone(), offset1); }); + testGroups.get("test201").put("test201a", (_,_) -> { return test201a(aI.clone(), offset1); }); + + testGroups.put("test202", new HashMap()); + testGroups.get("test202").put("test202R", (_,_) -> { return test202R(aI.clone(), offset1, vL1, vI1); }); + testGroups.get("test202").put("test202a", (_,_) -> { return test202a(aI.clone(), offset1, vL1, vI1); }); + + testGroups.put("test300", new HashMap()); + testGroups.get("test300").put("test300R", (_,_) -> { return test300R(aI.clone()); }); + testGroups.get("test300").put("test300a", (_,_) -> { return test300a(aI.clone()); }); + + testGroups.put("test400", new HashMap()); + testGroups.get("test400").put("test400R", (_,_) -> { return test400R(aI.clone()); }); + testGroups.get("test400").put("test400a", (_,_) -> { return test400a(aI.clone()); }); + + testGroups.put("test500", new HashMap()); + testGroups.get("test500").put("test500R", (_,_) -> { return test500R(aB.clone(), offset1, vL1); }); + testGroups.get("test500").put("test500a", (_,_) -> { return test500a(aB.clone(), offset1, vL1); }); + + testGroups.put("test501", new HashMap()); + testGroups.get("test501").put("test500R", (_,i) -> { return test500R(aB.clone(), RANGE - 20 + (i % 30), vL1); }); + testGroups.get("test501").put("test501a", (_,i) -> { return test501a(aB.clone(), RANGE - 20 + (i % 30), vL1); }); + // +-------------------+ + // Create offsets that are sometimes going to pass all RangeChecks, and sometimes one, and sometimes none. + // Consequence: all RangeChecks stay in the final compilation. + + testGroups.put("test502", new HashMap()); + testGroups.get("test502").put("test500R", (w,i) -> { return test500R(aB.clone(), w ? offset1 : RANGE - 20 + (i % 30), vL1); }); + testGroups.get("test502").put("test502a", (w,i) -> { return test502a(aB.clone(), w ? offset1 : RANGE - 20 + (i % 30), vL1); }); + // +-----+ +-------------------+ + // First use something in range, and after warmup randomize going outside the range. + // Consequence: all RangeChecks stay in the final compilation. + } + + @Warmup(100) + @Run(test = {"test1a", + "test1b", + "test1c", + "test1d", + "test1e", + "test1f", + "test1g", + "test1h", + "test1i", + "test2a", + "test2b", + "test2c", + "test2d", + "test2e", + "test3a", + "test4a", + "test5a", + "test6a", + "test7a", + "test100a", + "test101a", + "test102a", + "test200a", + "test201a", + "test202a", + "test300a", + "test400a", + "test500a", + "test501a", + "test502a"}) + public void runTests(RunInfo info) { + // Repeat many times, so that we also have multiple iterations for post-warmup to potentially recompile + int iters = info.isWarmUp() ? 1_000 : 50_000; + for (int iter = 0; iter < iters; iter++) { + // Write random values to inputs + set_random(aB); + set_random(bB); + set_random(aS); + set_random(bS); + set_random(aI); + set_random(bI); + set_random(aL); + set_random(bL); + + offset1 = Math.abs(RANDOM.nextInt()) % 100; + offset2 = Math.abs(RANDOM.nextInt()) % 100; + vB1 = (byte)RANDOM.nextInt(); + vB2 = (byte)RANDOM.nextInt(); + vS1 = (short)RANDOM.nextInt(); + vS2 = (short)RANDOM.nextInt(); + vI1 = RANDOM.nextInt(); + vI2 = RANDOM.nextInt(); + vL1 = RANDOM.nextLong(); + vL2 = RANDOM.nextLong(); + + // Run all tests + for (Map.Entry> group_entry : testGroups.entrySet()) { + String group_name = group_entry.getKey(); + Map group = group_entry.getValue(); + Object[] gold = null; + String gold_name = "NONE"; + for (Map.Entry entry : group.entrySet()) { + String name = entry.getKey(); + TestFunction test = entry.getValue(); + Object[] result = test.run(info.isWarmUp(), iter); + if (gold == null) { + gold = result; + gold_name = name; + } else { + verify("group " + group_name + ", gold " + gold_name + ", test " + name, gold, result); + } + } + } + } + } + + static void verify(String name, Object[] gold, Object[] result) { + if (gold.length != result.length) { + throw new RuntimeException("verify " + name + ": not the same number of outputs: gold.length = " + + gold.length + ", result.length = " + result.length); + } + for (int i = 0; i < gold.length; i++) { + Object g = gold[i]; + Object r = result[i]; + if (g.getClass() != r.getClass() || !g.getClass().isArray() || !r.getClass().isArray()) { + throw new RuntimeException("verify " + name + ": must both be array of same type:" + + " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() + + " result[" + i + "].getClass() = " + r.getClass().getSimpleName()); + } + if (g == r) { + throw new RuntimeException("verify " + name + ": should be two separate arrays (with identical content):" + + " gold[" + i + "] == result[" + i + "]"); + } + if (Array.getLength(g) != Array.getLength(r)) { + throw new RuntimeException("verify " + name + ": arrays must have same length:" + + " gold[" + i + "].length = " + Array.getLength(g) + + " result[" + i + "].length = " + Array.getLength(r)); + } + Class c = g.getClass().getComponentType(); + if (c == byte.class) { + verifyB(name, i, (byte[])g, (byte[])r); + } else if (c == short.class) { + verifyS(name, i, (short[])g, (short[])r); + } else if (c == int.class) { + verifyI(name, i, (int[])g, (int[])r); + } else if (c == long.class) { + verifyL(name, i, (long[])g, (long[])r); + } else { + throw new RuntimeException("verify " + name + ": array type not supported for verify:" + + " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() + + " result[" + i + "].getClass() = " + r.getClass().getSimpleName()); + } + } + } + + static void verifyB(String name, int i, byte[] g, byte[] r) { + for (int j = 0; j < g.length; j++) { + if (g[j] != r[j]) { + throw new RuntimeException("verify " + name + ": arrays must have same content:" + + " gold[" + i + "][" + j + "] = " + g[j] + + " = " + String.format("%02X", g[j] & 0xFF) + + " result[" + i + "][" + j + "] = " + r[j] + + " = " + String.format("%02X", r[j] & 0xFF)); + } + } + } + + static void verifyS(String name, int i, short[] g, short[] r) { + for (int j = 0; j < g.length; j++) { + if (g[j] != r[j]) { + throw new RuntimeException("verify " + name + ": arrays must have same content:" + + " gold[" + i + "][" + j + "] = " + g[j] + + " result[" + i + "][" + j + "] = " + r[j]); + } + } + } + + static void verifyI(String name, int i, int[] g, int[] r) { + for (int j = 0; j < g.length; j++) { + if (g[j] != r[j]) { + throw new RuntimeException("verify " + name + ": arrays must have same content:" + + " gold[" + i + "][" + j + "] = " + g[j] + + " result[" + i + "][" + j + "] = " + r[j]); + } + } + } + + static void verifyL(String name, int i, long[] g, long[] r) { + for (int j = 0; j < g.length; j++) { + if (g[j] != r[j]) { + throw new RuntimeException("verify " + name + ": arrays must have same content:" + + " gold[" + i + "][" + j + "] = " + g[j] + + " result[" + i + "][" + j + "] = " + r[j]); + } + } + } + + static void set_random(byte[] a) { + for (int i = 0; i < a.length; i++) { + a[i] = (byte)RANDOM.nextInt(); + } + } + + static void set_random(short[] a) { + for (int i = 0; i < a.length; i++) { + a[i] = (short)RANDOM.nextInt(); + } + } + + static void set_random(int[] a) { + for (int i = 0; i < a.length; i++) { + a[i] = RANDOM.nextInt(); + } + } + + static void set_random(long[] a) { + for (int i = 0; i < a.length; i++) { + a[i] = RANDOM.nextLong(); + } + } + + // ------------------------------------------- + // ------- Little-Endian API ---------- + // ------------------------------------------- + // Note: I had to add @ForceInline because otherwise it would sometimes + // not inline nested method calls. + + // Store a short LE into an array using store bytes in an array + @ForceInline + static void storeShortLE(byte[] bytes, int offset, short value) { + storeBytes(bytes, offset, (byte)(value >> 0), + (byte)(value >> 8)); + } + + // Store an int LE into an array using store bytes in an array + @ForceInline + static void storeIntLE(byte[] bytes, int offset, int value) { + storeBytes(bytes, offset, (byte)(value >> 0 ), + (byte)(value >> 8 ), + (byte)(value >> 16), + (byte)(value >> 24)); + } + + // Store an int LE into an array using store bytes in an array + @ForceInline + static void storeLongLE(byte[] bytes, int offset, long value) { + storeBytes(bytes, offset, (byte)(value >> 0 ), + (byte)(value >> 8 ), + (byte)(value >> 16), + (byte)(value >> 24), + (byte)(value >> 32), + (byte)(value >> 40), + (byte)(value >> 48), + (byte)(value >> 56)); + } + + // Store 2 bytes into an array + @ForceInline + static void storeBytes(byte[] bytes, int offset, byte b0, byte b1) { + bytes[offset + 0] = b0; + bytes[offset + 1] = b1; + } + + // Store 4 bytes into an array + @ForceInline + static void storeBytes(byte[] bytes, int offset, byte b0, byte b1, byte b2, byte b3) { + bytes[offset + 0] = b0; + bytes[offset + 1] = b1; + bytes[offset + 2] = b2; + bytes[offset + 3] = b3; + } + + // Store 8 bytes into an array + @ForceInline + static void storeBytes(byte[] bytes, int offset, byte b0, byte b1, byte b2, byte b3, + byte b4, byte b5, byte b6, byte b7) { + bytes[offset + 0] = b0; + bytes[offset + 1] = b1; + bytes[offset + 2] = b2; + bytes[offset + 3] = b3; + bytes[offset + 4] = b4; + bytes[offset + 5] = b5; + bytes[offset + 6] = b6; + bytes[offset + 7] = b7; + } + + @DontCompile + static Object[] test1R(byte[] a) { + a[0] = (byte)0xbe; + a[1] = (byte)0xba; + a[2] = (byte)0xad; + a[3] = (byte)0xba; + a[4] = (byte)0xef; + a[5] = (byte)0xbe; + a[6] = (byte)0xad; + a[7] = (byte)0xde; + return new Object[]{ a }; + } + + @Test + @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test1a(byte[] a) { + a[0] = (byte)0xbe; + a[1] = (byte)0xba; + a[2] = (byte)0xad; + a[3] = (byte)0xba; + a[4] = (byte)0xef; + a[5] = (byte)0xbe; + a[6] = (byte)0xad; + a[7] = (byte)0xde; + return new Object[]{ a }; + } + + @Test + @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test1b(byte[] a) { + // Add custom null check, to ensure the unsafe access always recognizes its type as an array store + if (a == null) {return null;} + UNSAFE.putLongUnaligned(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET, 0xdeadbeefbaadbabeL); + return new Object[]{ a }; + } + + @Test + @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test1c(byte[] a) { + storeLongLE(a, 0, 0xdeadbeefbaadbabeL); + return new Object[]{ a }; + } + + @Test + @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test1d(byte[] a) { + storeIntLE(a, 0, 0xbaadbabe); + storeIntLE(a, 4, 0xdeadbeef); + return new Object[]{ a }; + } + + @Test + @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test1e(byte[] a) { + storeShortLE(a, 0, (short)0xbabe); + storeShortLE(a, 2, (short)0xbaad); + storeShortLE(a, 4, (short)0xbeef); + storeShortLE(a, 6, (short)0xdead); + return new Object[]{ a }; + } + + @Test + @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test1f(byte[] a) { + UNSAFE.putByte(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 0, (byte)0xbe); + UNSAFE.putByte(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1, (byte)0xba); + UNSAFE.putByte(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 2, (byte)0xad); + UNSAFE.putByte(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 3, (byte)0xba); + UNSAFE.putByte(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 4, (byte)0xef); + UNSAFE.putByte(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 5, (byte)0xbe); + UNSAFE.putByte(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 6, (byte)0xad); + UNSAFE.putByte(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 7, (byte)0xde); + return new Object[]{ a }; + } + + @Test + // Do not optimize these, just to be sure we do not mess with store ordering. + @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_B_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "8"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test1g(byte[] a) { + UNSAFE.putByteRelease(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 0, (byte)0xbe); + UNSAFE.putByteRelease(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1, (byte)0xba); + UNSAFE.putByteRelease(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 2, (byte)0xad); + UNSAFE.putByteRelease(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 3, (byte)0xba); + UNSAFE.putByteRelease(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 4, (byte)0xef); + UNSAFE.putByteRelease(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 5, (byte)0xbe); + UNSAFE.putByteRelease(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 6, (byte)0xad); + UNSAFE.putByteRelease(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 7, (byte)0xde); + return new Object[]{ a }; + } + + @Test + // Do not optimize these, just to be sure we do not mess with store ordering. + @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_B_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "8"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test1h(byte[] a) { + UNSAFE.putByteVolatile(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 0, (byte)0xbe); + UNSAFE.putByteVolatile(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1, (byte)0xba); + UNSAFE.putByteVolatile(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 2, (byte)0xad); + UNSAFE.putByteVolatile(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 3, (byte)0xba); + UNSAFE.putByteVolatile(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 4, (byte)0xef); + UNSAFE.putByteVolatile(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 5, (byte)0xbe); + UNSAFE.putByteVolatile(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 6, (byte)0xad); + UNSAFE.putByteVolatile(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 7, (byte)0xde); + return new Object[]{ a }; + } + + @Test + // Do not optimize these, just to be sure we do not mess with store ordering. + @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_B_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "8"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test1i(byte[] a) { + UNSAFE.putByteOpaque(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 0, (byte)0xbe); + UNSAFE.putByteOpaque(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 1, (byte)0xba); + UNSAFE.putByteOpaque(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 2, (byte)0xad); + UNSAFE.putByteOpaque(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 3, (byte)0xba); + UNSAFE.putByteOpaque(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 4, (byte)0xef); + UNSAFE.putByteOpaque(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 5, (byte)0xbe); + UNSAFE.putByteOpaque(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 6, (byte)0xad); + UNSAFE.putByteOpaque(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + 7, (byte)0xde); + return new Object[]{ a }; + } + + @DontCompile + static Object[] test2R(byte[] a, int offset, long v) { + a[offset + 0] = (byte)(v >> 0); + a[offset + 1] = (byte)(v >> 8); + a[offset + 2] = (byte)(v >> 16); + a[offset + 3] = (byte)(v >> 24); + a[offset + 4] = (byte)(v >> 32); + a[offset + 5] = (byte)(v >> 40); + a[offset + 6] = (byte)(v >> 48); + a[offset + 7] = (byte)(v >> 56); + return new Object[]{ a }; + } + + @Test + @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test2a(byte[] a, int offset, long v) { + a[offset + 0] = (byte)(v >> 0); + a[offset + 1] = (byte)(v >> 8); + a[offset + 2] = (byte)(v >> 16); + a[offset + 3] = (byte)(v >> 24); + a[offset + 4] = (byte)(v >> 32); + a[offset + 5] = (byte)(v >> 40); + a[offset + 6] = (byte)(v >> 48); + a[offset + 7] = (byte)(v >> 56); + return new Object[]{ a }; + } + + @Test + @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test2b(byte[] a, int offset, long v) { + // Add custom null check, to ensure the unsafe access always recognizes its type as an array store + if (a == null) {return null;} + UNSAFE.putLongUnaligned(a, UNSAFE.ARRAY_BYTE_BASE_OFFSET + offset, v); + return new Object[]{ a }; + } + + @Test + @IR(counts = {IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test2c(byte[] a, int offset, long v) { + storeLongLE(a, offset, v); + return new Object[]{ a }; + } + + @Test + // No optimization, casting long -> int -> byte does not work + static Object[] test2d(byte[] a, int offset, long v) { + storeIntLE(a, offset + 0, (int)(v >> 0)); + storeIntLE(a, offset + 4, (int)(v >> 32)); + return new Object[]{ a }; + } + + @Test + // No optimization, casting long -> short -> byte does not work + static Object[] test2e(byte[] a, int offset, long v) { + storeShortLE(a, offset + 0, (short)(v >> 0)); + storeShortLE(a, offset + 2, (short)(v >> 16)); + storeShortLE(a, offset + 4, (short)(v >> 32)); + storeShortLE(a, offset + 6, (short)(v >> 48)); + return new Object[]{ a }; + } + + @DontCompile + static Object[] test3R(byte[] a, int offset, long v) { + a[offset + 0] = (byte)(v >> 0); + a[offset + 1] = (byte)(v >> 8); + a[offset + 2] = (byte)(v >> 16); + a[offset + 3] = (byte)(v >> 24); + a[offset + 4] = (byte)(v >> 0); + a[offset + 5] = (byte)(v >> 8); + a[offset + 6] = (byte)(v >> 16); + a[offset + 7] = (byte)(v >> 24); + return new Object[]{ a }; + } + + @Test + @IR(counts = {IRNode.STORE_I_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "2"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test3a(byte[] a, int offset, long v) { + a[offset + 0] = (byte)(v >> 0); + a[offset + 1] = (byte)(v >> 8); + a[offset + 2] = (byte)(v >> 16); + a[offset + 3] = (byte)(v >> 24); + a[offset + 4] = (byte)(v >> 0); + a[offset + 5] = (byte)(v >> 8); + a[offset + 6] = (byte)(v >> 16); + a[offset + 7] = (byte)(v >> 24); + return new Object[]{ a }; + } + + @DontCompile + static Object[] test4R(byte[] a, int offset, long v1, int v2, short v3, byte v4) { + a[offset + 0] = (byte)0x00; + a[offset + 1] = (byte)0xFF; + a[offset + 2] = v4; + a[offset + 3] = (byte)0x42; + a[offset + 4] = (byte)(v1 >> 0); + a[offset + 5] = (byte)(v1 >> 8); + a[offset + 6] = (byte)0xAB; + a[offset + 7] = (byte)0xCD; + a[offset + 8] = (byte)0xEF; + a[offset + 9] = (byte)0x01; + a[offset + 10] = (byte)(v2 >> 0); + a[offset + 11] = (byte)(v2 >> 8); + a[offset + 12] = (byte)(v2 >> 16); + a[offset + 13] = (byte)(v2 >> 24); + a[offset + 14] = (byte)(v3 >> 0); + a[offset + 15] = (byte)(v3 >> 8); + a[offset + 16] = (byte)0xEF; + return new Object[]{ a }; + } + + @Test + @IR(counts = {IRNode.STORE_B_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "4", // 3 (+ 1 for uncommon trap) + IRNode.STORE_C_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "3", + IRNode.STORE_I_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "2", + IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test4a(byte[] a, int offset, long v1, int v2, short v3, byte v4) { + a[offset + 0] = (byte)0x00; // individual load expected to go into state of RC + a[offset + 1] = (byte)0xFF; + a[offset + 2] = v4; + a[offset + 3] = (byte)0x42; + a[offset + 4] = (byte)(v1 >> 0); + a[offset + 5] = (byte)(v1 >> 8); + a[offset + 6] = (byte)0xAB; + a[offset + 7] = (byte)0xCD; + a[offset + 8] = (byte)0xEF; + a[offset + 9] = (byte)0x01; + a[offset + 10] = (byte)(v2 >> 0); + a[offset + 11] = (byte)(v2 >> 8); + a[offset + 12] = (byte)(v2 >> 16); + a[offset + 13] = (byte)(v2 >> 24); + a[offset + 14] = (byte)(v3 >> 0); + a[offset + 15] = (byte)(v3 >> 8); + a[offset + 16] = (byte)0xEF; + return new Object[]{ a }; + } + + @DontCompile + static Object[] test5R(byte[] a, int offset) { + a[offset + 0] = (byte)0x01; + a[offset + 1] = (byte)0x02; + a[offset + 2] = (byte)0x03; + a[offset + 3] = (byte)0x04; + a[offset + 4] = (byte)0x11; + a[offset + 5] = (byte)0x22; + a[offset + 6] = (byte)0x33; + a[offset + 7] = (byte)0x44; + a[offset + 8] = (byte)0x55; + a[offset + 9] = (byte)0x66; + a[offset + 10] = (byte)0x77; + a[offset + 11] = (byte)0xAA; + a[offset + 12] = (byte)0xBB; + a[offset + 13] = (byte)0xCC; + a[offset + 14] = (byte)0xDD; + return new Object[]{ a }; + } + + @Test + @IR(counts = {IRNode.STORE_B_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1", + IRNode.STORE_C_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1", + IRNode.STORE_I_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1", + IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test5a(byte[] a, int offset) { + a[offset + 0] = (byte)0x01; + a[offset + 1] = (byte)0x02; + a[offset + 2] = (byte)0x03; + a[offset + 3] = (byte)0x04; + a[offset + 4] = (byte)0x11; + a[offset + 5] = (byte)0x22; + a[offset + 6] = (byte)0x33; + a[offset + 7] = (byte)0x44; + a[offset + 8] = (byte)0x55; + a[offset + 9] = (byte)0x66; + a[offset + 10] = (byte)0x77; + a[offset + 11] = (byte)0xAA; + a[offset + 12] = (byte)0xBB; + a[offset + 13] = (byte)0xCC; + a[offset + 14] = (byte)0xDD; + return new Object[]{ a }; + } + + @DontCompile + static Object[] test6R(byte[] a, byte[] b, int offset1, int offset2) { + a[offset1 + 1] = (byte)0x02; + a[offset1 + 3] = (byte)0x04; + b[offset1 + 4] = (byte)0x11; + a[offset1 + 5] = (byte)0x22; + a[offset2 + 6] = (byte)0x33; + a[offset1 + 7] = (byte)0x44; + b[offset1 + 8] = (byte)0x55; + b[offset1 + 10] = (byte)0x66; + return new Object[]{ a, b }; + } + + @Test + @IR(counts = {IRNode.STORE_B_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "8", + IRNode.STORE_C_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_I_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0"}) + static Object[] test6a(byte[] a, byte[] b, int offset1, int offset2) { + a[offset1 + 1] = (byte)0x02; + a[offset1 + 3] = (byte)0x04; + b[offset1 + 4] = (byte)0x11; + a[offset1 + 5] = (byte)0x22; + a[offset2 + 6] = (byte)0x33; + a[offset1 + 7] = (byte)0x44; + b[offset1 + 8] = (byte)0x55; + b[offset1 + 10] = (byte)0x66; + return new Object[]{ a, b }; + } + + @DontCompile + static Object[] test7R(byte[] a, int offset1, int v1) { + a[offset1 + 1] = (byte)(v1 >> 8); + a[offset1 + 2] = (byte)(v1 >> 16); + a[offset1 + 3] = (byte)(v1 >> 24); + return new Object[]{ a }; + } + + @Test + @IR(counts = {IRNode.STORE_B_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "3", + IRNode.STORE_C_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_I_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0"}) + static Object[] test7a(byte[] a, int offset1, int v1) { + a[offset1 + 1] = (byte)(v1 >> 8); + a[offset1 + 2] = (byte)(v1 >> 16); + a[offset1 + 3] = (byte)(v1 >> 24); + return new Object[]{ a }; + } + + @DontCompile + static Object[] test100R(short[] a, int offset) { + a[offset + 0] = (short)0x0100; + a[offset + 1] = (short)0x0200; + a[offset + 2] = (short)0x0311; + a[offset + 3] = (short)0x0400; + a[offset + 4] = (short)0x1100; + a[offset + 5] = (short)0x2233; + a[offset + 6] = (short)0x3300; + a[offset + 7] = (short)0x4400; + a[offset + 8] = (short)0x5599; + a[offset + 9] = (short)0x6600; + a[offset + 10] = (short)0x7700; + a[offset + 11] = (short)0xAACC; + a[offset + 12] = (short)0xBB00; + a[offset + 13] = (short)0xCC00; + a[offset + 14] = (short)0xDDFF; + return new Object[]{ a }; + } + + @Test + @IR(counts = {IRNode.STORE_B_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_C_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1", + IRNode.STORE_I_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1", + IRNode.STORE_L_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "3"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test100a(short[] a, int offset) { + a[offset + 0] = (short)0x0100; // stays unchanged -> both used for RC and Return path + a[offset + 1] = (short)0x0200; // I + a[offset + 2] = (short)0x0311; // I + a[offset + 3] = (short)0x0400; // L + a[offset + 4] = (short)0x1100; // L + a[offset + 5] = (short)0x2233; // L + a[offset + 6] = (short)0x3300; // L + a[offset + 7] = (short)0x4400; // L + a[offset + 8] = (short)0x5599; // L + a[offset + 9] = (short)0x6600; // L + a[offset + 10] = (short)0x7700; // L + a[offset + 11] = (short)0xAACC; // L + a[offset + 12] = (short)0xBB00; // L + a[offset + 13] = (short)0xCC00; // L + a[offset + 14] = (short)0xDDFF; // L + return new Object[]{ a }; + } + + @DontCompile + static Object[] test101R(short[] a, int offset) { + a[offset + 0] = (short)0x0100; + a[offset + 1] = (short)0x0200; + a[offset + 2] = (short)0x0311; + a[offset + 3] = (short)0x0400; + a[offset + 4] = (short)0x1100; + a[offset + 5] = (short)0x2233; + a[offset + 6] = (short)0x3300; + a[offset + 7] = (short)0x4400; + a[offset + 8] = (short)0x5599; + a[offset + 9] = (short)0x6600; + a[offset + 10] = (short)0x7700; + a[offset + 11] = (short)0xAACC; + a[offset + 12] = (short)0xBB00; + a[offset + 13] = (short)0xCC00; + return new Object[]{ a }; + } + + @Test + @IR(counts = {IRNode.STORE_B_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_C_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1", // only for RC + IRNode.STORE_I_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1", + IRNode.STORE_L_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "3"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test101a(short[] a, int offset) { + a[offset + 0] = (short)0x0100; // I plus kept unchanged for RC + a[offset + 1] = (short)0x0200; // I + a[offset + 2] = (short)0x0311; // L + a[offset + 3] = (short)0x0400; // L + a[offset + 4] = (short)0x1100; // L + a[offset + 5] = (short)0x2233; // L + a[offset + 6] = (short)0x3300; // L + a[offset + 7] = (short)0x4400; // L + a[offset + 8] = (short)0x5599; // L + a[offset + 9] = (short)0x6600; // L + a[offset + 10] = (short)0x7700; // L + a[offset + 11] = (short)0xAACC; // L + a[offset + 12] = (short)0xBB00; // L + a[offset + 13] = (short)0xCC00; // L + return new Object[]{ a }; + } + + @DontCompile + static Object[] test102R(short[] a, int offset, long v1, int v2, short v3) { + a[offset + 0] = (short)0x0000; + a[offset + 1] = (short)0xFFFF; + a[offset + 2] = v3; + a[offset + 3] = (short)0x4242; + a[offset + 4] = (short)(v1 >> 0); + a[offset + 5] = (short)(v1 >> 16); + a[offset + 6] = (short)0xAB11; + a[offset + 7] = (short)0xCD36; + a[offset + 8] = (short)0xEF89; + a[offset + 9] = (short)0x0156; + a[offset + 10] = (short)(v1 >> 0); + a[offset + 11] = (short)(v1 >> 16); + a[offset + 12] = (short)(v1 >> 32); + a[offset + 13] = (short)(v1 >> 48); + a[offset + 14] = (short)(v2 >> 0); + a[offset + 15] = (short)(v2 >> 16); + a[offset + 16] = (short)0xEFEF; + return new Object[]{ a }; + } + + @Test + @IR(counts = {IRNode.STORE_B_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_C_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "4", // 3 (+1 that goes into RC) + IRNode.STORE_I_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "3", + IRNode.STORE_L_OF_CLASS, "short\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "2"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test102a(short[] a, int offset, long v1, int v2, short v3) { + a[offset + 0] = (short)0x0000; // store goes into RC + a[offset + 1] = (short)0xFFFF; + a[offset + 2] = v3; + a[offset + 3] = (short)0x4242; + a[offset + 4] = (short)(v1 >> 0); + a[offset + 5] = (short)(v1 >> 16); + a[offset + 6] = (short)0xAB11; + a[offset + 7] = (short)0xCD36; + a[offset + 8] = (short)0xEF89; + a[offset + 9] = (short)0x0156; + a[offset + 10] = (short)(v1 >> 0); + a[offset + 11] = (short)(v1 >> 16); + a[offset + 12] = (short)(v1 >> 32); + a[offset + 13] = (short)(v1 >> 48); + a[offset + 14] = (short)(v2 >> 0); + a[offset + 15] = (short)(v2 >> 16); + a[offset + 16] = (short)0xEFEF; + return new Object[]{ a }; + } + + @DontCompile + static Object[] test200R(int[] a, int offset) { + a[offset + 0] = 0x01001236; + a[offset + 1] = 0x02001284; + a[offset + 2] = 0x03111235; + a[offset + 3] = 0x04001294; + a[offset + 4] = 0x11001234; + a[offset + 5] = 0x22331332; + a[offset + 6] = 0x33001234; + a[offset + 7] = 0x44001432; + a[offset + 8] = 0x55991234; + a[offset + 9] = 0x66001233; + a[offset + 10] = 0x77001434; + a[offset + 11] = 0xAACC1234; + a[offset + 12] = 0xBB001434; + a[offset + 13] = 0xCC001236; + a[offset + 14] = 0xDDFF1534; + return new Object[]{ a }; + } + + @Test + @IR(counts = {IRNode.STORE_B_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_C_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_I_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1", + IRNode.STORE_L_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "7"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test200a(int[] a, int offset) { + a[offset + 0] = 0x01001236; // stays unchanged -> both used for RC and Return path + a[offset + 1] = 0x02001284; // L + a[offset + 2] = 0x03111235; // L + a[offset + 3] = 0x04001294; // L + a[offset + 4] = 0x11001234; // L + a[offset + 5] = 0x22331332; // L + a[offset + 6] = 0x33001234; // L + a[offset + 7] = 0x44001432; // L + a[offset + 8] = 0x55991234; // L + a[offset + 9] = 0x66001233; // L + a[offset + 10] = 0x77001434; // L + a[offset + 11] = 0xAACC1234; // L + a[offset + 12] = 0xBB001434; // L + a[offset + 13] = 0xCC001236; // L + a[offset + 14] = 0xDDFF1534; // L + return new Object[]{ a }; + } + + @DontCompile + static Object[] test201R(int[] a, int offset) { + a[offset + 0] = 0x01001236; + a[offset + 1] = 0x02001284; + a[offset + 2] = 0x03111235; + a[offset + 3] = 0x04001294; + a[offset + 4] = 0x11001234; + a[offset + 5] = 0x22331332; + a[offset + 6] = 0x33001234; + a[offset + 7] = 0x44001432; + a[offset + 8] = 0x55991234; + a[offset + 9] = 0x66001233; + a[offset + 10] = 0x77001434; + a[offset + 11] = 0xAACC1234; + a[offset + 12] = 0xBB001434; + a[offset + 13] = 0xCC001236; + return new Object[]{ a }; + } + + @Test + @IR(counts = {IRNode.STORE_B_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_C_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_I_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1", // only for RC + IRNode.STORE_L_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "7"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test201a(int[] a, int offset) { + a[offset + 0] = 0x01001236; // L and also kept unchanged for RC + a[offset + 1] = 0x02001284; // L + a[offset + 2] = 0x03111235; // L + a[offset + 3] = 0x04001294; // L + a[offset + 4] = 0x11001234; // L + a[offset + 5] = 0x22331332; // L + a[offset + 6] = 0x33001234; // L + a[offset + 7] = 0x44001432; // L + a[offset + 8] = 0x55991234; // L + a[offset + 9] = 0x66001233; // L + a[offset + 10] = 0x77001434; // L + a[offset + 11] = 0xAACC1234; // L + a[offset + 12] = 0xBB001434; // L + a[offset + 13] = 0xCC001236; // L + return new Object[]{ a }; + } + + @DontCompile + static Object[] test202R(int[] a, int offset, long v1, int v2) { + a[offset + 0] = 0x00000000; + a[offset + 1] = 0xFFFFFFFF; + a[offset + 2] = v2; + a[offset + 3] = 0x42424242; + a[offset + 4] = (int)(v1 >> 0); + a[offset + 5] = (int)(v1 >> 32); + a[offset + 6] = 0xAB110129; + a[offset + 7] = 0xCD360183; + a[offset + 8] = 0xEF890173; + a[offset + 9] = 0x01560124; + a[offset + 10] = (int)(v1 >> 0); + a[offset + 11] = (int)(v1 >> 32); + a[offset + 12] = (int)(v1 >> 0); + a[offset + 13] = (int)(v1 >> 32); + a[offset + 14] = v2; + a[offset + 15] = v2; + a[offset + 16] = 0xEFEFEFEF; + return new Object[]{ a }; + } + + @Test + @IR(counts = {IRNode.STORE_B_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_C_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_I_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "6", // 5 (+1 that goes into RC) + IRNode.STORE_L_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "6"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test202a(int[] a, int offset, long v1, int v2) { + a[offset + 0] = 0x00000000; // merged with store below, but also kept unchanged for RC + a[offset + 1] = 0xFFFFFFFF; + a[offset + 2] = v2; + a[offset + 3] = 0x42424242; + a[offset + 4] = (int)(v1 >> 0); + a[offset + 5] = (int)(v1 >> 32); + a[offset + 6] = 0xAB110129; + a[offset + 7] = 0xCD360183; + a[offset + 8] = 0xEF890173; + a[offset + 9] = 0x01560124; + a[offset + 10] = (int)(v1 >> 0); + a[offset + 11] = (int)(v1 >> 32); + a[offset + 12] = (int)(v1 >> 0); + a[offset + 13] = (int)(v1 >> 32); + a[offset + 14] = v2; + a[offset + 15] = v2; + a[offset + 16] = 0xEFEFEFEF; + return new Object[]{ a }; + } + + @DontCompile + static Object[] test300R(int[] a) { + a[2] = 42; + a[3] = 42; + a[4] = 42; + a[5] = 42; + int x = a[3]; // dependent load + return new Object[]{ a, new int[]{ x } }; + } + + @Test + @IR(counts = {IRNode.STORE_L_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "2"}, + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test300a(int[] a) { + a[2] = 42; + a[3] = 42; + a[4] = 42; + a[5] = 42; + int x = a[3]; // dependent load + return new Object[]{ a, new int[]{ x } }; + } + + @DontCompile + static Object[] test400R(int[] a) { + UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 0, (byte)0xbe); + UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 1, (byte)0xba); + UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 2, (byte)0xad); + UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 3, (byte)0xba); + UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 4, (byte)0xef); + UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 5, (byte)0xbe); + UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 6, (byte)0xad); + UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 7, (byte)0xde); + return new Object[]{ a }; + } + + @Test + // We must be careful with mismatched accesses on arrays: + // An int-array can have about 2x max_int size, and hence if we address bytes in it, we can have int-overflows. + // We might consider addresses (x + 0) and (x + 1) as adjacent, even if x = max_int, and therefore the second + // address overflows and is not adjacent at all. + // Therefore, we should only consider stores that have the same size as the element type of the array. + @IR(counts = {IRNode.STORE_B_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "8", // no merging + IRNode.STORE_C_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_I_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_L_OF_CLASS, "int\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0"}) + static Object[] test400a(int[] a) { + UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 0, (byte)0xbe); + UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 1, (byte)0xba); + UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 2, (byte)0xad); + UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 3, (byte)0xba); + UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 4, (byte)0xef); + UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 5, (byte)0xbe); + UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 6, (byte)0xad); + UNSAFE.putByte(a, UNSAFE.ARRAY_INT_BASE_OFFSET + 7, (byte)0xde); + return new Object[]{ a }; + } + + @DontCompile + // The 500-series has all the same code, but is executed with different inputs: + // 500a: never violate a RangeCheck -> expect will always merge stores + // 501a: randomly violate RangeCheck, also during warmup -> never merge stores + // 502a: during warmup never violate RangeCheck -> compile once with merged stores + // but then after warmup violate RangeCheck -> recompile without merged stores + static Object[] test500R(byte[] a, int offset, long v) { + int idx = 0; + try { + a[offset + 0] = (byte)(v >> 0); + idx = 1; + a[offset + 1] = (byte)(v >> 8); + idx = 2; + a[offset + 2] = (byte)(v >> 16); + idx = 3; + a[offset + 3] = (byte)(v >> 24); + idx = 4; + a[offset + 4] = (byte)(v >> 32); + idx = 5; + a[offset + 5] = (byte)(v >> 40); + idx = 6; + a[offset + 6] = (byte)(v >> 48); + idx = 7; + a[offset + 7] = (byte)(v >> 56); + idx = 8; + } catch (ArrayIndexOutOfBoundsException _) {} + return new Object[]{ a, new int[]{ idx } }; + } + + @Test + @IR(counts = {IRNode.STORE_B_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1", // for RangeCheck trap + IRNode.STORE_C_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_I_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "1"}, // expect merged + applyIf = {"UseUnalignedAccesses", "true"}) + static Object[] test500a(byte[] a, int offset, long v) { + int idx = 0; + try { + a[offset + 0] = (byte)(v >> 0); + idx = 1; + a[offset + 1] = (byte)(v >> 8); + idx = 2; + a[offset + 2] = (byte)(v >> 16); + idx = 3; + a[offset + 3] = (byte)(v >> 24); + idx = 4; + a[offset + 4] = (byte)(v >> 32); + idx = 5; + a[offset + 5] = (byte)(v >> 40); + idx = 6; + a[offset + 6] = (byte)(v >> 48); + idx = 7; + a[offset + 7] = (byte)(v >> 56); + idx = 8; + } catch (ArrayIndexOutOfBoundsException _) {} + return new Object[]{ a, new int[]{ idx } }; + } + + @Test + @IR(counts = {IRNode.STORE_B_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "8", // No optimization because of too many RangeChecks + IRNode.STORE_C_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_I_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0"}) + static Object[] test501a(byte[] a, int offset, long v) { + int idx = 0; + try { + a[offset + 0] = (byte)(v >> 0); + idx = 1; + a[offset + 1] = (byte)(v >> 8); + idx = 2; + a[offset + 2] = (byte)(v >> 16); + idx = 3; + a[offset + 3] = (byte)(v >> 24); + idx = 4; + a[offset + 4] = (byte)(v >> 32); + idx = 5; + a[offset + 5] = (byte)(v >> 40); + idx = 6; + a[offset + 6] = (byte)(v >> 48); + idx = 7; + a[offset + 7] = (byte)(v >> 56); + idx = 8; + } catch (ArrayIndexOutOfBoundsException _) {} + return new Object[]{ a, new int[]{ idx } }; + } + + @Test + @IR(counts = {IRNode.STORE_B_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "8", // No optimization because of too many RangeChecks + IRNode.STORE_C_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_I_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0", + IRNode.STORE_L_OF_CLASS, "byte\\\\[int:>=0] \\\\(java/lang/Cloneable,java/io/Serializable\\\\)", "0"}) + static Object[] test502a(byte[] a, int offset, long v) { + int idx = 0; + try { + a[offset + 0] = (byte)(v >> 0); + idx = 1; + a[offset + 1] = (byte)(v >> 8); + idx = 2; + a[offset + 2] = (byte)(v >> 16); + idx = 3; + a[offset + 3] = (byte)(v >> 24); + idx = 4; + a[offset + 4] = (byte)(v >> 32); + idx = 5; + a[offset + 5] = (byte)(v >> 40); + idx = 6; + a[offset + 6] = (byte)(v >> 48); + idx = 7; + a[offset + 7] = (byte)(v >> 56); + idx = 8; + } catch (ArrayIndexOutOfBoundsException _) {} + return new Object[]{ a, new int[]{ idx } }; + } +} diff --git a/test/micro/org/openjdk/bench/vm/compiler/MergeStores.java b/test/micro/org/openjdk/bench/vm/compiler/MergeStores.java new file mode 100644 index 0000000000000..e1f0d5eaedcdf --- /dev/null +++ b/test/micro/org/openjdk/bench/vm/compiler/MergeStores.java @@ -0,0 +1,696 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. Oracle designates this + * particular file as subject to the "Classpath" exception as provided + * by Oracle in the LICENSE file that accompanied this code. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package org.openjdk.bench.vm.compiler; + +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Warmup; + + +import jdk.internal.misc.Unsafe; +import jdk.internal.util.ByteArrayLittleEndian; +import java.util.concurrent.TimeUnit; + +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.NANOSECONDS) +@Warmup(iterations = 3, time = 3) +@Measurement(iterations = 3, time = 3) +@Fork(value = 3, jvmArgsAppend = { + "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED", + "--add-exports", "java.base/jdk.internal.util=ALL-UNNAMED"}) +@State(Scope.Benchmark) +public class MergeStores { + + public static final int RANGE = 100; + + static Unsafe UNSAFE = Unsafe.getUnsafe(); + + @Param("1") + public static short vS; + + @Param("1") + public static int vI; + + @Param("1") + public static long vL; + + public static int offset = 5; + public static byte[] aB = new byte[RANGE]; + public static short[] aS = new short[RANGE]; + public static int[] aI = new int[RANGE]; + + // ------------------------------------------- + // ------- Little-Endian API ---------- + // ------------------------------------------- + + // Store a short LE into an array using store bytes in an array + static void storeShortLE(byte[] bytes, int offset, short value) { + storeBytes(bytes, offset, (byte)(value >> 0), + (byte)(value >> 8)); + } + + // Store an int LE into an array using store bytes in an array + static void storeIntLE(byte[] bytes, int offset, int value) { + storeBytes(bytes, offset, (byte)(value >> 0 ), + (byte)(value >> 8 ), + (byte)(value >> 16), + (byte)(value >> 24)); + } + + // Store an int LE into an array using store bytes in an array + static void storeLongLE(byte[] bytes, int offset, long value) { + storeBytes(bytes, offset, (byte)(value >> 0 ), + (byte)(value >> 8 ), + (byte)(value >> 16), + (byte)(value >> 24), + (byte)(value >> 32), + (byte)(value >> 40), + (byte)(value >> 48), + (byte)(value >> 56)); + } + + // Store 2 bytes into an array + static void storeBytes(byte[] bytes, int offset, byte b0, byte b1) { + bytes[offset + 0] = b0; + bytes[offset + 1] = b1; + } + + // Store 4 bytes into an array + static void storeBytes(byte[] bytes, int offset, byte b0, byte b1, byte b2, byte b3) { + bytes[offset + 0] = b0; + bytes[offset + 1] = b1; + bytes[offset + 2] = b2; + bytes[offset + 3] = b3; + } + + // Store 8 bytes into an array + static void storeBytes(byte[] bytes, int offset, byte b0, byte b1, byte b2, byte b3, + byte b4, byte b5, byte b6, byte b7) { + bytes[offset + 0] = b0; + bytes[offset + 1] = b1; + bytes[offset + 2] = b2; + bytes[offset + 3] = b3; + bytes[offset + 4] = b4; + bytes[offset + 5] = b5; + bytes[offset + 6] = b6; + bytes[offset + 7] = b7; + } + + // -------------------------------- BENCHMARKS -------------------------------- + + @Benchmark + public void baseline() { + } + + @Benchmark + public byte[] baseline_allocate() { + byte[] aB = new byte[RANGE]; + return aB; + } + + @Benchmark + public byte[] store_B2_con_adr0_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[0] = (byte)0x01; + aB[1] = (byte)0x02; + return aB; + } + + @Benchmark + public byte[] store_B2_con_adr1_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[1] = (byte)0x01; + aB[2] = (byte)0x02; + return aB; + } + + @Benchmark + public byte[] store_B2_con_offs_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[offset + 0] = (byte)0x01; + aB[offset + 1] = (byte)0x02; + return aB; + } + + @Benchmark + public byte[] store_B2_con_offs_allocate_unsafe() { + byte[] aB = new byte[RANGE]; + UNSAFE.putShortUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, (short)0x0201); + return aB; + } + + @Benchmark + public byte[] store_B2_con_offs_allocate_bale() { + byte[] aB = new byte[RANGE]; + ByteArrayLittleEndian.setShort(aB, offset, (short)0x0201); + return aB; + } + + @Benchmark + public byte[] store_B2_con_offs_allocate_leapi() { + byte[] aB = new byte[RANGE]; + storeShortLE(aB, offset, (short)0x0201); + return aB; + } + + @Benchmark + public byte[] store_B2_con_offs_nonalloc_direct() { + aB[offset + 0] = (byte)0x01; + aB[offset + 1] = (byte)0x02; + return aB; + } + + @Benchmark + public byte[] store_B2_con_offs_nonalloc_unsafe() { + UNSAFE.putShortUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, (short)0x0201); + return aB; + } + + @Benchmark + public byte[] store_B2_con_offs_nonalloc_bale() { + ByteArrayLittleEndian.setShort(aB, offset, (short)0x0201); + return aB; + } + + @Benchmark + public byte[] store_B2_con_offs_nonalloc_leapi() { + storeShortLE(aB, offset, (short)0x0201); + return aB; + } + + @Benchmark + public byte[] store_B2_S_offs_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[offset + 0] = (byte)(vS >> 0 ); + aB[offset + 1] = (byte)(vS >> 8 ); + return aB; + } + + @Benchmark + public byte[] store_B2_S_offs_allocate_unsafe() { + byte[] aB = new byte[RANGE]; + UNSAFE.putShortUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vS); + return aB; + } + + @Benchmark + public byte[] store_B2_S_offs_allocate_bale() { + byte[] aB = new byte[RANGE]; + ByteArrayLittleEndian.setShort(aB, offset, vS); + return aB; + } + + @Benchmark + public byte[] store_B2_S_offs_allocate_leapi() { + byte[] aB = new byte[RANGE]; + storeShortLE(aB, offset, vS); + return aB; + } + + @Benchmark + public byte[] store_B2_S_offs_nonalloc_direct() { + aB[offset + 0] = (byte)(vS >> 0 ); + aB[offset + 1] = (byte)(vS >> 8 ); + return aB; + } + + @Benchmark + public byte[] store_B2_S_offs_nonalloc_unsafe() { + UNSAFE.putShortUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vS); + return aB; + } + + @Benchmark + public byte[] store_B2_S_offs_nonalloc_bale() { + ByteArrayLittleEndian.setShort(aB, offset, vS); + return aB; + } + + @Benchmark + public byte[] store_B2_S_offs_nonalloc_leapi() { + storeShortLE(aB, offset, vS); + return aB; + } + + @Benchmark + public byte[] store_B4_con_adr0_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[0] = (byte)0x01; + aB[1] = (byte)0x02; + aB[2] = (byte)0x03; + aB[3] = (byte)0x04; + return aB; + } + + @Benchmark + public byte[] store_B4_con_adr1_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[1] = (byte)0x01; + aB[2] = (byte)0x02; + aB[3] = (byte)0x03; + aB[4] = (byte)0x04; + return aB; + } + + @Benchmark + public byte[] store_B4_con_offs_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[offset + 0] = (byte)0x01; + aB[offset + 1] = (byte)0x02; + aB[offset + 2] = (byte)0x03; + aB[offset + 3] = (byte)0x04; + return aB; + } + + @Benchmark + public byte[] store_B4_con_offs_allocate_unsafe() { + byte[] aB = new byte[RANGE]; + UNSAFE.putIntUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, 0x04030201); + return aB; + } + + @Benchmark + public byte[] store_B4_con_offs_allocate_bale() { + byte[] aB = new byte[RANGE]; + ByteArrayLittleEndian.setInt(aB, offset, 0x04030201); + return aB; + } + + @Benchmark + public byte[] store_B4_con_offs_allocate_leapi() { + byte[] aB = new byte[RANGE]; + storeIntLE(aB, offset, 0x04030201); + return aB; + } + + @Benchmark + public byte[] store_B4_con_offs_nonalloc_direct() { + aB[offset + 0] = (byte)0x01; + aB[offset + 1] = (byte)0x02; + aB[offset + 2] = (byte)0x03; + aB[offset + 3] = (byte)0x04; + return aB; + } + + @Benchmark + public byte[] store_B4_con_offs_nonalloc_unsafe() { + UNSAFE.putIntUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, 0x04030201); + return aB; + } + + @Benchmark + public byte[] store_B4_con_offs_nonalloc_bale() { + ByteArrayLittleEndian.setInt(aB, offset, 0x04030201); + return aB; + } + + @Benchmark + public byte[] store_B4_con_offs_nonalloc_leapi() { + storeIntLE(aB, offset, 0x04030201); + return aB; + } + + @Benchmark + public byte[] store_B4_I_offs_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[offset + 0] = (byte)(vI >> 0 ); + aB[offset + 1] = (byte)(vI >> 8 ); + aB[offset + 2] = (byte)(vI >> 16); + aB[offset + 3] = (byte)(vI >> 24); + return aB; + } + + @Benchmark + public byte[] store_B4_I_offs_allocate_unsafe() { + byte[] aB = new byte[RANGE]; + UNSAFE.putIntUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vI); + return aB; + } + + @Benchmark + public byte[] store_B4_I_offs_allocate_bale() { + byte[] aB = new byte[RANGE]; + ByteArrayLittleEndian.setInt(aB, offset, vI); + return aB; + } + + @Benchmark + public byte[] store_B4_I_offs_allocate_leapi() { + byte[] aB = new byte[RANGE]; + storeIntLE(aB, offset, vI); + return aB; + } + + @Benchmark + public byte[] store_B4_I_offs_nonalloc_direct() { + aB[offset + 0] = (byte)(vI >> 0 ); + aB[offset + 1] = (byte)(vI >> 8 ); + aB[offset + 2] = (byte)(vI >> 16); + aB[offset + 3] = (byte)(vI >> 24); + return aB; + } + + @Benchmark + public byte[] store_B4_I_offs_nonalloc_unsafe() { + UNSAFE.putIntUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vI); + return aB; + } + + @Benchmark + public byte[] store_B4_I_offs_nonalloc_bale() { + ByteArrayLittleEndian.setInt(aB, offset, vI); + return aB; + } + + @Benchmark + public byte[] store_B4_I_offs_nonalloc_leapi() { + storeIntLE(aB, offset, vI); + return aB; + } + + @Benchmark + public byte[] store_B8_con_adr0_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[0] = (byte)0x01; + aB[1] = (byte)0x02; + aB[2] = (byte)0x03; + aB[3] = (byte)0x04; + aB[4] = (byte)0x05; + aB[5] = (byte)0x06; + aB[6] = (byte)0x07; + aB[7] = (byte)0x08; + return aB; + } + + @Benchmark + public byte[] store_B8_con_adr1_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[1] = (byte)0x01; + aB[2] = (byte)0x02; + aB[3] = (byte)0x03; + aB[4] = (byte)0x04; + aB[5] = (byte)0x05; + aB[6] = (byte)0x06; + aB[7] = (byte)0x07; + aB[8] = (byte)0x08; + return aB; + } + + @Benchmark + public byte[] store_B8_con_offs_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[offset + 0] = (byte)0x01; + aB[offset + 1] = (byte)0x02; + aB[offset + 2] = (byte)0x03; + aB[offset + 3] = (byte)0x04; + aB[offset + 4] = (byte)0x05; + aB[offset + 5] = (byte)0x06; + aB[offset + 6] = (byte)0x07; + aB[offset + 7] = (byte)0x08; + return aB; + } + + @Benchmark + public byte[] store_B8_con_offs_allocate_unsafe() { + byte[] aB = new byte[RANGE]; + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, 0x0807060504030201L); + return aB; + } + + @Benchmark + public byte[] store_B8_con_offs_allocate_bale() { + byte[] aB = new byte[RANGE]; + ByteArrayLittleEndian.setLong(aB, offset, 0x0807060504030201L); + return aB; + } + + @Benchmark + public byte[] store_B8_con_offs_allocate_leapi() { + byte[] aB = new byte[RANGE]; + storeLongLE(aB, offset, 0x0807060504030201L); + return aB; + } + + @Benchmark + public byte[] store_B8_con_offs_nonalloc_direct() { + aB[offset + 0] = (byte)0x01; + aB[offset + 1] = (byte)0x02; + aB[offset + 2] = (byte)0x03; + aB[offset + 3] = (byte)0x04; + aB[offset + 4] = (byte)0x05; + aB[offset + 5] = (byte)0x06; + aB[offset + 6] = (byte)0x07; + aB[offset + 7] = (byte)0x08; + return aB; + } + + @Benchmark + public byte[] store_B8_con_offs_nonalloc_unsafe() { + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, 0x0807060504030201L); + return aB; + } + + @Benchmark + public byte[] store_B8_con_offs_nonalloc_bale() { + ByteArrayLittleEndian.setLong(aB, offset, 0x0807060504030201L); + return aB; + } + + @Benchmark + public byte[] store_B8_con_offs_nonalloc_leapi() { + storeLongLE(aB, offset, 0x0807060504030201L); + return aB; + } + + @Benchmark + public byte[] store_B8_L_offs_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[offset + 0] = (byte)(vL >> 0 ); + aB[offset + 1] = (byte)(vL >> 8 ); + aB[offset + 2] = (byte)(vL >> 16); + aB[offset + 3] = (byte)(vL >> 24); + aB[offset + 4] = (byte)(vL >> 32); + aB[offset + 5] = (byte)(vL >> 40); + aB[offset + 6] = (byte)(vL >> 48); + aB[offset + 7] = (byte)(vL >> 56); + return aB; + } + + @Benchmark + public byte[] store_B8_L_offs_allocate_unsafe() { + byte[] aB = new byte[RANGE]; + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vL); + return aB; + } + + @Benchmark + public byte[] store_B8_L_offs_allocate_bale() { + byte[] aB = new byte[RANGE]; + ByteArrayLittleEndian.setLong(aB, offset, vL); + return aB; + } + + @Benchmark + public byte[] store_B8_L_offs_allocate_leapi() { + byte[] aB = new byte[RANGE]; + storeLongLE(aB, offset, vL); + return aB; + } + + @Benchmark + public byte[] store_B8_L_offs_nonalloc_direct() { + aB[offset + 0] = (byte)(vL >> 0 ); + aB[offset + 1] = (byte)(vL >> 8 ); + aB[offset + 2] = (byte)(vL >> 16); + aB[offset + 3] = (byte)(vL >> 24); + aB[offset + 4] = (byte)(vL >> 32); + aB[offset + 5] = (byte)(vL >> 40); + aB[offset + 6] = (byte)(vL >> 48); + aB[offset + 7] = (byte)(vL >> 56); + return aB; + } + + @Benchmark + public byte[] store_B8_L_offs_nonalloc_unsafe() { + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset, vL); + return aB; + } + + @Benchmark + public byte[] store_B8_L_offs_nonalloc_bale() { + ByteArrayLittleEndian.setLong(aB, offset, vL); + return aB; + } + + @Benchmark + public byte[] store_B8_L_offs_nonalloc_leapi() { + storeLongLE(aB, offset, vL); + return aB; + } + + @Benchmark + public byte[] store_B8_I2_offs_allocate_direct() { + byte[] aB = new byte[RANGE]; + aB[offset + 0] = (byte)(vI >> 0 ); + aB[offset + 1] = (byte)(vI >> 8 ); + aB[offset + 2] = (byte)(vI >> 16); + aB[offset + 3] = (byte)(vI >> 24); + aB[offset + 4] = (byte)(vI >> 0 ); + aB[offset + 5] = (byte)(vI >> 8 ); + aB[offset + 6] = (byte)(vI >> 16); + aB[offset + 7] = (byte)(vI >> 24); + return aB; + } + + @Benchmark + public byte[] store_B8_I2_offs_allocate_unsafe() { + byte[] aB = new byte[RANGE]; + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 0, vI); + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 4, vI); + return aB; + } + + @Benchmark + public byte[] store_B8_I2_offs_allocate_bale() { + byte[] aB = new byte[RANGE]; + ByteArrayLittleEndian.setInt(aB, offset + 0, vI); + ByteArrayLittleEndian.setInt(aB, offset + 4, vI); + return aB; + } + + @Benchmark + public byte[] store_B8_I2_offs_allocate_leapi() { + byte[] aB = new byte[RANGE]; + storeIntLE(aB, offset + 0, vI); + storeIntLE(aB, offset + 4, vI); + return aB; + } + + @Benchmark + public byte[] store_B8_I2_offs_nonalloc_direct() { + aB[offset + 0] = (byte)(vI >> 0 ); + aB[offset + 1] = (byte)(vI >> 8 ); + aB[offset + 2] = (byte)(vI >> 16); + aB[offset + 3] = (byte)(vI >> 24); + aB[offset + 4] = (byte)(vI >> 0 ); + aB[offset + 5] = (byte)(vI >> 8 ); + aB[offset + 6] = (byte)(vI >> 16); + aB[offset + 7] = (byte)(vI >> 24); + return aB; + } + + @Benchmark + public byte[] store_B8_I2_offs_nonalloc_unsafe() { + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 0, vI); + UNSAFE.putLongUnaligned(aB, Unsafe.ARRAY_BYTE_BASE_OFFSET + offset + 4, vI); + return aB; + } + + @Benchmark + public byte[] store_B8_I2_offs_nonalloc_bale() { + ByteArrayLittleEndian.setInt(aB, offset + 0, vI); + ByteArrayLittleEndian.setInt(aB, offset + 4, vI); + return aB; + } + + @Benchmark + public byte[] store_B8_I2_offs_nonalloc_leapi() { + storeIntLE(aB, offset + 0, vI); + storeIntLE(aB, offset + 4, vI); + return aB; + } + + @Benchmark + public short[] store_S2_con_offs_allocate_direct() { + short[] aS = new short[RANGE]; + aS[offset + 0] = (short)0x0102; + aS[offset + 1] = (short)0x0304; + return aS; + } + + @Benchmark + public short[] store_S2_con_offs_nonalloc_direct() { + aS[offset + 0] = (short)0x0102; + aS[offset + 1] = (short)0x0304; + return aS; + } + + @Benchmark + public short[] store_S4_con_offs_allocate_direct() { + short[] aS = new short[RANGE]; + aS[offset + 0] = (short)0x0102; + aS[offset + 1] = (short)0x0304; + aS[offset + 2] = (short)0x0506; + aS[offset + 3] = (short)0x0708; + return aS; + } + + @Benchmark + public short[] store_S4_con_offs_nonalloc_direct() { + aS[offset + 0] = (short)0x0102; + aS[offset + 1] = (short)0x0304; + aS[offset + 2] = (short)0x0506; + aS[offset + 3] = (short)0x0708; + return aS; + } + + @Benchmark + public int[] store_I2_con_offs_allocate_direct() { + int[] aI = new int[RANGE]; + aI[offset + 0] = 0x01020304; + aI[offset + 1] = 0x05060708; + return aI; + } + + @Benchmark + public int[] store_I2_con_offs_nonalloc_direct() { + aI[offset + 0] = 0x01020304; + aI[offset + 1] = 0x05060708; + return aI; + } + + @Benchmark + public int[] store_I2_zero_offs_allocate_direct() { + int[] aI = new int[RANGE]; + aI[offset + 0] = 0; + aI[offset + 1] = 0; + return aI; + } + + @Benchmark + public int[] store_I2_zero_offs_nonalloc_direct() { + aI[offset + 0] = 0; + aI[offset + 1] = 0; + return aI; + } +}