Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
b142e4f
8298935: Superword: handle cyclic dependencies with offset larger than 1
eme64 Jan 31, 2023
5e01c9f
refactored verification code
eme64 Feb 2, 2023
2def5f7
Add IR test for compile option Vectorize
eme64 Feb 3, 2023
744dc23
Extended TestForEachRem.java with distance 2 case that fails on master
eme64 Feb 3, 2023
c668b5f
remove useless comment
eme64 Feb 6, 2023
830a05a
cyclic dependency test is now an IR test
eme64 Feb 6, 2023
b393693
Replace old verification code, and remove first fix
eme64 Feb 6, 2023
ddb9dc5
Filter out dependent packs if we have _do_vector_loop
eme64 Feb 6, 2023
d1f19ba
small improvement for printing
eme64 Feb 6, 2023
b633a3a
Implement fix
eme64 Feb 7, 2023
bf4befa
refactoring to improve readability
eme64 Feb 7, 2023
d170f54
remove duplicate verification
eme64 Feb 7, 2023
32df5de
refactoring again, better comments, _do_vector_loop only if vector al…
eme64 Feb 8, 2023
3fa535b
require avx and sve
eme64 Feb 8, 2023
0a834cd
apply a IR verification only for cpu feature
eme64 Feb 8, 2023
a7e88e1
typos
eme64 Feb 17, 2023
b3d4d29
improve IR annotation for TestCyclicDependency.java, allow running it…
eme64 Feb 17, 2023
f02433b
Improved IR test TestOptionVectorizeIR.java
eme64 Feb 17, 2023
113b9ff
Merge branch 'master' into JDK-8298935
eme64 Feb 17, 2023
796eb3f
fixed a few IR cpuFeature requirements
eme64 Feb 17, 2023
4ebfa80
fixed another IR cpuFeature issue for aarch64 asimd vs sve
eme64 Feb 17, 2023
befb017
Merge branch 'master' into JDK-8298935
eme64 Feb 20, 2023
08cccf4
Regression test for Test.java byte case that crashed on arm
eme64 Feb 21, 2023
8dce09b
Merge branch 'master' into JDK-8298935
eme64 Feb 23, 2023
7033a87
Version 1 of script-generated offset dependency test
eme64 Feb 28, 2023
5955c29
v2 TestDependencyOffsets.java based on MaxVectorSize not SuperWordMax…
eme64 Feb 28, 2023
7462cd1
Fix TestCyclicDependency.java for aarch64 machines with AlignVector =…
eme64 Feb 28, 2023
8349906
Fix TestOptionVectorizeIR.java for aarch64 machines with AlignVector …
eme64 Feb 28, 2023
0cb67e5
TestDependencyOffsets.java: MulVL not supported on NEON / asimd. Repl…
eme64 Mar 1, 2023
366bc31
removed negative rules for TestCyclicDependency.java
eme64 Mar 2, 2023
9b8738a
remove negative IR rules for TestOptionVectorizeIR.java
eme64 Mar 2, 2023
645ed50
Reworked TestDependencyOffsets.java
eme64 Mar 2, 2023
199fcf0
Merge branch 'master' into JDK-8298935
eme64 Mar 2, 2023
fb7f6dd
TestOptionVectorizeIR.java: removed PopulateIndex IR rule - fails on …
eme64 Mar 3, 2023
9e3d480
TestDependencyOffsets.java: parallelize it + various AVX settings
eme64 Mar 8, 2023
a44082b
TestDependencyOffsets.java: add vanilla run
eme64 Mar 8, 2023
0f7e39c
resolve merge conflict after Roland's fix
eme64 Mar 9, 2023
216bb1a
A little renaming and improved comments
eme64 Mar 9, 2023
ef61acd
Fixed wording from last commit
eme64 Mar 9, 2023
731cc7b
Merge master after NULL -> nullptr conversion
eme64 Mar 9, 2023
ff0850e
merge master: resolved conflict in test/hotspot/jtreg/compiler/lib/ir…
eme64 Mar 13, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
257 changes: 202 additions & 55 deletions src/hotspot/share/opto/superword.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,8 @@ bool SuperWord::SLP_extract() {

filter_packs();

DEBUG_ONLY(verify_packs();)

schedule();

// Record eventual count of vector packs for checks in post loop vectorization
Expand Down Expand Up @@ -640,54 +642,10 @@ void SuperWord::find_adjacent_refs() {
}
}

// Create initial pack pairs of memory operations for which
// alignment is set and vectors will be aligned.
bool create_pack = true;
if (memory_alignment(mem_ref, best_iv_adjustment) == 0 || _do_vector_loop) {
if (vectors_should_be_aligned()) {
int vw = vector_width(mem_ref);
int vw_best = vector_width(best_align_to_mem_ref);
if (vw > vw_best) {
// Do not vectorize a memory access with more elements per vector
// if unaligned memory access is not allowed because number of
// iterations in pre-loop will be not enough to align it.
create_pack = false;
} else {
SWPointer p2(best_align_to_mem_ref, this, nullptr, false);
if (!align_to_ref_p.invar_equals(p2)) {
// Do not vectorize memory accesses with different invariants
// if unaligned memory accesses are not allowed.
create_pack = false;
}
}
}
} else {
if (same_memory_slice(best_align_to_mem_ref, mem_ref)) {
// Can't allow vectorization of unaligned memory accesses with the
// same memory slice since it could be overlapped accesses to the same array.
create_pack = false;
} else {
// Allow independent (different type) unaligned memory operations
// if HW supports them.
if (vectors_should_be_aligned()) {
create_pack = false;
} else {
// Check if packs of the same memory slice but
// with a different alignment were created before.
for (uint i = 0; i < align_to_refs.size(); i++) {
MemNode* mr = align_to_refs.at(i)->as_Mem();
if (mr == mem_ref) {
// Skip when we are looking at same memory operation.
continue;
}
if (same_memory_slice(mem_ref, mr) &&
memory_alignment(mr, iv_adjustment) != 0)
create_pack = false;
}
}
}
}
if (create_pack) {
if (can_create_pairs(mem_ref, iv_adjustment, align_to_ref_p,
best_align_to_mem_ref, best_iv_adjustment,
align_to_refs)) {
// Create initial pack pairs of memory operations for which alignment was set.
for (uint i = 0; i < memops.size(); i++) {
Node* s1 = memops.at(i);
int align = alignment(s1);
Expand All @@ -707,7 +665,9 @@ void SuperWord::find_adjacent_refs() {
}
}
}
} else { // Don't create unaligned pack
} else {
// Cannot create pairs for mem_ref. Reject all related memops forever.

// First, remove remaining memory ops of the same memory slice from the list.
for (int i = memops.size() - 1; i >= 0; i--) {
MemNode* s = memops.at(i)->as_Mem();
Expand Down Expand Up @@ -794,6 +754,96 @@ void SuperWord::find_adjacent_refs_trace_1(Node* best_align_to_mem_ref, int best
}
#endif

// Check if we can create the pack pairs for mem_ref:
// If required, enforce strict alignment requirements of hardware.
// Else, only enforce alignment within a memory slice, so that there cannot be any
// memory-dependence between different vector "lanes".
bool SuperWord::can_create_pairs(MemNode* mem_ref, int iv_adjustment, SWPointer &align_to_ref_p,
MemNode* best_align_to_mem_ref, int best_iv_adjustment,
Node_List &align_to_refs) {
bool is_aligned_with_best = memory_alignment(mem_ref, best_iv_adjustment) == 0;

if (vectors_should_be_aligned()) {
// All vectors need to be memory aligned, modulo their vector_width. This is more strict
// than the hardware probably requires. Most hardware at most requires 4-byte alignment.
//
// In the pre-loop, we align best_align_to_mem_ref to its vector_length. To ensure that
// all mem_ref's are memory aligned modulo their vector_width, we only need to check that
// they are all aligned to best_align_to_mem_ref, modulo their vector_width. For that,
// we check the following 3 conditions.

// (1) All packs are aligned with best_align_to_mem_ref.
if (!is_aligned_with_best) {
return false;
}
// (2) All other vectors have vector_size less or equal to that of best_align_to_mem_ref.
int vw = vector_width(mem_ref);
int vw_best = vector_width(best_align_to_mem_ref);
if (vw > vw_best) {
// We only align to vector_width of best_align_to_mem_ref during pre-loop.
// A mem_ref with a larger vector_width might thus not be vector_width aligned.
return false;
}
// (3) Ensure that all vectors have the same invariant. We model memory accesses like this
// address = base + k*iv + constant [+ invar]
// memory_alignment ignores the invariant.
SWPointer p2(best_align_to_mem_ref, this, nullptr, false);
if (!align_to_ref_p.invar_equals(p2)) {
// Do not vectorize memory accesses with different invariants
// if unaligned memory accesses are not allowed.
return false;
}
return true;
} else {
// Alignment is not required by the hardware.

// However, we need to ensure that the pack for mem_ref is independent, i.e. all members
// of the pack are mutually independent.

if (_do_vector_loop) {
// Wait until combine_packs to check independence of packs. For now we just know that
// the adjacent pairs are independent. This allows us to vectorize when we do not have
// alignment modulo vector_width. For example (forward read):
// for (int i ...) { v[i] = v[i + 1] + 5; }
// The following will be filtered out in combine_packs (forward write):
// for (int i ...) { v[i + 1] = v[i] + 5; }
return true;
}

// If all mem_ref's are modulo vector_width aligned with all other mem_ref's of their
// memory slice, then the VectorLoad / VectorStore regions are either exactly overlapping
// or completely non-overlapping. This ensures that there cannot be memory-dependencies
// between different vector "lanes".
// During SuperWord::filter_packs -> SuperWord::profitable -> SuperWord::is_vector_use,
// we check that all inputs are vectors that match on every element (with some reasonable
// exceptions). This ensures that every "lane" is isomorpic and independent to all other
// "lanes". This allows us to vectorize these cases:
// for (int i ...) { v[i] = v[i] + 5; } // same alignment
// for (int i ...) { v[i] = v[i + 32] + 5; } // alignment modulo vector_width
if (same_memory_slice(mem_ref, best_align_to_mem_ref)) {
return is_aligned_with_best;
} else {
return is_mem_ref_aligned_with_same_memory_slice(mem_ref, iv_adjustment, align_to_refs);
}
}
}

// Check if alignment of mem_ref is consistent with the other packs of the same memory slice
bool SuperWord::is_mem_ref_aligned_with_same_memory_slice(MemNode* mem_ref, int iv_adjustment,
Node_List &align_to_refs) {
for (uint i = 0; i < align_to_refs.size(); i++) {
MemNode* mr = align_to_refs.at(i)->as_Mem();
if (mr != mem_ref &&
same_memory_slice(mr, mem_ref) &&
memory_alignment(mr, iv_adjustment) != 0) {
// mem_ref is misaligned with mr, another ref of the same memory slice.
return false;
}
}
// No misalignment found.
return true;
}

//------------------------------find_align_to_ref---------------------------
// Find a memory reference to align the loop induction variable to.
// Looks first at stores then at loads, looking for a memory reference
Expand Down Expand Up @@ -1326,6 +1376,44 @@ bool SuperWord::independent(Node* s1, Node* s2) {
return independent_path(shallow, deep);
}

//------------------------------find_dependence---------------------
// Is any s1 in p dependent on any s2 in p? Yes: return such a s2. No: return nullptr.
// We could query independent(s1, s2) for all pairs, but that results
// in O(p.size * p.size) graph traversals. We can do it all in one BFS!
// Start the BFS traversal at all nodes from the pack. Traverse DepPreds
// recursively, for nodes that have at least depth min_d, which is the
// smallest depth of all nodes from the pack. Once we have traversed all
// those nodes, and have not found another node from the pack, we know
// that all nodes in the pack are independent.
Node* SuperWord::find_dependence(Node_List* p) {
if (p->at(0)->is_reduction()) {
return nullptr; // ignore reductions
}
ResourceMark rm;
Unique_Node_List worklist; // traversal queue
int min_d = depth(p->at(0));
visited_clear();
for (uint k = 0; k < p->size(); k++) {
Node* n = p->at(k);
min_d = MIN2(min_d, depth(n));
worklist.push(n); // start traversal at all nodes in p
visited_set(n); // mark node
}
for (uint i = 0; i < worklist.size(); i++) {
Node* n = worklist.at(i);
for (DepPreds preds(n, _dg); !preds.done(); preds.next()) {
Node* pred = preds.current();
if (in_bb(pred) && depth(pred) >= min_d) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @eme64 , Dependency captures three scenarios :-
a) RAW(ture) : Store -> Load
b) WAR(anti) : Load -> Store
c) WAW : Store -> Store.
RAR is a non-dependency. Existing routine independent_path is called during statement packing after its certain that statements are isomorphic. Newly added find_dependence routine optimizes the dependency checking but it does not seem to be skipping over anti-dependence checks which may prevent vectorization in some cases. Can you share your thoughts on this.

Copy link
Contributor Author

@eme64 eme64 Feb 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks @jatin-bhateja . In essence, I am trusting DepPreds to give me the correct dependency information. Is that not correct? Do you have an example where DepPreds would have a Load depend on a Load, without a store in between?

Also: it seems that SuperWord::independent_path does nothing special about RAR, it just follows all dependencies of DepPreds recursively. This is what is used in SuperWord::independent. So if DepPreds could not be trusted, then we have the same issue here.

In the end, find_dependence should do nothing else but SuperWord::independent, except for a whole pack at once, and not only pairwise for two memops.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are correct Load-Load(RAR) is not captured in dependence graph anyways.

My above comment was in context of scenario b), I verified that newly added find_dependence method will not find any circular intra-pack dependence in this case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jatin-bhateja Ah, I misunderstood. Thanks for clarifying. So you verified that WAR is not an issue? Thanks!

I agree that it should not be an issue. For a Load -> Store to block vectorization, we would have to find a dependency path from a Load -> Store -> Load (LSL) or a Store -> Load -> Store (SLS), since my verification only checks if any of the memops in the pack depend on another memop of the pack (both loads or both stores).
But both LSL and SLS have a Store -> Load (RAW) and so we cannot vectorize.
Do you agree with this argument?

Copy link
Member

@jatin-bhateja jatin-bhateja Feb 21, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jatin-bhateja Ah, I misunderstood. Thanks for clarifying. So you verified that WAR is not an issue? Thanks!

I agree that it should not be an issue. For a Load -> Store to block vectorization, we would have to find a dependency path from a Load -> Store -> Load (LSL) or a Store -> Load -> Store (SLS), since my verification only checks if any of the memops in the pack depend on another memop of the pack (both loads or both stores). But both LSL and SLS have a Store -> Load (RAW) and so we cannot vectorize. Do you agree with this argument?

Correct, as long as Load in (Store->Load) not feeding back into any other Store in the same pack.

Without complicating the way I see this is that dependence checking looking for an intra-pack dependency where pack is comprised of isomorphic nodes (loads/stores) does a walk over data pendency edges of each memory node in the pack and if the traversal lands back to one the nodes in the pack it should declare a cyclic dependence.

if (visited_test(pred)) { // marked as in p?
return pred;
}
worklist.push(pred);
}
}
}
return nullptr;
}

//--------------------------have_similar_inputs-----------------------
// For a node pair (s1, s2) which is isomorphic and independent,
// do s1 and s2 have similar input edges?
Expand Down Expand Up @@ -1483,8 +1571,10 @@ bool SuperWord::follow_use_defs(Node_List* p) {
int align = alignment(s1);
Node* t1 = s1->in(j);
Node* t2 = s2->in(j);
if (!in_bb(t1) || !in_bb(t2))
if (!in_bb(t1) || !in_bb(t2) || t1->is_Mem() || t2->is_Mem()) {
// Only follow non-memory nodes in block - we do not want to resurrect misaligned packs.
continue;
}
align = adjust_alignment_for_type_conversion(s1, t1, align);
if (stmts_can_pack(t1, t2, align)) {
if (est_savings(t1, t2) >= 0) {
Expand Down Expand Up @@ -1522,10 +1612,16 @@ bool SuperWord::follow_def_uses(Node_List* p) {
for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
Node* t1 = s1->fast_out(i);
num_s1_uses++;
if (!in_bb(t1)) continue;
if (!in_bb(t1) || t1->is_Mem()) {
// Only follow non-memory nodes in block - we do not want to resurrect misaligned packs.
continue;
}
for (DUIterator_Fast jmax, j = s2->fast_outs(jmax); j < jmax; j++) {
Node* t2 = s2->fast_out(j);
if (!in_bb(t2)) continue;
if (!in_bb(t2) || t2->is_Mem()) {
// Only follow non-memory nodes in block - we do not want to resurrect misaligned packs.
continue;
}
if (t2->Opcode() == Op_AddI && t2 == _lp->as_CountedLoop()->incr()) continue; // don't mess with the iv
if (!opnd_positions_match(s1, t1, s2, t2))
continue;
Expand Down Expand Up @@ -1714,7 +1810,6 @@ void SuperWord::combine_packs() {
for (int j = i + 1; j < _packset.length(); j++) {
Node_List* p2 = _packset.at(j);
if (p2 == nullptr) continue;
if (i == j) continue;
if (p1->at(p1->size()-1) == p2->at(0)) {
for (uint k = 1; k < p2->size(); k++) {
p1->push(p2->at(k));
Expand Down Expand Up @@ -1755,6 +1850,32 @@ void SuperWord::combine_packs() {
}
}

if (_do_vector_loop) {
// Since we did not enforce exact alignment of the packsets, we only know that there
// is no dependence with distance 1, because we have checked independent(s1, s2) for
// all adjacent memops. But there could be a dependence of a different distance.
// Hence: remove the pack if there is a dependence.
for (int i = 0; i < _packset.length(); i++) {
Node_List* p = _packset.at(i);
if (p != nullptr) {
Node* dependence = find_dependence(p);
if (dependence != nullptr) {
#ifndef PRODUCT
if (TraceSuperWord) {
tty->cr();
tty->print_cr("WARNING: Found dependency.");
tty->print_cr("Cannot vectorize despite compile directive Vectorize.");
dependence->dump();
tty->print_cr("In pack[%d]", i);
print_pack(p);
}
#endif
_packset.at_put(i, nullptr);
}
}
}
}

// Compress list.
for (int i = _packset.length() - 1; i >= 0; i--) {
Node_List* p1 = _packset.at(i);
Expand All @@ -1773,7 +1894,6 @@ void SuperWord::combine_packs() {
// Construct the map from nodes to packs. Only valid after the
// point where a node is only in one pack (after combine_packs).
void SuperWord::construct_my_pack_map() {
Node_List* rslt = nullptr;
for (int i = 0; i < _packset.length(); i++) {
Node_List* p = _packset.at(i);
for (uint j = 0; j < p->size(); j++) {
Expand Down Expand Up @@ -2203,6 +2323,29 @@ bool SuperWord::profitable(Node_List* p) {
return true;
}

#ifdef ASSERT
void SuperWord::verify_packs() {
for (int i = 0; i < _packset.length(); i++) {
Node_List* p = _packset.at(i);
Node* dependence = find_dependence(p);
if (dependence != nullptr) {
tty->print_cr("Other nodes in pack have dependence on:");
dependence->dump();
tty->print_cr("The following nodes are not independent:");
for (uint k = 0; k < p->size(); k++) {
Node* n = p->at(k);
if (!independent(n, dependence)) {
n->dump();
}
}
tty->print_cr("They are all from pack[%d]", i);
print_pack(p);
}
assert(dependence == nullptr, "all nodes in pack must be mutually independent");
}
}
#endif

//------------------------------schedule---------------------------
// Adjust the memory graph for the packed operations
void SuperWord::schedule() {
Expand Down Expand Up @@ -4027,7 +4170,11 @@ void SuperWord::print_packset() {
for (int i = 0; i < _packset.length(); i++) {
tty->print_cr("Pack: %d", i);
Node_List* p = _packset.at(i);
print_pack(p);
if (p == nullptr) {
tty->print_cr(" nullptr");
} else {
print_pack(p);
}
}
#endif
}
Expand Down
13 changes: 13 additions & 0 deletions src/hotspot/share/opto/superword.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -476,6 +476,15 @@ class SuperWord : public ResourceObj {
void find_adjacent_refs_trace_1(Node* best_align_to_mem_ref, int best_iv_adjustment);
void print_loop(bool whole);
#endif
// Check if we can create the pack pairs for mem_ref:
// If required, enforce strict alignment requirements of hardware.
// Else, only enforce alignment within a memory slice, so that there cannot be any
// memory-dependence between different vector "lanes".
bool can_create_pairs(MemNode* mem_ref, int iv_adjustment, SWPointer &align_to_ref_p,
MemNode* best_align_to_mem_ref, int best_iv_adjustment,
Node_List &align_to_refs);
// Check if alignment of mem_ref is consistent with the other packs of the same memory slice.
bool is_mem_ref_aligned_with_same_memory_slice(MemNode* mem_ref, int iv_adjustment, Node_List &align_to_refs);
// Find a memory reference to align the loop induction variable to.
MemNode* find_align_to_ref(Node_List &memops, int &idx);
// Calculate loop's iv adjustment for this memory ops.
Expand Down Expand Up @@ -512,6 +521,8 @@ class SuperWord : public ResourceObj {
bool isomorphic(Node* s1, Node* s2);
// Is there no data path from s1 to s2 or s2 to s1?
bool independent(Node* s1, Node* s2);
// Is any s1 in p dependent on any s2 in p? Yes: return such a s2. No: return nullptr.
Node* find_dependence(Node_List* p);
// For a node pair (s1, s2) which is isomorphic and independent,
// do s1 and s2 have similar input edges?
bool have_similar_inputs(Node* s1, Node* s2);
Expand Down Expand Up @@ -543,6 +554,8 @@ class SuperWord : public ResourceObj {
void filter_packs();
// Merge CMove into new vector-nodes
void merge_packs_to_cmove();
// Verify that for every pack, all nodes are mutually independent
DEBUG_ONLY(void verify_packs();)
// Adjust the memory graph for the packed operations
void schedule();
// Remove "current" from its current position in the memory graph and insert
Expand Down
Loading