From ca5ca85d2408abfcb8a37f16476dba13c3b474d0 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 11 Mar 2024 07:12:15 +0000 Subject: [PATCH] 8325651: C2 SuperWord: refactor the dependency graph Reviewed-by: chagedorn, vlivanov --- src/hotspot/share/opto/superword.cpp | 301 ++---------------- src/hotspot/share/opto/superword.hpp | 157 +-------- .../share/opto/traceAutoVectorizationTag.hpp | 4 +- src/hotspot/share/opto/vectorization.cpp | 161 ++++++++++ src/hotspot/share/opto/vectorization.hpp | 114 ++++++- 5 files changed, 309 insertions(+), 428 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index ef4bae89426..7acc86a6d3f 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -46,7 +46,6 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) : _node_info(arena(), _vloop.estimated_body_length(), 0, SWNodeInfo::initial), // info needed per node _clone_map(phase()->C->clone_map()), // map of nodes created in cloning _align_to_ref(nullptr), // memory reference to align vectors to - _dg(arena()), // dependence graph _race_possible(false), // cases where SDMU is true _do_vector_loop(phase()->C->do_vector_loop()), // whether to do vectorization/simd style _num_work_vecs(0), // amount of vector work we have @@ -452,12 +451,6 @@ bool SuperWord::SLP_extract() { // Ensure extra info is allocated. initialize_node_info(); - // build _dg - dependence_graph(); - - // compute function depth(Node*) - compute_max_depth(); - // Attempt vectorization find_adjacent_refs(); @@ -749,86 +742,6 @@ int SuperWord::get_iv_adjustment(MemNode* mem_ref) { return iv_adjustment; } -//---------------------------dependence_graph--------------------------- -// Construct dependency graph. -// Add dependence edges to load/store nodes for memory dependence -// A.out()->DependNode.in(1) and DependNode.out()->B.prec(x) -void SuperWord::dependence_graph() { - CountedLoopNode *cl = lpt()->_head->as_CountedLoop(); - assert(cl->is_main_loop(), "SLP should only work on main loops"); - - // First, assign a dependence node to each memory node - for (int i = 0; i < body().length(); i++ ) { - Node* n = body().at(i); - if (n->is_Mem() || n->is_memory_phi()) { - _dg.make_node(n); - } - } - - const GrowableArray& mem_slice_head = _vloop_analyzer.memory_slices().heads(); - const GrowableArray& mem_slice_tail = _vloop_analyzer.memory_slices().tails(); - - ResourceMark rm; - GrowableArray slice_nodes; - - // For each memory slice, create the dependences - for (int i = 0; i < mem_slice_head.length(); i++) { - PhiNode* head = mem_slice_head.at(i); - MemNode* tail = mem_slice_tail.at(i); - - // Get slice in predecessor order (last is first) - _vloop_analyzer.memory_slices().get_slice_in_reverse_order(head, tail, slice_nodes); - - // Make the slice dependent on the root - DepMem* slice = _dg.dep(head); - _dg.make_edge(_dg.root(), slice); - - // Create a sink for the slice - DepMem* slice_sink = _dg.make_node(nullptr); - _dg.make_edge(slice_sink, _dg.tail()); - - // Now visit each pair of memory ops, creating the edges - for (int j = slice_nodes.length() - 1; j >= 0 ; j--) { - Node* s1 = slice_nodes.at(j); - - // If no dependency yet, use slice - if (_dg.dep(s1)->in_cnt() == 0) { - _dg.make_edge(slice, s1); - } - VPointer p1(s1->as_Mem(), _vloop); - bool sink_dependent = true; - for (int k = j - 1; k >= 0; k--) { - Node* s2 = slice_nodes.at(k); - if (s1->is_Load() && s2->is_Load()) - continue; - VPointer p2(s2->as_Mem(), _vloop); - - int cmp = p1.cmp(p2); - if (!VPointer::not_equal(cmp)) { - // Possibly same address - _dg.make_edge(s1, s2); - sink_dependent = false; - } - } - if (sink_dependent) { - _dg.make_edge(s1, slice_sink); - } - } - -#ifndef PRODUCT - if (is_trace_superword_dependence_graph()) { - tty->print_cr("\nDependence graph for slice: %d", head->_idx); - for (int q = 0; q < slice_nodes.length(); q++) { - _dg.print(slice_nodes.at(q)); - } - tty->cr(); - } -#endif - - slice_nodes.clear(); - } -} - void VLoopMemorySlices::find_memory_slices() { assert(_heads.is_empty(), "not yet computed"); assert(_tails.is_empty(), "not yet computed"); @@ -861,7 +774,7 @@ void VLoopMemorySlices::print() const { #endif // Get all memory nodes of a slice, in reverse order -void VLoopMemorySlices::get_slice_in_reverse_order(PhiNode* head, MemNode* tail, GrowableArray &slice) const { +void VLoopMemorySlices::get_slice_in_reverse_order(PhiNode* head, MemNode* tail, GrowableArray &slice) const { assert(slice.is_empty(), "start empty"); Node* n = tail; Node* prev = nullptr; @@ -871,7 +784,7 @@ void VLoopMemorySlices::get_slice_in_reverse_order(PhiNode* head, MemNode* tail, Node* out = n->fast_out(i); if (out->is_Load()) { if (_vloop.in_bb(out)) { - slice.push(out); + slice.push(out->as_Load()); } } else { // FIXME @@ -889,7 +802,7 @@ void VLoopMemorySlices::get_slice_in_reverse_order(PhiNode* head, MemNode* tail, }//else }//for if (n == head) { break; } - slice.push(n); + slice.push(n->as_Mem()); prev = n; assert(n->is_Mem(), "unexpected node %s", n->Name()); n = n->in(MemNode::Memory); @@ -1001,9 +914,8 @@ bool SuperWord::isomorphic(Node* s1, Node* s2) { } } -//------------------------------independent--------------------------- // Is there no data path from s1 to s2 or s2 to s1? -bool SuperWord::independent(Node* s1, Node* s2) { +bool VLoopDependencyGraph::independent(Node* s1, Node* s2) const { int d1 = depth(s1); int d2 = depth(s2); @@ -1024,9 +936,9 @@ bool SuperWord::independent(Node* s1, Node* s2) { worklist.push(deep); for (uint i = 0; i < worklist.size(); i++) { Node* n = worklist.at(i); - for (DepPreds preds(n, _dg); !preds.done(); preds.next()) { + for (PredsIterator preds(*this, n); !preds.done(); preds.next()) { Node* pred = preds.current(); - if (in_bb(pred) && depth(pred) >= min_d) { + if (_vloop.in_bb(pred) && depth(pred) >= min_d) { if (pred == shallow) { return false; // found it -> dependent } @@ -1045,7 +957,7 @@ bool SuperWord::independent(Node* s1, Node* s2) { // is the smallest depth of all nodes from the nodes list. Once we have // traversed all those nodes, and have not found another node from the // nodes list, we know that all nodes in the nodes list are independent. -bool SuperWord::mutually_independent(const Node_List* nodes) const { +bool VLoopDependencyGraph::mutually_independent(const Node_List* nodes) const { ResourceMark rm; Unique_Node_List worklist; VectorSet nodes_set; @@ -1054,14 +966,14 @@ bool SuperWord::mutually_independent(const Node_List* nodes) const { Node* n = nodes->at(k); min_d = MIN2(min_d, depth(n)); worklist.push(n); // start traversal at all nodes in nodes list - nodes_set.set(bb_idx(n)); + nodes_set.set(_body.bb_idx(n)); } for (uint i = 0; i < worklist.size(); i++) { Node* n = worklist.at(i); - for (DepPreds preds(n, _dg); !preds.done(); preds.next()) { + for (PredsIterator preds(*this, n); !preds.done(); preds.next()) { Node* pred = preds.current(); - if (in_bb(pred) && depth(pred) >= min_d) { - if (nodes_set.test(bb_idx(pred))) { + if (_vloop.in_bb(pred) && depth(pred) >= min_d) { + if (nodes_set.test(_body.bb_idx(pred))) { return false; // found one -> dependent } worklist.push(pred); @@ -1982,16 +1894,16 @@ void SuperWord::verify_packs() { } #endif -// The PacksetGraph combines the DepPreds graph with the packset. In the PackSet +// The PacksetGraph combines the dependency graph with the packset. In the PackSet // graph, we have two kinds of nodes: // (1) pack-node: Represents all nodes of some pack p in a single node, which // shall later become a vector node. // (2) scalar-node: Represents a node that is not in any pack. -// For any edge (n1, n2) in DepPreds, we add an edge to the PacksetGraph for the -// PacksetGraph nodes corresponding to n1 and n2. -// We work from the DepPreds graph, because it gives us all the data-dependencies, -// as well as more refined memory-dependencies than the C2 graph. DepPreds does -// not have cycles. But packing nodes can introduce cyclic dependencies. Example: +// For any edge (n1, n2) in the dependency graph, we add an edge to the PacksetGraph for +// the PacksetGraph nodes corresponding to n1 and n2. +// We work from the dependency graph, because it gives us all the data-dependencies, +// as well as more refined memory-dependencies than the C2 graph. The dependency graph +// does not have cycles. But packing nodes can introduce cyclic dependencies. Example: // // +--------+ // A -> X | v @@ -2055,11 +1967,10 @@ public: GrowableArray& out(int pid) { return _out.at(pid - 1); } bool schedule_success() const { return _schedule_success; } - // Create nodes (from packs and scalar-nodes), and add edges, based on DepPreds. + // Create nodes (from packs and scalar-nodes), and add edges, based on the dependency graph. void build() { const GrowableArray& packset = _slp->packset(); const GrowableArray& body = _slp->body(); - const DepGraph& dg = _slp->dg(); // Map nodes in packsets for (int i = 0; i < packset.length(); i++) { Node_List* p = packset.at(i); @@ -2096,7 +2007,7 @@ public: for (uint k = 0; k < p->size(); k++) { Node* n = p->at(k); assert(pid == get_pid(n), "all nodes in pack have same pid"); - for (DepPreds preds(n, dg); !preds.done(); preds.next()) { + for (VLoopDependencyGraph::PredsIterator preds(_slp->dependency_graph(), n); !preds.done(); preds.next()) { Node* pred = preds.current(); int pred_pid = get_pid_or_zero(pred); if (pred_pid == pid && _slp->is_marked_reduction(n)) { @@ -2118,7 +2029,7 @@ public: if (pid <= max_pid_packset) { continue; // Only scalar-nodes } - for (DepPreds preds(n, dg); !preds.done(); preds.next()) { + for (VLoopDependencyGraph::PredsIterator preds(_slp->dependency_graph(), n); !preds.done(); preds.next()) { Node* pred = preds.current(); int pred_pid = get_pid_or_zero(pred); // Only add edges for mapped nodes (in body) @@ -2209,7 +2120,7 @@ public: }; // The C2 graph (specifically the memory graph), needs to be re-ordered. -// (1) Build the PacksetGraph. It combines the DepPreds graph with the +// (1) Build the PacksetGraph. It combines the dependency graph with the // packset. The PacksetGraph gives us the dependencies that must be // respected after scheduling. // (2) Schedule the PacksetGraph to the memops_schedule, which represents @@ -3042,41 +2953,6 @@ void SuperWord::initialize_node_info() { grow_node_info(bb_idx(last)); } -//------------------------------compute_max_depth--------------------------- -// Compute max depth for expressions from beginning of block -// Use to prune search paths during test for independence. -void SuperWord::compute_max_depth() { - int ct = 0; - bool again; - do { - again = false; - for (int i = 0; i < body().length(); i++) { - Node* n = body().at(i); - if (!n->is_Phi()) { - int d_orig = depth(n); - int d_in = 0; - for (DepPreds preds(n, _dg); !preds.done(); preds.next()) { - Node* pred = preds.current(); - if (in_bb(pred)) { - d_in = MAX2(d_in, depth(pred)); - } - } - if (d_in + 1 != d_orig) { - set_depth(n, d_in + 1); - again = true; - } - } - } - ct++; - } while (again); - -#ifndef PRODUCT - if (is_trace_superword_dependence_graph()) { - tty->print_cr("compute_max_depth iterated: %d times", ct); - } -#endif -} - BasicType SuperWord::longer_type_for_conversion(Node* n) { if (!(VectorNode::is_convert_opcode(n->Opcode()) || requires_long_to_int_conversion(n->Opcode())) || @@ -3734,141 +3610,6 @@ void SuperWord::print_stmt(Node* s) { const SWNodeInfo SWNodeInfo::initial; - -// ============================ DepGraph =========================== - -//------------------------------make_node--------------------------- -// Make a new dependence graph node for an ideal node. -DepMem* DepGraph::make_node(Node* node) { - DepMem* m = new (_arena) DepMem(node); - if (node != nullptr) { - assert(_map.at_grow(node->_idx) == nullptr, "one init only"); - _map.at_put_grow(node->_idx, m); - } - return m; -} - -//------------------------------make_edge--------------------------- -// Make a new dependence graph edge from dpred -> dsucc -DepEdge* DepGraph::make_edge(DepMem* dpred, DepMem* dsucc) { - DepEdge* e = new (_arena) DepEdge(dpred, dsucc, dsucc->in_head(), dpred->out_head()); - dpred->set_out_head(e); - dsucc->set_in_head(e); - return e; -} - -// ========================== DepMem ======================== - -//------------------------------in_cnt--------------------------- -int DepMem::in_cnt() { - int ct = 0; - for (DepEdge* e = _in_head; e != nullptr; e = e->next_in()) ct++; - return ct; -} - -//------------------------------out_cnt--------------------------- -int DepMem::out_cnt() { - int ct = 0; - for (DepEdge* e = _out_head; e != nullptr; e = e->next_out()) ct++; - return ct; -} - -//------------------------------print----------------------------- -void DepMem::print() { -#ifndef PRODUCT - tty->print(" DepNode %d (", _node->_idx); - for (DepEdge* p = _in_head; p != nullptr; p = p->next_in()) { - Node* pred = p->pred()->node(); - tty->print(" %d", pred != nullptr ? pred->_idx : 0); - } - tty->print(") ["); - for (DepEdge* s = _out_head; s != nullptr; s = s->next_out()) { - Node* succ = s->succ()->node(); - tty->print(" %d", succ != nullptr ? succ->_idx : 0); - } - tty->print_cr(" ]"); -#endif -} - -// =========================== DepEdge ========================= - -//------------------------------DepPreds--------------------------- -void DepEdge::print() { -#ifndef PRODUCT - tty->print_cr("DepEdge: %d [ %d ]", _pred->node()->_idx, _succ->node()->_idx); -#endif -} - -// =========================== DepPreds ========================= -// Iterator over predecessor edges in the dependence graph. - -//------------------------------DepPreds--------------------------- -DepPreds::DepPreds(Node* n, const DepGraph& dg) { - _n = n; - _done = false; - if (_n->is_Store() || _n->is_Load()) { - _next_idx = MemNode::Address; - _end_idx = n->req(); - _dep_next = dg.dep(_n)->in_head(); - } else if (_n->is_Mem()) { - _next_idx = 0; - _end_idx = 0; - _dep_next = dg.dep(_n)->in_head(); - } else { - _next_idx = 1; - _end_idx = _n->req(); - _dep_next = nullptr; - } - next(); -} - -//------------------------------next--------------------------- -void DepPreds::next() { - if (_dep_next != nullptr) { - _current = _dep_next->pred()->node(); - _dep_next = _dep_next->next_in(); - } else if (_next_idx < _end_idx) { - _current = _n->in(_next_idx++); - } else { - _done = true; - } -} - -// =========================== DepSuccs ========================= -// Iterator over successor edges in the dependence graph. - -//------------------------------DepSuccs--------------------------- -DepSuccs::DepSuccs(Node* n, DepGraph& dg) { - _n = n; - _done = false; - if (_n->is_Load()) { - _next_idx = 0; - _end_idx = _n->outcnt(); - _dep_next = dg.dep(_n)->out_head(); - } else if (_n->is_Mem() || _n->is_memory_phi()) { - _next_idx = 0; - _end_idx = 0; - _dep_next = dg.dep(_n)->out_head(); - } else { - _next_idx = 0; - _end_idx = _n->outcnt(); - _dep_next = nullptr; - } - next(); -} - -//-------------------------------next--------------------------- -void DepSuccs::next() { - if (_dep_next != nullptr) { - _current = _dep_next->succ()->node(); - _dep_next = _dep_next->next_out(); - } else if (_next_idx < _end_idx) { - _current = _n->raw_out(_next_idx++); - } else { - _done = true; - } -} - // // --------------------------------- vectorization/simd ----------------------------------- // diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index 00a8c915ac7..c007f7bd51e 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -57,128 +57,6 @@ class VPointer; -// ========================= Dependence Graph ===================== - -class DepMem; - -//------------------------------DepEdge--------------------------- -// An edge in the dependence graph. The edges incident to a dependence -// node are threaded through _next_in for incoming edges and _next_out -// for outgoing edges. -class DepEdge : public ArenaObj { - protected: - DepMem* _pred; - DepMem* _succ; - DepEdge* _next_in; // list of in edges, null terminated - DepEdge* _next_out; // list of out edges, null terminated - - public: - DepEdge(DepMem* pred, DepMem* succ, DepEdge* next_in, DepEdge* next_out) : - _pred(pred), _succ(succ), _next_in(next_in), _next_out(next_out) {} - - DepEdge* next_in() { return _next_in; } - DepEdge* next_out() { return _next_out; } - DepMem* pred() { return _pred; } - DepMem* succ() { return _succ; } - - void print(); -}; - -//------------------------------DepMem--------------------------- -// A node in the dependence graph. _in_head starts the threaded list of -// incoming edges, and _out_head starts the list of outgoing edges. -class DepMem : public ArenaObj { - protected: - Node* _node; // Corresponding ideal node - DepEdge* _in_head; // Head of list of in edges, null terminated - DepEdge* _out_head; // Head of list of out edges, null terminated - - public: - DepMem(Node* node) : _node(node), _in_head(nullptr), _out_head(nullptr) {} - - Node* node() { return _node; } - DepEdge* in_head() { return _in_head; } - DepEdge* out_head() { return _out_head; } - void set_in_head(DepEdge* hd) { _in_head = hd; } - void set_out_head(DepEdge* hd) { _out_head = hd; } - - int in_cnt(); // Incoming edge count - int out_cnt(); // Outgoing edge count - - void print(); -}; - -//------------------------------DepGraph--------------------------- -class DepGraph { - protected: - Arena* _arena; - GrowableArray _map; - DepMem* _root; - DepMem* _tail; - - public: - DepGraph(Arena* a) : _arena(a), _map(a, 8, 0, nullptr) { - _root = new (_arena) DepMem(nullptr); - _tail = new (_arena) DepMem(nullptr); - } - - DepMem* root() { return _root; } - DepMem* tail() { return _tail; } - - // Return dependence node corresponding to an ideal node - DepMem* dep(Node* node) const { return _map.at(node->_idx); } - - // Make a new dependence graph node for an ideal node. - DepMem* make_node(Node* node); - - // Make a new dependence graph edge dprec->dsucc - DepEdge* make_edge(DepMem* dpred, DepMem* dsucc); - - DepEdge* make_edge(Node* pred, Node* succ) { return make_edge(dep(pred), dep(succ)); } - DepEdge* make_edge(DepMem* pred, Node* succ) { return make_edge(pred, dep(succ)); } - DepEdge* make_edge(Node* pred, DepMem* succ) { return make_edge(dep(pred), succ); } - - void print(Node* n) { dep(n)->print(); } - void print(DepMem* d) { d->print(); } -}; - -//------------------------------DepPreds--------------------------- -// Iterator over predecessors in the dependence graph and -// non-memory-graph inputs of ideal nodes. -class DepPreds : public StackObj { -private: - Node* _n; - int _next_idx, _end_idx; - DepEdge* _dep_next; - Node* _current; - bool _done; - -public: - DepPreds(Node* n, const DepGraph& dg); - Node* current() { return _current; } - bool done() { return _done; } - void next(); -}; - -//------------------------------DepSuccs--------------------------- -// Iterator over successors in the dependence graph and -// non-memory-graph outputs of ideal nodes. -class DepSuccs : public StackObj { -private: - Node* _n; - int _next_idx, _end_idx; - DepEdge* _dep_next; - Node* _current; - bool _done; - -public: - DepSuccs(Node* n, DepGraph& dg); - Node* current() { return _current; } - bool done() { return _done; } - void next(); -}; - - // ========================= SuperWord ===================== // -----------------------------SWNodeInfo--------------------------------- @@ -186,10 +64,9 @@ public: class SWNodeInfo { public: int _alignment; // memory alignment for a node - int _depth; // Max expression (DAG) depth from block start Node_List* _my_pack; // pack containing this node - SWNodeInfo() : _alignment(-1), _depth(0), _my_pack(nullptr) {} + SWNodeInfo() : _alignment(-1), _my_pack(nullptr) {} static const SWNodeInfo initial; }; @@ -212,8 +89,6 @@ class SuperWord : public ResourceObj { CloneMap& _clone_map; // map of nodes created in cloning MemNode const* _align_to_ref; // Memory reference that pre-loop will align to - DepGraph _dg; // Dependence graph - public: SuperWord(const VLoopAnalyzer &vloop_analyzer); @@ -280,6 +155,19 @@ class SuperWord : public ResourceObj { return _vloop_analyzer.types().vector_width_in_bytes(n); } + // VLoopDependencyGraph Accessors + const VLoopDependencyGraph& dependency_graph() const { + return _vloop_analyzer.dependency_graph(); + } + + bool independent(Node* n1, Node* n2) const { + return _vloop_analyzer.dependency_graph().independent(n1, n2); + } + + bool mutually_independent(const Node_List* nodes) const { + return _vloop_analyzer.dependency_graph().mutually_independent(nodes); + } + #ifndef PRODUCT // TraceAutoVectorization and TraceSuperWord bool is_trace_superword_alignment() const { @@ -287,11 +175,6 @@ class SuperWord : public ResourceObj { return _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT); } - bool is_trace_superword_dependence_graph() const { - return TraceSuperWord || - _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH); - } - bool is_trace_superword_adjacent_memops() const { return TraceSuperWord || _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS); @@ -321,7 +204,6 @@ class SuperWord : public ResourceObj { return TraceSuperWord || is_trace_align_vector() || _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT) || - _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH) || _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS) || _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_REJECTIONS) || _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_PACKSET) || @@ -338,7 +220,6 @@ class SuperWord : public ResourceObj { bool do_vector_loop() { return _do_vector_loop; } const GrowableArray& packset() const { return _packset; } - const DepGraph& dg() const { return _dg; } private: bool _race_possible; // In cases where SDMU is true bool _do_vector_loop; // whether to do vectorization/simd style @@ -362,10 +243,6 @@ class SuperWord : public ResourceObj { int alignment(Node* n) { return _node_info.adr_at(bb_idx(n))->_alignment; } void set_alignment(Node* n, int a) { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_alignment = a; } - // Max expression (DAG) depth from beginning of the block for each node - int depth(Node* n) const { return _node_info.adr_at(bb_idx(n))->_depth; } - void set_depth(Node* n, int d) { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_depth = d; } - // my_pack public: Node_List* my_pack(Node* n) { return !in_bb(n) ? nullptr : _node_info.adr_at(bb_idx(n))->_my_pack; } @@ -387,8 +264,6 @@ private: MemNode* find_align_to_ref(Node_List &memops, int &idx); // Calculate loop's iv adjustment for this memory ops. int get_iv_adjustment(MemNode* mem); - // Construct dependency graph. - void dependence_graph(); // Can s1 and s2 be in a pack with s1 immediately preceding s2 and s1 aligned at "align" bool stmts_can_pack(Node* s1, Node* s2, int align); @@ -398,10 +273,6 @@ private: bool are_adjacent_refs(Node* s1, Node* s2); // Are s1 and s2 similar? bool isomorphic(Node* s1, Node* s2); - // Is there no data path from s1 to s2 or s2 to s1? - bool independent(Node* s1, Node* s2); - // Are all nodes in nodes list mutually independent? - bool mutually_independent(const Node_List* nodes) const; // For a node pair (s1, s2) which is isomorphic and independent, // do s1 and s2 have similar input edges? bool have_similar_inputs(Node* s1, Node* s2); diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp index 615f9230f3a..e04664caba1 100644 --- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp +++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp @@ -35,8 +35,8 @@ flags(MEMORY_SLICES, "Trace VLoopMemorySlices") \ flags(BODY, "Trace VLoopBody") \ flags(TYPES, "Trace VLoopTypes") \ + flags(DEPENDENCY_GRAPH, "Trace VLoopDependencyGraph") \ flags(SW_ALIGNMENT, "Trace SuperWord alignment analysis") \ - flags(SW_DEPENDENCE_GRAPH, "Trace SuperWord::dependence_graph") \ flags(SW_ADJACENT_MEMOPS, "Trace SuperWord::find_adjacent_refs") \ flags(SW_REJECTIONS, "Trace SuperWord rejections (non vectorizations)") \ flags(SW_PACKSET, "Trace SuperWord packset at different stages") \ @@ -115,14 +115,12 @@ class TraceAutoVectorizationTagValidator { _tags.set_range(0, TRACE_AUTO_VECTORIZATION_TAG_NUM); } else if (SW_VERBOSE == tag) { _tags.at_put(SW_ALIGNMENT, set_bit); - _tags.at_put(SW_DEPENDENCE_GRAPH, set_bit); _tags.at_put(SW_ADJACENT_MEMOPS, set_bit); _tags.at_put(SW_REJECTIONS, set_bit); _tags.at_put(SW_PACKSET, set_bit); _tags.at_put(SW_INFO, set_bit); _tags.at_put(SW_VERBOSE, set_bit); } else if (SW_INFO == tag) { - _tags.at_put(SW_DEPENDENCE_GRAPH, set_bit); _tags.at_put(SW_ADJACENT_MEMOPS, set_bit); _tags.at_put(SW_REJECTIONS, set_bit); _tags.at_put(SW_PACKSET, set_bit); diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 4acbaedd21d..f3890eee017 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -161,9 +161,170 @@ VStatus VLoopAnalyzer::setup_submodules_helper() { _types.compute_vector_element_type(); + _dependency_graph.construct(); + return VStatus::make_success(); } +// Construct the dependency graph: +// - Data-dependencies: implicit (taken from C2 node inputs). +// - Memory-dependencies: +// - No edges between different slices. +// - No Load-Load edges. +// - Inside a slice, add all Store-Load, Load-Store, Store-Store edges, +// except if we can prove that the memory does not overlap. +void VLoopDependencyGraph::construct() { + const GrowableArray& mem_slice_heads = _memory_slices.heads(); + const GrowableArray& mem_slice_tails = _memory_slices.tails(); + + ResourceMark rm; + GrowableArray slice_nodes; + GrowableArray memory_pred_edges; + + // For each memory slice, create the memory subgraph + for (int i = 0; i < mem_slice_heads.length(); i++) { + PhiNode* head = mem_slice_heads.at(i); + MemNode* tail = mem_slice_tails.at(i); + + _memory_slices.get_slice_in_reverse_order(head, tail, slice_nodes); + + // In forward order (reverse of reverse), visit all memory nodes in the slice. + for (int j = slice_nodes.length() - 1; j >= 0 ; j--) { + MemNode* n1 = slice_nodes.at(j); + memory_pred_edges.clear(); + + VPointer p1(n1, _vloop); + // For all memory nodes before it, check if we need to add a memory edge. + for (int k = slice_nodes.length() - 1; k > j; k--) { + MemNode* n2 = slice_nodes.at(k); + + // Ignore Load-Load dependencies: + if (n1->is_Load() && n2->is_Load()) { continue; } + + VPointer p2(n2, _vloop); + if (!VPointer::not_equal(p1.cmp(p2))) { + // Possibly overlapping memory + memory_pred_edges.append(_body.bb_idx(n2)); + } + } + if (memory_pred_edges.is_nonempty()) { + // Data edges are taken implicitly from the C2 graph, thus we only add + // a dependency node if we have memory edges. + add_node(n1, memory_pred_edges); + } + } + slice_nodes.clear(); + } + + compute_depth(); + + NOT_PRODUCT( if (_vloop.is_trace_dependency_graph()) { print(); } ) +} + +void VLoopDependencyGraph::add_node(MemNode* n, GrowableArray& memory_pred_edges) { + assert(_dependency_nodes.at_grow(_body.bb_idx(n), nullptr) == nullptr, "not yet created"); + assert(!memory_pred_edges.is_empty(), "no need to create a node without edges"); + DependencyNode* dn = new (_arena) DependencyNode(n, memory_pred_edges, _arena); + _dependency_nodes.at_put_grow(_body.bb_idx(n), dn, nullptr); +} + +// We iterate over the body, which is already ordered by the dependencies, i.e. pred comes +// before use. With a single pass, we can compute the depth of every node, since we can +// assume that the depth of all preds is already computed when we compute the depth of use. +void VLoopDependencyGraph::compute_depth() { + for (int i = 0; i < _body.body().length(); i++) { + Node* n = _body.body().at(i); + int max_pred_depth = 0; + if (n->is_Phi()) { + for (PredsIterator it(*this, n); !it.done(); it.next()) { + Node* pred = it.current(); + if (_vloop.in_bb(pred)) { + max_pred_depth = MAX2(max_pred_depth, depth(pred)); + } + } + } + set_depth(n, max_pred_depth + 1); + } +} + +#ifndef PRODUCT +void VLoopDependencyGraph::print() const { + tty->print_cr("\nVLoopDependencyGraph::print:"); + + tty->print_cr(" Memory pred edges:"); + for (int i = 0; i < _body.body().length(); i++) { + Node* n = _body.body().at(i); + const DependencyNode* dn = dependency_node(n); + if (dn != nullptr) { + tty->print(" DependencyNode[%d %s:", n->_idx, n->Name()); + for (uint j = 0; j < dn->memory_pred_edges_length(); j++) { + Node* pred = _body.body().at(dn->memory_pred_edge(j)); + tty->print(" %d %s", pred->_idx, pred->Name()); + } + tty->print_cr("]"); + } + } + tty->cr(); + + tty->print_cr(" Complete dependency graph:"); + for (int i = 0; i < _body.body().length(); i++) { + Node* n = _body.body().at(i); + tty->print(" d%02d Dependencies[%d %s:", depth(n), n->_idx, n->Name()); + for (PredsIterator it(*this, n); !it.done(); it.next()) { + Node* pred = it.current(); + tty->print(" %d %s", pred->_idx, pred->Name()); + } + tty->print_cr("]"); + } +} +#endif + +VLoopDependencyGraph::DependencyNode::DependencyNode(MemNode* n, + GrowableArray& memory_pred_edges, + Arena* arena) : + _node(n), + _memory_pred_edges_length(memory_pred_edges.length()), + _memory_pred_edges(nullptr) +{ + assert(memory_pred_edges.is_nonempty(), "not empty"); + uint bytes = memory_pred_edges.length() * sizeof(int); + _memory_pred_edges = (int*)arena->Amalloc(bytes); + memcpy(_memory_pred_edges, memory_pred_edges.adr_at(0), bytes); +} + +VLoopDependencyGraph::PredsIterator::PredsIterator(const VLoopDependencyGraph& dependency_graph, + const Node* node) : + _dependency_graph(dependency_graph), + _node(node), + _dependency_node(dependency_graph.dependency_node(node)), + _current(nullptr), + _next_pred(0), + _end_pred(node->req()), + _next_memory_pred(0), + _end_memory_pred((_dependency_node != nullptr) ? _dependency_node->memory_pred_edges_length() : 0) +{ + if (_node->is_Store() || _node->is_Load()) { + // Load: address + // Store: address, value + _next_pred = MemNode::Address; + } else { + assert(!_node->is_Mem(), "only loads and stores are expected mem nodes"); + _next_pred = 1; // skip control + } + next(); +} + +void VLoopDependencyGraph::PredsIterator::next() { + if (_next_pred < _end_pred) { + _current = _node->in(_next_pred++); + } else if (_next_memory_pred < _end_memory_pred) { + int pred_bb_idx = _dependency_node->memory_pred_edge(_next_memory_pred++); + _current = _dependency_graph._body.body().at(pred_bb_idx); + } else { + _current = nullptr; // done + } +} + #ifndef PRODUCT int VPointer::Tracer::_depth = 0; #endif diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 00f6a9de474..88a46a3d688 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -150,6 +150,10 @@ public: return _vtrace.is_trace(TraceAutoVectorizationTag::TYPES); } + bool is_trace_dependency_graph() const { + return _vtrace.is_trace(TraceAutoVectorizationTag::DEPENDENCY_GRAPH); + } + bool is_trace_pointer_analysis() const { return _vtrace.is_trace(TraceAutoVectorizationTag::POINTER_ANALYSIS); } @@ -308,7 +312,7 @@ public: const GrowableArray& tails() const { return _tails; } // Get all memory nodes of a slice, in reverse order - void get_slice_in_reverse_order(PhiNode* head, MemNode* tail, GrowableArray& slice) const; + void get_slice_in_reverse_order(PhiNode* head, MemNode* tail, GrowableArray& slice) const; bool same_memory_slice(MemNode* m1, MemNode* m2) const; @@ -441,6 +445,109 @@ private: const Type* container_type(Node* n) const; }; +// Submodule of VLoopAnalyzer. +// The dependency graph is used to determine if nodes are independent, and can thus potentially +// be executed in parallel. That is a prerequisite for packing nodes into vector operations. +// The dependency graph is a combination: +// - Data-dependencies: they can directly be taken from the C2 node inputs. +// - Memory-dependencies: the edges in the C2 memory-slice are too restrictive: for example all +// stores are serialized, even if their memory does not overlap. Thus, +// we refine the memory-dependencies (see construct method). +class VLoopDependencyGraph : public StackObj { +private: + class DependencyNode; + + Arena* _arena; + const VLoop& _vloop; + const VLoopBody& _body; + const VLoopMemorySlices& _memory_slices; + + // bb_idx -> DependenceNode* + GrowableArray _dependency_nodes; + + // Node depth in DAG: bb_idx -> depth + GrowableArray _depths; + +public: + VLoopDependencyGraph(Arena* arena, + const VLoop& vloop, + const VLoopBody& body, + const VLoopMemorySlices& memory_slices) : + _arena(arena), + _vloop(vloop), + _body(body), + _memory_slices(memory_slices), + _dependency_nodes(arena, + vloop.estimated_body_length(), + vloop.estimated_body_length(), + nullptr), + _depths(arena, + vloop.estimated_body_length(), + vloop.estimated_body_length(), + 0) {} + NONCOPYABLE(VLoopDependencyGraph); + + void construct(); + bool independent(Node* s1, Node* s2) const; + bool mutually_independent(const Node_List* nodes) const; + +private: + void add_node(MemNode* n, GrowableArray& memory_pred_edges); + int depth(const Node* n) const { return _depths.at(_body.bb_idx(n)); } + void set_depth(const Node* n, int d) { _depths.at_put(_body.bb_idx(n), d); } + void compute_depth(); + NOT_PRODUCT( void print() const; ) + + const DependencyNode* dependency_node(const Node* n) const { + return _dependency_nodes.at(_body.bb_idx(n)); + } + + class DependencyNode : public ArenaObj { + private: + MemNode* _node; // Corresponding ideal node + const uint _memory_pred_edges_length; + int* _memory_pred_edges; // memory pred-edges, mapping to bb_idx + public: + DependencyNode(MemNode* n, GrowableArray& memory_pred_edges, Arena* arena); + NONCOPYABLE(DependencyNode); + uint memory_pred_edges_length() const { return _memory_pred_edges_length; } + + int memory_pred_edge(uint i) const { + assert(i < _memory_pred_edges_length, "bounds check"); + return _memory_pred_edges[i]; + } + }; + +public: + // Iterator for dependency graph predecessors of a node. + class PredsIterator : public StackObj { + private: + const VLoopDependencyGraph& _dependency_graph; + + const Node* _node; + const DependencyNode* _dependency_node; + + Node* _current; + + // Iterate in node->in(i) + int _next_pred; + int _end_pred; + + // Iterate in dependency_node->memory_pred_edge(i) + int _next_memory_pred; + int _end_memory_pred; + public: + PredsIterator(const VLoopDependencyGraph& dependency_graph, const Node* node); + NONCOPYABLE(PredsIterator); + void next(); + bool done() const { return _current == nullptr; } + Node* current() const { + assert(!done(), "not done yet"); + return _current; + } + }; +}; + // Analyze the loop in preparation for auto-vectorization. This class is // deliberately structured into many submodules, which are as independent // as possible, though some submodules do require other submodules. @@ -463,6 +570,7 @@ private: VLoopMemorySlices _memory_slices; VLoopBody _body; VLoopTypes _types; + VLoopDependencyGraph _dependency_graph; public: VLoopAnalyzer(const VLoop& vloop, VSharedData& vshared) : @@ -472,7 +580,8 @@ public: _reductions (&_arena, vloop), _memory_slices (&_arena, vloop), _body (&_arena, vloop, vshared), - _types (&_arena, vloop, _body) + _types (&_arena, vloop, _body), + _dependency_graph(&_arena, vloop, _body, _memory_slices) { _success = setup_submodules(); } @@ -486,6 +595,7 @@ public: const VLoopMemorySlices& memory_slices() const { return _memory_slices; } const VLoopBody& body() const { return _body; } const VLoopTypes& types() const { return _types; } + const VLoopDependencyGraph& dependency_graph() const { return _dependency_graph; } private: bool setup_submodules();