8074981: Integer/FP scalar reduction optimization

Add scalar reduction optimization to C2 to take advantage of vector instructions in modern x86 CPUs.

Reviewed-by: kvn, twisti
This commit is contained in:
Michael C Berg 2015-04-01 18:07:50 -07:00 committed by Vladimir Kozlov
parent 7c5d30b0e3
commit 9e55e44c85
22 changed files with 1599 additions and 20 deletions

View file

@ -65,7 +65,8 @@ SuperWord::SuperWord(PhaseIdealLoop* phase) :
_lpt(NULL), // loop tree node
_lp(NULL), // LoopNode
_bb(NULL), // basic block
_iv(NULL) // induction var
_iv(NULL), // induction var
_race_possible(false) // cases where SDMU is true
{}
//------------------------------transform_loop---------------------------
@ -145,7 +146,6 @@ void SuperWord::transform_loop(IdealLoopTree* lpt) {
void SuperWord::SLP_extract() {
// Ready the block
if (!construct_bb())
return; // Exit if no interesting nodes or complex graph.
@ -640,7 +640,7 @@ bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {
}
if (isomorphic(s1, s2)) {
if (independent(s1, s2)) {
if (independent(s1, s2) || reduction(s1, s2)) {
if (!exists_at(s1, 0) && !exists_at(s2, 1)) {
if (!s1->is_Mem() || are_adjacent_refs(s1, s2)) {
int s1_align = alignment(s1);
@ -718,6 +718,28 @@ bool SuperWord::independent(Node* s1, Node* s2) {
return independent_path(shallow, deep);
}
//------------------------------reduction---------------------------
// Is there a data path between s1 and s2 and the nodes reductions?
bool SuperWord::reduction(Node* s1, Node* s2) {
bool retValue = false;
int d1 = depth(s1);
int d2 = depth(s2);
if (d1 + 1 == d2) {
if (s1->is_reduction() && s2->is_reduction()) {
// This is an ordered set, so s1 should define s2
for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
Node* t1 = s1->fast_out(i);
if (t1 == s2) {
// both nodes are reductions and connected
retValue = true;
}
}
}
}
return retValue;
}
//------------------------------independent_path------------------------------
// Helper for independent
bool SuperWord::independent_path(Node* shallow, Node* deep, uint dp) {
@ -761,6 +783,7 @@ int SuperWord::data_size(Node* s) {
void SuperWord::extend_packlist() {
bool changed;
do {
packset_sort(_packset.length());
changed = false;
for (int i = 0; i < _packset.length(); i++) {
Node_List* p = _packset.at(i);
@ -769,6 +792,13 @@ void SuperWord::extend_packlist() {
}
} while (changed);
if (_race_possible) {
for (int i = 0; i < _packset.length(); i++) {
Node_List* p = _packset.at(i);
order_def_uses(p);
}
}
#ifndef PRODUCT
if (TraceSuperWord) {
tty->print_cr("\nAfter extend_packlist");
@ -825,10 +855,12 @@ bool SuperWord::follow_def_uses(Node_List* p) {
int align = alignment(s1);
int savings = -1;
int num_s1_uses = 0;
Node* u1 = NULL;
Node* u2 = NULL;
for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
Node* t1 = s1->fast_out(i);
num_s1_uses++;
if (!in_bb(t1)) continue;
for (DUIterator_Fast jmax, j = s2->fast_outs(jmax); j < jmax; j++) {
Node* t2 = s2->fast_out(j);
@ -845,6 +877,9 @@ bool SuperWord::follow_def_uses(Node_List* p) {
}
}
}
if (num_s1_uses > 1) {
_race_possible = true;
}
if (savings >= 0) {
Node_List* pair = new Node_List();
pair->push(u1);
@ -856,9 +891,64 @@ bool SuperWord::follow_def_uses(Node_List* p) {
return changed;
}
//------------------------------order_def_uses---------------------------
// For extended packsets, ordinally arrange uses packset by major component
void SuperWord::order_def_uses(Node_List* p) {
Node* s1 = p->at(0);
if (s1->is_Store()) return;
// reductions are always managed beforehand
if (s1->is_reduction()) return;
for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
Node* t1 = s1->fast_out(i);
// Only allow operand swap on commuting operations
if (!t1->is_Add() && !t1->is_Mul()) {
break;
}
// Now find t1's packset
Node_List* p2 = NULL;
for (int j = 0; j < _packset.length(); j++) {
p2 = _packset.at(j);
Node* first = p2->at(0);
if (t1 == first) {
break;
}
p2 = NULL;
}
// Arrange all sub components by the major component
if (p2 != NULL) {
for (uint j = 1; j < p->size(); j++) {
Node* d1 = p->at(j);
Node* u1 = p2->at(j);
opnd_positions_match(s1, t1, d1, u1);
}
}
}
}
//---------------------------opnd_positions_match-------------------------
// Is the use of d1 in u1 at the same operand position as d2 in u2?
bool SuperWord::opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2) {
// check reductions to see if they are marshalled to represent the reduction
// operator in a specified opnd
if (u1->is_reduction() && u2->is_reduction()) {
// ensure reductions have phis and reduction definitions feeding the 1st operand
Node* first = u1->in(2);
if (first->is_Phi() || first->is_reduction()) {
u1->swap_edges(1, 2);
}
// ensure reductions have phis and reduction definitions feeding the 1st operand
first = u2->in(2);
if (first->is_Phi() || first->is_reduction()) {
u2->swap_edges(1, 2);
}
return true;
}
uint ct = u1->req();
if (ct != u2->req()) return false;
uint i1 = 0;
@ -940,7 +1030,8 @@ void SuperWord::combine_packs() {
for (int i = 0; i < _packset.length(); i++) {
Node_List* p1 = _packset.at(i);
if (p1 == NULL) continue;
for (int j = 0; j < _packset.length(); j++) {
// Because of sorting we can start at i + 1
for (int j = i + 1; j < _packset.length(); j++) {
Node_List* p2 = _packset.at(j);
if (p2 == NULL) continue;
if (i == j) continue;
@ -1067,8 +1158,19 @@ void SuperWord::filter_packs() {
//------------------------------implemented---------------------------
// Can code be generated for pack p?
bool SuperWord::implemented(Node_List* p) {
bool retValue = false;
Node* p0 = p->at(0);
return VectorNode::implemented(p0->Opcode(), p->size(), velt_basic_type(p0));
if (p0 != NULL) {
int opc = p0->Opcode();
uint size = p->size();
if (p0->is_reduction()) {
const Type *arith_type = p0->bottom_type();
retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
} else {
retValue = VectorNode::implemented(opc, size, velt_basic_type(p0));
}
}
return retValue;
}
//------------------------------same_inputs--------------------------
@ -1102,6 +1204,18 @@ bool SuperWord::profitable(Node_List* p) {
if (!is_vector_use(p0, i))
return false;
}
// Check if reductions are connected
if (p0->is_reduction()) {
Node* second_in = p0->in(2);
Node_List* second_pk = my_pack(second_in);
if (second_pk == NULL) {
// Remove reduction flag if no parent pack, it is not profitable
p0->remove_flag(Node::Flag_is_reduction);
return false;
} else if (second_pk->size() != p->size()) {
return false;
}
}
if (VectorNode::is_shift(p0)) {
// For now, return false if shift count is vector or not scalar promotion
// case (different shift counts) because it is not supported yet.
@ -1123,6 +1237,9 @@ bool SuperWord::profitable(Node_List* p) {
for (uint k = 0; k < use->req(); k++) {
Node* n = use->in(k);
if (def == n) {
// reductions can be loop carried dependences
if (def->is_reduction() && use->is_Phi())
continue;
if (!is_vector_use(use, k)) {
return false;
}
@ -1407,16 +1524,33 @@ void SuperWord::output() {
vlen_in_bytes = vn->as_StoreVector()->memory_size();
} else if (n->req() == 3) {
// Promote operands to vector
Node* in1 = vector_opd(p, 1);
Node* in1 = NULL;
bool node_isa_reduction = n->is_reduction();
if (node_isa_reduction) {
// the input to the first reduction operation is retained
in1 = low_adr->in(1);
} else {
in1 = vector_opd(p, 1);
}
Node* in2 = vector_opd(p, 2);
if (VectorNode::is_invariant_vector(in1) && (n->is_Add() || n->is_Mul())) {
if (VectorNode::is_invariant_vector(in1) && (node_isa_reduction == false) && (n->is_Add() || n->is_Mul())) {
// Move invariant vector input into second position to avoid register spilling.
Node* tmp = in1;
in1 = in2;
in2 = tmp;
}
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
if (node_isa_reduction) {
const Type *arith_type = n->bottom_type();
vn = ReductionNode::make(opc, NULL, in1, in2, arith_type->basic_type());
if (in2->is_Load()) {
vlen_in_bytes = in2->as_LoadVector()->memory_size();
} else {
vlen_in_bytes = in2->as_Vector()->length_in_bytes();
}
} else {
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
}
} else {
ShouldNotReachHere();
}
@ -1556,6 +1690,8 @@ void SuperWord::insert_extracts(Node_List* p) {
_n_idx_list.pop();
Node* def = use->in(idx);
if (def->is_reduction()) continue;
// Insert extract operation
_igvn.hash_delete(def);
int def_pos = alignment(def) / data_size(def);
@ -1576,6 +1712,7 @@ void SuperWord::insert_extracts(Node_List* p) {
bool SuperWord::is_vector_use(Node* use, int u_idx) {
Node_List* u_pk = my_pack(use);
if (u_pk == NULL) return false;
if (use->is_reduction()) return true;
Node* def = use->in(u_idx);
Node_List* d_pk = my_pack(def);
if (d_pk == NULL) {
@ -1613,7 +1750,7 @@ bool SuperWord::construct_bb() {
// by the visited and post_visited sets,
// and count number of nodes in block.
int bb_ct = 0;
for (uint i = 0; i < lpt()->_body.size(); i++ ) {
for (uint i = 0; i < lpt()->_body.size(); i++) {
Node *n = lpt()->_body.at(i);
set_bb_idx(n, i); // Create a temporary map
if (in_bb(n)) {
@ -1674,6 +1811,7 @@ bool SuperWord::construct_bb() {
// Do a depth first walk over out edges
int rpo_idx = bb_ct - 1;
int size;
int reduction_uses = 0;
while ((size = _stk.length()) > 0) {
Node* n = _stk.top(); // Leave node on stack
if (!visited_test_set(n)) {
@ -1685,6 +1823,14 @@ bool SuperWord::construct_bb() {
if (in_bb(use) && !visited_test(use) &&
// Don't go around backedge
(!use->is_Phi() || n == entry)) {
if (use->is_reduction()) {
// First see if we can map the reduction on the given system we are on, then
// make a data entry operation for each reduction we see.
BasicType bt = use->bottom_type()->basic_type();
if (ReductionNode::implemented(use->Opcode(), Matcher::min_vector_size(bt), bt)) {
reduction_uses++;
}
}
_stk.push(use);
}
}
@ -1708,7 +1854,8 @@ bool SuperWord::construct_bb() {
set_bb_idx(n, j);
}
initialize_bb(); // Ensure extra info is allocated.
// Ensure extra info is allocated.
initialize_bb();
#ifndef PRODUCT
if (TraceSuperWord) {
@ -1726,7 +1873,7 @@ bool SuperWord::construct_bb() {
}
#endif
assert(rpo_idx == -1 && bb_ct == _block.length(), "all block members found");
return (_mem_slice_head.length() > 0) || (_data_entry.length() > 0);
return (_mem_slice_head.length() > 0) || (reduction_uses > 0) || (_data_entry.length() > 0);
}
//------------------------------initialize_bb---------------------------
@ -1959,6 +2106,27 @@ void SuperWord::remove_pack_at(int pos) {
_packset.remove_at(pos);
}
void SuperWord::packset_sort(int n) {
// simple bubble sort so that we capitalize with O(n) when its already sorted
while (n != 0) {
bool swapped = false;
for (int i = 1; i < n; i++) {
Node_List* q_low = _packset.at(i-1);
Node_List* q_i = _packset.at(i);
// only swap when we find something to swap
if (alignment(q_low->at(0)) > alignment(q_i->at(0))) {
Node_List* t = q_i;
*(_packset.adr_at(i)) = q_low;
*(_packset.adr_at(i-1)) = q_i;
swapped = true;
}
}
if (swapped == false) break;
n--;
}
}
//------------------------------executed_first---------------------------
// Return the node executed first in pack p. Uses the RPO block list
// to determine order.