4809552: Optimize Arrays.fill(...)

Reviewed-by: kvn
This commit is contained in:
Tom Rodriguez 2010-08-27 17:33:49 -07:00
parent 519c627fe5
commit 08d9e03b81
17 changed files with 940 additions and 11 deletions

View file

@ -2049,11 +2049,18 @@ bool IdealLoopTree::is_range_check_if(IfNode *iff, PhaseIdealLoop *phase, Invari
if (cmp->Opcode() != Op_CmpU ) {
return false;
}
if (cmp->in(2)->Opcode() != Op_LoadRange) {
return false;
Node* range = cmp->in(2);
if (range->Opcode() != Op_LoadRange) {
const TypeInt* tint = phase->_igvn.type(range)->isa_int();
if (!OptimizeFill || tint == NULL || tint->empty() || tint->_lo < 0) {
// Allow predication on positive values that aren't LoadRanges.
// This allows optimization of loops where the length of the
// array is a known value and doesn't need to be loaded back
// from the array.
return false;
}
}
LoadRangeNode* lr = (LoadRangeNode*)cmp->in(2);
if (!invar.is_invariant(lr)) { // loadRange must be invariant
if (!invar.is_invariant(range)) {
return false;
}
Node *iv = _head->as_CountedLoop()->phi();
@ -2248,9 +2255,9 @@ bool PhaseIdealLoop::loop_predication_impl(IdealLoopTree *loop) {
const Node* cmp = bol->in(1)->as_Cmp();
Node* idx = cmp->in(1);
assert(!invar.is_invariant(idx), "index is variant");
assert(cmp->in(2)->Opcode() == Op_LoadRange, "must be");
Node* ld_rng = cmp->in(2); // LoadRangeNode
assert(invar.is_invariant(ld_rng), "load range must be invariant");
assert(cmp->in(2)->Opcode() == Op_LoadRange || OptimizeFill, "must be");
Node* rng = cmp->in(2);
assert(invar.is_invariant(rng), "range must be invariant");
int scale = 1;
Node* offset = zero;
bool ok = is_scaled_iv_plus_offset(idx, cl->phi(), &scale, &offset);
@ -2271,21 +2278,21 @@ bool PhaseIdealLoop::loop_predication_impl(IdealLoopTree *loop) {
// Perform cloning to keep Invariance state correct since the
// late schedule will place invariant things in the loop.
ld_rng = invar.clone(ld_rng, ctrl);
rng = invar.clone(rng, ctrl);
if (offset && offset != zero) {
assert(invar.is_invariant(offset), "offset must be loop invariant");
offset = invar.clone(offset, ctrl);
}
// Test the lower bound
Node* lower_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, ld_rng, false);
Node* lower_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, rng, false);
IfNode* lower_bound_iff = lower_bound_proj->in(0)->as_If();
_igvn.hash_delete(lower_bound_iff);
lower_bound_iff->set_req(1, lower_bound_bol);
if (TraceLoopPredicate) tty->print_cr("lower bound check if: %d", lower_bound_iff->_idx);
// Test the upper bound
Node* upper_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, ld_rng, true);
Node* upper_bound_bol = rc_predicate(ctrl, scale, offset, init, limit, stride, rng, true);
IfNode* upper_bound_iff = upper_bound_proj->in(0)->as_If();
_igvn.hash_delete(upper_bound_iff);
upper_bound_iff->set_req(1, upper_bound_bol);
@ -2366,3 +2373,348 @@ bool IdealLoopTree::loop_predication( PhaseIdealLoop *phase) {
return hoisted;
}
// Process all the loops in the loop tree and replace any fill
// patterns with an intrisc version.
bool PhaseIdealLoop::do_intrinsify_fill() {
bool changed = false;
for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) {
IdealLoopTree* lpt = iter.current();
changed |= intrinsify_fill(lpt);
}
return changed;
}
// Examine an inner loop looking for a a single store of an invariant
// value in a unit stride loop,
bool PhaseIdealLoop::match_fill_loop(IdealLoopTree* lpt, Node*& store, Node*& store_value,
Node*& shift, Node*& con) {
const char* msg = NULL;
Node* msg_node = NULL;
store_value = NULL;
con = NULL;
shift = NULL;
// Process the loop looking for stores. If there are multiple
// stores or extra control flow give at this point.
CountedLoopNode* head = lpt->_head->as_CountedLoop();
for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) {
Node* n = lpt->_body.at(i);
if (n->outcnt() == 0) continue; // Ignore dead
if (n->is_Store()) {
if (store != NULL) {
msg = "multiple stores";
break;
}
int opc = n->Opcode();
if (opc == Op_StoreP || opc == Op_StoreN || opc == Op_StoreCM) {
msg = "oop fills not handled";
break;
}
Node* value = n->in(MemNode::ValueIn);
if (!lpt->is_invariant(value)) {
msg = "variant store value";
}
store = n;
store_value = value;
} else if (n->is_If() && n != head->loopexit()) {
msg = "extra control flow";
msg_node = n;
}
}
if (store == NULL) {
// No store in loop
return false;
}
if (msg == NULL && head->stride_con() != 1) {
// could handle negative strides too
if (head->stride_con() < 0) {
msg = "negative stride";
} else {
msg = "non-unit stride";
}
}
if (msg == NULL && !store->in(MemNode::Address)->is_AddP()) {
msg = "can't handle store address";
msg_node = store->in(MemNode::Address);
}
// Make sure there is an appropriate fill routine
BasicType t = store->as_Mem()->memory_type();
const char* fill_name;
if (msg == NULL &&
StubRoutines::select_fill_function(t, false, fill_name) == NULL) {
msg = "unsupported store";
msg_node = store;
}
if (msg != NULL) {
#ifndef PRODUCT
if (TraceOptimizeFill) {
tty->print_cr("not fill intrinsic candidate: %s", msg);
if (msg_node != NULL) msg_node->dump();
}
#endif
return false;
}
// Make sure the address expression can be handled. It should be
// head->phi * elsize + con. head->phi might have a ConvI2L.
Node* elements[4];
Node* conv = NULL;
int count = store->in(MemNode::Address)->as_AddP()->unpack_offsets(elements, ARRAY_SIZE(elements));
for (int e = 0; e < count; e++) {
Node* n = elements[e];
if (n->is_Con() && con == NULL) {
con = n;
} else if (n->Opcode() == Op_LShiftX && shift == NULL) {
Node* value = n->in(1);
#ifdef _LP64
if (value->Opcode() == Op_ConvI2L) {
conv = value;
value = value->in(1);
}
#endif
if (value != head->phi()) {
msg = "unhandled shift in address";
} else {
shift = n;
assert(type2aelembytes(store->as_Mem()->memory_type(), true) == 1 << shift->in(2)->get_int(), "scale should match");
}
} else if (n->Opcode() == Op_ConvI2L && conv == NULL) {
if (n->in(1) == head->phi()) {
conv = n;
} else {
msg = "unhandled input to ConvI2L";
}
} else if (n == head->phi()) {
// no shift, check below for allowed cases
} else {
msg = "unhandled node in address";
msg_node = n;
}
}
if (count == -1) {
msg = "malformed address expression";
msg_node = store;
}
// byte sized items won't have a shift
if (msg == NULL && shift == NULL && t != T_BYTE && t != T_BOOLEAN) {
msg = "can't find shift";
msg_node = store;
}
if (msg != NULL) {
#ifndef PRODUCT
if (TraceOptimizeFill) {
tty->print_cr("not fill intrinsic: %s", msg);
if (msg_node != NULL) msg_node->dump();
}
#endif
return false;
}
// No make sure all the other nodes in the loop can be handled
VectorSet ok(Thread::current()->resource_area());
// store related values are ok
ok.set(store->_idx);
ok.set(store->in(MemNode::Memory)->_idx);
// Loop structure is ok
ok.set(head->_idx);
ok.set(head->loopexit()->_idx);
ok.set(head->phi()->_idx);
ok.set(head->incr()->_idx);
ok.set(head->loopexit()->cmp_node()->_idx);
ok.set(head->loopexit()->in(1)->_idx);
// Address elements are ok
if (con) ok.set(con->_idx);
if (shift) ok.set(shift->_idx);
if (conv) ok.set(conv->_idx);
for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) {
Node* n = lpt->_body.at(i);
if (n->outcnt() == 0) continue; // Ignore dead
if (ok.test(n->_idx)) continue;
// Backedge projection is ok
if (n->is_IfTrue() && n->in(0) == head->loopexit()) continue;
if (!n->is_AddP()) {
msg = "unhandled node";
msg_node = n;
break;
}
}
// Make sure no unexpected values are used outside the loop
for (uint i = 0; msg == NULL && i < lpt->_body.size(); i++) {
Node* n = lpt->_body.at(i);
// These values can be replaced with other nodes if they are used
// outside the loop.
if (n == store || n == head->loopexit() || n == head->incr()) continue;
for (SimpleDUIterator iter(n); iter.has_next(); iter.next()) {
Node* use = iter.get();
if (!lpt->_body.contains(use)) {
msg = "node is used outside loop";
// lpt->_body.dump();
msg_node = n;
break;
}
}
}
#ifdef ASSERT
if (TraceOptimizeFill) {
if (msg != NULL) {
tty->print_cr("no fill intrinsic: %s", msg);
if (msg_node != NULL) msg_node->dump();
} else {
tty->print_cr("fill intrinsic for:");
}
store->dump();
if (Verbose) {
lpt->_body.dump();
}
}
#endif
return msg == NULL;
}
bool PhaseIdealLoop::intrinsify_fill(IdealLoopTree* lpt) {
// Only for counted inner loops
if (!lpt->is_counted() || !lpt->is_inner()) {
return false;
}
// Must have constant stride
CountedLoopNode* head = lpt->_head->as_CountedLoop();
if (!head->stride_is_con() || !head->is_normal_loop()) {
return false;
}
// Check that the body only contains a store of a loop invariant
// value that is indexed by the loop phi.
Node* store = NULL;
Node* store_value = NULL;
Node* shift = NULL;
Node* offset = NULL;
if (!match_fill_loop(lpt, store, store_value, shift, offset)) {
return false;
}
// Now replace the whole loop body by a call to a fill routine that
// covers the same region as the loop.
Node* base = store->in(MemNode::Address)->as_AddP()->in(AddPNode::Base);
// Build an expression for the beginning of the copy region
Node* index = head->init_trip();
#ifdef _LP64
index = new (C, 2) ConvI2LNode(index);
_igvn.register_new_node_with_optimizer(index);
#endif
if (shift != NULL) {
// byte arrays don't require a shift but others do.
index = new (C, 3) LShiftXNode(index, shift->in(2));
_igvn.register_new_node_with_optimizer(index);
}
index = new (C, 4) AddPNode(base, base, index);
_igvn.register_new_node_with_optimizer(index);
Node* from = new (C, 4) AddPNode(base, index, offset);
_igvn.register_new_node_with_optimizer(from);
// Compute the number of elements to copy
Node* len = new (C, 3) SubINode(head->limit(), head->init_trip());
_igvn.register_new_node_with_optimizer(len);
BasicType t = store->as_Mem()->memory_type();
bool aligned = false;
if (offset != NULL && head->init_trip()->is_Con()) {
int element_size = type2aelembytes(t);
aligned = (offset->find_intptr_t_type()->get_con() + head->init_trip()->get_int() * element_size) % HeapWordSize == 0;
}
// Build a call to the fill routine
const char* fill_name;
address fill = StubRoutines::select_fill_function(t, aligned, fill_name);
assert(fill != NULL, "what?");
// Convert float/double to int/long for fill routines
if (t == T_FLOAT) {
store_value = new (C, 2) MoveF2INode(store_value);
_igvn.register_new_node_with_optimizer(store_value);
} else if (t == T_DOUBLE) {
store_value = new (C, 2) MoveD2LNode(store_value);
_igvn.register_new_node_with_optimizer(store_value);
}
Node* mem_phi = store->in(MemNode::Memory);
Node* result_ctrl;
Node* result_mem;
const TypeFunc* call_type = OptoRuntime::array_fill_Type();
int size = call_type->domain()->cnt();
CallLeafNode *call = new (C, size) CallLeafNoFPNode(call_type, fill,
fill_name, TypeAryPtr::get_array_body_type(t));
call->init_req(TypeFunc::Parms+0, from);
call->init_req(TypeFunc::Parms+1, store_value);
call->init_req(TypeFunc::Parms+2, len);
call->init_req( TypeFunc::Control, head->init_control());
call->init_req( TypeFunc::I_O , C->top() ) ; // does no i/o
call->init_req( TypeFunc::Memory , mem_phi->in(LoopNode::EntryControl) );
call->init_req( TypeFunc::ReturnAdr, C->start()->proj_out(TypeFunc::ReturnAdr) );
call->init_req( TypeFunc::FramePtr, C->start()->proj_out(TypeFunc::FramePtr) );
_igvn.register_new_node_with_optimizer(call);
result_ctrl = new (C, 1) ProjNode(call,TypeFunc::Control);
_igvn.register_new_node_with_optimizer(result_ctrl);
result_mem = new (C, 1) ProjNode(call,TypeFunc::Memory);
_igvn.register_new_node_with_optimizer(result_mem);
// If this fill is tightly coupled to an allocation and overwrites
// the whole body, allow it to take over the zeroing.
AllocateNode* alloc = AllocateNode::Ideal_allocation(base, this);
if (alloc != NULL && alloc->is_AllocateArray()) {
Node* length = alloc->as_AllocateArray()->Ideal_length();
if (head->limit() == length &&
head->init_trip() == _igvn.intcon(0)) {
if (TraceOptimizeFill) {
tty->print_cr("Eliminated zeroing in allocation");
}
alloc->maybe_set_complete(&_igvn);
} else {
#ifdef ASSERT
if (TraceOptimizeFill) {
tty->print_cr("filling array but bounds don't match");
alloc->dump();
head->init_trip()->dump();
head->limit()->dump();
length->dump();
}
#endif
}
}
// Redirect the old control and memory edges that are outside the loop.
Node* exit = head->loopexit()->proj_out(0);
_igvn.replace_node(exit, result_ctrl);
_igvn.replace_node(store, result_mem);
// Any uses the increment outside of the loop become the loop limit.
_igvn.replace_node(head->incr(), head->limit());
// Disconnect the head from the loop.
for (uint i = 0; i < lpt->_body.size(); i++) {
Node* n = lpt->_body.at(i);
_igvn.replace_node(n, C->top());
}
return true;
}