8134802: LCM register pressure scheduling

Calculate register pressure in a block to help instructions scheduling.

Reviewed-by: kvn, dlong
This commit is contained in:
Michael Berg 2015-09-16 13:16:17 -07:00
parent a402bebf6e
commit d49d1ea740
21 changed files with 513 additions and 35 deletions

View file

@ -3024,6 +3024,10 @@ const bool Matcher::match_rule_supported(int opcode) {
return true; // Per default match rules are supported.
}
const int Matcher::float_pressure(int default_pressure_threshold) {
return default_pressure_threshold;
}
int Matcher::regnum_to_fpu_offset(int regnum)
{
Unimplemented();

View file

@ -72,6 +72,7 @@ define_pd_global(bool, OptoPeephole, true);
define_pd_global(bool, UseCISCSpill, true);
define_pd_global(bool, OptoScheduling, false);
define_pd_global(bool, OptoBundling, false);
define_pd_global(bool, OptoRegScheduling, false);
define_pd_global(intx, ReservedCodeCacheSize, 48*M);
define_pd_global(intx, NonProfiledCodeHeapSize, 21*M);

View file

@ -60,6 +60,7 @@ define_pd_global(intx, LoopUnrollLimit, 60);
define_pd_global(bool, OptoPeephole, false);
define_pd_global(bool, UseCISCSpill, false);
define_pd_global(bool, OptoBundling, false);
define_pd_global(bool, OptoRegScheduling, false);
// GL:
// Detected a problem with unscaled compressed oops and
// narrow_oop_use_complex_address() == false.

View file

@ -2064,6 +2064,10 @@ const bool Matcher::match_rule_supported(int opcode) {
return true; // Per default match rules are supported.
}
const int Matcher::float_pressure(int default_pressure_threshold) {
return default_pressure_threshold;
}
int Matcher::regnum_to_fpu_offset(int regnum) {
// No user for this method?
Unimplemented();

View file

@ -64,6 +64,7 @@ define_pd_global(bool, OptoPeephole, false);
define_pd_global(bool, UseCISCSpill, false);
define_pd_global(bool, OptoBundling, false);
define_pd_global(bool, OptoScheduling, true);
define_pd_global(bool, OptoRegScheduling, false);
#ifdef _LP64
// We need to make sure that all generated code is within

View file

@ -1860,6 +1860,10 @@ const bool Matcher::match_rule_supported(int opcode) {
return true; // Per default match rules are supported.
}
const int Matcher::float_pressure(int default_pressure_threshold) {
return default_pressure_threshold;
}
int Matcher::regnum_to_fpu_offset(int regnum) {
return regnum - 32; // The FP registers are in the second chunk
}

View file

@ -48,11 +48,11 @@ define_pd_global(intx, CompileThreshold, 10000);
define_pd_global(intx, OnStackReplacePercentage, 140);
define_pd_global(intx, ConditionalMoveLimit, 3);
define_pd_global(intx, FLOATPRESSURE, 6);
define_pd_global(intx, FreqInlineSize, 325);
define_pd_global(intx, MinJumpTableSize, 10);
#ifdef AMD64
define_pd_global(intx, INTPRESSURE, 13);
define_pd_global(intx, FLOATPRESSURE, 14);
define_pd_global(intx, InteriorEntryAlignment, 16);
define_pd_global(size_t, NewSizeThreadIncrease, ScaleForWordSize(4*K));
define_pd_global(intx, LoopUnrollLimit, 60);
@ -64,6 +64,7 @@ define_pd_global(intx, CodeCacheExpansionSize, 64*K);
define_pd_global(uint64_t, MaxRAM, 128ULL*G);
#else
define_pd_global(intx, INTPRESSURE, 6);
define_pd_global(intx, FLOATPRESSURE, 6);
define_pd_global(intx, InteriorEntryAlignment, 4);
define_pd_global(size_t, NewSizeThreadIncrease, 4*K);
define_pd_global(intx, LoopUnrollLimit, 50); // Design center runs on 1.3.1
@ -82,6 +83,7 @@ define_pd_global(bool, OptoPeephole, true);
define_pd_global(bool, UseCISCSpill, true);
define_pd_global(bool, OptoScheduling, false);
define_pd_global(bool, OptoBundling, false);
define_pd_global(bool, OptoRegScheduling, true);
define_pd_global(intx, ReservedCodeCacheSize, 48*M);
define_pd_global(intx, NonProfiledCodeHeapSize, 21*M);

View file

@ -1712,6 +1712,18 @@ const bool Matcher::match_rule_supported(int opcode) {
return ret_value; // Per default match rules are supported.
}
const int Matcher::float_pressure(int default_pressure_threshold) {
int float_pressure_threshold = default_pressure_threshold;
#ifdef _LP64
if (UseAVX > 2) {
// Increase pressure threshold on machines with AVX3 which have
// 2x more XMM registers.
float_pressure_threshold = default_pressure_threshold * 2;
}
#endif
return float_pressure_threshold;
}
// Max vector size in bytes. 0 if not supported.
const int Matcher::vector_width_in_bytes(BasicType bt) {
assert(is_java_primitive(bt), "only primitive type vectors");

View file

@ -358,6 +358,8 @@ void Block::dump(const PhaseCFG* cfg) const {
PhaseCFG::PhaseCFG(Arena* arena, RootNode* root, Matcher& matcher)
: Phase(CFG)
, _block_arena(arena)
, _regalloc(NULL)
, _scheduling_for_pressure(false)
, _root(root)
, _matcher(matcher)
, _node_to_block_mapping(arena)

View file

@ -37,6 +37,7 @@ class MachCallNode;
class Matcher;
class RootNode;
class VectorSet;
class PhaseChaitin;
struct Tarjan;
//------------------------------Block_Array------------------------------------
@ -383,6 +384,12 @@ class PhaseCFG : public Phase {
// Arena for the blocks to be stored in
Arena* _block_arena;
// Info used for scheduling
PhaseChaitin* _regalloc;
// Register pressure heuristic used?
bool _scheduling_for_pressure;
// The matcher for this compilation
Matcher& _matcher;
@ -433,12 +440,14 @@ class PhaseCFG : public Phase {
// to late. Helper for schedule_late.
Block* hoist_to_cheaper_block(Block* LCA, Block* early, Node* self);
bool schedule_local(Block* block, GrowableArray<int>& ready_cnt, VectorSet& next_call);
bool schedule_local(Block* block, GrowableArray<int>& ready_cnt, VectorSet& next_call, intptr_t* recacl_pressure_nodes);
void set_next_call(Block* block, Node* n, VectorSet& next_call);
void needed_for_next_call(Block* block, Node* this_call, VectorSet& next_call);
// Perform basic-block local scheduling
Node* select(Block* block, Node_List& worklist, GrowableArray<int>& ready_cnt, VectorSet& next_call, uint sched_slot);
Node* select(Block* block, Node_List& worklist, GrowableArray<int>& ready_cnt, VectorSet& next_call, uint sched_slot,
intptr_t* recacl_pressure_nodes);
void adjust_register_pressure(Node* n, Block* block, intptr_t *recalc_pressure_nodes, bool finalize_mode);
// Schedule a call next in the block
uint sched_call(Block* block, uint node_cnt, Node_List& worklist, GrowableArray<int>& ready_cnt, MachCallNode* mcall, VectorSet& next_call);

View file

@ -290,6 +290,9 @@
product_pd(bool, OptoScheduling, \
"Instruction Scheduling after register allocation") \
\
product_pd(bool, OptoRegScheduling, \
"Instruction Scheduling before register allocation for pressure") \
\
product(bool, PartialPeelLoop, true, \
"Partial peel (rotate) loops") \
\

View file

@ -191,7 +191,7 @@ uint LiveRangeMap::find_const(uint lrg) const {
return next;
}
PhaseChaitin::PhaseChaitin(uint unique, PhaseCFG &cfg, Matcher &matcher)
PhaseChaitin::PhaseChaitin(uint unique, PhaseCFG &cfg, Matcher &matcher, bool scheduling_info_generated)
: PhaseRegAlloc(unique, cfg, matcher,
#ifndef PRODUCT
print_chaitin_statistics
@ -205,6 +205,11 @@ PhaseChaitin::PhaseChaitin(uint unique, PhaseCFG &cfg, Matcher &matcher)
, _spilled_twice(Thread::current()->resource_area())
, _lo_degree(0), _lo_stk_degree(0), _hi_degree(0), _simplified(0)
, _oldphi(unique)
, _scheduling_info_generated(scheduling_info_generated)
, _sched_int_pressure(0, INTPRESSURE)
, _sched_float_pressure(0, FLOATPRESSURE)
, _scratch_int_pressure(0, INTPRESSURE)
, _scratch_float_pressure(0, FLOATPRESSURE)
#ifndef PRODUCT
, _trace_spilling(TraceSpilling || C->method_has_option("TraceSpilling"))
#endif
@ -350,7 +355,7 @@ void PhaseChaitin::Register_Allocate() {
// all copy-related live ranges low and then using the max copy-related
// live range as a cut-off for LIVE and the IFG. In other words, I can
// build a subset of LIVE and IFG just for copies.
PhaseLive live(_cfg, _lrg_map.names(), &live_arena);
PhaseLive live(_cfg, _lrg_map.names(), &live_arena, false);
// Need IFG for coalescing and coloring
PhaseIFG ifg(&live_arena);
@ -690,6 +695,29 @@ void PhaseChaitin::de_ssa() {
_lrg_map.reset_uf_map(lr_counter);
}
void PhaseChaitin::mark_ssa() {
// Use ssa names to populate the live range maps or if no mask
// is available, use the 0 entry.
uint max_idx = 0;
for ( uint i = 0; i < _cfg.number_of_blocks(); i++ ) {
Block* block = _cfg.get_block(i);
uint cnt = block->number_of_nodes();
// Handle all the normal Nodes in the block
for ( uint j = 0; j < cnt; j++ ) {
Node *n = block->get_node(j);
// Pre-color to the zero live range, or pick virtual register
const RegMask &rm = n->out_RegMask();
_lrg_map.map(n->_idx, rm.is_NotEmpty() ? n->_idx : 0);
max_idx = (n->_idx > max_idx) ? n->_idx : max_idx;
}
}
_lrg_map.set_max_lrg_id(max_idx+1);
// Reset the Union-Find mapping to be identity
_lrg_map.reset_uf_map(max_idx+1);
}
// Gather LiveRanGe information, including register masks. Modification of
// cisc spillable in_RegMasks should not be done before AggressiveCoalesce.
@ -707,7 +735,9 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
for (uint j = 1; j < block->number_of_nodes(); j++) {
Node* n = block->get_node(j);
uint input_edge_start =1; // Skip control most nodes
bool is_machine_node = false;
if (n->is_Mach()) {
is_machine_node = true;
input_edge_start = n->as_Mach()->oper_input_base();
}
uint idx = n->is_Copy();
@ -929,6 +959,7 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
// Convert operand number to edge index number
inp = n->as_Mach()->operand_index(inp);
}
// Prepare register mask for each input
for( uint k = input_edge_start; k < cnt; k++ ) {
uint vreg = _lrg_map.live_range_id(n->in(k));
@ -948,6 +979,12 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
n->as_Mach()->use_cisc_RegMask();
}
if (is_machine_node && _scheduling_info_generated) {
MachNode* cur_node = n->as_Mach();
// this is cleaned up by register allocation
if (k >= cur_node->num_opnds()) continue;
}
LRG &lrg = lrgs(vreg);
// // Testing for floating point code shape
// Node *test = n->in(k);
@ -989,7 +1026,7 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
// double can interfere with TWO aligned pairs, or effectively
// FOUR registers!
#ifdef ASSERT
if (is_vect) {
if (is_vect && !_scheduling_info_generated) {
if (lrg.num_regs() != 0) {
assert(lrgmask.is_aligned_sets(lrg.num_regs()), "vector should be aligned");
assert(!lrg._fat_proj, "sanity");

View file

@ -399,7 +399,6 @@ class PhaseChaitin : public PhaseRegAlloc {
int _trip_cnt;
int _alternate;
LRG &lrgs(uint idx) const { return _ifg->lrgs(idx); }
PhaseLive *_live; // Liveness, used in the interference graph
PhaseIFG *_ifg; // Interference graph (for original chunk)
Node_List **_lrg_nodes; // Array of node; lists for lrgs which spill
@ -464,16 +463,28 @@ class PhaseChaitin : public PhaseRegAlloc {
#endif
public:
PhaseChaitin( uint unique, PhaseCFG &cfg, Matcher &matcher );
PhaseChaitin(uint unique, PhaseCFG &cfg, Matcher &matcher, bool track_liveout_pressure);
~PhaseChaitin() {}
LiveRangeMap _lrg_map;
LRG &lrgs(uint idx) const { return _ifg->lrgs(idx); }
// Do all the real work of allocate
void Register_Allocate();
float high_frequency_lrg() const { return _high_frequency_lrg; }
// Used when scheduling info generated, not in general register allocation
bool _scheduling_info_generated;
void set_ifg(PhaseIFG &ifg) { _ifg = &ifg; }
void set_live(PhaseLive &live) { _live = &live; }
PhaseLive* get_live() { return _live; }
// Populate the live range maps with ssa info for scheduling
void mark_ssa();
#ifndef PRODUCT
bool trace_spilling() const { return _trace_spilling; }
#endif
@ -516,7 +527,11 @@ private:
uint _final_pressure;
// number of live ranges that constitute high register pressure
const uint _high_pressure_limit;
uint _high_pressure_limit;
// initial pressure observed
uint _start_pressure;
public:
// lower the register pressure and look for a low to high pressure
@ -537,6 +552,14 @@ private:
}
}
void init(int limit) {
_current_pressure = 0;
_high_pressure_index = 0;
_final_pressure = 0;
_high_pressure_limit = limit;
_start_pressure = 0;
}
uint high_pressure_index() const {
return _high_pressure_index;
}
@ -545,6 +568,10 @@ private:
return _final_pressure;
}
uint start_pressure() const {
return _start_pressure;
}
uint current_pressure() const {
return _current_pressure;
}
@ -561,6 +588,15 @@ private:
_high_pressure_index = 0;
}
void set_start_pressure(int value) {
_start_pressure = value;
_final_pressure = value;
}
void set_current_pressure(int value) {
_current_pressure = value;
}
void check_pressure_at_fatproj(uint fatproj_location, RegMask& fatproj_mask) {
// this pressure is only valid at this instruction, i.e. we don't need to lower
// the register pressure since the fat proj was never live before (going backwards)
@ -579,12 +615,11 @@ private:
Pressure(uint high_pressure_index, uint high_pressure_limit)
: _current_pressure(0)
, _high_pressure_index(high_pressure_index)
, _final_pressure(0)
, _high_pressure_limit(high_pressure_limit)
, _final_pressure(0) {}
, _start_pressure(0) {}
};
void lower_pressure(Block* b, uint location, LRG& lrg, IndexSet* liveout, Pressure& int_pressure, Pressure& float_pressure);
void raise_pressure(Block* b, LRG& lrg, Pressure& int_pressure, Pressure& float_pressure);
void check_for_high_pressure_transition_at_fatproj(uint& block_reg_pressure, uint location, LRG& lrg, Pressure& pressure, const int op_regtype);
void add_input_to_liveout(Block* b, Node* n, IndexSet* liveout, double cost, Pressure& int_pressure, Pressure& float_pressure);
void compute_initial_block_pressure(Block* b, IndexSet* liveout, Pressure& int_pressure, Pressure& float_pressure, double cost);
@ -600,10 +635,25 @@ private:
// acceptable register sets do not overlap, then they do not interfere.
uint build_ifg_physical( ResourceArea *a );
public:
// Gather LiveRanGe information, including register masks and base pointer/
// derived pointer relationships.
void gather_lrg_masks( bool mod_cisc_masks );
// user visible pressure variables for scheduling
Pressure _sched_int_pressure;
Pressure _sched_float_pressure;
Pressure _scratch_int_pressure;
Pressure _scratch_float_pressure;
// Pressure functions for user context
void lower_pressure(Block* b, uint location, LRG& lrg, IndexSet* liveout, Pressure& int_pressure, Pressure& float_pressure);
void raise_pressure(Block* b, LRG& lrg, Pressure& int_pressure, Pressure& float_pressure);
void compute_entry_block_pressure(Block* b);
void compute_exit_block_pressure(Block* b);
void print_pressure_info(Pressure& pressure, const char *str);
private:
// Force the bases of derived pointers to be alive at GC points.
bool stretch_base_pointer_live_ranges( ResourceArea *a );
// Helper to stretch above; recursively discover the base Node for

View file

@ -2336,7 +2336,7 @@ void Compile::Code_Gen() {
debug_only( cfg.verify(); )
}
PhaseChaitin regalloc(unique(), cfg, matcher);
PhaseChaitin regalloc(unique(), cfg, matcher, false);
_regalloc = &regalloc;
{
TracePhase tp("regalloc", &timers[_t_registerAllocation]);

View file

@ -34,6 +34,7 @@
#include "opto/phaseX.hpp"
#include "opto/rootnode.hpp"
#include "opto/runtime.hpp"
#include "opto/chaitin.hpp"
#include "runtime/deoptimization.hpp"
// Portions of code courtesy of Clifford Click
@ -1363,6 +1364,44 @@ void PhaseCFG::global_code_motion() {
}
}
bool block_size_threshold_ok = false;
intptr_t *recalc_pressure_nodes = NULL;
if (OptoRegScheduling) {
for (uint i = 0; i < number_of_blocks(); i++) {
Block* block = get_block(i);
if (block->number_of_nodes() > 10) {
block_size_threshold_ok = true;
break;
}
}
}
// Enabling the scheduler for register pressure plus finding blocks of size to schedule for it
// is key to enabling this feature.
PhaseChaitin regalloc(C->unique(), *this, _matcher, true);
ResourceArea live_arena; // Arena for liveness
ResourceMark rm_live(&live_arena);
PhaseLive live(*this, regalloc._lrg_map.names(), &live_arena, true);
PhaseIFG ifg(&live_arena);
if (OptoRegScheduling && block_size_threshold_ok) {
regalloc.mark_ssa();
Compile::TracePhase tp("computeLive", &timers[_t_computeLive]);
rm_live.reset_to_mark(); // Reclaim working storage
IndexSet::reset_memory(C, &live_arena);
uint node_size = regalloc._lrg_map.max_lrg_id();
ifg.init(node_size); // Empty IFG
regalloc.set_ifg(ifg);
regalloc.set_live(live);
regalloc.gather_lrg_masks(false); // Collect LRG masks
live.compute(node_size); // Compute liveness
recalc_pressure_nodes = NEW_RESOURCE_ARRAY(intptr_t, node_size);
for (uint i = 0; i < node_size; i++) {
recalc_pressure_nodes[i] = 0;
}
}
_regalloc = &regalloc;
#ifndef PRODUCT
if (trace_opto_pipelining()) {
tty->print("\n---- Start Local Scheduling ----\n");
@ -1375,13 +1414,15 @@ void PhaseCFG::global_code_motion() {
visited.Clear();
for (uint i = 0; i < number_of_blocks(); i++) {
Block* block = get_block(i);
if (!schedule_local(block, ready_cnt, visited)) {
if (!schedule_local(block, ready_cnt, visited, recalc_pressure_nodes)) {
if (!C->failure_reason_is(C2Compiler::retry_no_subsuming_loads())) {
C->record_method_not_compilable("local schedule failed");
}
_regalloc = NULL;
return;
}
}
_regalloc = NULL;
// If we inserted any instructions between a Call and his CatchNode,
// clone the instructions on all paths below the Catch.

View file

@ -439,9 +439,11 @@ void PhaseChaitin::lower_pressure(Block* b, uint location, LRG& lrg, IndexSet* l
}
}
}
if (_scheduling_info_generated == false) {
assert(int_pressure.current_pressure() == count_int_pressure(liveout), "the int pressure is incorrect");
assert(float_pressure.current_pressure() == count_float_pressure(liveout), "the float pressure is incorrect");
}
}
/* Go to the first non-phi index in a block */
static uint first_nonphi_index(Block* b) {
@ -517,6 +519,58 @@ void PhaseChaitin::compute_initial_block_pressure(Block* b, IndexSet* liveout, P
assert(float_pressure.current_pressure() == count_float_pressure(liveout), "the float pressure is incorrect");
}
/*
* Computes the entry register pressure of a block, looking at all live
* ranges in the livein. The register pressure is computed for both float
* and int/pointer registers.
*/
void PhaseChaitin::compute_entry_block_pressure(Block* b) {
IndexSet* livein = _live->livein(b);
IndexSetIterator elements(livein);
uint lid = elements.next();
while (lid != 0) {
LRG& lrg = lrgs(lid);
raise_pressure(b, lrg, _sched_int_pressure, _sched_float_pressure);
lid = elements.next();
}
// Now check phis for locally defined inputs
for (uint j = 0; j < b->number_of_nodes(); j++) {
Node* n = b->get_node(j);
if (n->is_Phi()) {
for (uint k = 1; k < n->req(); k++) {
Node* phi_in = n->in(k);
// Because we are talking about phis, raise register pressure once for each
// instance of a phi to account for a single value
if (_cfg.get_block_for_node(phi_in) == b) {
LRG& lrg = lrgs(phi_in->_idx);
raise_pressure(b, lrg, _sched_int_pressure, _sched_float_pressure);
break;
}
}
}
}
_sched_int_pressure.set_start_pressure(_sched_int_pressure.current_pressure());
_sched_float_pressure.set_start_pressure(_sched_float_pressure.current_pressure());
}
/*
* Computes the exit register pressure of a block, looking at all live
* ranges in the liveout. The register pressure is computed for both float
* and int/pointer registers.
*/
void PhaseChaitin::compute_exit_block_pressure(Block* b) {
IndexSet* livein = _live->live(b);
IndexSetIterator elements(livein);
_sched_int_pressure.set_current_pressure(0);
_sched_float_pressure.set_current_pressure(0);
uint lid = elements.next();
while (lid != 0) {
LRG& lrg = lrgs(lid);
raise_pressure(b, lrg, _sched_int_pressure, _sched_float_pressure);
lid = elements.next();
}
}
/*
* Remove dead node if it's not used.
* We only remove projection nodes if the node "defining" the projection is
@ -737,6 +791,16 @@ void PhaseChaitin::adjust_high_pressure_index(Block* b, uint& block_hrp_index, P
block_hrp_index = i;
}
void PhaseChaitin::print_pressure_info(Pressure& pressure, const char *str) {
if (str != NULL) {
tty->print_cr("# *** %s ***", str);
}
tty->print_cr("# start pressure is = %d", pressure.start_pressure());
tty->print_cr("# max pressure is = %d", pressure.final_pressure());
tty->print_cr("# end pressure is = %d", pressure.current_pressure());
tty->print_cr("#");
}
/* Build an interference graph:
* That is, if 2 live ranges are simultaneously alive but in their acceptable
* register sets do not overlap, then they do not interfere. The IFG is built

View file

@ -31,6 +31,7 @@
#include "opto/cfgnode.hpp"
#include "opto/machnode.hpp"
#include "opto/runtime.hpp"
#include "opto/chaitin.hpp"
#include "runtime/sharedRuntime.hpp"
// Optimization - Graph Style
@ -443,7 +444,13 @@ void PhaseCFG::implicit_null_check(Block* block, Node *proj, Node *val, int allo
// remaining cases (most), choose the instruction with the greatest latency
// (that is, the most number of pseudo-cycles required to the end of the
// routine). If there is a tie, choose the instruction with the most inputs.
Node* PhaseCFG::select(Block* block, Node_List &worklist, GrowableArray<int> &ready_cnt, VectorSet &next_call, uint sched_slot) {
Node* PhaseCFG::select(
Block* block,
Node_List &worklist,
GrowableArray<int> &ready_cnt,
VectorSet &next_call,
uint sched_slot,
intptr_t* recalc_pressure_nodes) {
// If only a single entry on the stack, use it
uint cnt = worklist.size();
@ -458,6 +465,7 @@ Node* PhaseCFG::select(Block* block, Node_List &worklist, GrowableArray<int> &re
uint score = 0; // Bigger is better
int idx = -1; // Index in worklist
int cand_cnt = 0; // Candidate count
bool block_size_threshold_ok = (block->number_of_nodes() > 10) ? true : false;
for( uint i=0; i<cnt; i++ ) { // Inspect entire worklist
// Order in worklist is used to break ties.
@ -539,6 +547,46 @@ Node* PhaseCFG::select(Block* block, Node_List &worklist, GrowableArray<int> &re
uint n_latency = get_latency_for_node(n);
uint n_score = n->req(); // Many inputs get high score to break ties
if (OptoRegScheduling && block_size_threshold_ok) {
if (recalc_pressure_nodes[n->_idx] == 0x7fff7fff) {
_regalloc->_scratch_int_pressure.init(_regalloc->_sched_int_pressure.high_pressure_limit());
_regalloc->_scratch_float_pressure.init(_regalloc->_sched_float_pressure.high_pressure_limit());
// simulate the notion that we just picked this node to schedule
n->add_flag(Node::Flag_is_scheduled);
// now caculate its effect upon the graph if we did
adjust_register_pressure(n, block, recalc_pressure_nodes, false);
// return its state for finalize in case somebody else wins
n->remove_flag(Node::Flag_is_scheduled);
// now save the two final pressure components of register pressure, limiting pressure calcs to short size
short int_pressure = (short)_regalloc->_scratch_int_pressure.current_pressure();
short float_pressure = (short)_regalloc->_scratch_float_pressure.current_pressure();
recalc_pressure_nodes[n->_idx] = int_pressure;
recalc_pressure_nodes[n->_idx] |= (float_pressure << 16);
}
if (_scheduling_for_pressure) {
latency = n_latency;
if (n_choice != 3) {
// Now evaluate each register pressure component based on threshold in the score.
// In general the defining register type will dominate the score, ergo we will not see register pressure grow on both banks
// on a single instruction, but we might see it shrink on both banks.
// For each use of register that has a register class that is over the high pressure limit, we build n_score up for
// live ranges that terminate on this instruction.
if (_regalloc->_sched_int_pressure.current_pressure() > _regalloc->_sched_int_pressure.high_pressure_limit()) {
short int_pressure = (short)recalc_pressure_nodes[n->_idx];
n_score = (int_pressure < 0) ? ((score + n_score) - int_pressure) : (int_pressure > 0) ? 1 : n_score;
}
if (_regalloc->_sched_float_pressure.current_pressure() > _regalloc->_sched_float_pressure.high_pressure_limit()) {
short float_pressure = (short)(recalc_pressure_nodes[n->_idx] >> 16);
n_score = (float_pressure < 0) ? ((score + n_score) - float_pressure) : (float_pressure > 0) ? 1 : n_score;
}
} else {
// make sure we choose these candidates
score = 0;
}
}
}
// Keep best latency found
cand_cnt++;
if (choice < n_choice ||
@ -562,6 +610,100 @@ Node* PhaseCFG::select(Block* block, Node_List &worklist, GrowableArray<int> &re
return n;
}
//-------------------------adjust_register_pressure----------------------------
void PhaseCFG::adjust_register_pressure(Node* n, Block* block, intptr_t* recalc_pressure_nodes, bool finalize_mode) {
PhaseLive* liveinfo = _regalloc->get_live();
IndexSet* liveout = liveinfo->live(block);
// first adjust the register pressure for the sources
for (uint i = 1; i < n->req(); i++) {
bool lrg_ends = false;
Node *src_n = n->in(i);
if (src_n == NULL) continue;
if (!src_n->is_Mach()) continue;
uint src = _regalloc->_lrg_map.find(src_n);
if (src == 0) continue;
LRG& lrg_src = _regalloc->lrgs(src);
// detect if the live range ends or not
if (liveout->member(src) == false) {
lrg_ends = true;
for (DUIterator_Fast jmax, j = src_n->fast_outs(jmax); j < jmax; j++) {
Node* m = src_n->fast_out(j); // Get user
if (m == n) continue;
if (!m->is_Mach()) continue;
MachNode *mach = m->as_Mach();
bool src_matches = false;
int iop = mach->ideal_Opcode();
switch (iop) {
case Op_StoreB:
case Op_StoreC:
case Op_StoreCM:
case Op_StoreD:
case Op_StoreF:
case Op_StoreI:
case Op_StoreL:
case Op_StoreP:
case Op_StoreN:
case Op_StoreVector:
case Op_StoreNKlass:
for (uint k = 1; k < m->req(); k++) {
Node *in = m->in(k);
if (in == src_n) {
src_matches = true;
break;
}
}
break;
default:
src_matches = true;
break;
}
// If we have a store as our use, ignore the non source operands
if (src_matches == false) continue;
// Mark every unscheduled use which is not n with a recalculation
if ((get_block_for_node(m) == block) && (!m->is_scheduled())) {
if (finalize_mode && !m->is_Phi()) {
recalc_pressure_nodes[m->_idx] = 0x7fff7fff;
}
lrg_ends = false;
}
}
}
// if none, this live range ends and we can adjust register pressure
if (lrg_ends) {
if (finalize_mode) {
_regalloc->lower_pressure(block, 0, lrg_src, NULL, _regalloc->_sched_int_pressure, _regalloc->_sched_float_pressure);
} else {
_regalloc->lower_pressure(block, 0, lrg_src, NULL, _regalloc->_scratch_int_pressure, _regalloc->_scratch_float_pressure);
}
}
}
// now add the register pressure from the dest and evaluate which heuristic we should use:
// 1.) The default, latency scheduling
// 2.) Register pressure scheduling based on the high pressure limit threshold for int or float register stacks
uint dst = _regalloc->_lrg_map.find(n);
if (dst != 0) {
LRG& lrg_dst = _regalloc->lrgs(dst);
if (finalize_mode) {
_regalloc->raise_pressure(block, lrg_dst, _regalloc->_sched_int_pressure, _regalloc->_sched_float_pressure);
// check to see if we fall over the register pressure cliff here
if (_regalloc->_sched_int_pressure.current_pressure() > _regalloc->_sched_int_pressure.high_pressure_limit()) {
_scheduling_for_pressure = true;
} else if (_regalloc->_sched_float_pressure.current_pressure() > _regalloc->_sched_float_pressure.high_pressure_limit()) {
_scheduling_for_pressure = true;
} else {
// restore latency scheduling mode
_scheduling_for_pressure = false;
}
} else {
_regalloc->raise_pressure(block, lrg_dst, _regalloc->_scratch_int_pressure, _regalloc->_scratch_float_pressure);
}
}
}
//------------------------------set_next_call----------------------------------
void PhaseCFG::set_next_call(Block* block, Node* n, VectorSet& next_call) {
@ -711,7 +853,7 @@ uint PhaseCFG::sched_call(Block* block, uint node_cnt, Node_List& worklist, Grow
//------------------------------schedule_local---------------------------------
// Topological sort within a block. Someday become a real scheduler.
bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, VectorSet& next_call) {
bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, VectorSet& next_call, intptr_t *recalc_pressure_nodes) {
// Already "sorted" are the block start Node (as the first entry), and
// the block-ending Node and any trailing control projections. We leave
// these alone. PhiNodes and ParmNodes are made to follow the block start
@ -733,10 +875,24 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
return true;
}
bool block_size_threshold_ok = (block->number_of_nodes() > 10) ? true : false;
// We track the uses of local definitions as input dependences so that
// we know when a given instruction is avialable to be scheduled.
uint i;
if (OptoRegScheduling && block_size_threshold_ok) {
for (i = 1; i < block->number_of_nodes(); i++) { // setup nodes for pressure calc
Node *n = block->get_node(i);
n->remove_flag(Node::Flag_is_scheduled);
if (!n->is_Phi()) {
recalc_pressure_nodes[n->_idx] = 0x7fff7fff;
}
}
}
// Move PhiNodes and ParmNodes from 1 to cnt up to the start
uint node_cnt = block->end_idx();
uint phi_cnt = 1;
uint i;
for( i = 1; i<node_cnt; i++ ) { // Scan for Phi
Node *n = block->get_node(i);
if( n->is_Phi() || // Found a PhiNode or ParmNode
@ -744,6 +900,10 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
// Move guy at 'phi_cnt' to the end; makes a hole at phi_cnt
block->map_node(block->get_node(phi_cnt), i);
block->map_node(n, phi_cnt++); // swap Phi/Parm up front
if (OptoRegScheduling && block_size_threshold_ok) {
// mark n as scheduled
n->add_flag(Node::Flag_is_scheduled);
}
} else { // All others
// Count block-local inputs to 'n'
uint cnt = n->len(); // Input count
@ -797,6 +957,12 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
Node* m = n->fast_out(j);
if (get_block_for_node(m) == block) { // Local-block user
int m_cnt = ready_cnt.at(m->_idx)-1;
if (OptoRegScheduling && block_size_threshold_ok) {
// mark m as scheduled
if (m_cnt < 0) {
m->add_flag(Node::Flag_is_scheduled);
}
}
ready_cnt.at_put(m->_idx, m_cnt); // Fix ready count
}
}
@ -827,6 +993,18 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
worklist.push(d);
}
if (OptoRegScheduling && block_size_threshold_ok) {
// To stage register pressure calculations we need to examine the live set variables
// breaking them up by register class to compartmentalize the calculations.
uint float_pressure = Matcher::float_pressure(FLOATPRESSURE);
_regalloc->_sched_int_pressure.init(INTPRESSURE);
_regalloc->_sched_float_pressure.init(float_pressure);
_regalloc->_scratch_int_pressure.init(INTPRESSURE);
_regalloc->_scratch_float_pressure.init(float_pressure);
_regalloc->compute_entry_block_pressure(block);
}
// Warm up the 'next_call' heuristic bits
needed_for_next_call(block, block->head(), next_call);
@ -858,9 +1036,18 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
#endif
// Select and pop a ready guy from worklist
Node* n = select(block, worklist, ready_cnt, next_call, phi_cnt);
Node* n = select(block, worklist, ready_cnt, next_call, phi_cnt, recalc_pressure_nodes);
block->map_node(n, phi_cnt++); // Schedule him next
if (OptoRegScheduling && block_size_threshold_ok) {
n->add_flag(Node::Flag_is_scheduled);
// Now adjust the resister pressure with the node we selected
if (!n->is_Phi()) {
adjust_register_pressure(n, block, recalc_pressure_nodes, true);
}
}
#ifndef PRODUCT
if (trace_opto_pipelining()) {
tty->print("# select %d: %s", n->_idx, n->Name());
@ -925,6 +1112,12 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
return false;
}
if (OptoRegScheduling && block_size_threshold_ok) {
_regalloc->compute_exit_block_pressure(block);
block->_reg_pressure = _regalloc->_sched_int_pressure.final_pressure();
block->_freg_pressure = _regalloc->_sched_float_pressure.final_pressure();
}
#ifndef PRODUCT
if (trace_opto_pipelining()) {
tty->print_cr("#");
@ -933,11 +1126,17 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
tty->print("# ");
block->get_node(i)->fast_dump();
}
tty->print_cr("# ");
if (OptoRegScheduling && block_size_threshold_ok) {
tty->print_cr("# pressure info : %d", block->_pre_order);
_regalloc->print_pressure_info(_regalloc->_sched_int_pressure, "int register info");
_regalloc->print_pressure_info(_regalloc->_sched_float_pressure, "float register info");
}
tty->cr();
}
#endif
return true;
}

View file

@ -41,7 +41,14 @@
// block is put on the worklist.
// The locally live-in stuff is computed once and added to predecessor
// live-out sets. This separate compilation is done in the outer loop below.
PhaseLive::PhaseLive( const PhaseCFG &cfg, const LRG_List &names, Arena *arena ) : Phase(LIVE), _cfg(cfg), _names(names), _arena(arena), _live(0) {
PhaseLive::PhaseLive(const PhaseCFG &cfg, const LRG_List &names, Arena *arena, bool keep_deltas)
: Phase(LIVE),
_cfg(cfg),
_names(names),
_arena(arena),
_live(0),
_livein(0),
_keep_deltas(keep_deltas) {
}
void PhaseLive::compute(uint maxlrg) {
@ -56,6 +63,13 @@ void PhaseLive::compute(uint maxlrg) {
_live[i].initialize(_maxlrg);
}
if (_keep_deltas) {
_livein = (IndexSet*)_arena->Amalloc(sizeof(IndexSet) * _cfg.number_of_blocks());
for (i = 0; i < _cfg.number_of_blocks(); i++) {
_livein[i].initialize(_maxlrg);
}
}
// Init the sparse arrays for delta-sets.
ResourceMark rm; // Nuke temp storage on exit
@ -124,7 +138,10 @@ void PhaseLive::compute(uint maxlrg) {
// PhiNode uses go in the live-out set of prior blocks.
for (uint k = i; k > 0; k--) {
add_liveout(p, _names.at(block->get_node(k-1)->in(l)->_idx), first_pass);
Node *phi = block->get_node(k - 1);
if (l < phi->req()) {
add_liveout(p, _names.at(phi->in(l)->_idx), first_pass);
}
}
}
freeset(block);
@ -200,8 +217,11 @@ IndexSet *PhaseLive::getfreeset( ) {
}
// Free an IndexSet from a block.
void PhaseLive::freeset( const Block *p ) {
void PhaseLive::freeset( Block *p ) {
IndexSet *f = _deltas[p->_pre_order-1];
if ( _keep_deltas ) {
add_livein(p, f);
}
f->set_next(_free_IndexSet);
_free_IndexSet = f; // Drop onto free list
_deltas[p->_pre_order-1] = NULL;
@ -249,10 +269,23 @@ void PhaseLive::add_liveout( Block *p, IndexSet *lo, VectorSet &first_pass ) {
}
}
// Add a vector of live-in values to a given blocks live-in set.
void PhaseLive::add_livein(Block *p, IndexSet *lo) {
IndexSet *livein = &_livein[p->_pre_order-1];
IndexSetIterator elements(lo);
uint r;
while ((r = elements.next()) != 0) {
livein->insert(r); // Then add to live-in set
}
}
#ifndef PRODUCT
// Dump the live-out set for a block
void PhaseLive::dump( const Block *b ) const {
tty->print("Block %d: ",b->_pre_order);
if ( _keep_deltas ) {
tty->print("LiveIn: "); _livein[b->_pre_order-1].dump();
}
tty->print("LiveOut: "); _live[b->_pre_order-1].dump();
uint cnt = b->number_of_nodes();
for( uint i=0; i<cnt; i++ ) {

View file

@ -46,7 +46,8 @@ typedef GrowableArray<uint> LRG_List;
class PhaseLive : public Phase {
// Array of Sets of values live at the start of a block.
// Indexed by block pre-order number.
IndexSet *_live;
IndexSet *_live; // live out
IndexSet *_livein; // live in
// Array of Sets of values defined locally in the block
// Indexed by block pre-order number.
@ -62,15 +63,17 @@ class PhaseLive : public Phase {
const LRG_List &_names; // Mapping from Nodes to live ranges
uint _maxlrg; // Largest live-range number
Arena *_arena;
bool _keep_deltas; // Retain live in information
IndexSet *getset( Block *p );
IndexSet *getfreeset( );
void freeset( const Block *p );
void freeset( Block *p );
void add_liveout( Block *p, uint r, VectorSet &first_pass );
void add_liveout( Block *p, IndexSet *lo, VectorSet &first_pass );
void add_livein( Block *p, IndexSet *lo );
public:
PhaseLive(const PhaseCFG &cfg, const LRG_List &names, Arena *arena);
PhaseLive(const PhaseCFG &cfg, const LRG_List &names, Arena *arena, bool keep_deltas);
~PhaseLive() {}
// Compute liveness info
void compute(uint maxlrg);
@ -79,6 +82,7 @@ public:
// Return the live-out set for this block
IndexSet *live( const Block * b ) { return &_live[b->_pre_order-1]; }
IndexSet *livein( const Block * b ) { return &_livein[b->_pre_order - 1]; }
#ifndef PRODUCT
void dump( const Block *b ) const;

View file

@ -269,6 +269,9 @@ public:
// should generate this one.
static const bool match_rule_supported(int opcode);
// Some uarchs have different sized float register resources
static const int float_pressure(int default_pressure_threshold);
// Used to determine if we have fast l2f conversion
// USII has it, USIII doesn't
static const bool convL2FSupported(void);

View file

@ -674,7 +674,8 @@ public:
Flag_avoid_back_to_back_after = Flag_avoid_back_to_back_before << 1,
Flag_has_call = Flag_avoid_back_to_back_after << 1,
Flag_is_reduction = Flag_has_call << 1,
Flag_is_expensive = Flag_is_reduction << 1,
Flag_is_scheduled = Flag_is_reduction,
Flag_is_expensive = Flag_is_scheduled << 1,
_max_flags = (Flag_is_expensive << 1) - 1 // allow flags combination
};
@ -861,6 +862,9 @@ public:
// It must have the loop's phi as input and provide a def to the phi.
bool is_reduction() const { return (_flags & Flag_is_reduction) != 0; }
// Used in lcm to mark nodes that have scheduled
bool is_scheduled() const { return (_flags & Flag_is_scheduled) != 0; }
//----------------- Optimization
// Get the worst-case Type output for this Node.