8134802: LCM register pressure scheduling

Calculate register pressure in a block to help instructions scheduling. Reviewed-by: kvn, dlong
2025-09-24 04:54:40 +02:00 · 2015-09-16 13:16:17 -07:00 · 2015-09-16 13:16:17 -07:00 · d49d1ea740
commit d49d1ea740
parent a402bebf6e
21 changed files with 513 additions and 35 deletions
--- a/hotspot/src/cpu/aarch64/vm/aarch64.ad
+++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad
@ -3024,6 +3024,10 @@ const bool Matcher::match_rule_supported(int opcode) {
  return true;  // Per default match rules are supported.
 }
 const int Matcher::float_pressure(int default_pressure_threshold) {
  return default_pressure_threshold;
 }
 int Matcher::regnum_to_fpu_offset(int regnum)
 {
  Unimplemented();
--- a/hotspot/src/cpu/aarch64/vm/c2_globals_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/c2_globals_aarch64.hpp
@ -72,6 +72,7 @@ define_pd_global(bool, OptoPeephole,                 true);
 define_pd_global(bool, UseCISCSpill,                 true);
 define_pd_global(bool, OptoScheduling,               false);
 define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoRegScheduling,            false);
 define_pd_global(intx, ReservedCodeCacheSize,        48*M);
 define_pd_global(intx, NonProfiledCodeHeapSize,      21*M);
--- a/hotspot/src/cpu/ppc/vm/c2_globals_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/c2_globals_ppc.hpp
@ -60,6 +60,7 @@ define_pd_global(intx, LoopUnrollLimit,              60);
 define_pd_global(bool, OptoPeephole,                 false);
 define_pd_global(bool, UseCISCSpill,                 false);
 define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoRegScheduling,            false);
 // GL:
 // Detected a problem with unscaled compressed oops and
 // narrow_oop_use_complex_address() == false.
--- a/hotspot/src/cpu/ppc/vm/ppc.ad
+++ b/hotspot/src/cpu/ppc/vm/ppc.ad
@ -2064,6 +2064,10 @@ const bool Matcher::match_rule_supported(int opcode) {
  return true;  // Per default match rules are supported.
 }
 const int Matcher::float_pressure(int default_pressure_threshold) {
  return default_pressure_threshold;
 }
 int Matcher::regnum_to_fpu_offset(int regnum) {
  // No user for this method?
  Unimplemented();
--- a/hotspot/src/cpu/sparc/vm/c2_globals_sparc.hpp
+++ b/hotspot/src/cpu/sparc/vm/c2_globals_sparc.hpp
@ -64,6 +64,7 @@ define_pd_global(bool, OptoPeephole,                 false);
 define_pd_global(bool, UseCISCSpill,                 false);
 define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoScheduling,               true);
 define_pd_global(bool, OptoRegScheduling,            false);
 #ifdef _LP64
 // We need to make sure that all generated code is within
--- a/hotspot/src/cpu/sparc/vm/sparc.ad
+++ b/hotspot/src/cpu/sparc/vm/sparc.ad
@ -1860,6 +1860,10 @@ const bool Matcher::match_rule_supported(int opcode) {
  return true;  // Per default match rules are supported.
 }
 const int Matcher::float_pressure(int default_pressure_threshold) {
  return default_pressure_threshold;
 }
 int Matcher::regnum_to_fpu_offset(int regnum) {
  return regnum - 32; // The FP registers are in the second chunk
 }
--- a/hotspot/src/cpu/x86/vm/c2_globals_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/c2_globals_x86.hpp
@ -48,11 +48,11 @@ define_pd_global(intx, CompileThreshold,             10000);
 define_pd_global(intx, OnStackReplacePercentage,     140);
 define_pd_global(intx, ConditionalMoveLimit,         3);
 define_pd_global(intx, FLOATPRESSURE,                6);
 define_pd_global(intx, FreqInlineSize,               325);
 define_pd_global(intx, MinJumpTableSize,             10);
 #ifdef AMD64
 define_pd_global(intx, INTPRESSURE,                  13);
 define_pd_global(intx, FLOATPRESSURE,                14);
 define_pd_global(intx, InteriorEntryAlignment,       16);
 define_pd_global(size_t, NewSizeThreadIncrease,      ScaleForWordSize(4*K));
 define_pd_global(intx, LoopUnrollLimit,              60);
@ -64,6 +64,7 @@ define_pd_global(intx, CodeCacheExpansionSize,       64*K);
 define_pd_global(uint64_t, MaxRAM,                   128ULL*G);
 #else
 define_pd_global(intx, INTPRESSURE,                  6);
 define_pd_global(intx, FLOATPRESSURE,                6);
 define_pd_global(intx, InteriorEntryAlignment,       4);
 define_pd_global(size_t, NewSizeThreadIncrease,      4*K);
 define_pd_global(intx, LoopUnrollLimit,              50);     // Design center runs on 1.3.1
@ -82,6 +83,7 @@ define_pd_global(bool, OptoPeephole,                 true);
 define_pd_global(bool, UseCISCSpill,                 true);
 define_pd_global(bool, OptoScheduling,               false);
 define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoRegScheduling,            true);
 define_pd_global(intx, ReservedCodeCacheSize,        48*M);
 define_pd_global(intx, NonProfiledCodeHeapSize,      21*M);
--- a/hotspot/src/cpu/x86/vm/x86.ad
+++ b/hotspot/src/cpu/x86/vm/x86.ad
@ -1712,6 +1712,18 @@ const bool Matcher::match_rule_supported(int opcode) {
  return ret_value;  // Per default match rules are supported.
 }
 const int Matcher::float_pressure(int default_pressure_threshold) {
  int float_pressure_threshold = default_pressure_threshold;
 #ifdef _LP64
  if (UseAVX > 2) {
    // Increase pressure threshold on machines with AVX3 which have
    // 2x more XMM registers.
    float_pressure_threshold = default_pressure_threshold * 2;
  }
 #endif
  return float_pressure_threshold;
 }
 // Max vector size in bytes. 0 if not supported.
 const int Matcher::vector_width_in_bytes(BasicType bt) {
  assert(is_java_primitive(bt), "only primitive type vectors");
--- a/hotspot/src/share/vm/opto/block.cpp
+++ b/hotspot/src/share/vm/opto/block.cpp
@ -358,6 +358,8 @@ void Block::dump(const PhaseCFG* cfg) const {
 PhaseCFG::PhaseCFG(Arena* arena, RootNode* root, Matcher& matcher)
 : Phase(CFG)
 , _block_arena(arena)
 , _regalloc(NULL)
 , _scheduling_for_pressure(false)
 , _root(root)
 , _matcher(matcher)
 , _node_to_block_mapping(arena)
--- a/hotspot/src/share/vm/opto/block.hpp
+++ b/hotspot/src/share/vm/opto/block.hpp
@ -37,6 +37,7 @@ class MachCallNode;
 class Matcher;
 class RootNode;
 class VectorSet;
 class PhaseChaitin;
 struct Tarjan;
 //------------------------------Block_Array------------------------------------
@ -383,6 +384,12 @@ class PhaseCFG : public Phase {
  // Arena for the blocks to be stored in
  Arena* _block_arena;
  // Info used for scheduling
  PhaseChaitin* _regalloc;
  // Register pressure heuristic used?
  bool _scheduling_for_pressure;
  // The matcher for this compilation
  Matcher& _matcher;
@ -433,12 +440,14 @@ class PhaseCFG : public Phase {
  // to late. Helper for schedule_late.
  Block* hoist_to_cheaper_block(Block* LCA, Block* early, Node* self);
-  bool schedule_local(Block* block, GrowableArray<int>& ready_cnt, VectorSet& next_call);
+  bool schedule_local(Block* block, GrowableArray<int>& ready_cnt, VectorSet& next_call, intptr_t* recacl_pressure_nodes);
  void set_next_call(Block* block, Node* n, VectorSet& next_call);
  void needed_for_next_call(Block* block, Node* this_call, VectorSet& next_call);
  // Perform basic-block local scheduling
-  Node* select(Block* block, Node_List& worklist, GrowableArray<int>& ready_cnt, VectorSet& next_call, uint sched_slot);
+  Node* select(Block* block, Node_List& worklist, GrowableArray<int>& ready_cnt, VectorSet& next_call, uint sched_slot,
               intptr_t* recacl_pressure_nodes);
  void adjust_register_pressure(Node* n, Block* block, intptr_t *recalc_pressure_nodes, bool finalize_mode);
  // Schedule a call next in the block
  uint sched_call(Block* block, uint node_cnt, Node_List& worklist, GrowableArray<int>& ready_cnt, MachCallNode* mcall, VectorSet& next_call);
--- a/hotspot/src/share/vm/opto/c2_globals.hpp
+++ b/hotspot/src/share/vm/opto/c2_globals.hpp
@ -290,6 +290,9 @@
  product_pd(bool, OptoScheduling,                                          \
          "Instruction Scheduling after register allocation")               \
                                                                            \
  product_pd(bool, OptoRegScheduling,                                       \
          "Instruction Scheduling before register allocation for pressure") \
                                                                            \
  product(bool, PartialPeelLoop, true,                                      \
          "Partial peel (rotate) loops")                                    \
                                                                            \
--- a/hotspot/src/share/vm/opto/chaitin.cpp
+++ b/hotspot/src/share/vm/opto/chaitin.cpp
@ -191,7 +191,7 @@ uint LiveRangeMap::find_const(uint lrg) const {
  return next;
 }
-PhaseChaitin::PhaseChaitin(uint unique, PhaseCFG &cfg, Matcher &matcher)
+PhaseChaitin::PhaseChaitin(uint unique, PhaseCFG &cfg, Matcher &matcher, bool scheduling_info_generated)
  : PhaseRegAlloc(unique, cfg, matcher,
 #ifndef PRODUCT
       print_chaitin_statistics
@ -205,6 +205,11 @@ PhaseChaitin::PhaseChaitin(uint unique, PhaseCFG &cfg, Matcher &matcher)
  , _spilled_twice(Thread::current()->resource_area())
  , _lo_degree(0), _lo_stk_degree(0), _hi_degree(0), _simplified(0)
  , _oldphi(unique)
  , _scheduling_info_generated(scheduling_info_generated)
  , _sched_int_pressure(0, INTPRESSURE)
  , _sched_float_pressure(0, FLOATPRESSURE)
  , _scratch_int_pressure(0, INTPRESSURE)
  , _scratch_float_pressure(0, FLOATPRESSURE)
 #ifndef PRODUCT
  , _trace_spilling(TraceSpilling || C->method_has_option("TraceSpilling"))
 #endif
@ -350,7 +355,7 @@ void PhaseChaitin::Register_Allocate() {
  // all copy-related live ranges low and then using the max copy-related
  // live range as a cut-off for LIVE and the IFG.  In other words, I can
  // build a subset of LIVE and IFG just for copies.
-  PhaseLive live(_cfg, _lrg_map.names(), &live_arena);
+  PhaseLive live(_cfg, _lrg_map.names(), &live_arena, false);
  // Need IFG for coalescing and coloring
  PhaseIFG ifg(&live_arena);
@ -690,6 +695,29 @@ void PhaseChaitin::de_ssa() {
  _lrg_map.reset_uf_map(lr_counter);
 }
 void PhaseChaitin::mark_ssa() {
  // Use ssa names to populate the live range maps or if no mask
  // is available, use the 0 entry.
  uint max_idx = 0;
  for ( uint i = 0; i < _cfg.number_of_blocks(); i++ ) {
    Block* block = _cfg.get_block(i);
    uint cnt = block->number_of_nodes();
    // Handle all the normal Nodes in the block
    for ( uint j = 0; j < cnt; j++ ) {
      Node *n = block->get_node(j);
      // Pre-color to the zero live range, or pick virtual register
      const RegMask &rm = n->out_RegMask();
      _lrg_map.map(n->_idx, rm.is_NotEmpty() ? n->_idx : 0);
      max_idx = (n->_idx > max_idx) ? n->_idx : max_idx;
    }
  }
  _lrg_map.set_max_lrg_id(max_idx+1);
  // Reset the Union-Find mapping to be identity
  _lrg_map.reset_uf_map(max_idx+1);
 }
 // Gather LiveRanGe information, including register masks.  Modification of
 // cisc spillable in_RegMasks should not be done before AggressiveCoalesce.
@ -707,7 +735,9 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
    for (uint j = 1; j < block->number_of_nodes(); j++) {
      Node* n = block->get_node(j);
      uint input_edge_start =1; // Skip control most nodes
      bool is_machine_node = false;
      if (n->is_Mach()) {
        is_machine_node = true;
        input_edge_start = n->as_Mach()->oper_input_base();
      }
      uint idx = n->is_Copy();
@ -929,6 +959,7 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
          // Convert operand number to edge index number
          inp = n->as_Mach()->operand_index(inp);
      }
      // Prepare register mask for each input
      for( uint k = input_edge_start; k < cnt; k++ ) {
        uint vreg = _lrg_map.live_range_id(n->in(k));
@ -948,6 +979,12 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
          n->as_Mach()->use_cisc_RegMask();
        }
        if (is_machine_node && _scheduling_info_generated) {
          MachNode* cur_node = n->as_Mach();
          // this is cleaned up by register allocation
          if (k >= cur_node->num_opnds()) continue;
        }
        LRG &lrg = lrgs(vreg);
        // // Testing for floating point code shape
        // Node *test = n->in(k);
@ -989,7 +1026,7 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
        // double can interfere with TWO aligned pairs, or effectively
        // FOUR registers!
 #ifdef ASSERT
-        if (is_vect) {
+        if (is_vect && !_scheduling_info_generated) {
          if (lrg.num_regs() != 0) {
            assert(lrgmask.is_aligned_sets(lrg.num_regs()), "vector should be aligned");
            assert(!lrg._fat_proj, "sanity");
--- a/hotspot/src/share/vm/opto/chaitin.hpp
+++ b/hotspot/src/share/vm/opto/chaitin.hpp
@ -399,7 +399,6 @@ class PhaseChaitin : public PhaseRegAlloc {
  int _trip_cnt;
  int _alternate;
  LRG &lrgs(uint idx) const { return _ifg->lrgs(idx); }
  PhaseLive *_live;             // Liveness, used in the interference graph
  PhaseIFG *_ifg;               // Interference graph (for original chunk)
  Node_List **_lrg_nodes;       // Array of node; lists for lrgs which spill
@ -464,16 +463,28 @@ class PhaseChaitin : public PhaseRegAlloc {
 #endif
 public:
-  PhaseChaitin( uint unique, PhaseCFG &cfg, Matcher &matcher );
+  PhaseChaitin(uint unique, PhaseCFG &cfg, Matcher &matcher, bool track_liveout_pressure);
  ~PhaseChaitin() {}
  LiveRangeMap _lrg_map;
  LRG &lrgs(uint idx) const { return _ifg->lrgs(idx); }
  // Do all the real work of allocate
  void Register_Allocate();
  float high_frequency_lrg() const { return _high_frequency_lrg; }
  // Used when scheduling info generated, not in general register allocation
  bool _scheduling_info_generated;
  void set_ifg(PhaseIFG &ifg) { _ifg = &ifg;  }
  void set_live(PhaseLive &live) { _live = &live; }
  PhaseLive* get_live() { return _live; }
  // Populate the live range maps with ssa info for scheduling
  void mark_ssa();
 #ifndef PRODUCT
  bool trace_spilling() const { return _trace_spilling; }
 #endif
@ -516,7 +527,11 @@ private:
      uint _final_pressure;
      // number of live ranges that constitute high register pressure
-      const uint _high_pressure_limit;
+      uint _high_pressure_limit;
      // initial pressure observed
      uint _start_pressure;
    public:
      // lower the register pressure and look for a low to high pressure
@ -537,6 +552,14 @@ private:
        }
      }
      void init(int limit) {
        _current_pressure = 0;
        _high_pressure_index = 0;
        _final_pressure = 0;
        _high_pressure_limit = limit;
        _start_pressure = 0;
      }
      uint high_pressure_index() const {
        return _high_pressure_index;
      }
@ -545,6 +568,10 @@ private:
        return _final_pressure;
      }
      uint start_pressure() const {
        return _start_pressure;
      }
      uint current_pressure() const {
        return _current_pressure;
      }
@ -561,6 +588,15 @@ private:
        _high_pressure_index = 0;
      }
      void set_start_pressure(int value) {
        _start_pressure = value;
        _final_pressure = value;
      }
      void set_current_pressure(int value) {
        _current_pressure = value;
      }
      void check_pressure_at_fatproj(uint fatproj_location, RegMask& fatproj_mask) {
        // this pressure is only valid at this instruction, i.e. we don't need to lower
        // the register pressure since the fat proj was never live before (going backwards)
@ -577,14 +613,13 @@ private:
      }
      Pressure(uint high_pressure_index, uint high_pressure_limit)
-      : _current_pressure(0)
+        : _current_pressure(0)
-      , _high_pressure_index(high_pressure_index)
+        , _high_pressure_index(high_pressure_index)
-      , _high_pressure_limit(high_pressure_limit)
+        , _final_pressure(0)
-      , _final_pressure(0) {}
+        , _high_pressure_limit(high_pressure_limit)
        , _start_pressure(0) {}
  };
  void lower_pressure(Block* b, uint location, LRG& lrg, IndexSet* liveout, Pressure& int_pressure, Pressure& float_pressure);
  void raise_pressure(Block* b, LRG& lrg, Pressure& int_pressure, Pressure& float_pressure);
  void check_for_high_pressure_transition_at_fatproj(uint& block_reg_pressure, uint location, LRG& lrg, Pressure& pressure, const int op_regtype);
  void add_input_to_liveout(Block* b, Node* n, IndexSet* liveout, double cost, Pressure& int_pressure, Pressure& float_pressure);
  void compute_initial_block_pressure(Block* b, IndexSet* liveout, Pressure& int_pressure, Pressure& float_pressure, double cost);
@ -600,10 +635,25 @@ private:
  // acceptable register sets do not overlap, then they do not interfere.
  uint build_ifg_physical( ResourceArea *a );
 public:
  // Gather LiveRanGe information, including register masks and base pointer/
  // derived pointer relationships.
  void gather_lrg_masks( bool mod_cisc_masks );
  // user visible pressure variables for scheduling
  Pressure _sched_int_pressure;
  Pressure _sched_float_pressure;
  Pressure _scratch_int_pressure;
  Pressure _scratch_float_pressure;
  // Pressure functions for user context
  void lower_pressure(Block* b, uint location, LRG& lrg, IndexSet* liveout, Pressure& int_pressure, Pressure& float_pressure);
  void raise_pressure(Block* b, LRG& lrg, Pressure& int_pressure, Pressure& float_pressure);
  void compute_entry_block_pressure(Block* b);
  void compute_exit_block_pressure(Block* b);
  void print_pressure_info(Pressure& pressure, const char *str);
 private:
  // Force the bases of derived pointers to be alive at GC points.
  bool stretch_base_pointer_live_ranges( ResourceArea *a );
  // Helper to stretch above; recursively discover the base Node for
--- a/hotspot/src/share/vm/opto/compile.cpp
+++ b/hotspot/src/share/vm/opto/compile.cpp
@ -2336,7 +2336,7 @@ void Compile::Code_Gen() {
    debug_only( cfg.verify(); )
  }
-  PhaseChaitin regalloc(unique(), cfg, matcher);
+  PhaseChaitin regalloc(unique(), cfg, matcher, false);
  _regalloc = &regalloc;
  {
    TracePhase tp("regalloc", &timers[_t_registerAllocation]);
--- a/hotspot/src/share/vm/opto/gcm.cpp
+++ b/hotspot/src/share/vm/opto/gcm.cpp
@ -34,6 +34,7 @@
 #include "opto/phaseX.hpp"
 #include "opto/rootnode.hpp"
 #include "opto/runtime.hpp"
 #include "opto/chaitin.hpp"
 #include "runtime/deoptimization.hpp"
 // Portions of code courtesy of Clifford Click
@ -1363,6 +1364,44 @@ void PhaseCFG::global_code_motion() {
    }
  }
  bool block_size_threshold_ok = false;
  intptr_t *recalc_pressure_nodes = NULL;
  if (OptoRegScheduling) {
    for (uint i = 0; i < number_of_blocks(); i++) {
      Block* block = get_block(i);
      if (block->number_of_nodes() > 10) {
        block_size_threshold_ok = true;
        break;
      }
    }
  }
  // Enabling the scheduler for register pressure plus finding blocks of size to schedule for it
  // is key to enabling this feature.
  PhaseChaitin regalloc(C->unique(), *this, _matcher, true);
  ResourceArea live_arena;      // Arena for liveness
  ResourceMark rm_live(&live_arena);
  PhaseLive live(*this, regalloc._lrg_map.names(), &live_arena, true);
  PhaseIFG ifg(&live_arena);
  if (OptoRegScheduling && block_size_threshold_ok) {
    regalloc.mark_ssa();
    Compile::TracePhase tp("computeLive", &timers[_t_computeLive]);
    rm_live.reset_to_mark();           // Reclaim working storage
    IndexSet::reset_memory(C, &live_arena);
    uint node_size = regalloc._lrg_map.max_lrg_id();
    ifg.init(node_size); // Empty IFG
    regalloc.set_ifg(ifg);
    regalloc.set_live(live);
    regalloc.gather_lrg_masks(false);    // Collect LRG masks
    live.compute(node_size); // Compute liveness
    recalc_pressure_nodes = NEW_RESOURCE_ARRAY(intptr_t, node_size);
    for (uint i = 0; i < node_size; i++) {
      recalc_pressure_nodes[i] = 0;
    }
  }
  _regalloc = &regalloc;
 #ifndef PRODUCT
  if (trace_opto_pipelining()) {
    tty->print("\n---- Start Local Scheduling ----\n");
@ -1375,13 +1414,15 @@ void PhaseCFG::global_code_motion() {
  visited.Clear();
  for (uint i = 0; i < number_of_blocks(); i++) {
    Block* block = get_block(i);
-    if (!schedule_local(block, ready_cnt, visited)) {
+    if (!schedule_local(block, ready_cnt, visited, recalc_pressure_nodes)) {
      if (!C->failure_reason_is(C2Compiler::retry_no_subsuming_loads())) {
        C->record_method_not_compilable("local schedule failed");
      }
      _regalloc = NULL;
      return;
    }
  }
  _regalloc = NULL;
  // If we inserted any instructions between a Call and his CatchNode,
  // clone the instructions on all paths below the Catch.
--- a/hotspot/src/share/vm/opto/ifg.cpp
+++ b/hotspot/src/share/vm/opto/ifg.cpp
@ -439,8 +439,10 @@ void PhaseChaitin::lower_pressure(Block* b, uint location, LRG& lrg, IndexSet* l
      }
    }
  }
-  assert(int_pressure.current_pressure() == count_int_pressure(liveout), "the int pressure is incorrect");
+  if (_scheduling_info_generated == false) {
-  assert(float_pressure.current_pressure() == count_float_pressure(liveout), "the float pressure is incorrect");
+    assert(int_pressure.current_pressure() == count_int_pressure(liveout), "the int pressure is incorrect");
    assert(float_pressure.current_pressure() == count_float_pressure(liveout), "the float pressure is incorrect");
  }
 }
 /* Go to the first non-phi index in a block */
@ -517,6 +519,58 @@ void PhaseChaitin::compute_initial_block_pressure(Block* b, IndexSet* liveout, P
  assert(float_pressure.current_pressure() == count_float_pressure(liveout), "the float pressure is incorrect");
 }
 /*
 * Computes the entry register pressure of a block, looking at all live
 * ranges in the livein. The register pressure is computed for both float
 * and int/pointer registers.
 */
 void PhaseChaitin::compute_entry_block_pressure(Block* b) {
  IndexSet* livein = _live->livein(b);
  IndexSetIterator elements(livein);
  uint lid = elements.next();
  while (lid != 0) {
    LRG& lrg = lrgs(lid);
    raise_pressure(b, lrg, _sched_int_pressure, _sched_float_pressure);
    lid = elements.next();
  }
  // Now check phis for locally defined inputs
  for (uint j = 0; j < b->number_of_nodes(); j++) {
    Node* n = b->get_node(j);
    if (n->is_Phi()) {
      for (uint k = 1; k < n->req(); k++) {
        Node* phi_in = n->in(k);
        // Because we are talking about phis, raise register pressure once for each
        // instance of a phi to account for a single value
        if (_cfg.get_block_for_node(phi_in) == b) {
          LRG& lrg = lrgs(phi_in->_idx);
          raise_pressure(b, lrg, _sched_int_pressure, _sched_float_pressure);
          break;
        }
      }
    }
  }
  _sched_int_pressure.set_start_pressure(_sched_int_pressure.current_pressure());
  _sched_float_pressure.set_start_pressure(_sched_float_pressure.current_pressure());
 }
 /*
 * Computes the exit register pressure of a block, looking at all live
 * ranges in the liveout. The register pressure is computed for both float
 * and int/pointer registers.
 */
 void PhaseChaitin::compute_exit_block_pressure(Block* b) {
  IndexSet* livein = _live->live(b);
  IndexSetIterator elements(livein);
  _sched_int_pressure.set_current_pressure(0);
  _sched_float_pressure.set_current_pressure(0);
  uint lid = elements.next();
  while (lid != 0) {
    LRG& lrg = lrgs(lid);
    raise_pressure(b, lrg, _sched_int_pressure, _sched_float_pressure);
    lid = elements.next();
  }
 }
 /*
 * Remove dead node if it's not used.
 * We only remove projection nodes if the node "defining" the projection is
@ -737,6 +791,16 @@ void PhaseChaitin::adjust_high_pressure_index(Block* b, uint& block_hrp_index, P
  block_hrp_index = i;
 }
 void PhaseChaitin::print_pressure_info(Pressure& pressure, const char *str) {
  if (str != NULL) {
    tty->print_cr("#  *** %s ***", str);
  }
  tty->print_cr("#     start pressure is = %d", pressure.start_pressure());
  tty->print_cr("#     max pressure is = %d", pressure.final_pressure());
  tty->print_cr("#     end pressure is = %d", pressure.current_pressure());
  tty->print_cr("#");
 }
 /* Build an interference graph:
 *   That is, if 2 live ranges are simultaneously alive but in their acceptable
 *   register sets do not overlap, then they do not interfere. The IFG is built
--- a/hotspot/src/share/vm/opto/lcm.cpp
+++ b/hotspot/src/share/vm/opto/lcm.cpp
@ -31,6 +31,7 @@
 #include "opto/cfgnode.hpp"
 #include "opto/machnode.hpp"
 #include "opto/runtime.hpp"
 #include "opto/chaitin.hpp"
 #include "runtime/sharedRuntime.hpp"
 // Optimization - Graph Style
@ -443,7 +444,13 @@ void PhaseCFG::implicit_null_check(Block* block, Node *proj, Node *val, int allo
 // remaining cases (most), choose the instruction with the greatest latency
 // (that is, the most number of pseudo-cycles required to the end of the
 // routine). If there is a tie, choose the instruction with the most inputs.
-Node* PhaseCFG::select(Block* block, Node_List &worklist, GrowableArray<int> &ready_cnt, VectorSet &next_call, uint sched_slot) {
+Node* PhaseCFG::select(
  Block* block,
  Node_List &worklist,
  GrowableArray<int> &ready_cnt,
  VectorSet &next_call,
  uint sched_slot,
  intptr_t* recalc_pressure_nodes) {
  // If only a single entry on the stack, use it
  uint cnt = worklist.size();
@ -458,6 +465,7 @@ Node* PhaseCFG::select(Block* block, Node_List &worklist, GrowableArray<int> &re
  uint score   = 0; // Bigger is better
  int idx = -1;     // Index in worklist
  int cand_cnt = 0; // Candidate count
  bool block_size_threshold_ok = (block->number_of_nodes() > 10) ? true : false;
  for( uint i=0; i<cnt; i++ ) { // Inspect entire worklist
    // Order in worklist is used to break ties.
@ -537,7 +545,47 @@ Node* PhaseCFG::select(Block* block, Node_List &worklist, GrowableArray<int> &re
    }
    uint n_latency = get_latency_for_node(n);
-    uint n_score   = n->req();   // Many inputs get high score to break ties
+    uint n_score = n->req();   // Many inputs get high score to break ties
    if (OptoRegScheduling && block_size_threshold_ok) {
      if (recalc_pressure_nodes[n->_idx] == 0x7fff7fff) {
        _regalloc->_scratch_int_pressure.init(_regalloc->_sched_int_pressure.high_pressure_limit());
        _regalloc->_scratch_float_pressure.init(_regalloc->_sched_float_pressure.high_pressure_limit());
        // simulate the notion that we just picked this node to schedule
        n->add_flag(Node::Flag_is_scheduled);
        // now caculate its effect upon the graph if we did
        adjust_register_pressure(n, block, recalc_pressure_nodes, false);
        // return its state for finalize in case somebody else wins
        n->remove_flag(Node::Flag_is_scheduled);
        // now save the two final pressure components of register pressure, limiting pressure calcs to short size
        short int_pressure = (short)_regalloc->_scratch_int_pressure.current_pressure();
        short float_pressure = (short)_regalloc->_scratch_float_pressure.current_pressure();
        recalc_pressure_nodes[n->_idx] = int_pressure;
        recalc_pressure_nodes[n->_idx] |= (float_pressure << 16);
      }
      if (_scheduling_for_pressure) {
        latency = n_latency;
        if (n_choice != 3) {
          // Now evaluate each register pressure component based on threshold in the score.
          // In general the defining register type will dominate the score, ergo we will not see register pressure grow on both banks
          // on a single instruction, but we might see it shrink on both banks.
          // For each use of register that has a register class that is over the high pressure limit, we build n_score up for
          // live ranges that terminate on this instruction.
          if (_regalloc->_sched_int_pressure.current_pressure() > _regalloc->_sched_int_pressure.high_pressure_limit()) {
            short int_pressure = (short)recalc_pressure_nodes[n->_idx];
            n_score = (int_pressure < 0) ? ((score + n_score) - int_pressure) : (int_pressure > 0) ? 1 : n_score;
          }
          if (_regalloc->_sched_float_pressure.current_pressure() > _regalloc->_sched_float_pressure.high_pressure_limit()) {
            short float_pressure = (short)(recalc_pressure_nodes[n->_idx] >> 16);
            n_score = (float_pressure < 0) ? ((score + n_score) - float_pressure) : (float_pressure > 0) ? 1 : n_score;
          }
        } else {
          // make sure we choose these candidates
          score = 0;
        }
      }
    }
    // Keep best latency found
    cand_cnt++;
@ -562,6 +610,100 @@ Node* PhaseCFG::select(Block* block, Node_List &worklist, GrowableArray<int> &re
  return n;
 }
 //-------------------------adjust_register_pressure----------------------------
 void PhaseCFG::adjust_register_pressure(Node* n, Block* block, intptr_t* recalc_pressure_nodes, bool finalize_mode) {
  PhaseLive* liveinfo = _regalloc->get_live();
  IndexSet* liveout = liveinfo->live(block);
  // first adjust the register pressure for the sources
  for (uint i = 1; i < n->req(); i++) {
    bool lrg_ends = false;
    Node *src_n = n->in(i);
    if (src_n == NULL) continue;
    if (!src_n->is_Mach()) continue;
    uint src = _regalloc->_lrg_map.find(src_n);
    if (src == 0) continue;
    LRG& lrg_src = _regalloc->lrgs(src);
    // detect if the live range ends or not
    if (liveout->member(src) == false) {
      lrg_ends = true;
      for (DUIterator_Fast jmax, j = src_n->fast_outs(jmax); j < jmax; j++) {
        Node* m = src_n->fast_out(j); // Get user
        if (m == n) continue;
        if (!m->is_Mach()) continue;
        MachNode *mach = m->as_Mach();
        bool src_matches = false;
        int iop = mach->ideal_Opcode();
        switch (iop) {
        case Op_StoreB:
        case Op_StoreC:
        case Op_StoreCM:
        case Op_StoreD:
        case Op_StoreF:
        case Op_StoreI:
        case Op_StoreL:
        case Op_StoreP:
        case Op_StoreN:
        case Op_StoreVector:
        case Op_StoreNKlass:
          for (uint k = 1; k < m->req(); k++) {
            Node *in = m->in(k);
            if (in == src_n) {
              src_matches = true;
              break;
            }
          }
          break;
        default:
          src_matches = true;
          break;
        }
        // If we have a store as our use, ignore the non source operands
        if (src_matches == false) continue;
        // Mark every unscheduled use which is not n with a recalculation
        if ((get_block_for_node(m) == block) && (!m->is_scheduled())) {
          if (finalize_mode && !m->is_Phi()) {
            recalc_pressure_nodes[m->_idx] = 0x7fff7fff;
          }
          lrg_ends = false;
        }
      }
    }
    // if none, this live range ends and we can adjust register pressure
    if (lrg_ends) {
      if (finalize_mode) {
        _regalloc->lower_pressure(block, 0, lrg_src, NULL, _regalloc->_sched_int_pressure, _regalloc->_sched_float_pressure);
      } else {
        _regalloc->lower_pressure(block, 0, lrg_src, NULL, _regalloc->_scratch_int_pressure, _regalloc->_scratch_float_pressure);
      }
    }
  }
  // now add the register pressure from the dest and evaluate which heuristic we should use:
  // 1.) The default, latency scheduling
  // 2.) Register pressure scheduling based on the high pressure limit threshold for int or float register stacks
  uint dst = _regalloc->_lrg_map.find(n);
  if (dst != 0) {
    LRG& lrg_dst = _regalloc->lrgs(dst);
    if (finalize_mode) {
      _regalloc->raise_pressure(block, lrg_dst, _regalloc->_sched_int_pressure, _regalloc->_sched_float_pressure);
      // check to see if we fall over the register pressure cliff here
      if (_regalloc->_sched_int_pressure.current_pressure() > _regalloc->_sched_int_pressure.high_pressure_limit()) {
        _scheduling_for_pressure = true;
      } else if (_regalloc->_sched_float_pressure.current_pressure() > _regalloc->_sched_float_pressure.high_pressure_limit()) {
        _scheduling_for_pressure = true;
      } else {
        // restore latency scheduling mode
        _scheduling_for_pressure = false;
      }
    } else {
      _regalloc->raise_pressure(block, lrg_dst, _regalloc->_scratch_int_pressure, _regalloc->_scratch_float_pressure);
    }
  }
 }
 //------------------------------set_next_call----------------------------------
 void PhaseCFG::set_next_call(Block* block, Node* n, VectorSet& next_call) {
@ -644,7 +786,7 @@ uint PhaseCFG::sched_call(Block* block, uint node_cnt, Node_List& worklist, Grow
        continue;
      }
      if( m->is_Phi() ) continue;
-      int m_cnt = ready_cnt.at(m->_idx)-1;
+      int m_cnt = ready_cnt.at(m->_idx) - 1;
      ready_cnt.at_put(m->_idx, m_cnt);
      if( m_cnt == 0 )
        worklist.push(m);
@ -711,7 +853,7 @@ uint PhaseCFG::sched_call(Block* block, uint node_cnt, Node_List& worklist, Grow
 //------------------------------schedule_local---------------------------------
 // Topological sort within a block.  Someday become a real scheduler.
-bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, VectorSet& next_call) {
+bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, VectorSet& next_call, intptr_t *recalc_pressure_nodes) {
  // Already "sorted" are the block start Node (as the first entry), and
  // the block-ending Node and any trailing control projections.  We leave
  // these alone.  PhiNodes and ParmNodes are made to follow the block start
@ -733,10 +875,24 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
    return true;
  }
  bool block_size_threshold_ok = (block->number_of_nodes() > 10) ? true : false;
  // We track the uses of local definitions as input dependences so that
  // we know when a given instruction is avialable to be scheduled.
  uint i;
  if (OptoRegScheduling && block_size_threshold_ok) {
    for (i = 1; i < block->number_of_nodes(); i++) { // setup nodes for pressure calc
      Node *n = block->get_node(i);
      n->remove_flag(Node::Flag_is_scheduled);
      if (!n->is_Phi()) {
        recalc_pressure_nodes[n->_idx] = 0x7fff7fff;
      }
    }
  }
  // Move PhiNodes and ParmNodes from 1 to cnt up to the start
  uint node_cnt = block->end_idx();
  uint phi_cnt = 1;
  uint i;
  for( i = 1; i<node_cnt; i++ ) { // Scan for Phi
    Node *n = block->get_node(i);
    if( n->is_Phi() ||          // Found a PhiNode or ParmNode
@ -744,6 +900,10 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
      // Move guy at 'phi_cnt' to the end; makes a hole at phi_cnt
      block->map_node(block->get_node(phi_cnt), i);
      block->map_node(n, phi_cnt++);  // swap Phi/Parm up front
      if (OptoRegScheduling && block_size_threshold_ok) {
        // mark n as scheduled
        n->add_flag(Node::Flag_is_scheduled);
      }
    } else {                    // All others
      // Count block-local inputs to 'n'
      uint cnt = n->len();      // Input count
@ -791,12 +951,18 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
  // All the prescheduled guys do not hold back internal nodes
  uint i3;
-  for(i3 = 0; i3<phi_cnt; i3++ ) {  // For all pre-scheduled
+  for (i3 = 0; i3 < phi_cnt; i3++) {  // For all pre-scheduled
    Node *n = block->get_node(i3);       // Get pre-scheduled
    for (DUIterator_Fast jmax, j = n->fast_outs(jmax); j < jmax; j++) {
      Node* m = n->fast_out(j);
      if (get_block_for_node(m) == block) { // Local-block user
        int m_cnt = ready_cnt.at(m->_idx)-1;
        if (OptoRegScheduling && block_size_threshold_ok) {
          // mark m as scheduled
          if (m_cnt < 0) {
            m->add_flag(Node::Flag_is_scheduled);
          }
        }
        ready_cnt.at_put(m->_idx, m_cnt);   // Fix ready count
      }
    }
@ -827,6 +993,18 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
    worklist.push(d);
  }
  if (OptoRegScheduling && block_size_threshold_ok) {
    // To stage register pressure calculations we need to examine the live set variables
    // breaking them up by register class to compartmentalize the calculations.
    uint float_pressure = Matcher::float_pressure(FLOATPRESSURE);
    _regalloc->_sched_int_pressure.init(INTPRESSURE);
    _regalloc->_sched_float_pressure.init(float_pressure);
    _regalloc->_scratch_int_pressure.init(INTPRESSURE);
    _regalloc->_scratch_float_pressure.init(float_pressure);
    _regalloc->compute_entry_block_pressure(block);
  }
  // Warm up the 'next_call' heuristic bits
  needed_for_next_call(block, block->head(), next_call);
@ -858,9 +1036,18 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
 #endif
    // Select and pop a ready guy from worklist
-    Node* n = select(block, worklist, ready_cnt, next_call, phi_cnt);
+    Node* n = select(block, worklist, ready_cnt, next_call, phi_cnt, recalc_pressure_nodes);
    block->map_node(n, phi_cnt++);    // Schedule him next
    if (OptoRegScheduling && block_size_threshold_ok) {
      n->add_flag(Node::Flag_is_scheduled);
      // Now adjust the resister pressure with the node we selected
      if (!n->is_Phi()) {
        adjust_register_pressure(n, block, recalc_pressure_nodes, true);
      }
    }
 #ifndef PRODUCT
    if (trace_opto_pipelining()) {
      tty->print("#    select %d: %s", n->_idx, n->Name());
@ -906,7 +1093,7 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
        assert(m->is_MachProj() && n->is_Mach() && n->as_Mach()->has_call(), "unexpected node types");
        continue;
      }
-      int m_cnt = ready_cnt.at(m->_idx)-1;
+      int m_cnt = ready_cnt.at(m->_idx) - 1;
      ready_cnt.at_put(m->_idx, m_cnt);
      if( m_cnt == 0 )
        worklist.push(m);
@ -925,6 +1112,12 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
    return false;
  }
  if (OptoRegScheduling && block_size_threshold_ok) {
    _regalloc->compute_exit_block_pressure(block);
    block->_reg_pressure = _regalloc->_sched_int_pressure.final_pressure();
    block->_freg_pressure = _regalloc->_sched_float_pressure.final_pressure();
  }
 #ifndef PRODUCT
  if (trace_opto_pipelining()) {
    tty->print_cr("#");
@ -933,11 +1126,17 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
      tty->print("# ");
      block->get_node(i)->fast_dump();
    }
    tty->print_cr("# ");
    if (OptoRegScheduling && block_size_threshold_ok) {
      tty->print_cr("# pressure info : %d", block->_pre_order);
      _regalloc->print_pressure_info(_regalloc->_sched_int_pressure, "int register info");
      _regalloc->print_pressure_info(_regalloc->_sched_float_pressure, "float register info");
    }
    tty->cr();
  }
 #endif
  return true;
 }
--- a/hotspot/src/share/vm/opto/live.cpp
+++ b/hotspot/src/share/vm/opto/live.cpp
@ -41,7 +41,14 @@
 // block is put on the worklist.
 //   The locally live-in stuff is computed once and added to predecessor
 // live-out sets.  This separate compilation is done in the outer loop below.
-PhaseLive::PhaseLive( const PhaseCFG &cfg, const LRG_List &names, Arena *arena ) : Phase(LIVE), _cfg(cfg), _names(names), _arena(arena), _live(0) {
+PhaseLive::PhaseLive(const PhaseCFG &cfg, const LRG_List &names, Arena *arena, bool keep_deltas)
  : Phase(LIVE),
  _cfg(cfg),
  _names(names),
  _arena(arena),
  _live(0),
  _livein(0),
  _keep_deltas(keep_deltas) {
 }
 void PhaseLive::compute(uint maxlrg) {
@ -56,6 +63,13 @@ void PhaseLive::compute(uint maxlrg) {
    _live[i].initialize(_maxlrg);
  }
  if (_keep_deltas) {
    _livein = (IndexSet*)_arena->Amalloc(sizeof(IndexSet) * _cfg.number_of_blocks());
    for (i = 0; i < _cfg.number_of_blocks(); i++) {
      _livein[i].initialize(_maxlrg);
    }
  }
  // Init the sparse arrays for delta-sets.
  ResourceMark rm;              // Nuke temp storage on exit
@ -124,7 +138,10 @@ void PhaseLive::compute(uint maxlrg) {
      // PhiNode uses go in the live-out set of prior blocks.
      for (uint k = i; k > 0; k--) {
-        add_liveout(p, _names.at(block->get_node(k-1)->in(l)->_idx), first_pass);
+        Node *phi = block->get_node(k - 1);
        if (l < phi->req()) {
          add_liveout(p, _names.at(phi->in(l)->_idx), first_pass);
        }
      }
    }
    freeset(block);
@ -200,8 +217,11 @@ IndexSet *PhaseLive::getfreeset( ) {
 }
 // Free an IndexSet from a block.
-void PhaseLive::freeset( const Block *p ) {
+void PhaseLive::freeset( Block *p ) {
  IndexSet *f = _deltas[p->_pre_order-1];
  if ( _keep_deltas ) {
    add_livein(p, f);
  }
  f->set_next(_free_IndexSet);
  _free_IndexSet = f;           // Drop onto free list
  _deltas[p->_pre_order-1] = NULL;
@ -249,10 +269,23 @@ void PhaseLive::add_liveout( Block *p, IndexSet *lo, VectorSet &first_pass ) {
  }
 }
 // Add a vector of live-in values to a given blocks live-in set.
 void PhaseLive::add_livein(Block *p, IndexSet *lo) {
  IndexSet *livein = &_livein[p->_pre_order-1];
  IndexSetIterator elements(lo);
  uint r;
  while ((r = elements.next()) != 0) {
    livein->insert(r);         // Then add to live-in set
  }
 }
 #ifndef PRODUCT
 // Dump the live-out set for a block
 void PhaseLive::dump( const Block *b ) const {
  tty->print("Block %d: ",b->_pre_order);
  if ( _keep_deltas ) {
    tty->print("LiveIn: ");  _livein[b->_pre_order-1].dump();
  }
  tty->print("LiveOut: ");  _live[b->_pre_order-1].dump();
  uint cnt = b->number_of_nodes();
  for( uint i=0; i<cnt; i++ ) {
--- a/hotspot/src/share/vm/opto/live.hpp
+++ b/hotspot/src/share/vm/opto/live.hpp
@ -46,7 +46,8 @@ typedef GrowableArray<uint> LRG_List;
 class PhaseLive : public Phase {
  // Array of Sets of values live at the start of a block.
  // Indexed by block pre-order number.
-  IndexSet *_live;
+  IndexSet *_live; // live out
  IndexSet *_livein; // live in
  // Array of Sets of values defined locally in the block
  // Indexed by block pre-order number.
@ -62,15 +63,17 @@ class PhaseLive : public Phase {
  const LRG_List &_names;       // Mapping from Nodes to live ranges
  uint _maxlrg;                 // Largest live-range number
  Arena *_arena;
  bool _keep_deltas;            // Retain live in information
  IndexSet *getset( Block *p );
  IndexSet *getfreeset( );
-  void freeset( const Block *p );
+  void freeset( Block *p );
  void add_liveout( Block *p, uint r, VectorSet &first_pass );
  void add_liveout( Block *p, IndexSet *lo, VectorSet &first_pass );
  void add_livein( Block *p, IndexSet *lo );
 public:
-  PhaseLive(const PhaseCFG &cfg, const LRG_List &names, Arena *arena);
+  PhaseLive(const PhaseCFG &cfg, const LRG_List &names, Arena *arena, bool keep_deltas);
  ~PhaseLive() {}
  // Compute liveness info
  void compute(uint maxlrg);
@ -79,6 +82,7 @@ public:
  // Return the live-out set for this block
  IndexSet *live( const Block * b ) { return &_live[b->_pre_order-1]; }
  IndexSet *livein( const Block * b ) { return &_livein[b->_pre_order - 1]; }
 #ifndef PRODUCT
  void dump( const Block *b ) const;
--- a/hotspot/src/share/vm/opto/matcher.hpp
+++ b/hotspot/src/share/vm/opto/matcher.hpp
@ -269,6 +269,9 @@ public:
  // should generate this one.
  static const bool match_rule_supported(int opcode);
  // Some uarchs have different sized float register resources
  static const int float_pressure(int default_pressure_threshold);
  // Used to determine if we have fast l2f conversion
  // USII has it, USIII doesn't
  static const bool convL2FSupported(void);
--- a/hotspot/src/share/vm/opto/node.hpp
+++ b/hotspot/src/share/vm/opto/node.hpp
@ -674,7 +674,8 @@ public:
    Flag_avoid_back_to_back_after    = Flag_avoid_back_to_back_before << 1,
    Flag_has_call                    = Flag_avoid_back_to_back_after << 1,
    Flag_is_reduction                = Flag_has_call << 1,
-    Flag_is_expensive                = Flag_is_reduction << 1,
+    Flag_is_scheduled                = Flag_is_reduction,
    Flag_is_expensive                = Flag_is_scheduled << 1,
    _max_flags = (Flag_is_expensive << 1) - 1 // allow flags combination
  };
@ -861,6 +862,9 @@ public:
  // It must have the loop's phi as input and provide a def to the phi.
  bool is_reduction() const { return (_flags & Flag_is_reduction) != 0; }
  // Used in lcm to mark nodes that have scheduled
  bool is_scheduled() const { return (_flags & Flag_is_scheduled) != 0; }
 //----------------- Optimization
  // Get the worst-case Type output for this Node.