8134802: LCM register pressure scheduling

Calculate register pressure in a block to help instructions scheduling. Reviewed-by: kvn, dlong
2025-09-24 04:54:40 +02:00 · 2015-09-16 13:16:17 -07:00 · 2015-09-16 13:16:17 -07:00 · d49d1ea740
commit d49d1ea740
parent a402bebf6e
21 changed files with 513 additions and 35 deletions
--- a/hotspot/src/cpu/aarch64/vm/aarch64.ad
+++ b/hotspot/src/cpu/aarch64/vm/aarch64.ad
@ -3024,6 +3024,10 @@ const bool Matcher::match_rule_supported(int opcode) {
  return true;  // Per default match rules are supported.
 }

+const int Matcher::float_pressure(int default_pressure_threshold) {
+  return default_pressure_threshold;
+}
+
 int Matcher::regnum_to_fpu_offset(int regnum)
 {
  Unimplemented();
--- a/hotspot/src/cpu/aarch64/vm/c2_globals_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/c2_globals_aarch64.hpp
@ -72,6 +72,7 @@ define_pd_global(bool, OptoPeephole,                 true);
 define_pd_global(bool, UseCISCSpill,                 true);
 define_pd_global(bool, OptoScheduling,               false);
 define_pd_global(bool, OptoBundling,                 false);
+define_pd_global(bool, OptoRegScheduling,            false);

 define_pd_global(intx, ReservedCodeCacheSize,        48*M);
 define_pd_global(intx, NonProfiledCodeHeapSize,      21*M);
--- a/hotspot/src/cpu/ppc/vm/c2_globals_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/c2_globals_ppc.hpp
@ -60,6 +60,7 @@ define_pd_global(intx, LoopUnrollLimit,              60);
 define_pd_global(bool, OptoPeephole,                 false);
 define_pd_global(bool, UseCISCSpill,                 false);
 define_pd_global(bool, OptoBundling,                 false);
+define_pd_global(bool, OptoRegScheduling,            false);
 // GL:
 // Detected a problem with unscaled compressed oops and
 // narrow_oop_use_complex_address() == false.
--- a/hotspot/src/cpu/ppc/vm/ppc.ad
+++ b/hotspot/src/cpu/ppc/vm/ppc.ad
@ -2064,6 +2064,10 @@ const bool Matcher::match_rule_supported(int opcode) {
  return true;  // Per default match rules are supported.
 }

+const int Matcher::float_pressure(int default_pressure_threshold) {
+  return default_pressure_threshold;
+}
+
 int Matcher::regnum_to_fpu_offset(int regnum) {
  // No user for this method?
  Unimplemented();
--- a/hotspot/src/cpu/sparc/vm/c2_globals_sparc.hpp
+++ b/hotspot/src/cpu/sparc/vm/c2_globals_sparc.hpp
@ -64,6 +64,7 @@ define_pd_global(bool, OptoPeephole,                 false);
 define_pd_global(bool, UseCISCSpill,                 false);
 define_pd_global(bool, OptoBundling,                 false);
 define_pd_global(bool, OptoScheduling,               true);
+define_pd_global(bool, OptoRegScheduling,            false);

 #ifdef _LP64
 // We need to make sure that all generated code is within
--- a/hotspot/src/cpu/sparc/vm/sparc.ad
+++ b/hotspot/src/cpu/sparc/vm/sparc.ad
@ -1860,6 +1860,10 @@ const bool Matcher::match_rule_supported(int opcode) {
  return true;  // Per default match rules are supported.
 }

+const int Matcher::float_pressure(int default_pressure_threshold) {
+  return default_pressure_threshold;
+}
+
 int Matcher::regnum_to_fpu_offset(int regnum) {
  return regnum - 32; // The FP registers are in the second chunk
 }
--- a/hotspot/src/cpu/x86/vm/c2_globals_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/c2_globals_x86.hpp
@ -48,11 +48,11 @@ define_pd_global(intx, CompileThreshold,             10000);

 define_pd_global(intx, OnStackReplacePercentage,     140);
 define_pd_global(intx, ConditionalMoveLimit,         3);
-define_pd_global(intx, FLOATPRESSURE,                6);
 define_pd_global(intx, FreqInlineSize,               325);
 define_pd_global(intx, MinJumpTableSize,             10);
 #ifdef AMD64
 define_pd_global(intx, INTPRESSURE,                  13);
+define_pd_global(intx, FLOATPRESSURE,                14);
 define_pd_global(intx, InteriorEntryAlignment,       16);
 define_pd_global(size_t, NewSizeThreadIncrease,      ScaleForWordSize(4*K));
 define_pd_global(intx, LoopUnrollLimit,              60);
@ -64,6 +64,7 @@ define_pd_global(intx, CodeCacheExpansionSize,       64*K);
 define_pd_global(uint64_t, MaxRAM,                   128ULL*G);
 #else
 define_pd_global(intx, INTPRESSURE,                  6);
+define_pd_global(intx, FLOATPRESSURE,                6);
 define_pd_global(intx, InteriorEntryAlignment,       4);
 define_pd_global(size_t, NewSizeThreadIncrease,      4*K);
 define_pd_global(intx, LoopUnrollLimit,              50);     // Design center runs on 1.3.1
@ -82,6 +83,7 @@ define_pd_global(bool, OptoPeephole,                 true);
 define_pd_global(bool, UseCISCSpill,                 true);
 define_pd_global(bool, OptoScheduling,               false);
 define_pd_global(bool, OptoBundling,                 false);
+define_pd_global(bool, OptoRegScheduling,            true);

 define_pd_global(intx, ReservedCodeCacheSize,        48*M);
 define_pd_global(intx, NonProfiledCodeHeapSize,      21*M);
--- a/hotspot/src/cpu/x86/vm/x86.ad
+++ b/hotspot/src/cpu/x86/vm/x86.ad
@ -1712,6 +1712,18 @@ const bool Matcher::match_rule_supported(int opcode) {
  return ret_value;  // Per default match rules are supported.
 }

+const int Matcher::float_pressure(int default_pressure_threshold) {
+  int float_pressure_threshold = default_pressure_threshold;
+#ifdef _LP64
+  if (UseAVX > 2) {
+    // Increase pressure threshold on machines with AVX3 which have
+    // 2x more XMM registers.
+    float_pressure_threshold = default_pressure_threshold * 2;
+  }
+#endif
+  return float_pressure_threshold;
+}
+
 // Max vector size in bytes. 0 if not supported.
 const int Matcher::vector_width_in_bytes(BasicType bt) {
  assert(is_java_primitive(bt), "only primitive type vectors");
--- a/hotspot/src/share/vm/opto/block.cpp
+++ b/hotspot/src/share/vm/opto/block.cpp
@ -358,6 +358,8 @@ void Block::dump(const PhaseCFG* cfg) const {
 PhaseCFG::PhaseCFG(Arena* arena, RootNode* root, Matcher& matcher)
 : Phase(CFG)
 , _block_arena(arena)
+, _regalloc(NULL)
+, _scheduling_for_pressure(false)
 , _root(root)
 , _matcher(matcher)
 , _node_to_block_mapping(arena)
--- a/hotspot/src/share/vm/opto/block.hpp
+++ b/hotspot/src/share/vm/opto/block.hpp
@ -37,6 +37,7 @@ class MachCallNode;
 class Matcher;
 class RootNode;
 class VectorSet;
+class PhaseChaitin;
 struct Tarjan;

 //------------------------------Block_Array------------------------------------
@ -383,6 +384,12 @@ class PhaseCFG : public Phase {
  // Arena for the blocks to be stored in
  Arena* _block_arena;

+  // Info used for scheduling
+  PhaseChaitin* _regalloc;
+
+  // Register pressure heuristic used?
+  bool _scheduling_for_pressure;
+
  // The matcher for this compilation
  Matcher& _matcher;

@ -433,12 +440,14 @@ class PhaseCFG : public Phase {
  // to late. Helper for schedule_late.
  Block* hoist_to_cheaper_block(Block* LCA, Block* early, Node* self);

-  bool schedule_local(Block* block, GrowableArray<int>& ready_cnt, VectorSet& next_call);
+  bool schedule_local(Block* block, GrowableArray<int>& ready_cnt, VectorSet& next_call, intptr_t* recacl_pressure_nodes);
  void set_next_call(Block* block, Node* n, VectorSet& next_call);
  void needed_for_next_call(Block* block, Node* this_call, VectorSet& next_call);

  // Perform basic-block local scheduling
-  Node* select(Block* block, Node_List& worklist, GrowableArray<int>& ready_cnt, VectorSet& next_call, uint sched_slot);
+  Node* select(Block* block, Node_List& worklist, GrowableArray<int>& ready_cnt, VectorSet& next_call, uint sched_slot,
+               intptr_t* recacl_pressure_nodes);
+  void adjust_register_pressure(Node* n, Block* block, intptr_t *recalc_pressure_nodes, bool finalize_mode);

  // Schedule a call next in the block
  uint sched_call(Block* block, uint node_cnt, Node_List& worklist, GrowableArray<int>& ready_cnt, MachCallNode* mcall, VectorSet& next_call);
--- a/hotspot/src/share/vm/opto/c2_globals.hpp
+++ b/hotspot/src/share/vm/opto/c2_globals.hpp
@ -290,6 +290,9 @@
  product_pd(bool, OptoScheduling,                                          \
          "Instruction Scheduling after register allocation")               \
                                                                            \
+  product_pd(bool, OptoRegScheduling,                                       \
+          "Instruction Scheduling before register allocation for pressure") \
+                                                                            \
  product(bool, PartialPeelLoop, true,                                      \
          "Partial peel (rotate) loops")                                    \
                                                                            \
--- a/hotspot/src/share/vm/opto/chaitin.cpp
+++ b/hotspot/src/share/vm/opto/chaitin.cpp
@ -191,7 +191,7 @@ uint LiveRangeMap::find_const(uint lrg) const {
  return next;
 }

-PhaseChaitin::PhaseChaitin(uint unique, PhaseCFG &cfg, Matcher &matcher)
+PhaseChaitin::PhaseChaitin(uint unique, PhaseCFG &cfg, Matcher &matcher, bool scheduling_info_generated)
  : PhaseRegAlloc(unique, cfg, matcher,
 #ifndef PRODUCT
       print_chaitin_statistics
@ -205,6 +205,11 @@ PhaseChaitin::PhaseChaitin(uint unique, PhaseCFG &cfg, Matcher &matcher)
  , _spilled_twice(Thread::current()->resource_area())
  , _lo_degree(0), _lo_stk_degree(0), _hi_degree(0), _simplified(0)
  , _oldphi(unique)
+  , _scheduling_info_generated(scheduling_info_generated)
+  , _sched_int_pressure(0, INTPRESSURE)
+  , _sched_float_pressure(0, FLOATPRESSURE)
+  , _scratch_int_pressure(0, INTPRESSURE)
+  , _scratch_float_pressure(0, FLOATPRESSURE)
 #ifndef PRODUCT
  , _trace_spilling(TraceSpilling || C->method_has_option("TraceSpilling"))
 #endif
@ -350,7 +355,7 @@ void PhaseChaitin::Register_Allocate() {
  // all copy-related live ranges low and then using the max copy-related
  // live range as a cut-off for LIVE and the IFG.  In other words, I can
  // build a subset of LIVE and IFG just for copies.
-  PhaseLive live(_cfg, _lrg_map.names(), &live_arena);
+  PhaseLive live(_cfg, _lrg_map.names(), &live_arena, false);

  // Need IFG for coalescing and coloring
  PhaseIFG ifg(&live_arena);
@ -690,6 +695,29 @@ void PhaseChaitin::de_ssa() {
  _lrg_map.reset_uf_map(lr_counter);
 }

+void PhaseChaitin::mark_ssa() {
+  // Use ssa names to populate the live range maps or if no mask
+  // is available, use the 0 entry.
+  uint max_idx = 0;
+  for ( uint i = 0; i < _cfg.number_of_blocks(); i++ ) {
+    Block* block = _cfg.get_block(i);
+    uint cnt = block->number_of_nodes();
+
+    // Handle all the normal Nodes in the block
+    for ( uint j = 0; j < cnt; j++ ) {
+      Node *n = block->get_node(j);
+      // Pre-color to the zero live range, or pick virtual register
+      const RegMask &rm = n->out_RegMask();
+      _lrg_map.map(n->_idx, rm.is_NotEmpty() ? n->_idx : 0);
+      max_idx = (n->_idx > max_idx) ? n->_idx : max_idx;
+    }
+  }
+  _lrg_map.set_max_lrg_id(max_idx+1);
+
+  // Reset the Union-Find mapping to be identity
+  _lrg_map.reset_uf_map(max_idx+1);
+}
+

 // Gather LiveRanGe information, including register masks.  Modification of
 // cisc spillable in_RegMasks should not be done before AggressiveCoalesce.
@ -707,7 +735,9 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
    for (uint j = 1; j < block->number_of_nodes(); j++) {
      Node* n = block->get_node(j);
      uint input_edge_start =1; // Skip control most nodes
+      bool is_machine_node = false;
      if (n->is_Mach()) {
+        is_machine_node = true;
        input_edge_start = n->as_Mach()->oper_input_base();
      }
      uint idx = n->is_Copy();
@ -929,6 +959,7 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
          // Convert operand number to edge index number
          inp = n->as_Mach()->operand_index(inp);
      }
+
      // Prepare register mask for each input
      for( uint k = input_edge_start; k < cnt; k++ ) {
        uint vreg = _lrg_map.live_range_id(n->in(k));
@ -948,6 +979,12 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
          n->as_Mach()->use_cisc_RegMask();
        }

+        if (is_machine_node && _scheduling_info_generated) {
+          MachNode* cur_node = n->as_Mach();
+          // this is cleaned up by register allocation
+          if (k >= cur_node->num_opnds()) continue;
+        }
+
        LRG &lrg = lrgs(vreg);
        // // Testing for floating point code shape
        // Node *test = n->in(k);
@ -989,7 +1026,7 @@ void PhaseChaitin::gather_lrg_masks( bool after_aggressive ) {
        // double can interfere with TWO aligned pairs, or effectively
        // FOUR registers!
 #ifdef ASSERT
-        if (is_vect) {
+        if (is_vect && !_scheduling_info_generated) {
          if (lrg.num_regs() != 0) {
            assert(lrgmask.is_aligned_sets(lrg.num_regs()), "vector should be aligned");
            assert(!lrg._fat_proj, "sanity");
--- a/hotspot/src/share/vm/opto/chaitin.hpp
+++ b/hotspot/src/share/vm/opto/chaitin.hpp
@ -399,7 +399,6 @@ class PhaseChaitin : public PhaseRegAlloc {
  int _trip_cnt;
  int _alternate;

-  LRG &lrgs(uint idx) const { return _ifg->lrgs(idx); }
  PhaseLive *_live;             // Liveness, used in the interference graph
  PhaseIFG *_ifg;               // Interference graph (for original chunk)
  Node_List **_lrg_nodes;       // Array of node; lists for lrgs which spill
@ -464,16 +463,28 @@ class PhaseChaitin : public PhaseRegAlloc {
 #endif

 public:
-  PhaseChaitin( uint unique, PhaseCFG &cfg, Matcher &matcher );
+  PhaseChaitin(uint unique, PhaseCFG &cfg, Matcher &matcher, bool track_liveout_pressure);
  ~PhaseChaitin() {}

  LiveRangeMap _lrg_map;

+  LRG &lrgs(uint idx) const { return _ifg->lrgs(idx); }
+
  // Do all the real work of allocate
  void Register_Allocate();

  float high_frequency_lrg() const { return _high_frequency_lrg; }

+  // Used when scheduling info generated, not in general register allocation
+  bool _scheduling_info_generated;
+
+  void set_ifg(PhaseIFG &ifg) { _ifg = &ifg;  }
+  void set_live(PhaseLive &live) { _live = &live; }
+  PhaseLive* get_live() { return _live; }
+
+  // Populate the live range maps with ssa info for scheduling
+  void mark_ssa();
+
 #ifndef PRODUCT
  bool trace_spilling() const { return _trace_spilling; }
 #endif
@ -516,7 +527,11 @@ private:
      uint _final_pressure;

      // number of live ranges that constitute high register pressure
-      const uint _high_pressure_limit;
+      uint _high_pressure_limit;
+
+      // initial pressure observed
+      uint _start_pressure;
+
    public:

      // lower the register pressure and look for a low to high pressure
@ -537,6 +552,14 @@ private:
        }
      }

+      void init(int limit) {
+        _current_pressure = 0;
+        _high_pressure_index = 0;
+        _final_pressure = 0;
+        _high_pressure_limit = limit;
+        _start_pressure = 0;
+      }
+
      uint high_pressure_index() const {
        return _high_pressure_index;
      }
@ -545,6 +568,10 @@ private:
        return _final_pressure;
      }

+      uint start_pressure() const {
+        return _start_pressure;
+      }
+
      uint current_pressure() const {
        return _current_pressure;
      }
@ -561,6 +588,15 @@ private:
        _high_pressure_index = 0;
      }

+      void set_start_pressure(int value) {
+        _start_pressure = value;
+        _final_pressure = value;
+      }
+
+      void set_current_pressure(int value) {
+        _current_pressure = value;
+      }
+
      void check_pressure_at_fatproj(uint fatproj_location, RegMask& fatproj_mask) {
        // this pressure is only valid at this instruction, i.e. we don't need to lower
        // the register pressure since the fat proj was never live before (going backwards)
@ -579,12 +615,11 @@ private:
      Pressure(uint high_pressure_index, uint high_pressure_limit)
        : _current_pressure(0)
        , _high_pressure_index(high_pressure_index)
+        , _final_pressure(0)
        , _high_pressure_limit(high_pressure_limit)
-      , _final_pressure(0) {}
+        , _start_pressure(0) {}
  };

-  void lower_pressure(Block* b, uint location, LRG& lrg, IndexSet* liveout, Pressure& int_pressure, Pressure& float_pressure);
-  void raise_pressure(Block* b, LRG& lrg, Pressure& int_pressure, Pressure& float_pressure);
  void check_for_high_pressure_transition_at_fatproj(uint& block_reg_pressure, uint location, LRG& lrg, Pressure& pressure, const int op_regtype);
  void add_input_to_liveout(Block* b, Node* n, IndexSet* liveout, double cost, Pressure& int_pressure, Pressure& float_pressure);
  void compute_initial_block_pressure(Block* b, IndexSet* liveout, Pressure& int_pressure, Pressure& float_pressure, double cost);
@ -600,10 +635,25 @@ private:
  // acceptable register sets do not overlap, then they do not interfere.
  uint build_ifg_physical( ResourceArea *a );

+public:
  // Gather LiveRanGe information, including register masks and base pointer/
  // derived pointer relationships.
  void gather_lrg_masks( bool mod_cisc_masks );

+  // user visible pressure variables for scheduling
+  Pressure _sched_int_pressure;
+  Pressure _sched_float_pressure;
+  Pressure _scratch_int_pressure;
+  Pressure _scratch_float_pressure;
+
+  // Pressure functions for user context
+  void lower_pressure(Block* b, uint location, LRG& lrg, IndexSet* liveout, Pressure& int_pressure, Pressure& float_pressure);
+  void raise_pressure(Block* b, LRG& lrg, Pressure& int_pressure, Pressure& float_pressure);
+  void compute_entry_block_pressure(Block* b);
+  void compute_exit_block_pressure(Block* b);
+  void print_pressure_info(Pressure& pressure, const char *str);
+
+private:
  // Force the bases of derived pointers to be alive at GC points.
  bool stretch_base_pointer_live_ranges( ResourceArea *a );
  // Helper to stretch above; recursively discover the base Node for
--- a/hotspot/src/share/vm/opto/compile.cpp
+++ b/hotspot/src/share/vm/opto/compile.cpp
@ -2336,7 +2336,7 @@ void Compile::Code_Gen() {
    debug_only( cfg.verify(); )
  }

-  PhaseChaitin regalloc(unique(), cfg, matcher);
+  PhaseChaitin regalloc(unique(), cfg, matcher, false);
  _regalloc = &regalloc;
  {
    TracePhase tp("regalloc", &timers[_t_registerAllocation]);
--- a/hotspot/src/share/vm/opto/gcm.cpp
+++ b/hotspot/src/share/vm/opto/gcm.cpp
@ -34,6 +34,7 @@
 #include "opto/phaseX.hpp"
 #include "opto/rootnode.hpp"
 #include "opto/runtime.hpp"
+#include "opto/chaitin.hpp"
 #include "runtime/deoptimization.hpp"

 // Portions of code courtesy of Clifford Click
@ -1363,6 +1364,44 @@ void PhaseCFG::global_code_motion() {
    }
  }

+  bool block_size_threshold_ok = false;
+  intptr_t *recalc_pressure_nodes = NULL;
+  if (OptoRegScheduling) {
+    for (uint i = 0; i < number_of_blocks(); i++) {
+      Block* block = get_block(i);
+      if (block->number_of_nodes() > 10) {
+        block_size_threshold_ok = true;
+        break;
+      }
+    }
+  }
+
+  // Enabling the scheduler for register pressure plus finding blocks of size to schedule for it
+  // is key to enabling this feature.
+  PhaseChaitin regalloc(C->unique(), *this, _matcher, true);
+  ResourceArea live_arena;      // Arena for liveness
+  ResourceMark rm_live(&live_arena);
+  PhaseLive live(*this, regalloc._lrg_map.names(), &live_arena, true);
+  PhaseIFG ifg(&live_arena);
+  if (OptoRegScheduling && block_size_threshold_ok) {
+    regalloc.mark_ssa();
+    Compile::TracePhase tp("computeLive", &timers[_t_computeLive]);
+    rm_live.reset_to_mark();           // Reclaim working storage
+    IndexSet::reset_memory(C, &live_arena);
+    uint node_size = regalloc._lrg_map.max_lrg_id();
+    ifg.init(node_size); // Empty IFG
+    regalloc.set_ifg(ifg);
+    regalloc.set_live(live);
+    regalloc.gather_lrg_masks(false);    // Collect LRG masks
+    live.compute(node_size); // Compute liveness
+
+    recalc_pressure_nodes = NEW_RESOURCE_ARRAY(intptr_t, node_size);
+    for (uint i = 0; i < node_size; i++) {
+      recalc_pressure_nodes[i] = 0;
+    }
+  }
+  _regalloc = &regalloc;
+
 #ifndef PRODUCT
  if (trace_opto_pipelining()) {
    tty->print("\n---- Start Local Scheduling ----\n");
@ -1375,13 +1414,15 @@ void PhaseCFG::global_code_motion() {
  visited.Clear();
  for (uint i = 0; i < number_of_blocks(); i++) {
    Block* block = get_block(i);
-    if (!schedule_local(block, ready_cnt, visited)) {
+    if (!schedule_local(block, ready_cnt, visited, recalc_pressure_nodes)) {
      if (!C->failure_reason_is(C2Compiler::retry_no_subsuming_loads())) {
        C->record_method_not_compilable("local schedule failed");
      }
+      _regalloc = NULL;
      return;
    }
  }
+  _regalloc = NULL;

  // If we inserted any instructions between a Call and his CatchNode,
  // clone the instructions on all paths below the Catch.
--- a/hotspot/src/share/vm/opto/ifg.cpp
+++ b/hotspot/src/share/vm/opto/ifg.cpp
@ -439,9 +439,11 @@ void PhaseChaitin::lower_pressure(Block* b, uint location, LRG& lrg, IndexSet* l
      }
    }
  }
+  if (_scheduling_info_generated == false) {
    assert(int_pressure.current_pressure() == count_int_pressure(liveout), "the int pressure is incorrect");
    assert(float_pressure.current_pressure() == count_float_pressure(liveout), "the float pressure is incorrect");
  }
+}

 /* Go to the first non-phi index in a block */
 static uint first_nonphi_index(Block* b) {
@ -517,6 +519,58 @@ void PhaseChaitin::compute_initial_block_pressure(Block* b, IndexSet* liveout, P
  assert(float_pressure.current_pressure() == count_float_pressure(liveout), "the float pressure is incorrect");
 }

+/*
+* Computes the entry register pressure of a block, looking at all live
+* ranges in the livein. The register pressure is computed for both float
+* and int/pointer registers.
+*/
+void PhaseChaitin::compute_entry_block_pressure(Block* b) {
+  IndexSet* livein = _live->livein(b);
+  IndexSetIterator elements(livein);
+  uint lid = elements.next();
+  while (lid != 0) {
+    LRG& lrg = lrgs(lid);
+    raise_pressure(b, lrg, _sched_int_pressure, _sched_float_pressure);
+    lid = elements.next();
+  }
+  // Now check phis for locally defined inputs
+  for (uint j = 0; j < b->number_of_nodes(); j++) {
+    Node* n = b->get_node(j);
+    if (n->is_Phi()) {
+      for (uint k = 1; k < n->req(); k++) {
+        Node* phi_in = n->in(k);
+        // Because we are talking about phis, raise register pressure once for each
+        // instance of a phi to account for a single value
+        if (_cfg.get_block_for_node(phi_in) == b) {
+          LRG& lrg = lrgs(phi_in->_idx);
+          raise_pressure(b, lrg, _sched_int_pressure, _sched_float_pressure);
+          break;
+        }
+      }
+    }
+  }
+  _sched_int_pressure.set_start_pressure(_sched_int_pressure.current_pressure());
+  _sched_float_pressure.set_start_pressure(_sched_float_pressure.current_pressure());
+}
+
+/*
+* Computes the exit register pressure of a block, looking at all live
+* ranges in the liveout. The register pressure is computed for both float
+* and int/pointer registers.
+*/
+void PhaseChaitin::compute_exit_block_pressure(Block* b) {
+  IndexSet* livein = _live->live(b);
+  IndexSetIterator elements(livein);
+  _sched_int_pressure.set_current_pressure(0);
+  _sched_float_pressure.set_current_pressure(0);
+  uint lid = elements.next();
+  while (lid != 0) {
+    LRG& lrg = lrgs(lid);
+    raise_pressure(b, lrg, _sched_int_pressure, _sched_float_pressure);
+    lid = elements.next();
+  }
+}
+
 /*
 * Remove dead node if it's not used.
 * We only remove projection nodes if the node "defining" the projection is
@ -737,6 +791,16 @@ void PhaseChaitin::adjust_high_pressure_index(Block* b, uint& block_hrp_index, P
  block_hrp_index = i;
 }

+void PhaseChaitin::print_pressure_info(Pressure& pressure, const char *str) {
+  if (str != NULL) {
+    tty->print_cr("#  *** %s ***", str);
+  }
+  tty->print_cr("#     start pressure is = %d", pressure.start_pressure());
+  tty->print_cr("#     max pressure is = %d", pressure.final_pressure());
+  tty->print_cr("#     end pressure is = %d", pressure.current_pressure());
+  tty->print_cr("#");
+}
+
 /* Build an interference graph:
 *   That is, if 2 live ranges are simultaneously alive but in their acceptable
 *   register sets do not overlap, then they do not interfere. The IFG is built
--- a/hotspot/src/share/vm/opto/lcm.cpp
+++ b/hotspot/src/share/vm/opto/lcm.cpp
@ -31,6 +31,7 @@
 #include "opto/cfgnode.hpp"
 #include "opto/machnode.hpp"
 #include "opto/runtime.hpp"
+#include "opto/chaitin.hpp"
 #include "runtime/sharedRuntime.hpp"

 // Optimization - Graph Style
@ -443,7 +444,13 @@ void PhaseCFG::implicit_null_check(Block* block, Node *proj, Node *val, int allo
 // remaining cases (most), choose the instruction with the greatest latency
 // (that is, the most number of pseudo-cycles required to the end of the
 // routine). If there is a tie, choose the instruction with the most inputs.
-Node* PhaseCFG::select(Block* block, Node_List &worklist, GrowableArray<int> &ready_cnt, VectorSet &next_call, uint sched_slot) {
+Node* PhaseCFG::select(
+  Block* block,
+  Node_List &worklist,
+  GrowableArray<int> &ready_cnt,
+  VectorSet &next_call,
+  uint sched_slot,
+  intptr_t* recalc_pressure_nodes) {

  // If only a single entry on the stack, use it
  uint cnt = worklist.size();
@ -458,6 +465,7 @@ Node* PhaseCFG::select(Block* block, Node_List &worklist, GrowableArray<int> &re
  uint score   = 0; // Bigger is better
  int idx = -1;     // Index in worklist
  int cand_cnt = 0; // Candidate count
+  bool block_size_threshold_ok = (block->number_of_nodes() > 10) ? true : false;

  for( uint i=0; i<cnt; i++ ) { // Inspect entire worklist
    // Order in worklist is used to break ties.
@ -539,6 +547,46 @@ Node* PhaseCFG::select(Block* block, Node_List &worklist, GrowableArray<int> &re
    uint n_latency = get_latency_for_node(n);
    uint n_score = n->req();   // Many inputs get high score to break ties

+    if (OptoRegScheduling && block_size_threshold_ok) {
+      if (recalc_pressure_nodes[n->_idx] == 0x7fff7fff) {
+        _regalloc->_scratch_int_pressure.init(_regalloc->_sched_int_pressure.high_pressure_limit());
+        _regalloc->_scratch_float_pressure.init(_regalloc->_sched_float_pressure.high_pressure_limit());
+        // simulate the notion that we just picked this node to schedule
+        n->add_flag(Node::Flag_is_scheduled);
+        // now caculate its effect upon the graph if we did
+        adjust_register_pressure(n, block, recalc_pressure_nodes, false);
+        // return its state for finalize in case somebody else wins
+        n->remove_flag(Node::Flag_is_scheduled);
+        // now save the two final pressure components of register pressure, limiting pressure calcs to short size
+        short int_pressure = (short)_regalloc->_scratch_int_pressure.current_pressure();
+        short float_pressure = (short)_regalloc->_scratch_float_pressure.current_pressure();
+        recalc_pressure_nodes[n->_idx] = int_pressure;
+        recalc_pressure_nodes[n->_idx] |= (float_pressure << 16);
+      }
+
+      if (_scheduling_for_pressure) {
+        latency = n_latency;
+        if (n_choice != 3) {
+          // Now evaluate each register pressure component based on threshold in the score.
+          // In general the defining register type will dominate the score, ergo we will not see register pressure grow on both banks
+          // on a single instruction, but we might see it shrink on both banks.
+          // For each use of register that has a register class that is over the high pressure limit, we build n_score up for
+          // live ranges that terminate on this instruction.
+          if (_regalloc->_sched_int_pressure.current_pressure() > _regalloc->_sched_int_pressure.high_pressure_limit()) {
+            short int_pressure = (short)recalc_pressure_nodes[n->_idx];
+            n_score = (int_pressure < 0) ? ((score + n_score) - int_pressure) : (int_pressure > 0) ? 1 : n_score;
+          }
+          if (_regalloc->_sched_float_pressure.current_pressure() > _regalloc->_sched_float_pressure.high_pressure_limit()) {
+            short float_pressure = (short)(recalc_pressure_nodes[n->_idx] >> 16);
+            n_score = (float_pressure < 0) ? ((score + n_score) - float_pressure) : (float_pressure > 0) ? 1 : n_score;
+          }
+        } else {
+          // make sure we choose these candidates
+          score = 0;
+        }
+      }
+    }
+
    // Keep best latency found
    cand_cnt++;
    if (choice < n_choice ||
@ -562,6 +610,100 @@ Node* PhaseCFG::select(Block* block, Node_List &worklist, GrowableArray<int> &re
  return n;
 }

+//-------------------------adjust_register_pressure----------------------------
+void PhaseCFG::adjust_register_pressure(Node* n, Block* block, intptr_t* recalc_pressure_nodes, bool finalize_mode) {
+  PhaseLive* liveinfo = _regalloc->get_live();
+  IndexSet* liveout = liveinfo->live(block);
+  // first adjust the register pressure for the sources
+  for (uint i = 1; i < n->req(); i++) {
+    bool lrg_ends = false;
+    Node *src_n = n->in(i);
+    if (src_n == NULL) continue;
+    if (!src_n->is_Mach()) continue;
+    uint src = _regalloc->_lrg_map.find(src_n);
+    if (src == 0) continue;
+    LRG& lrg_src = _regalloc->lrgs(src);
+    // detect if the live range ends or not
+    if (liveout->member(src) == false) {
+      lrg_ends = true;
+      for (DUIterator_Fast jmax, j = src_n->fast_outs(jmax); j < jmax; j++) {
+        Node* m = src_n->fast_out(j); // Get user
+        if (m == n) continue;
+        if (!m->is_Mach()) continue;
+        MachNode *mach = m->as_Mach();
+        bool src_matches = false;
+        int iop = mach->ideal_Opcode();
+
+        switch (iop) {
+        case Op_StoreB:
+        case Op_StoreC:
+        case Op_StoreCM:
+        case Op_StoreD:
+        case Op_StoreF:
+        case Op_StoreI:
+        case Op_StoreL:
+        case Op_StoreP:
+        case Op_StoreN:
+        case Op_StoreVector:
+        case Op_StoreNKlass:
+          for (uint k = 1; k < m->req(); k++) {
+            Node *in = m->in(k);
+            if (in == src_n) {
+              src_matches = true;
+              break;
+            }
+          }
+          break;
+
+        default:
+          src_matches = true;
+          break;
+        }
+
+        // If we have a store as our use, ignore the non source operands
+        if (src_matches == false) continue;
+
+        // Mark every unscheduled use which is not n with a recalculation
+        if ((get_block_for_node(m) == block) && (!m->is_scheduled())) {
+          if (finalize_mode && !m->is_Phi()) {
+            recalc_pressure_nodes[m->_idx] = 0x7fff7fff;
+          }
+          lrg_ends = false;
+        }
+      }
+    }
+    // if none, this live range ends and we can adjust register pressure
+    if (lrg_ends) {
+      if (finalize_mode) {
+        _regalloc->lower_pressure(block, 0, lrg_src, NULL, _regalloc->_sched_int_pressure, _regalloc->_sched_float_pressure);
+      } else {
+        _regalloc->lower_pressure(block, 0, lrg_src, NULL, _regalloc->_scratch_int_pressure, _regalloc->_scratch_float_pressure);
+      }
+    }
+  }
+
+  // now add the register pressure from the dest and evaluate which heuristic we should use:
+  // 1.) The default, latency scheduling
+  // 2.) Register pressure scheduling based on the high pressure limit threshold for int or float register stacks
+  uint dst = _regalloc->_lrg_map.find(n);
+  if (dst != 0) {
+    LRG& lrg_dst = _regalloc->lrgs(dst);
+    if (finalize_mode) {
+      _regalloc->raise_pressure(block, lrg_dst, _regalloc->_sched_int_pressure, _regalloc->_sched_float_pressure);
+      // check to see if we fall over the register pressure cliff here
+      if (_regalloc->_sched_int_pressure.current_pressure() > _regalloc->_sched_int_pressure.high_pressure_limit()) {
+        _scheduling_for_pressure = true;
+      } else if (_regalloc->_sched_float_pressure.current_pressure() > _regalloc->_sched_float_pressure.high_pressure_limit()) {
+        _scheduling_for_pressure = true;
+      } else {
+        // restore latency scheduling mode
+        _scheduling_for_pressure = false;
+      }
+    } else {
+      _regalloc->raise_pressure(block, lrg_dst, _regalloc->_scratch_int_pressure, _regalloc->_scratch_float_pressure);
+    }
+  }
+}

 //------------------------------set_next_call----------------------------------
 void PhaseCFG::set_next_call(Block* block, Node* n, VectorSet& next_call) {
@ -711,7 +853,7 @@ uint PhaseCFG::sched_call(Block* block, uint node_cnt, Node_List& worklist, Grow

 //------------------------------schedule_local---------------------------------
 // Topological sort within a block.  Someday become a real scheduler.
-bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, VectorSet& next_call) {
+bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, VectorSet& next_call, intptr_t *recalc_pressure_nodes) {
  // Already "sorted" are the block start Node (as the first entry), and
  // the block-ending Node and any trailing control projections.  We leave
  // these alone.  PhiNodes and ParmNodes are made to follow the block start
@ -733,10 +875,24 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
    return true;
  }

+  bool block_size_threshold_ok = (block->number_of_nodes() > 10) ? true : false;
+
+  // We track the uses of local definitions as input dependences so that
+  // we know when a given instruction is avialable to be scheduled.
+  uint i;
+  if (OptoRegScheduling && block_size_threshold_ok) {
+    for (i = 1; i < block->number_of_nodes(); i++) { // setup nodes for pressure calc
+      Node *n = block->get_node(i);
+      n->remove_flag(Node::Flag_is_scheduled);
+      if (!n->is_Phi()) {
+        recalc_pressure_nodes[n->_idx] = 0x7fff7fff;
+      }
+    }
+  }
+
  // Move PhiNodes and ParmNodes from 1 to cnt up to the start
  uint node_cnt = block->end_idx();
  uint phi_cnt = 1;
-  uint i;
  for( i = 1; i<node_cnt; i++ ) { // Scan for Phi
    Node *n = block->get_node(i);
    if( n->is_Phi() ||          // Found a PhiNode or ParmNode
@ -744,6 +900,10 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
      // Move guy at 'phi_cnt' to the end; makes a hole at phi_cnt
      block->map_node(block->get_node(phi_cnt), i);
      block->map_node(n, phi_cnt++);  // swap Phi/Parm up front
+      if (OptoRegScheduling && block_size_threshold_ok) {
+        // mark n as scheduled
+        n->add_flag(Node::Flag_is_scheduled);
+      }
    } else {                    // All others
      // Count block-local inputs to 'n'
      uint cnt = n->len();      // Input count
@ -797,6 +957,12 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
      Node* m = n->fast_out(j);
      if (get_block_for_node(m) == block) { // Local-block user
        int m_cnt = ready_cnt.at(m->_idx)-1;
+        if (OptoRegScheduling && block_size_threshold_ok) {
+          // mark m as scheduled
+          if (m_cnt < 0) {
+            m->add_flag(Node::Flag_is_scheduled);
+          }
+        }
        ready_cnt.at_put(m->_idx, m_cnt);   // Fix ready count
      }
    }
@ -827,6 +993,18 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
    worklist.push(d);
  }

+  if (OptoRegScheduling && block_size_threshold_ok) {
+    // To stage register pressure calculations we need to examine the live set variables
+    // breaking them up by register class to compartmentalize the calculations.
+    uint float_pressure = Matcher::float_pressure(FLOATPRESSURE);
+    _regalloc->_sched_int_pressure.init(INTPRESSURE);
+    _regalloc->_sched_float_pressure.init(float_pressure);
+    _regalloc->_scratch_int_pressure.init(INTPRESSURE);
+    _regalloc->_scratch_float_pressure.init(float_pressure);
+
+    _regalloc->compute_entry_block_pressure(block);
+  }
+
  // Warm up the 'next_call' heuristic bits
  needed_for_next_call(block, block->head(), next_call);

@ -858,9 +1036,18 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
 #endif

    // Select and pop a ready guy from worklist
-    Node* n = select(block, worklist, ready_cnt, next_call, phi_cnt);
+    Node* n = select(block, worklist, ready_cnt, next_call, phi_cnt, recalc_pressure_nodes);
    block->map_node(n, phi_cnt++);    // Schedule him next

+    if (OptoRegScheduling && block_size_threshold_ok) {
+      n->add_flag(Node::Flag_is_scheduled);
+
+      // Now adjust the resister pressure with the node we selected
+      if (!n->is_Phi()) {
+        adjust_register_pressure(n, block, recalc_pressure_nodes, true);
+      }
+    }
+
 #ifndef PRODUCT
    if (trace_opto_pipelining()) {
      tty->print("#    select %d: %s", n->_idx, n->Name());
@ -925,6 +1112,12 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
    return false;
  }

+  if (OptoRegScheduling && block_size_threshold_ok) {
+    _regalloc->compute_exit_block_pressure(block);
+    block->_reg_pressure = _regalloc->_sched_int_pressure.final_pressure();
+    block->_freg_pressure = _regalloc->_sched_float_pressure.final_pressure();
+  }
+
 #ifndef PRODUCT
  if (trace_opto_pipelining()) {
    tty->print_cr("#");
@ -933,11 +1126,17 @@ bool PhaseCFG::schedule_local(Block* block, GrowableArray<int>& ready_cnt, Vecto
      tty->print("# ");
      block->get_node(i)->fast_dump();
    }
+    tty->print_cr("# ");
+
+    if (OptoRegScheduling && block_size_threshold_ok) {
+      tty->print_cr("# pressure info : %d", block->_pre_order);
+      _regalloc->print_pressure_info(_regalloc->_sched_int_pressure, "int register info");
+      _regalloc->print_pressure_info(_regalloc->_sched_float_pressure, "float register info");
+    }
    tty->cr();
  }
 #endif

-
  return true;
 }

--- a/hotspot/src/share/vm/opto/live.cpp
+++ b/hotspot/src/share/vm/opto/live.cpp
@ -41,7 +41,14 @@
 // block is put on the worklist.
 //   The locally live-in stuff is computed once and added to predecessor
 // live-out sets.  This separate compilation is done in the outer loop below.
-PhaseLive::PhaseLive( const PhaseCFG &cfg, const LRG_List &names, Arena *arena ) : Phase(LIVE), _cfg(cfg), _names(names), _arena(arena), _live(0) {
+PhaseLive::PhaseLive(const PhaseCFG &cfg, const LRG_List &names, Arena *arena, bool keep_deltas)
+  : Phase(LIVE),
+  _cfg(cfg),
+  _names(names),
+  _arena(arena),
+  _live(0),
+  _livein(0),
+  _keep_deltas(keep_deltas) {
 }

 void PhaseLive::compute(uint maxlrg) {
@ -56,6 +63,13 @@ void PhaseLive::compute(uint maxlrg) {
    _live[i].initialize(_maxlrg);
  }

+  if (_keep_deltas) {
+    _livein = (IndexSet*)_arena->Amalloc(sizeof(IndexSet) * _cfg.number_of_blocks());
+    for (i = 0; i < _cfg.number_of_blocks(); i++) {
+      _livein[i].initialize(_maxlrg);
+    }
+  }
+
  // Init the sparse arrays for delta-sets.
  ResourceMark rm;              // Nuke temp storage on exit

@ -124,7 +138,10 @@ void PhaseLive::compute(uint maxlrg) {

      // PhiNode uses go in the live-out set of prior blocks.
      for (uint k = i; k > 0; k--) {
-        add_liveout(p, _names.at(block->get_node(k-1)->in(l)->_idx), first_pass);
+        Node *phi = block->get_node(k - 1);
+        if (l < phi->req()) {
+          add_liveout(p, _names.at(phi->in(l)->_idx), first_pass);
+        }
      }
    }
    freeset(block);
@ -200,8 +217,11 @@ IndexSet *PhaseLive::getfreeset( ) {
 }

 // Free an IndexSet from a block.
-void PhaseLive::freeset( const Block *p ) {
+void PhaseLive::freeset( Block *p ) {
  IndexSet *f = _deltas[p->_pre_order-1];
+  if ( _keep_deltas ) {
+    add_livein(p, f);
+  }
  f->set_next(_free_IndexSet);
  _free_IndexSet = f;           // Drop onto free list
  _deltas[p->_pre_order-1] = NULL;
@ -249,10 +269,23 @@ void PhaseLive::add_liveout( Block *p, IndexSet *lo, VectorSet &first_pass ) {
  }
 }

+// Add a vector of live-in values to a given blocks live-in set.
+void PhaseLive::add_livein(Block *p, IndexSet *lo) {
+  IndexSet *livein = &_livein[p->_pre_order-1];
+  IndexSetIterator elements(lo);
+  uint r;
+  while ((r = elements.next()) != 0) {
+    livein->insert(r);         // Then add to live-in set
+  }
+}
+
 #ifndef PRODUCT
 // Dump the live-out set for a block
 void PhaseLive::dump( const Block *b ) const {
  tty->print("Block %d: ",b->_pre_order);
+  if ( _keep_deltas ) {
+    tty->print("LiveIn: ");  _livein[b->_pre_order-1].dump();
+  }
  tty->print("LiveOut: ");  _live[b->_pre_order-1].dump();
  uint cnt = b->number_of_nodes();
  for( uint i=0; i<cnt; i++ ) {
--- a/hotspot/src/share/vm/opto/live.hpp
+++ b/hotspot/src/share/vm/opto/live.hpp
@ -46,7 +46,8 @@ typedef GrowableArray<uint> LRG_List;
 class PhaseLive : public Phase {
  // Array of Sets of values live at the start of a block.
  // Indexed by block pre-order number.
-  IndexSet *_live;
+  IndexSet *_live; // live out
+  IndexSet *_livein; // live in

  // Array of Sets of values defined locally in the block
  // Indexed by block pre-order number.
@ -62,15 +63,17 @@ class PhaseLive : public Phase {
  const LRG_List &_names;       // Mapping from Nodes to live ranges
  uint _maxlrg;                 // Largest live-range number
  Arena *_arena;
+  bool _keep_deltas;            // Retain live in information

  IndexSet *getset( Block *p );
  IndexSet *getfreeset( );
-  void freeset( const Block *p );
+  void freeset( Block *p );
  void add_liveout( Block *p, uint r, VectorSet &first_pass );
  void add_liveout( Block *p, IndexSet *lo, VectorSet &first_pass );
+  void add_livein( Block *p, IndexSet *lo );

 public:
-  PhaseLive(const PhaseCFG &cfg, const LRG_List &names, Arena *arena);
+  PhaseLive(const PhaseCFG &cfg, const LRG_List &names, Arena *arena, bool keep_deltas);
  ~PhaseLive() {}
  // Compute liveness info
  void compute(uint maxlrg);
@ -79,6 +82,7 @@ public:

  // Return the live-out set for this block
  IndexSet *live( const Block * b ) { return &_live[b->_pre_order-1]; }
+  IndexSet *livein( const Block * b ) { return &_livein[b->_pre_order - 1]; }

 #ifndef PRODUCT
  void dump( const Block *b ) const;
--- a/hotspot/src/share/vm/opto/matcher.hpp
+++ b/hotspot/src/share/vm/opto/matcher.hpp
@ -269,6 +269,9 @@ public:
  // should generate this one.
  static const bool match_rule_supported(int opcode);

+  // Some uarchs have different sized float register resources
+  static const int float_pressure(int default_pressure_threshold);
+
  // Used to determine if we have fast l2f conversion
  // USII has it, USIII doesn't
  static const bool convL2FSupported(void);
--- a/hotspot/src/share/vm/opto/node.hpp
+++ b/hotspot/src/share/vm/opto/node.hpp
@ -674,7 +674,8 @@ public:
    Flag_avoid_back_to_back_after    = Flag_avoid_back_to_back_before << 1,
    Flag_has_call                    = Flag_avoid_back_to_back_after << 1,
    Flag_is_reduction                = Flag_has_call << 1,
-    Flag_is_expensive                = Flag_is_reduction << 1,
+    Flag_is_scheduled                = Flag_is_reduction,
+    Flag_is_expensive                = Flag_is_scheduled << 1,
    _max_flags = (Flag_is_expensive << 1) - 1 // allow flags combination
  };

@ -861,6 +862,9 @@ public:
  // It must have the loop's phi as input and provide a def to the phi.
  bool is_reduction() const { return (_flags & Flag_is_reduction) != 0; }

+  // Used in lcm to mark nodes that have scheduled
+  bool is_scheduled() const { return (_flags & Flag_is_scheduled) != 0; }
+
 //----------------- Optimization

  // Get the worst-case Type output for this Node.