Merge

2025-09-16 17:14:41 +02:00 · 2012-01-11 17:58:26 -05:00 · 2012-01-11 17:58:26 -05:00 · 8efd785f67
commit 8efd785f67
parent 007126d010 5c89631aef
119 changed files with 6257 additions and 4107 deletions
--- a/hotspot/agent/src/share/classes/sun/jvm/hotspot/oops/InstanceKlass.java
+++ b/hotspot/agent/src/share/classes/sun/jvm/hotspot/oops/InstanceKlass.java
@ -63,6 +63,8 @@ public class InstanceKlass extends Klass {
  private static int CLASS_STATE_FULLY_INITIALIZED;
  private static int CLASS_STATE_INITIALIZATION_ERROR;

+  private static int IS_MARKED_DEPENDENT_MASK;
+
  private static synchronized void initialize(TypeDataBase db) throws WrongTypeException {
    Type type            = db.lookupType("instanceKlass");
    arrayKlasses         = new OopField(type.getOopField("_array_klasses"), Oop.getHeaderSize());
@ -90,7 +92,7 @@ public class InstanceKlass extends Klass {
    staticFieldSize      = new CIntField(type.getCIntegerField("_static_field_size"), Oop.getHeaderSize());
    staticOopFieldCount   = new CIntField(type.getCIntegerField("_static_oop_field_count"), Oop.getHeaderSize());
    nonstaticOopMapSize  = new CIntField(type.getCIntegerField("_nonstatic_oop_map_size"), Oop.getHeaderSize());
-    isMarkedDependent    = new CIntField(type.getCIntegerField("_is_marked_dependent"), Oop.getHeaderSize());
+    miscFlags            = new CIntField(type.getCIntegerField("_misc_flags"), Oop.getHeaderSize());
    initState            = new CIntField(type.getCIntegerField("_init_state"), Oop.getHeaderSize());
    vtableLen            = new CIntField(type.getCIntegerField("_vtable_len"), Oop.getHeaderSize());
    itableLen            = new CIntField(type.getCIntegerField("_itable_len"), Oop.getHeaderSize());
@ -118,6 +120,8 @@ public class InstanceKlass extends Klass {
    CLASS_STATE_FULLY_INITIALIZED = db.lookupIntConstant("instanceKlass::fully_initialized").intValue();
    CLASS_STATE_INITIALIZATION_ERROR = db.lookupIntConstant("instanceKlass::initialization_error").intValue();

+    IS_MARKED_DEPENDENT_MASK = db.lookupIntConstant("instanceKlass::IS_MARKED_DEPENDENT").intValue();
+
  }

  InstanceKlass(OopHandle handle, ObjectHeap heap) {
@ -151,7 +155,7 @@ public class InstanceKlass extends Klass {
  private static CIntField staticFieldSize;
  private static CIntField staticOopFieldCount;
  private static CIntField nonstaticOopMapSize;
-  private static CIntField isMarkedDependent;
+  private static CIntField miscFlags;
  private static CIntField initState;
  private static CIntField vtableLen;
  private static CIntField itableLen;
@ -333,7 +337,7 @@ public class InstanceKlass extends Klass {
  public long      getNonstaticFieldSize()  { return                nonstaticFieldSize.getValue(this); }
  public long      getStaticOopFieldCount() { return                staticOopFieldCount.getValue(this); }
  public long      getNonstaticOopMapSize() { return                nonstaticOopMapSize.getValue(this); }
-  public boolean   getIsMarkedDependent()   { return                isMarkedDependent.getValue(this) != 0; }
+  public boolean   getIsMarkedDependent()   { return                (miscFlags.getValue(this) & IS_MARKED_DEPENDENT_MASK) != 0; }
  public long      getVtableLen()           { return                vtableLen.getValue(this); }
  public long      getItableLen()           { return                itableLen.getValue(this); }
  public Symbol    getGenericSignature()    { return getSymbol(genericSignature); }
@ -524,7 +528,7 @@ public class InstanceKlass extends Klass {
      visitor.doCInt(staticFieldSize, true);
      visitor.doCInt(staticOopFieldCount, true);
      visitor.doCInt(nonstaticOopMapSize, true);
-      visitor.doCInt(isMarkedDependent, true);
+      visitor.doCInt(miscFlags, true);
      visitor.doCInt(initState, true);
      visitor.doCInt(vtableLen, true);
      visitor.doCInt(itableLen, true);
--- a/hotspot/make/bsd/makefiles/adlc.make
+++ b/hotspot/make/bsd/makefiles/adlc.make
@ -39,9 +39,16 @@ OS = $(Platform_os_family)

 SOURCE.AD = $(OUTDIR)/$(OS)_$(Platform_arch_model).ad 

+ifeq ("${Platform_arch_model}", "${Platform_arch}")
  SOURCES.AD = \
  $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch_model).ad) \
  $(call altsrc-replace,$(HS_COMMON_SRC)/os_cpu/$(OS)_$(ARCH)/vm/$(OS)_$(Platform_arch_model).ad)
+else
+  SOURCES.AD = \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch_model).ad) \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch).ad) \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/os_cpu/$(OS)_$(ARCH)/vm/$(OS)_$(Platform_arch_model).ad)
+endif

 EXEC	= $(OUTDIR)/adlc

--- a/hotspot/make/linux/makefiles/adlc.make
+++ b/hotspot/make/linux/makefiles/adlc.make
@ -39,9 +39,16 @@ OS = $(Platform_os_family)

 SOURCE.AD = $(OUTDIR)/$(OS)_$(Platform_arch_model).ad 

+ifeq ("${Platform_arch_model}", "${Platform_arch}")
  SOURCES.AD = \
  $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch_model).ad) \
  $(call altsrc-replace,$(HS_COMMON_SRC)/os_cpu/$(OS)_$(ARCH)/vm/$(OS)_$(Platform_arch_model).ad)
+else
+  SOURCES.AD = \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch_model).ad) \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch).ad) \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/os_cpu/$(OS)_$(ARCH)/vm/$(OS)_$(Platform_arch_model).ad)
+endif

 EXEC	= $(OUTDIR)/adlc

--- a/hotspot/make/solaris/makefiles/adlc.make
+++ b/hotspot/make/solaris/makefiles/adlc.make
@ -40,9 +40,16 @@ OS = $(Platform_os_family)

 SOURCE.AD = $(OUTDIR)/$(OS)_$(Platform_arch_model).ad 

+ifeq ("${Platform_arch_model}", "${Platform_arch}")
  SOURCES.AD = \
  $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch_model).ad) \
  $(call altsrc-replace,$(HS_COMMON_SRC)/os_cpu/$(OS)_$(ARCH)/vm/$(OS)_$(Platform_arch_model).ad)
+else
+  SOURCES.AD = \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch_model).ad) \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/cpu/$(ARCH)/vm/$(Platform_arch).ad) \
+  $(call altsrc-replace,$(HS_COMMON_SRC)/os_cpu/$(OS)_$(ARCH)/vm/$(OS)_$(Platform_arch_model).ad)
+endif

 EXEC	= $(OUTDIR)/adlc

--- a/hotspot/make/windows/makefiles/adlc.make
+++ b/hotspot/make/windows/makefiles/adlc.make
@ -53,6 +53,17 @@ CPP_INCLUDE_DIRS=\
  /I "$(WorkSpace)\src\os\windows\vm" \
  /I "$(WorkSpace)\src\cpu\$(Platform_arch)\vm"

+!if "$(Platform_arch_model)" == "$(Platform_arch)"
+SOURCES_AD=\
+  $(WorkSpace)/src/cpu/$(Platform_arch)/vm/$(Platform_arch_model).ad \
+  $(WorkSpace)/src/os_cpu/windows_$(Platform_arch)/vm/windows_$(Platform_arch_model).ad
+!else
+SOURCES_AD=\
+  $(WorkSpace)/src/cpu/$(Platform_arch)/vm/$(Platform_arch_model).ad \
+  $(WorkSpace)/src/cpu/$(Platform_arch)/vm/$(Platform_arch).ad \
+  $(WorkSpace)/src/os_cpu/windows_$(Platform_arch)/vm/windows_$(Platform_arch_model).ad
+!endif
+
 # NOTE! If you add any files here, you must also update GENERATED_NAMES_IN_DIR
 # and ProjectCreatorIDEOptions in projectcreator.make. 
 GENERATED_NAMES=\
@ -105,7 +116,6 @@ $(GENERATED_NAMES_IN_DIR): $(Platform_arch_model).ad adlc.exe
 	$(ADLC) $(ADLCFLAGS) $(Platform_arch_model).ad
 	mv $(GENERATED_NAMES) $(AdlcOutDir)/

-$(Platform_arch_model).ad: $(WorkSpace)/src/cpu/$(Platform_arch)/vm/$(Platform_arch_model).ad $(WorkSpace)/src/os_cpu/windows_$(Platform_arch)/vm/windows_$(Platform_arch_model).ad
+$(Platform_arch_model).ad: $(SOURCES_AD)
 	rm -f $(Platform_arch_model).ad
-	cat $(WorkSpace)/src/cpu/$(Platform_arch)/vm/$(Platform_arch_model).ad  \
-	    $(WorkSpace)/src/os_cpu/windows_$(Platform_arch)/vm/windows_$(Platform_arch_model).ad >$(Platform_arch_model).ad
+	cat $(SOURCES_AD) >$(Platform_arch_model).ad
--- a/hotspot/src/cpu/sparc/vm/assembler_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/assembler_sparc.cpp
@ -3036,10 +3036,8 @@ void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
                                                   Label* L_failure,
                                                   Label* L_slow_path,
                                        RegisterOrConstant super_check_offset) {
-  int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
-                   Klass::secondary_super_cache_offset_in_bytes());
-  int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
-                    Klass::super_check_offset_offset_in_bytes());
+  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
+  int sco_offset = in_bytes(Klass::super_check_offset_offset());

  bool must_load_sco  = (super_check_offset.constant_or_zero() == -1);
  bool need_slow_path = (must_load_sco ||
@ -3159,10 +3157,8 @@ void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
  assert(label_nulls <= 1, "at most one NULL in the batch");

  // a couple of useful fields in sub_klass:
-  int ss_offset = (klassOopDesc::header_size() * HeapWordSize +
-                   Klass::secondary_supers_offset_in_bytes());
-  int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
-                   Klass::secondary_super_cache_offset_in_bytes());
+  int ss_offset = in_bytes(Klass::secondary_supers_offset());
+  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());

  // Do a linear scan of the secondary super-klass chain.
  // This code is rarely used, so simplicity is a virtue here.
@ -3336,7 +3332,7 @@ void MacroAssembler::biased_locking_enter(Register obj_reg, Register mark_reg,
  cmp_and_brx_short(temp_reg, markOopDesc::biased_lock_pattern, Assembler::notEqual, Assembler::pn, cas_label);

  load_klass(obj_reg, temp_reg);
-  ld_ptr(Address(temp_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()), temp_reg);
+  ld_ptr(Address(temp_reg, Klass::prototype_header_offset()), temp_reg);
  or3(G2_thread, temp_reg, temp_reg);
  xor3(mark_reg, temp_reg, temp_reg);
  andcc(temp_reg, ~((int) markOopDesc::age_mask_in_place), temp_reg);
@ -3413,7 +3409,7 @@ void MacroAssembler::biased_locking_enter(Register obj_reg, Register mark_reg,
  // FIXME: due to a lack of registers we currently blow away the age
  // bits in this situation. Should attempt to preserve them.
  load_klass(obj_reg, temp_reg);
-  ld_ptr(Address(temp_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()), temp_reg);
+  ld_ptr(Address(temp_reg, Klass::prototype_header_offset()), temp_reg);
  or3(G2_thread, temp_reg, temp_reg);
  casn(mark_addr.base(), mark_reg, temp_reg);
  // If the biasing toward our thread failed, this means that
@ -3443,7 +3439,7 @@ void MacroAssembler::biased_locking_enter(Register obj_reg, Register mark_reg,
  // FIXME: due to a lack of registers we currently blow away the age
  // bits in this situation. Should attempt to preserve them.
  load_klass(obj_reg, temp_reg);
-  ld_ptr(Address(temp_reg, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()), temp_reg);
+  ld_ptr(Address(temp_reg, Klass::prototype_header_offset()), temp_reg);
  casn(mark_addr.base(), mark_reg, temp_reg);
  // Fall through to the normal CAS-based lock, because no matter what
  // the result of the above CAS, some thread must have succeeded in
--- a/hotspot/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/c1_CodeStubs_sparc.cpp
@ -302,7 +302,7 @@ void PatchingStub::emit_code(LIR_Assembler* ce) {
    assert(_obj != noreg, "must be a valid register");
    assert(_oop_index >= 0, "must have oop index");
    __ load_heap_oop(_obj, java_lang_Class::klass_offset_in_bytes(), G3);
-    __ ld_ptr(G3, instanceKlass::init_thread_offset_in_bytes() + sizeof(klassOopDesc), G3);
+    __ ld_ptr(G3, in_bytes(instanceKlass::init_thread_offset()), G3);
    __ cmp_and_brx_short(G2_thread, G3, Assembler::notEqual, Assembler::pn, call_patch);

    // load_klass patches may execute the patched code before it's
@ -471,7 +471,7 @@ void G1UnsafeGetObjSATBBarrierStub::emit_code(LIR_Assembler* ce) {

  __ load_klass(src_reg, tmp_reg);

-  Address ref_type_adr(tmp_reg, instanceKlass::reference_type_offset_in_bytes() + sizeof(oopDesc));
+  Address ref_type_adr(tmp_reg, instanceKlass::reference_type_offset());
  __ ld(ref_type_adr, tmp_reg);

  // _reference_type field is of type ReferenceType (enum)
--- a/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/c1_LIRAssembler_sparc.cpp
@ -2202,8 +2202,7 @@ void LIR_Assembler::emit_arraycopy(LIR_OpArrayCopy* op) {
          } else if (!(flags & LIR_OpArrayCopy::dst_objarray)) {
            __ load_klass(dst, tmp);
          }
-          int lh_offset = klassOopDesc::header_size() * HeapWordSize +
-            Klass::layout_helper_offset_in_bytes();
+          int lh_offset = in_bytes(Klass::layout_helper_offset());

          __ lduw(tmp, lh_offset, tmp2);

@ -2238,12 +2237,10 @@ void LIR_Assembler::emit_arraycopy(LIR_OpArrayCopy* op) {
        __ mov(length, len);
        __ load_klass(dst, tmp);

-        int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
-                         objArrayKlass::element_klass_offset_in_bytes());
+        int ek_offset = in_bytes(objArrayKlass::element_klass_offset());
        __ ld_ptr(tmp, ek_offset, super_k);

-        int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
-                          Klass::super_check_offset_offset_in_bytes());
+        int sco_offset = in_bytes(Klass::super_check_offset_offset());
        __ lduw(super_k, sco_offset, chk_off);

        __ call_VM_leaf(tmp, copyfunc_addr);
@ -2455,8 +2452,8 @@ void LIR_Assembler::emit_alloc_obj(LIR_OpAllocObj* op) {
         op->obj()->as_register()   == O0 &&
         op->klass()->as_register() == G5, "must be");
  if (op->init_check()) {
-    __ ld(op->klass()->as_register(),
-          instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc),
+    __ ldub(op->klass()->as_register(),
+          in_bytes(instanceKlass::init_state_offset()),
          op->tmp1()->as_register());
    add_debug_info_for_null_check_here(op->stub()->info());
    __ cmp(op->tmp1()->as_register(), instanceKlass::fully_initialized);
@ -2627,7 +2624,7 @@ void LIR_Assembler::emit_typecheck_helper(LIR_OpTypeCheck *op, Label* success, L
  } else {
    bool need_slow_path = true;
    if (k->is_loaded()) {
-      if (k->super_check_offset() != sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes())
+      if ((int) k->super_check_offset() != in_bytes(Klass::secondary_super_cache_offset()))
        need_slow_path = false;
      // perform the fast part of the checking logic
      __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, Rtmp1, noreg,
@ -2731,7 +2728,7 @@ void LIR_Assembler::emit_opTypeCheck(LIR_OpTypeCheck* op) {
    __ load_klass(value, klass_RInfo);

    // get instance klass
-    __ ld_ptr(Address(k_RInfo, objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc)), k_RInfo);
+    __ ld_ptr(Address(k_RInfo, objArrayKlass::element_klass_offset()), k_RInfo);
    // perform the fast part of the checking logic
    __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, Rtmp1, O7, success_target, failure_target, NULL);

--- a/hotspot/src/cpu/sparc/vm/c1_MacroAssembler_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/c1_MacroAssembler_sparc.cpp
@ -181,7 +181,7 @@ void C1_MacroAssembler::try_allocate(
 void C1_MacroAssembler::initialize_header(Register obj, Register klass, Register len, Register t1, Register t2) {
  assert_different_registers(obj, klass, len, t1, t2);
  if (UseBiasedLocking && !len->is_valid()) {
-    ld_ptr(klass, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes(), t1);
+    ld_ptr(klass, in_bytes(Klass::prototype_header_offset()), t1);
  } else {
    set((intx)markOopDesc::prototype(), t1);
  }
@ -252,7 +252,7 @@ void C1_MacroAssembler::initialize_object(
 #ifdef ASSERT
  {
    Label ok;
-    ld(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes(), t1);
+    ld(klass, in_bytes(Klass::layout_helper_offset()), t1);
    if (var_size_in_bytes != noreg) {
      cmp_and_brx_short(t1, var_size_in_bytes, Assembler::equal, Assembler::pt, ok);
    } else {
--- a/hotspot/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/c1_Runtime1_sparc.cpp
@ -398,14 +398,14 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {

          if (id == fast_new_instance_init_check_id) {
            // make sure the klass is initialized
-            __ ld(G5_klass, instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc), G3_t1);
+            __ ldub(G5_klass, in_bytes(instanceKlass::init_state_offset()), G3_t1);
            __ cmp_and_br_short(G3_t1, instanceKlass::fully_initialized, Assembler::notEqual, Assembler::pn, slow_path);
          }
 #ifdef ASSERT
          // assert object can be fast path allocated
          {
            Label ok, not_ok;
-          __ ld(G5_klass, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc), G1_obj_size);
+          __ ld(G5_klass, in_bytes(Klass::layout_helper_offset()), G1_obj_size);
          // make sure it's an instance (LH > 0)
          __ cmp_and_br_short(G1_obj_size, 0, Assembler::lessEqual, Assembler::pn, not_ok);
          __ btst(Klass::_lh_instance_slow_path_bit, G1_obj_size);
@ -425,7 +425,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
          __ bind(retry_tlab);

          // get the instance size
-          __ ld(G5_klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes(), G1_obj_size);
+          __ ld(G5_klass, in_bytes(Klass::layout_helper_offset()), G1_obj_size);

          __ tlab_allocate(O0_obj, G1_obj_size, 0, G3_t1, slow_path);

@ -437,7 +437,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {

          __ bind(try_eden);
          // get the instance size
-          __ ld(G5_klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes(), G1_obj_size);
+          __ ld(G5_klass, in_bytes(Klass::layout_helper_offset()), G1_obj_size);
          __ eden_allocate(O0_obj, G1_obj_size, 0, G3_t1, G4_t2, slow_path);
          __ incr_allocated_bytes(G1_obj_size, G3_t1, G4_t2);

@ -471,8 +471,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
        Register G4_length = G4; // Incoming
        Register O0_obj   = O0; // Outgoing

-        Address klass_lh(G5_klass, ((klassOopDesc::header_size() * HeapWordSize)
-                                    + Klass::layout_helper_offset_in_bytes()));
+        Address klass_lh(G5_klass, Klass::layout_helper_offset());
        assert(Klass::_lh_header_size_shift % BitsPerByte == 0, "bytewise");
        assert(Klass::_lh_header_size_mask == 0xFF, "bytewise");
        // Use this offset to pick out an individual byte of the layout_helper:
@ -592,7 +591,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
        Label register_finalizer;
        Register t = O1;
        __ load_klass(O0, t);
-        __ ld(t, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc), t);
+        __ ld(t, in_bytes(Klass::access_flags_offset()), t);
        __ set(JVM_ACC_HAS_FINALIZER, G3);
        __ andcc(G3, t, G0);
        __ br(Assembler::notZero, false, Assembler::pt, register_finalizer);
--- a/hotspot/src/cpu/sparc/vm/cppInterpreter_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/cppInterpreter_sparc.cpp
@ -766,7 +766,7 @@ address InterpreterGenerator::generate_native_entry(bool synchronized) {
      // get native function entry point(O0 is a good temp until the very end)
       ld_ptr(Address(G5_method, 0, in_bytes(methodOopDesc::native_function_offset())), O0);
    // for static methods insert the mirror argument
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());

    __ ld_ptr(Address(G5_method, 0, in_bytes(methodOopDesc:: constants_offset())), O1);
    __ ld_ptr(Address(O1, 0, constantPoolOopDesc::pool_holder_offset_in_bytes()), O1);
@ -1173,7 +1173,7 @@ void CppInterpreterGenerator::generate_compute_interpreter_state(const Register
    __ btst(JVM_ACC_SYNCHRONIZED, O1);
    __ br( Assembler::zero, false, Assembler::pt, done);

-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
    __ delayed()->btst(JVM_ACC_STATIC, O1);
    __ ld_ptr(XXX_STATE(_locals), O1);
    __ br( Assembler::zero, true, Assembler::pt, got_obj);
--- a/hotspot/src/cpu/sparc/vm/methodHandles_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/methodHandles_sparc.cpp
@ -1098,7 +1098,7 @@ void MethodHandles::generate_method_handle_stub(MacroAssembler* _masm, MethodHan
  Address G3_amh_argument ( G3_method_handle, java_lang_invoke_AdapterMethodHandle::argument_offset_in_bytes());
  Address G3_amh_conversion(G3_method_handle, java_lang_invoke_AdapterMethodHandle::conversion_offset_in_bytes());

-  const int java_mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+  const int java_mirror_offset = in_bytes(Klass::java_mirror_offset());

  if (have_entry(ek)) {
    __ nop();  // empty stubs make SG sick
--- a/hotspot/src/cpu/sparc/vm/sparc.ad
+++ b/hotspot/src/cpu/sparc/vm/sparc.ad
@ -6773,6 +6773,16 @@ instruct unnecessary_membar_volatile() %{
  ins_pipe(empty);
 %}

+instruct membar_storestore() %{
+  match(MemBarStoreStore);
+  ins_cost(0);
+
+  size(0);
+  format %{ "!MEMBAR-storestore (empty encoding)" %}
+  ins_encode( );
+  ins_pipe(empty);
+%}
+
 //----------Register Move Instructions-----------------------------------------
 instruct roundDouble_nop(regD dst) %{
  match(Set dst (RoundDouble dst));
--- a/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/stubGenerator_sparc.cpp
@ -3046,8 +3046,7 @@ class StubGenerator: public StubCodeGenerator {
    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
    //

-    int lh_offset = klassOopDesc::header_size() * HeapWordSize +
-                    Klass::layout_helper_offset_in_bytes();
+    int lh_offset = in_bytes(Klass::layout_helper_offset());

    // Load 32-bits signed value. Use br() instruction with it to check icc.
    __ lduw(G3_src_klass, lh_offset, G5_lh);
@ -3194,15 +3193,13 @@ class StubGenerator: public StubCodeGenerator {
                                 G4_dst_klass, G3_src_klass);

      // Generate the type check.
-      int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
-                        Klass::super_check_offset_offset_in_bytes());
+      int sco_offset = in_bytes(Klass::super_check_offset_offset());
      __ lduw(G4_dst_klass, sco_offset, sco_temp);
      generate_type_check(G3_src_klass, sco_temp, G4_dst_klass,
                          O5_temp, L_plain_copy);

      // Fetch destination element klass from the objArrayKlass header.
-      int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
-                       objArrayKlass::element_klass_offset_in_bytes());
+      int ek_offset = in_bytes(objArrayKlass::element_klass_offset());

      // the checkcast_copy loop needs two extra arguments:
      __ ld_ptr(G4_dst_klass, ek_offset, O4);   // dest elem klass
@ -3414,6 +3411,9 @@ class StubGenerator: public StubCodeGenerator {
      generate_throw_exception("WrongMethodTypeException throw_exception",
                               CAST_FROM_FN_PTR(address, SharedRuntime::throw_WrongMethodTypeException),
                               G5_method_type, G3_method_handle);
+
+    // Build this early so it's available for the interpreter.
+    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
  }


@ -3427,7 +3427,6 @@ class StubGenerator: public StubCodeGenerator {
    StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
    StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
    StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
-    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));

    StubRoutines::_handler_for_unsafe_access_entry =
      generate_handler_for_unsafe_access();
--- a/hotspot/src/cpu/sparc/vm/templateInterpreter_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/templateInterpreter_sparc.cpp
@ -366,7 +366,7 @@ void InterpreterGenerator::lock_method(void) {

  // get synchronization object to O0
  { Label done;
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
    __ btst(JVM_ACC_STATIC, O0);
    __ br( Assembler::zero, true, Assembler::pt, done);
    __ delayed()->ld_ptr(Llocals, Interpreter::local_offset_in_bytes(0), O0); // get receiver for not-static case
@ -396,7 +396,6 @@ void TemplateInterpreterGenerator::generate_stack_overflow_check(Register Rframe
                                                         Register Rscratch,
                                                         Register Rscratch2) {
  const int page_size = os::vm_page_size();
-  Address saved_exception_pc(G2_thread, JavaThread::saved_exception_pc_offset());
  Label after_frame_check;

  assert_different_registers(Rframe_size, Rscratch, Rscratch2);
@ -436,11 +435,19 @@ void TemplateInterpreterGenerator::generate_stack_overflow_check(Register Rframe
  // the bottom of the stack
  __ cmp_and_brx_short(SP, Rscratch, Assembler::greater, Assembler::pt, after_frame_check);

-  // Save the return address as the exception pc
-  __ st_ptr(O7, saved_exception_pc);
-
  // the stack will overflow, throw an exception
-  __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_StackOverflowError));
+
+  // Note that SP is restored to sender's sp (in the delay slot). This
+  // is necessary if the sender's frame is an extended compiled frame
+  // (see gen_c2i_adapter()) and safer anyway in case of JSR292
+  // adaptations.
+
+  // Note also that the restored frame is not necessarily interpreted.
+  // Use the shared runtime version of the StackOverflowError.
+  assert(StubRoutines::throw_StackOverflowError_entry() != NULL, "stub not yet generated");
+  AddressLiteral stub(StubRoutines::throw_StackOverflowError_entry());
+  __ jump_to(stub, Rscratch);
+  __ delayed()->mov(O5_savedSP, SP);

  // if you get to here, then there is enough stack space
  __ bind( after_frame_check );
@ -984,7 +991,7 @@ address InterpreterGenerator::generate_native_entry(bool synchronized) {
    // get native function entry point(O0 is a good temp until the very end)
    __ delayed()->ld_ptr(Lmethod, in_bytes(methodOopDesc::native_function_offset()), O0);
    // for static methods insert the mirror argument
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());

    __ ld_ptr(Lmethod, methodOopDesc:: constants_offset(), O1);
    __ ld_ptr(O1, constantPoolOopDesc::pool_holder_offset_in_bytes(), O1);
--- a/hotspot/src/cpu/sparc/vm/templateTable_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/templateTable_sparc.cpp
@ -888,7 +888,7 @@ void TemplateTable::aastore() {

  // do fast instanceof cache test

-  __ ld_ptr(O4,     sizeof(oopDesc) + objArrayKlass::element_klass_offset_in_bytes(),  O4);
+  __ ld_ptr(O4,     in_bytes(objArrayKlass::element_klass_offset()),  O4);

  assert(Otos_i == O0, "just checking");

@ -2031,7 +2031,7 @@ void TemplateTable::_return(TosState state) {
    __ access_local_ptr(G3_scratch, Otos_i);
    __ load_klass(Otos_i, O2);
    __ set(JVM_ACC_HAS_FINALIZER, G3);
-    __ ld(O2, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc), O2);
+    __ ld(O2, in_bytes(Klass::access_flags_offset()), O2);
    __ andcc(G3, O2, G0);
    Label skip_register_finalizer;
    __ br(Assembler::zero, false, Assembler::pn, skip_register_finalizer);
@ -3350,13 +3350,13 @@ void TemplateTable::_new() {
  __ ld_ptr(Rscratch, Roffset, RinstanceKlass);

  // make sure klass is fully initialized:
-  __ ld(RinstanceKlass, instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc), G3_scratch);
+  __ ldub(RinstanceKlass, in_bytes(instanceKlass::init_state_offset()), G3_scratch);
  __ cmp(G3_scratch, instanceKlass::fully_initialized);
  __ br(Assembler::notEqual, false, Assembler::pn, slow_case);
-  __ delayed()->ld(RinstanceKlass, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc), Roffset);
+  __ delayed()->ld(RinstanceKlass, in_bytes(Klass::layout_helper_offset()), Roffset);

  // get instance_size in instanceKlass (already aligned)
-  //__ ld(RinstanceKlass, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc), Roffset);
+  //__ ld(RinstanceKlass, in_bytes(Klass::layout_helper_offset()), Roffset);

  // make sure klass does not have has_finalizer, or is abstract, or interface or java/lang/Class
  __ btst(Klass::_lh_instance_slow_path_bit, Roffset);
@ -3483,7 +3483,7 @@ void TemplateTable::_new() {
  __ bind(initialize_header);

  if (UseBiasedLocking) {
-    __ ld_ptr(RinstanceKlass, Klass::prototype_header_offset_in_bytes() + sizeof(oopDesc), G4_scratch);
+    __ ld_ptr(RinstanceKlass, in_bytes(Klass::prototype_header_offset()), G4_scratch);
  } else {
    __ set((intptr_t)markOopDesc::prototype(), G4_scratch);
  }
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp
@ -503,7 +503,31 @@ class Assembler : public AbstractAssembler  {
    REX_WR     = 0x4C,
    REX_WRB    = 0x4D,
    REX_WRX    = 0x4E,
-    REX_WRXB   = 0x4F
+    REX_WRXB   = 0x4F,
+
+    VEX_3bytes = 0xC4,
+    VEX_2bytes = 0xC5
+  };
+
+  enum VexPrefix {
+    VEX_B = 0x20,
+    VEX_X = 0x40,
+    VEX_R = 0x80,
+    VEX_W = 0x80
+  };
+
+  enum VexSimdPrefix {
+    VEX_SIMD_NONE = 0x0,
+    VEX_SIMD_66   = 0x1,
+    VEX_SIMD_F3   = 0x2,
+    VEX_SIMD_F2   = 0x3
+  };
+
+  enum VexOpcode {
+    VEX_OPCODE_NONE  = 0x0,
+    VEX_OPCODE_0F    = 0x1,
+    VEX_OPCODE_0F_38 = 0x2,
+    VEX_OPCODE_0F_3A = 0x3
  };

  enum WhichOperand {
@ -546,12 +570,99 @@ private:
  void prefixq(Address adr);

  void prefix(Address adr, Register reg,  bool byteinst = false);
-  void prefixq(Address adr, Register reg);
-
  void prefix(Address adr, XMMRegister reg);
+  void prefixq(Address adr, Register reg);
+  void prefixq(Address adr, XMMRegister reg);

  void prefetch_prefix(Address src);

+  void rex_prefix(Address adr, XMMRegister xreg,
+                  VexSimdPrefix pre, VexOpcode opc, bool rex_w);
+  int  rex_prefix_and_encode(int dst_enc, int src_enc,
+                             VexSimdPrefix pre, VexOpcode opc, bool rex_w);
+
+  void vex_prefix(bool vex_r, bool vex_b, bool vex_x, bool vex_w,
+                  int nds_enc, VexSimdPrefix pre, VexOpcode opc,
+                  bool vector256);
+
+  void vex_prefix(Address adr, int nds_enc, int xreg_enc,
+                  VexSimdPrefix pre, VexOpcode opc,
+                  bool vex_w, bool vector256);
+
+  void vex_prefix(XMMRegister dst, XMMRegister nds, Address src,
+                  VexSimdPrefix pre, bool vector256 = false) {
+     vex_prefix(src, nds->encoding(), dst->encoding(),
+                pre, VEX_OPCODE_0F, false, vector256);
+  }
+
+  int  vex_prefix_and_encode(int dst_enc, int nds_enc, int src_enc,
+                             VexSimdPrefix pre, VexOpcode opc,
+                             bool vex_w, bool vector256);
+
+  int  vex_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
+                             VexSimdPrefix pre, bool vector256 = false) {
+     return vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(),
+                                  pre, VEX_OPCODE_0F, false, vector256);
+  }
+
+  void simd_prefix(XMMRegister xreg, XMMRegister nds, Address adr,
+                   VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
+                   bool rex_w = false, bool vector256 = false);
+
+  void simd_prefix(XMMRegister dst, Address src,
+                   VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
+    simd_prefix(dst, xnoreg, src, pre, opc);
+  }
+  void simd_prefix(Address dst, XMMRegister src, VexSimdPrefix pre) {
+    simd_prefix(src, dst, pre);
+  }
+  void simd_prefix_q(XMMRegister dst, XMMRegister nds, Address src,
+                     VexSimdPrefix pre) {
+    bool rex_w = true;
+    simd_prefix(dst, nds, src, pre, VEX_OPCODE_0F, rex_w);
+  }
+
+
+  int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, XMMRegister src,
+                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F,
+                             bool rex_w = false, bool vector256 = false);
+
+  int simd_prefix_and_encode(XMMRegister dst, XMMRegister src,
+                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
+    return simd_prefix_and_encode(dst, xnoreg, src, pre, opc);
+  }
+
+  // Move/convert 32-bit integer value.
+  int simd_prefix_and_encode(XMMRegister dst, XMMRegister nds, Register src,
+                             VexSimdPrefix pre) {
+    // It is OK to cast from Register to XMMRegister to pass argument here
+    // since only encoding is used in simd_prefix_and_encode() and number of
+    // Gen and Xmm registers are the same.
+    return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre);
+  }
+  int simd_prefix_and_encode(XMMRegister dst, Register src, VexSimdPrefix pre) {
+    return simd_prefix_and_encode(dst, xnoreg, src, pre);
+  }
+  int simd_prefix_and_encode(Register dst, XMMRegister src,
+                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
+    return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc);
+  }
+
+  // Move/convert 64-bit integer value.
+  int simd_prefix_and_encode_q(XMMRegister dst, XMMRegister nds, Register src,
+                               VexSimdPrefix pre) {
+    bool rex_w = true;
+    return simd_prefix_and_encode(dst, nds, as_XMMRegister(src->encoding()), pre, VEX_OPCODE_0F, rex_w);
+  }
+  int simd_prefix_and_encode_q(XMMRegister dst, Register src, VexSimdPrefix pre) {
+    return simd_prefix_and_encode_q(dst, xnoreg, src, pre);
+  }
+  int simd_prefix_and_encode_q(Register dst, XMMRegister src,
+                             VexSimdPrefix pre, VexOpcode opc = VEX_OPCODE_0F) {
+    bool rex_w = true;
+    return simd_prefix_and_encode(as_XMMRegister(dst->encoding()), xnoreg, src, pre, opc, rex_w);
+  }
+
  // Helper functions for groups of instructions
  void emit_arith_b(int op1, int op2, Register dst, int imm8);

@ -764,6 +875,7 @@ private:
  void addss(XMMRegister dst, Address src);
  void addss(XMMRegister dst, XMMRegister src);

+  void andl(Address  dst, int32_t imm32);
  void andl(Register dst, int32_t imm32);
  void andl(Register dst, Address src);
  void andl(Register dst, Register src);
@ -774,9 +886,11 @@ private:
  void andq(Register dst, Register src);

  // Bitwise Logical AND of Packed Double-Precision Floating-Point Values
-  void andpd(XMMRegister dst, Address src);
  void andpd(XMMRegister dst, XMMRegister src);

+  // Bitwise Logical AND of Packed Single-Precision Floating-Point Values
+  void andps(XMMRegister dst, XMMRegister src);
+
  void bsfl(Register dst, Register src);
  void bsrl(Register dst, Register src);

@ -837,9 +951,11 @@ private:

  // Ordered Compare Scalar Double-Precision Floating-Point Values and set EFLAGS
  void comisd(XMMRegister dst, Address src);
+  void comisd(XMMRegister dst, XMMRegister src);

  // Ordered Compare Scalar Single-Precision Floating-Point Values and set EFLAGS
  void comiss(XMMRegister dst, Address src);
+  void comiss(XMMRegister dst, XMMRegister src);

  // Identify processor type and features
  void cpuid() {
@ -849,14 +965,19 @@ private:

  // Convert Scalar Double-Precision Floating-Point Value to Scalar Single-Precision Floating-Point Value
  void cvtsd2ss(XMMRegister dst, XMMRegister src);
+  void cvtsd2ss(XMMRegister dst, Address src);

  // Convert Doubleword Integer to Scalar Double-Precision Floating-Point Value
  void cvtsi2sdl(XMMRegister dst, Register src);
+  void cvtsi2sdl(XMMRegister dst, Address src);
  void cvtsi2sdq(XMMRegister dst, Register src);
+  void cvtsi2sdq(XMMRegister dst, Address src);

  // Convert Doubleword Integer to Scalar Single-Precision Floating-Point Value
  void cvtsi2ssl(XMMRegister dst, Register src);
+  void cvtsi2ssl(XMMRegister dst, Address src);
  void cvtsi2ssq(XMMRegister dst, Register src);
+  void cvtsi2ssq(XMMRegister dst, Address src);

  // Convert Packed Signed Doubleword Integers to Packed Double-Precision Floating-Point Value
  void cvtdq2pd(XMMRegister dst, XMMRegister src);
@ -866,6 +987,7 @@ private:

  // Convert Scalar Single-Precision Floating-Point Value to Scalar Double-Precision Floating-Point Value
  void cvtss2sd(XMMRegister dst, XMMRegister src);
+  void cvtss2sd(XMMRegister dst, Address src);

  // Convert with Truncation Scalar Double-Precision Floating-Point Value to Doubleword Integer
  void cvttsd2sil(Register dst, Address src);
@ -1140,8 +1262,6 @@ private:
  void movdq(Register dst, XMMRegister src);

  // Move Aligned Double Quadword
-  void movdqa(Address     dst, XMMRegister src);
-  void movdqa(XMMRegister dst, Address src);
  void movdqa(XMMRegister dst, XMMRegister src);

  // Move Unaligned Double Quadword
@ -1261,10 +1381,18 @@ private:
  void orq(Register dst, Address src);
  void orq(Register dst, Register src);

+  // Pack with unsigned saturation
+  void packuswb(XMMRegister dst, XMMRegister src);
+  void packuswb(XMMRegister dst, Address src);
+
  // SSE4.2 string instructions
  void pcmpestri(XMMRegister xmm1, XMMRegister xmm2, int imm8);
  void pcmpestri(XMMRegister xmm1, Address src, int imm8);

+  // SSE4.1 packed move
+  void pmovzxbw(XMMRegister dst, XMMRegister src);
+  void pmovzxbw(XMMRegister dst, Address src);
+
 #ifndef _LP64 // no 32bit push/pop on amd64
  void popl(Address dst);
 #endif
@ -1292,6 +1420,7 @@ private:

  // POR - Bitwise logical OR
  void por(XMMRegister dst, XMMRegister src);
+  void por(XMMRegister dst, Address src);

  // Shuffle Packed Doublewords
  void pshufd(XMMRegister dst, XMMRegister src, int mode);
@ -1313,6 +1442,11 @@ private:

  // Interleave Low Bytes
  void punpcklbw(XMMRegister dst, XMMRegister src);
+  void punpcklbw(XMMRegister dst, Address src);
+
+  // Interleave Low Doublewords
+  void punpckldq(XMMRegister dst, XMMRegister src);
+  void punpckldq(XMMRegister dst, Address src);

 #ifndef _LP64 // no 32bit push/pop on amd64
  void pushl(Address src);
@ -1429,6 +1563,13 @@ private:
  void xchgq(Register reg, Address adr);
  void xchgq(Register dst, Register src);

+  // Get Value of Extended Control Register
+  void xgetbv() {
+    emit_byte(0x0F);
+    emit_byte(0x01);
+    emit_byte(0xD0);
+  }
+
  void xorl(Register dst, int32_t imm32);
  void xorl(Register dst, Address src);
  void xorl(Register dst, Register src);
@ -1437,14 +1578,44 @@ private:
  void xorq(Register dst, Register src);

  // Bitwise Logical XOR of Packed Double-Precision Floating-Point Values
-  void xorpd(XMMRegister dst, Address src);
  void xorpd(XMMRegister dst, XMMRegister src);

  // Bitwise Logical XOR of Packed Single-Precision Floating-Point Values
-  void xorps(XMMRegister dst, Address src);
  void xorps(XMMRegister dst, XMMRegister src);

  void set_byte_if_not_zero(Register dst); // sets reg to 1 if not zero, otherwise 0
+
+  // AVX 3-operands instructions (encoded with VEX prefix)
+  void vaddsd(XMMRegister dst, XMMRegister nds, Address src);
+  void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vaddss(XMMRegister dst, XMMRegister nds, Address src);
+  void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vandpd(XMMRegister dst, XMMRegister nds, Address src);
+  void vandps(XMMRegister dst, XMMRegister nds, Address src);
+  void vdivsd(XMMRegister dst, XMMRegister nds, Address src);
+  void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vdivss(XMMRegister dst, XMMRegister nds, Address src);
+  void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vmulsd(XMMRegister dst, XMMRegister nds, Address src);
+  void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vmulss(XMMRegister dst, XMMRegister nds, Address src);
+  void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vsubsd(XMMRegister dst, XMMRegister nds, Address src);
+  void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vsubss(XMMRegister dst, XMMRegister nds, Address src);
+  void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vxorpd(XMMRegister dst, XMMRegister nds, Address src);
+  void vxorps(XMMRegister dst, XMMRegister nds, Address src);
+
+
+ protected:
+  // Next instructions require address alignment 16 bytes SSE mode.
+  // They should be called only from corresponding MacroAssembler instructions.
+  void andpd(XMMRegister dst, Address src);
+  void andps(XMMRegister dst, Address src);
+  void xorpd(XMMRegister dst, Address src);
+  void xorps(XMMRegister dst, Address src);
+
 };


@ -2175,9 +2346,15 @@ class MacroAssembler: public Assembler {
  void andpd(XMMRegister dst, Address src) { Assembler::andpd(dst, src); }
  void andpd(XMMRegister dst, AddressLiteral src);

+  void andps(XMMRegister dst, XMMRegister src) { Assembler::andps(dst, src); }
+  void andps(XMMRegister dst, Address src) { Assembler::andps(dst, src); }
+  void andps(XMMRegister dst, AddressLiteral src);
+
+  void comiss(XMMRegister dst, XMMRegister src) { Assembler::comiss(dst, src); }
  void comiss(XMMRegister dst, Address src) { Assembler::comiss(dst, src); }
  void comiss(XMMRegister dst, AddressLiteral src);

+  void comisd(XMMRegister dst, XMMRegister src) { Assembler::comisd(dst, src); }
  void comisd(XMMRegister dst, Address src) { Assembler::comisd(dst, src); }
  void comisd(XMMRegister dst, AddressLiteral src);

@ -2218,48 +2395,48 @@ public:

  void addsd(XMMRegister dst, XMMRegister src)    { Assembler::addsd(dst, src); }
  void addsd(XMMRegister dst, Address src)        { Assembler::addsd(dst, src); }
-  void addsd(XMMRegister dst, AddressLiteral src) { Assembler::addsd(dst, as_Address(src)); }
+  void addsd(XMMRegister dst, AddressLiteral src);

  void addss(XMMRegister dst, XMMRegister src)    { Assembler::addss(dst, src); }
  void addss(XMMRegister dst, Address src)        { Assembler::addss(dst, src); }
-  void addss(XMMRegister dst, AddressLiteral src) { Assembler::addss(dst, as_Address(src)); }
+  void addss(XMMRegister dst, AddressLiteral src);

  void divsd(XMMRegister dst, XMMRegister src)    { Assembler::divsd(dst, src); }
  void divsd(XMMRegister dst, Address src)        { Assembler::divsd(dst, src); }
-  void divsd(XMMRegister dst, AddressLiteral src) { Assembler::divsd(dst, as_Address(src)); }
+  void divsd(XMMRegister dst, AddressLiteral src);

  void divss(XMMRegister dst, XMMRegister src)    { Assembler::divss(dst, src); }
  void divss(XMMRegister dst, Address src)        { Assembler::divss(dst, src); }
-  void divss(XMMRegister dst, AddressLiteral src) { Assembler::divss(dst, as_Address(src)); }
+  void divss(XMMRegister dst, AddressLiteral src);

  void movsd(XMMRegister dst, XMMRegister src) { Assembler::movsd(dst, src); }
  void movsd(Address dst, XMMRegister src)     { Assembler::movsd(dst, src); }
  void movsd(XMMRegister dst, Address src)     { Assembler::movsd(dst, src); }
-  void movsd(XMMRegister dst, AddressLiteral src) { Assembler::movsd(dst, as_Address(src)); }
+  void movsd(XMMRegister dst, AddressLiteral src);

  void mulsd(XMMRegister dst, XMMRegister src)    { Assembler::mulsd(dst, src); }
  void mulsd(XMMRegister dst, Address src)        { Assembler::mulsd(dst, src); }
-  void mulsd(XMMRegister dst, AddressLiteral src) { Assembler::mulsd(dst, as_Address(src)); }
+  void mulsd(XMMRegister dst, AddressLiteral src);

  void mulss(XMMRegister dst, XMMRegister src)    { Assembler::mulss(dst, src); }
  void mulss(XMMRegister dst, Address src)        { Assembler::mulss(dst, src); }
-  void mulss(XMMRegister dst, AddressLiteral src) { Assembler::mulss(dst, as_Address(src)); }
+  void mulss(XMMRegister dst, AddressLiteral src);

  void sqrtsd(XMMRegister dst, XMMRegister src)    { Assembler::sqrtsd(dst, src); }
  void sqrtsd(XMMRegister dst, Address src)        { Assembler::sqrtsd(dst, src); }
-  void sqrtsd(XMMRegister dst, AddressLiteral src) { Assembler::sqrtsd(dst, as_Address(src)); }
+  void sqrtsd(XMMRegister dst, AddressLiteral src);

  void sqrtss(XMMRegister dst, XMMRegister src)    { Assembler::sqrtss(dst, src); }
  void sqrtss(XMMRegister dst, Address src)        { Assembler::sqrtss(dst, src); }
-  void sqrtss(XMMRegister dst, AddressLiteral src) { Assembler::sqrtss(dst, as_Address(src)); }
+  void sqrtss(XMMRegister dst, AddressLiteral src);

  void subsd(XMMRegister dst, XMMRegister src)    { Assembler::subsd(dst, src); }
  void subsd(XMMRegister dst, Address src)        { Assembler::subsd(dst, src); }
-  void subsd(XMMRegister dst, AddressLiteral src) { Assembler::subsd(dst, as_Address(src)); }
+  void subsd(XMMRegister dst, AddressLiteral src);

  void subss(XMMRegister dst, XMMRegister src)    { Assembler::subss(dst, src); }
  void subss(XMMRegister dst, Address src)        { Assembler::subss(dst, src); }
-  void subss(XMMRegister dst, AddressLiteral src) { Assembler::subss(dst, as_Address(src)); }
+  void subss(XMMRegister dst, AddressLiteral src);

  void ucomiss(XMMRegister dst, XMMRegister src) { Assembler::ucomiss(dst, src); }
  void ucomiss(XMMRegister dst, Address src)     { Assembler::ucomiss(dst, src); }
@ -2279,6 +2456,53 @@ public:
  void xorps(XMMRegister dst, Address src)     { Assembler::xorps(dst, src); }
  void xorps(XMMRegister dst, AddressLiteral src);

+  // AVX 3-operands instructions
+
+  void vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vaddsd(dst, nds, src); }
+  void vaddsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vaddsd(dst, nds, src); }
+  void vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vaddss(dst, nds, src); }
+  void vaddss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vaddss(dst, nds, src); }
+  void vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vandpd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vandpd(dst, nds, src); }
+  void vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vandps(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vandps(dst, nds, src); }
+  void vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivsd(dst, nds, src); }
+  void vdivsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vdivsd(dst, nds, src); }
+  void vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vdivss(dst, nds, src); }
+  void vdivss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vdivss(dst, nds, src); }
+  void vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vmulsd(dst, nds, src); }
+  void vmulsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vmulsd(dst, nds, src); }
+  void vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vmulss(dst, nds, src); }
+  void vmulss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vmulss(dst, nds, src); }
+  void vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vsubsd(dst, nds, src); }
+  void vsubsd(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vsubsd(dst, nds, src); }
+  void vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) { Assembler::vsubss(dst, nds, src); }
+  void vsubss(XMMRegister dst, XMMRegister nds, Address src)     { Assembler::vsubss(dst, nds, src); }
+  void vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vxorpd(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vxorpd(dst, nds, src); }
+  void vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+  void vxorps(XMMRegister dst, XMMRegister nds, Address src) { Assembler::vxorps(dst, nds, src); }
+  void vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src);
+
+
  // Data

  void cmov32( Condition cc, Register dst, Address  src);
--- a/hotspot/src/cpu/x86/vm/assembler_x86.inline.hpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.inline.hpp
@ -86,6 +86,7 @@ inline void Assembler::prefix(Address adr, Register reg,  bool byteinst) {}
 inline void Assembler::prefixq(Address adr, Register reg) {}

 inline void Assembler::prefix(Address adr, XMMRegister reg) {}
+inline void Assembler::prefixq(Address adr, XMMRegister reg) {}
 #else
 inline void Assembler::emit_long64(jlong x) {
  *(jlong*) _code_pos = x;
--- a/hotspot/src/cpu/x86/vm/c1_CodeStubs_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_CodeStubs_x86.cpp
@ -320,7 +320,7 @@ void PatchingStub::emit_code(LIR_Assembler* ce) {
    // begin_initialized_entry_offset has to fit in a byte. Also, we know it's not null.
    __ load_heap_oop_not_null(tmp2, Address(_obj, java_lang_Class::klass_offset_in_bytes()));
    __ get_thread(tmp);
-    __ cmpptr(tmp, Address(tmp2, instanceKlass::init_thread_offset_in_bytes() + sizeof(klassOopDesc)));
+    __ cmpptr(tmp, Address(tmp2, instanceKlass::init_thread_offset()));
    __ pop(tmp2);
    __ pop(tmp);
    __ jcc(Assembler::notEqual, call_patch);
@ -519,7 +519,7 @@ void G1UnsafeGetObjSATBBarrierStub::emit_code(LIR_Assembler* ce) {

  __ load_klass(tmp_reg, src_reg);

-  Address ref_type_adr(tmp_reg, instanceKlass::reference_type_offset_in_bytes() + sizeof(oopDesc));
+  Address ref_type_adr(tmp_reg, instanceKlass::reference_type_offset());
  __ cmpl(ref_type_adr, REF_NONE);
  __ jcc(Assembler::equal, _continuation);

--- a/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
@ -1557,8 +1557,8 @@ void LIR_Assembler::emit_opConvert(LIR_OpConvert* op) {

 void LIR_Assembler::emit_alloc_obj(LIR_OpAllocObj* op) {
  if (op->init_check()) {
-    __ cmpl(Address(op->klass()->as_register(),
-                    instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc)),
+    __ cmpb(Address(op->klass()->as_register(),
+                    instanceKlass::init_state_offset()),
            instanceKlass::fully_initialized);
    add_debug_info_for_null_check_here(op->stub()->info());
    __ jcc(Assembler::notEqual, *op->stub()->entry());
@ -1730,7 +1730,7 @@ void LIR_Assembler::emit_typecheck_helper(LIR_OpTypeCheck *op, Label* success, L
 #else
      __ cmpoop(Address(klass_RInfo, k->super_check_offset()), k->constant_encoding());
 #endif // _LP64
-      if (sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes() != k->super_check_offset()) {
+      if ((juint)in_bytes(Klass::secondary_super_cache_offset()) != k->super_check_offset()) {
        __ jcc(Assembler::notEqual, *failure_target);
        // successful cast, fall through to profile or jump
      } else {
@ -1842,7 +1842,7 @@ void LIR_Assembler::emit_opTypeCheck(LIR_OpTypeCheck* op) {
    __ load_klass(klass_RInfo, value);

    // get instance klass (it's already uncompressed)
-    __ movptr(k_RInfo, Address(k_RInfo, objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc)));
+    __ movptr(k_RInfo, Address(k_RInfo, objArrayKlass::element_klass_offset()));
    // perform the fast part of the checking logic
    __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, Rtmp1, success_target, failure_target, NULL);
    // call out-of-line instance of __ check_klass_subtype_slow_path(...):
@ -3289,8 +3289,7 @@ void LIR_Assembler::emit_arraycopy(LIR_OpArrayCopy* op) {
          } else if (!(flags & LIR_OpArrayCopy::dst_objarray)) {
            __ load_klass(tmp, dst);
          }
-          int lh_offset = klassOopDesc::header_size() * HeapWordSize +
-            Klass::layout_helper_offset_in_bytes();
+          int lh_offset = in_bytes(Klass::layout_helper_offset());
          Address klass_lh_addr(tmp, lh_offset);
          jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
          __ cmpl(klass_lh_addr, objArray_lh);
@ -3307,9 +3306,9 @@ void LIR_Assembler::emit_arraycopy(LIR_OpArrayCopy* op) {

 #ifndef _LP64
        __ movptr(tmp, dst_klass_addr);
-        __ movptr(tmp, Address(tmp, objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc)));
+        __ movptr(tmp, Address(tmp, objArrayKlass::element_klass_offset()));
        __ push(tmp);
-        __ movl(tmp, Address(tmp, Klass::super_check_offset_offset_in_bytes() + sizeof(oopDesc)));
+        __ movl(tmp, Address(tmp, Klass::super_check_offset_offset()));
        __ push(tmp);
        __ push(length);
        __ lea(tmp, Address(dst, dst_pos, scale, arrayOopDesc::base_offset_in_bytes(basic_type)));
@ -3333,15 +3332,15 @@ void LIR_Assembler::emit_arraycopy(LIR_OpArrayCopy* op) {
        // Allocate abi space for args but be sure to keep stack aligned
        __ subptr(rsp, 6*wordSize);
        __ load_klass(c_rarg3, dst);
-        __ movptr(c_rarg3, Address(c_rarg3, objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc)));
+        __ movptr(c_rarg3, Address(c_rarg3, objArrayKlass::element_klass_offset()));
        store_parameter(c_rarg3, 4);
-        __ movl(c_rarg3, Address(c_rarg3, Klass::super_check_offset_offset_in_bytes() + sizeof(oopDesc)));
+        __ movl(c_rarg3, Address(c_rarg3, Klass::super_check_offset_offset()));
        __ call(RuntimeAddress(copyfunc_addr));
        __ addptr(rsp, 6*wordSize);
 #else
        __ load_klass(c_rarg4, dst);
-        __ movptr(c_rarg4, Address(c_rarg4, objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc)));
-        __ movl(c_rarg3, Address(c_rarg4, Klass::super_check_offset_offset_in_bytes() + sizeof(oopDesc)));
+        __ movptr(c_rarg4, Address(c_rarg4, objArrayKlass::element_klass_offset()));
+        __ movl(c_rarg3, Address(c_rarg4, Klass::super_check_offset_offset()));
        __ call(RuntimeAddress(copyfunc_addr));
 #endif

--- a/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_MacroAssembler_x86.cpp
@ -150,7 +150,7 @@ void C1_MacroAssembler::initialize_header(Register obj, Register klass, Register
  assert_different_registers(obj, klass, len);
  if (UseBiasedLocking && !len->is_valid()) {
    assert_different_registers(obj, klass, len, t1, t2);
-    movptr(t1, Address(klass, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+    movptr(t1, Address(klass, Klass::prototype_header_offset()));
    movptr(Address(obj, oopDesc::mark_offset_in_bytes()), t1);
  } else {
    // This assumes that all prototype bits fit in an int32_t
--- a/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_Runtime1_x86.cpp
@ -1011,7 +1011,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {

          if (id == fast_new_instance_init_check_id) {
            // make sure the klass is initialized
-            __ cmpl(Address(klass, instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc)), instanceKlass::fully_initialized);
+            __ cmpb(Address(klass, instanceKlass::init_state_offset()), instanceKlass::fully_initialized);
            __ jcc(Assembler::notEqual, slow_path);
          }

@ -1019,7 +1019,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
          // assert object can be fast path allocated
          {
            Label ok, not_ok;
-            __ movl(obj_size, Address(klass, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc)));
+            __ movl(obj_size, Address(klass, Klass::layout_helper_offset()));
            __ cmpl(obj_size, 0);  // make sure it's an instance (LH > 0)
            __ jcc(Assembler::lessEqual, not_ok);
            __ testl(obj_size, Klass::_lh_instance_slow_path_bit);
@ -1040,7 +1040,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
          __ bind(retry_tlab);

          // get the instance size (size is postive so movl is fine for 64bit)
-          __ movl(obj_size, Address(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes()));
+          __ movl(obj_size, Address(klass, Klass::layout_helper_offset()));

          __ tlab_allocate(obj, obj_size, 0, t1, t2, slow_path);

@ -1052,7 +1052,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {

          __ bind(try_eden);
          // get the instance size (size is postive so movl is fine for 64bit)
-          __ movl(obj_size, Address(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes()));
+          __ movl(obj_size, Address(klass, Klass::layout_helper_offset()));

          __ eden_allocate(obj, obj_size, 0, t1, slow_path);
          __ incr_allocated_bytes(thread, obj_size, 0);
@ -1119,7 +1119,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
        {
          Label ok;
          Register t0 = obj;
-          __ movl(t0, Address(klass, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc)));
+          __ movl(t0, Address(klass, Klass::layout_helper_offset()));
          __ sarl(t0, Klass::_lh_array_tag_shift);
          int tag = ((id == new_type_array_id)
                     ? Klass::_lh_array_tag_type_value
@ -1153,7 +1153,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {

          // get the allocation size: round_up(hdr + length << (layout_helper & 0x1F))
          // since size is positive movl does right thing on 64bit
-          __ movl(t1, Address(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes()));
+          __ movl(t1, Address(klass, Klass::layout_helper_offset()));
          // since size is postive movl does right thing on 64bit
          __ movl(arr_size, length);
          assert(t1 == rcx, "fixed register usage");
@ -1167,7 +1167,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
          __ tlab_allocate(obj, arr_size, 0, t1, t2, slow_path);  // preserves arr_size

          __ initialize_header(obj, klass, length, t1, t2);
-          __ movb(t1, Address(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes() + (Klass::_lh_header_size_shift / BitsPerByte)));
+          __ movb(t1, Address(klass, in_bytes(Klass::layout_helper_offset()) + (Klass::_lh_header_size_shift / BitsPerByte)));
          assert(Klass::_lh_header_size_shift % BitsPerByte == 0, "bytewise");
          assert(Klass::_lh_header_size_mask <= 0xFF, "bytewise");
          __ andptr(t1, Klass::_lh_header_size_mask);
@ -1180,7 +1180,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
          __ bind(try_eden);
          // get the allocation size: round_up(hdr + length << (layout_helper & 0x1F))
          // since size is positive movl does right thing on 64bit
-          __ movl(t1, Address(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes()));
+          __ movl(t1, Address(klass, Klass::layout_helper_offset()));
          // since size is postive movl does right thing on 64bit
          __ movl(arr_size, length);
          assert(t1 == rcx, "fixed register usage");
@ -1195,7 +1195,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
          __ incr_allocated_bytes(thread, arr_size, 0);

          __ initialize_header(obj, klass, length, t1, t2);
-          __ movb(t1, Address(klass, klassOopDesc::header_size() * HeapWordSize + Klass::layout_helper_offset_in_bytes() + (Klass::_lh_header_size_shift / BitsPerByte)));
+          __ movb(t1, Address(klass, in_bytes(Klass::layout_helper_offset()) + (Klass::_lh_header_size_shift / BitsPerByte)));
          assert(Klass::_lh_header_size_shift % BitsPerByte == 0, "bytewise");
          assert(Klass::_lh_header_size_mask <= 0xFF, "bytewise");
          __ andptr(t1, Klass::_lh_header_size_mask);
@ -1267,7 +1267,7 @@ OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
        Label register_finalizer;
        Register t = rsi;
        __ load_klass(t, rax);
-        __ movl(t, Address(t, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc)));
+        __ movl(t, Address(t, Klass::access_flags_offset()));
        __ testl(t, JVM_ACC_HAS_FINALIZER);
        __ jcc(Assembler::notZero, register_finalizer);
        __ ret(0);
--- a/hotspot/src/cpu/x86/vm/cppInterpreter_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/cppInterpreter_x86.cpp
@ -511,7 +511,7 @@ void CppInterpreterGenerator::generate_compute_interpreter_state(const Register
    // get synchronization object

    Label done;
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
    __ movl(rax, access_flags);
    __ testl(rax, JVM_ACC_STATIC);
    __ movptr(rax, Address(locals, 0));                   // get receiver (assume this is frequent case)
@ -763,7 +763,7 @@ void InterpreterGenerator::lock_method(void) {
 #endif // ASSERT
  // get synchronization object
  { Label done;
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
    __ movl(rax, access_flags);
    __ movptr(rdi, STATE(_locals));                                     // prepare to get receiver (assume common case)
    __ testl(rax, JVM_ACC_STATIC);
@ -1180,7 +1180,7 @@ address InterpreterGenerator::generate_native_entry(bool synchronized) {

  // pass mirror handle if static call
  { Label L;
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
    __ movl(t, Address(method, methodOopDesc::access_flags_offset()));
    __ testl(t, JVM_ACC_STATIC);
    __ jcc(Assembler::zero, L);
--- a/hotspot/src/cpu/x86/vm/methodHandles_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/methodHandles_x86.cpp
@ -1160,7 +1160,7 @@ void MethodHandles::generate_method_handle_stub(MacroAssembler* _masm, MethodHan
  Address rcx_amh_conversion( rcx_recv, java_lang_invoke_AdapterMethodHandle::conversion_offset_in_bytes() );
  Address vmarg;                // __ argument_address(vmargslot)

-  const int java_mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+  const int java_mirror_offset = in_bytes(Klass::java_mirror_offset());

  if (have_entry(ek)) {
    __ nop();                   // empty stubs make SG sick
--- a/hotspot/src/cpu/x86/vm/nativeInst_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/nativeInst_x86.cpp
@ -237,6 +237,18 @@ int NativeMovRegMem::instruction_start() const {
  int off = 0;
  u_char instr_0 = ubyte_at(off);

+  // See comment in Assembler::locate_operand() about VEX prefixes.
+  if (instr_0 == instruction_VEX_prefix_2bytes) {
+    assert((UseAVX > 0), "shouldn't have VEX prefix");
+    NOT_LP64(assert((0xC0 & ubyte_at(1)) == 0xC0, "shouldn't have LDS and LES instructions"));
+    return 2;
+  }
+  if (instr_0 == instruction_VEX_prefix_3bytes) {
+    assert((UseAVX > 0), "shouldn't have VEX prefix");
+    NOT_LP64(assert((0xC0 & ubyte_at(1)) == 0xC0, "shouldn't have LDS and LES instructions"));
+    return 3;
+  }
+
  // First check to see if we have a (prefixed or not) xor
  if (instr_0 >= instruction_prefix_wide_lo && // 0x40
      instr_0 <= instruction_prefix_wide_hi) { // 0x4f
--- a/hotspot/src/cpu/x86/vm/nativeInst_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/nativeInst_x86.hpp
@ -287,6 +287,9 @@ class NativeMovRegMem: public NativeInstruction {
    instruction_code_xmm_store          = 0x11,
    instruction_code_xmm_lpd            = 0x12,

+    instruction_VEX_prefix_2bytes       = Assembler::VEX_2bytes,
+    instruction_VEX_prefix_3bytes       = Assembler::VEX_3bytes,
+
    instruction_size                    = 4,
    instruction_offset                  = 0,
    data_offset                         = 2,
--- a/hotspot/src/cpu/x86/vm/register_definitions_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/register_definitions_x86.cpp
@ -53,6 +53,7 @@ REGISTER_DEFINITION(Register, r14);
 REGISTER_DEFINITION(Register, r15);
 #endif // AMD64

+REGISTER_DEFINITION(XMMRegister, xnoreg);
 REGISTER_DEFINITION(XMMRegister, xmm0 );
 REGISTER_DEFINITION(XMMRegister, xmm1 );
 REGISTER_DEFINITION(XMMRegister, xmm2 );
@ -115,6 +116,7 @@ REGISTER_DEFINITION(Register, r12_heapbase);
 REGISTER_DEFINITION(Register, r15_thread);
 #endif // AMD64

+REGISTER_DEFINITION(MMXRegister, mnoreg );
 REGISTER_DEFINITION(MMXRegister, mmx0 );
 REGISTER_DEFINITION(MMXRegister, mmx1 );
 REGISTER_DEFINITION(MMXRegister, mmx2 );
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_32.cpp
@ -1374,8 +1374,7 @@ class StubGenerator: public StubCodeGenerator {
    //                                  L_success, L_failure, NULL);
    assert_different_registers(sub_klass, temp);

-    int sc_offset = (klassOopDesc::header_size() * HeapWordSize +
-                     Klass::secondary_super_cache_offset_in_bytes());
+    int sc_offset = in_bytes(Klass::secondary_super_cache_offset());

    // if the pointers are equal, we are done (e.g., String[] elements)
    __ cmpptr(sub_klass, super_klass_addr);
@ -1787,8 +1786,7 @@ class StubGenerator: public StubCodeGenerator {
    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
    //

-    int lh_offset = klassOopDesc::header_size() * HeapWordSize +
-                    Klass::layout_helper_offset_in_bytes();
+    int lh_offset = in_bytes(Klass::layout_helper_offset());
    Address src_klass_lh_addr(rcx_src_klass, lh_offset);

    // Handle objArrays completely differently...
@ -1914,10 +1912,8 @@ class StubGenerator: public StubCodeGenerator {
    // live at this point:  rcx_src_klass, dst[_pos], src[_pos]
    {
      // Handy offsets:
-      int  ek_offset = (klassOopDesc::header_size() * HeapWordSize +
-                        objArrayKlass::element_klass_offset_in_bytes());
-      int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
-                        Klass::super_check_offset_offset_in_bytes());
+      int  ek_offset = in_bytes(objArrayKlass::element_klass_offset());
+      int sco_offset = in_bytes(Klass::super_check_offset_offset());

      Register rsi_dst_klass = rsi;
      Register rdi_temp      = rdi;
@ -2323,6 +2319,9 @@ class StubGenerator: public StubCodeGenerator {
      generate_throw_exception("WrongMethodTypeException throw_exception",
                               CAST_FROM_FN_PTR(address, SharedRuntime::throw_WrongMethodTypeException),
                               rax, rcx);
+
+    // Build this early so it's available for the interpreter
+    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
  }


@ -2334,7 +2333,6 @@ class StubGenerator: public StubCodeGenerator {
    StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
    StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
    StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
-    StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));

    //------------------------------------------------------------------------------------------------------------------------
    // entry points that are platform specific
--- a/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@ -2261,8 +2261,7 @@ class StubGenerator: public StubCodeGenerator {
    // The ckoff and ckval must be mutually consistent,
    // even though caller generates both.
    { Label L;
-      int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
-                        Klass::super_check_offset_offset_in_bytes());
+      int sco_offset = in_bytes(Klass::super_check_offset_offset());
      __ cmpl(ckoff, Address(ckval, sco_offset));
      __ jcc(Assembler::equal, L);
      __ stop("super_check_offset inconsistent");
@ -2572,8 +2571,7 @@ class StubGenerator: public StubCodeGenerator {
    //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
    //

-    const int lh_offset = klassOopDesc::header_size() * HeapWordSize +
-                          Klass::layout_helper_offset_in_bytes();
+    const int lh_offset = in_bytes(Klass::layout_helper_offset());

    // Handle objArrays completely differently...
    const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
@ -2722,15 +2720,13 @@ class StubGenerator: public StubCodeGenerator {
      assert_clean_int(count, sco_temp);

      // Generate the type check.
-      const int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
-                              Klass::super_check_offset_offset_in_bytes());
+      const int sco_offset = in_bytes(Klass::super_check_offset_offset());
      __ movl(sco_temp, Address(r11_dst_klass, sco_offset));
      assert_clean_int(sco_temp, rax);
      generate_type_check(r10_src_klass, sco_temp, r11_dst_klass, L_plain_copy);

      // Fetch destination element klass from the objArrayKlass header.
-      int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
-                       objArrayKlass::element_klass_offset_in_bytes());
+      int ek_offset = in_bytes(objArrayKlass::element_klass_offset());
      __ movptr(r11_dst_klass, Address(r11_dst_klass, ek_offset));
      __ movl(  sco_temp,      Address(r11_dst_klass, sco_offset));
      assert_clean_int(sco_temp, rax);
@ -3072,6 +3068,13 @@ class StubGenerator: public StubCodeGenerator {
      generate_throw_exception("WrongMethodTypeException throw_exception",
                               CAST_FROM_FN_PTR(address, SharedRuntime::throw_WrongMethodTypeException),
                               rax, rcx);
+
+    // Build this early so it's available for the interpreter.
+    StubRoutines::_throw_StackOverflowError_entry =
+      generate_throw_exception("StackOverflowError throw_exception",
+                               CAST_FROM_FN_PTR(address,
+                                                SharedRuntime::
+                                                throw_StackOverflowError));
  }

  void generate_all() {
@ -3098,12 +3101,6 @@ class StubGenerator: public StubCodeGenerator {
                                                SharedRuntime::
                                                throw_NullPointerException_at_call));

-    StubRoutines::_throw_StackOverflowError_entry =
-      generate_throw_exception("StackOverflowError throw_exception",
-                               CAST_FROM_FN_PTR(address,
-                                                SharedRuntime::
-                                                throw_StackOverflowError));
-
    // entry points that are platform specific
    StubRoutines::x86::_f2i_fixup = generate_f2i_fixup();
    StubRoutines::x86::_f2l_fixup = generate_f2l_fixup();
--- a/hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/templateInterpreter_x86_32.cpp
@ -522,9 +522,18 @@ void InterpreterGenerator::generate_stack_overflow_check(void) {

  __ pop(rsi);  // get saved bcp / (c++ prev state ).

-  __ pop(rax);  // get return address
-  __ jump(ExternalAddress(Interpreter::throw_StackOverflowError_entry()));
+  // Restore sender's sp as SP. This is necessary if the sender's
+  // frame is an extended compiled frame (see gen_c2i_adapter())
+  // and safer anyway in case of JSR292 adaptations.

+  __ pop(rax); // return address must be moved if SP is changed
+  __ mov(rsp, rsi);
+  __ push(rax);
+
+  // Note: the restored frame is not necessarily interpreted.
+  // Use the shared runtime version of the StackOverflowError.
+  assert(StubRoutines::throw_StackOverflowError_entry() != NULL, "stub not yet generated");
+  __ jump(ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));
  // all done with frame size check
  __ bind(after_frame_check_pop);
  __ pop(rsi);
@ -552,7 +561,7 @@ void InterpreterGenerator::lock_method(void) {
  #endif // ASSERT
  // get synchronization object
  { Label done;
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
    __ movl(rax, access_flags);
    __ testl(rax, JVM_ACC_STATIC);
    __ movptr(rax, Address(rdi, Interpreter::local_offset_in_bytes(0)));  // get receiver (assume this is frequent case)
@ -1012,7 +1021,7 @@ address InterpreterGenerator::generate_native_entry(bool synchronized) {

  // pass mirror handle if static call
  { Label L;
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() + Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
    __ movl(t, Address(method, methodOopDesc::access_flags_offset()));
    __ testl(t, JVM_ACC_STATIC);
    __ jcc(Assembler::zero, L);
--- a/hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/templateInterpreter_x86_64.cpp
@ -467,8 +467,18 @@ void InterpreterGenerator::generate_stack_overflow_check(void) {
  __ cmpptr(rsp, rax);
  __ jcc(Assembler::above, after_frame_check);

-  __ pop(rax); // get return address
-  __ jump(ExternalAddress(Interpreter::throw_StackOverflowError_entry()));
+  // Restore sender's sp as SP. This is necessary if the sender's
+  // frame is an extended compiled frame (see gen_c2i_adapter())
+  // and safer anyway in case of JSR292 adaptations.
+
+  __ pop(rax); // return address must be moved if SP is changed
+  __ mov(rsp, r13);
+  __ push(rax);
+
+  // Note: the restored frame is not necessarily interpreted.
+  // Use the shared runtime version of the StackOverflowError.
+  assert(StubRoutines::throw_StackOverflowError_entry() != NULL, "stub not yet generated");
+  __ jump(ExternalAddress(StubRoutines::throw_StackOverflowError_entry()));

  // all done with frame size check
  __ bind(after_frame_check);
@ -505,8 +515,7 @@ void InterpreterGenerator::lock_method(void) {

  // get synchronization object
  {
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() +
-                              Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
    Label done;
    __ movl(rax, access_flags);
    __ testl(rax, JVM_ACC_STATIC);
@ -1006,8 +1015,7 @@ address InterpreterGenerator::generate_native_entry(bool synchronized) {
  // pass mirror handle if static call
  {
    Label L;
-    const int mirror_offset = klassOopDesc::klass_part_offset_in_bytes() +
-                              Klass::java_mirror_offset_in_bytes();
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
    __ movl(t, Address(method, methodOopDesc::access_flags_offset()));
    __ testl(t, JVM_ACC_STATIC);
    __ jcc(Assembler::zero, L);
--- a/hotspot/src/cpu/x86/vm/templateTable_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/templateTable_x86_32.cpp
@ -980,7 +980,7 @@ void TemplateTable::aastore() {
  __ load_klass(rbx, rax);
  // Move superklass into EAX
  __ load_klass(rax, rdx);
-  __ movptr(rax, Address(rax, sizeof(oopDesc) + objArrayKlass::element_klass_offset_in_bytes()));
+  __ movptr(rax, Address(rax, objArrayKlass::element_klass_offset()));
  // Compress array+index*wordSize+12 into a single register.  Frees ECX.
  __ lea(rdx, element_address);

@ -2033,7 +2033,7 @@ void TemplateTable::_return(TosState state) {
    assert(state == vtos, "only valid state");
    __ movptr(rax, aaddress(0));
    __ load_klass(rdi, rax);
-    __ movl(rdi, Address(rdi, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc)));
+    __ movl(rdi, Address(rdi, Klass::access_flags_offset()));
    __ testl(rdi, JVM_ACC_HAS_FINALIZER);
    Label skip_register_finalizer;
    __ jcc(Assembler::zero, skip_register_finalizer);
@ -3188,11 +3188,11 @@ void TemplateTable::_new() {

  // make sure klass is initialized & doesn't have finalizer
  // make sure klass is fully initialized
-  __ cmpl(Address(rcx, instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc)), instanceKlass::fully_initialized);
+  __ cmpb(Address(rcx, instanceKlass::init_state_offset()), instanceKlass::fully_initialized);
  __ jcc(Assembler::notEqual, slow_case);

  // get instance_size in instanceKlass (scaled to a count of bytes)
-  __ movl(rdx, Address(rcx, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc)));
+  __ movl(rdx, Address(rcx, Klass::layout_helper_offset()));
  // test to see if it has a finalizer or is malformed in some way
  __ testl(rdx, Klass::_lh_instance_slow_path_bit);
  __ jcc(Assembler::notZero, slow_case);
@ -3293,7 +3293,7 @@ void TemplateTable::_new() {
    __ bind(initialize_header);
    if (UseBiasedLocking) {
      __ pop(rcx);   // get saved klass back in the register.
-      __ movptr(rbx, Address(rcx, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+      __ movptr(rbx, Address(rcx, Klass::prototype_header_offset()));
      __ movptr(Address(rax, oopDesc::mark_offset_in_bytes ()), rbx);
    } else {
      __ movptr(Address(rax, oopDesc::mark_offset_in_bytes ()),
--- a/hotspot/src/cpu/x86/vm/templateTable_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/templateTable_x86_64.cpp
@ -1004,8 +1004,7 @@ void TemplateTable::aastore() {
  // Move superklass into rax
  __ load_klass(rax, rdx);
  __ movptr(rax, Address(rax,
-                         sizeof(oopDesc) +
-                         objArrayKlass::element_klass_offset_in_bytes()));
+                         objArrayKlass::element_klass_offset()));
  // Compress array + index*oopSize + 12 into a single register.  Frees rcx.
  __ lea(rdx, element_address);

@ -2067,7 +2066,7 @@ void TemplateTable::_return(TosState state) {
    assert(state == vtos, "only valid state");
    __ movptr(c_rarg1, aaddress(0));
    __ load_klass(rdi, c_rarg1);
-    __ movl(rdi, Address(rdi, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc)));
+    __ movl(rdi, Address(rdi, Klass::access_flags_offset()));
    __ testl(rdi, JVM_ACC_HAS_FINALIZER);
    Label skip_register_finalizer;
    __ jcc(Assembler::zero, skip_register_finalizer);
@ -3235,16 +3234,15 @@ void TemplateTable::_new() {

  // make sure klass is initialized & doesn't have finalizer
  // make sure klass is fully initialized
-  __ cmpl(Address(rsi,
-                  instanceKlass::init_state_offset_in_bytes() +
-                  sizeof(oopDesc)),
+  __ cmpb(Address(rsi,
+                  instanceKlass::init_state_offset()),
          instanceKlass::fully_initialized);
  __ jcc(Assembler::notEqual, slow_case);

  // get instance_size in instanceKlass (scaled to a count of bytes)
  __ movl(rdx,
          Address(rsi,
-                  Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc)));
+                  Klass::layout_helper_offset()));
  // test to see if it has a finalizer or is malformed in some way
  __ testl(rdx, Klass::_lh_instance_slow_path_bit);
  __ jcc(Assembler::notZero, slow_case);
@ -3337,7 +3335,7 @@ void TemplateTable::_new() {
    // initialize object header only.
    __ bind(initialize_header);
    if (UseBiasedLocking) {
-      __ movptr(rscratch1, Address(rsi, Klass::prototype_header_offset_in_bytes() + klassOopDesc::klass_part_offset_in_bytes()));
+      __ movptr(rscratch1, Address(rsi, Klass::prototype_header_offset()));
      __ movptr(Address(rax, oopDesc::mark_offset_in_bytes()), rscratch1);
    } else {
      __ movptr(Address(rax, oopDesc::mark_offset_in_bytes()),
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
@ -50,7 +50,7 @@ const char*           VM_Version::_features_str = "";
 VM_Version::CpuidInfo VM_Version::_cpuid_info   = { 0, };

 static BufferBlob* stub_blob;
-static const int stub_size = 500;
+static const int stub_size = 550;

 extern "C" {
  typedef void (*getPsrInfo_stub_t)(void*);
@ -73,7 +73,7 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
    const uint32_t CPU_FAMILY_486   = (4 << CPU_FAMILY_SHIFT);

    Label detect_486, cpu486, detect_586, std_cpuid1, std_cpuid4;
-    Label ext_cpuid1, ext_cpuid5, ext_cpuid7, done;
+    Label sef_cpuid, ext_cpuid, ext_cpuid1, ext_cpuid5, ext_cpuid7, done;

    StubCodeMark mark(this, "VM_Version", "getPsrInfo_stub");
 #   define __ _masm->
@ -229,6 +229,41 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
    __ movl(Address(rsi, 8), rcx);
    __ movl(Address(rsi,12), rdx);

+    //
+    // Check if OS has enabled XGETBV instruction to access XCR0
+    // (OSXSAVE feature flag) and CPU supports AVX
+    //
+    __ andl(rcx, 0x18000000);
+    __ cmpl(rcx, 0x18000000);
+    __ jccb(Assembler::notEqual, sef_cpuid);
+
+    //
+    // XCR0, XFEATURE_ENABLED_MASK register
+    //
+    __ xorl(rcx, rcx);   // zero for XCR0 register
+    __ xgetbv();
+    __ lea(rsi, Address(rbp, in_bytes(VM_Version::xem_xcr0_offset())));
+    __ movl(Address(rsi, 0), rax);
+    __ movl(Address(rsi, 4), rdx);
+
+    //
+    // cpuid(0x7) Structured Extended Features
+    //
+    __ bind(sef_cpuid);
+    __ movl(rax, 7);
+    __ cmpl(rax, Address(rbp, in_bytes(VM_Version::std_cpuid0_offset()))); // Is cpuid(0x7) supported?
+    __ jccb(Assembler::greater, ext_cpuid);
+
+    __ xorl(rcx, rcx);
+    __ cpuid();
+    __ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset())));
+    __ movl(Address(rsi, 0), rax);
+    __ movl(Address(rsi, 4), rbx);
+
+    //
+    // Extended cpuid(0x80000000)
+    //
+    __ bind(ext_cpuid);
    __ movl(rax, 0x80000000);
    __ cpuid();
    __ cmpl(rax, 0x80000000);     // Is cpuid(0x80000001) supported?
@ -373,13 +408,19 @@ void VM_Version::get_processor_features() {
  if (UseSSE < 1)
    _cpuFeatures &= ~CPU_SSE;

+  if (UseAVX < 2)
+    _cpuFeatures &= ~CPU_AVX2;
+
+  if (UseAVX < 1)
+    _cpuFeatures &= ~CPU_AVX;
+
  if (logical_processors_per_package() == 1) {
    // HT processor could be installed on a system which doesn't support HT.
    _cpuFeatures &= ~CPU_HT;
  }

  char buf[256];
-  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
               cores_per_cpu(), threads_per_core(),
               cpu_family(), _model, _stepping,
               (supports_cmov() ? ", cmov" : ""),
@ -393,6 +434,8 @@ void VM_Version::get_processor_features() {
               (supports_sse4_1() ? ", sse4.1" : ""),
               (supports_sse4_2() ? ", sse4.2" : ""),
               (supports_popcnt() ? ", popcnt" : ""),
+               (supports_avx()    ? ", avx" : ""),
+               (supports_avx2()   ? ", avx2" : ""),
               (supports_mmx_ext() ? ", mmxext" : ""),
               (supports_3dnow_prefetch() ? ", 3dnowpref" : ""),
               (supports_lzcnt()   ? ", lzcnt": ""),
@ -417,6 +460,13 @@ void VM_Version::get_processor_features() {
  if (!supports_sse ()) // Drop to 0 if no SSE  support
    UseSSE = 0;

+  if (UseAVX > 2) UseAVX=2;
+  if (UseAVX < 0) UseAVX=0;
+  if (!supports_avx2()) // Drop to 1 if no AVX2 support
+    UseAVX = MIN2((intx)1,UseAVX);
+  if (!supports_avx ()) // Drop to 0 if no AVX  support
+    UseAVX = 0;
+
  // On new cpus instructions which update whole XMM register should be used
  // to prevent partial register stall due to dependencies on high half.
  //
@ -551,6 +601,9 @@ void VM_Version::get_processor_features() {
    if (FLAG_IS_DEFAULT(UsePopCountInstruction)) {
      UsePopCountInstruction = true;
    }
+  } else if (UsePopCountInstruction) {
+    warning("POPCNT instruction is not available on this CPU");
+    FLAG_SET_DEFAULT(UsePopCountInstruction, false);
  }

 #ifdef COMPILER2
@ -622,7 +675,11 @@ void VM_Version::get_processor_features() {
  if (PrintMiscellaneous && Verbose) {
    tty->print_cr("Logical CPUs per core: %u",
                  logical_processors_per_package());
-    tty->print_cr("UseSSE=%d",UseSSE);
+    tty->print("UseSSE=%d",UseSSE);
+    if (UseAVX > 0) {
+      tty->print("  UseAVX=%d",UseAVX);
+    }
+    tty->cr();
    tty->print("Allocation");
    if (AllocatePrefetchStyle <= 0 || UseSSE == 0 && !supports_3dnow_prefetch()) {
      tty->print_cr(": no prefetching");
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp
@ -78,7 +78,10 @@ public:
               sse4_2   : 1,
                        : 2,
               popcnt   : 1,
-                        : 8;
+                        : 3,
+               osxsave  : 1,
+               avx      : 1,
+                        : 3;
    } bits;
  };

@ -168,6 +171,15 @@ public:
    } bits;
  };

+  union ExtCpuid7Edx {
+    uint32_t value;
+    struct {
+      uint32_t               : 8,
+              tsc_invariance : 1,
+                             : 23;
+    } bits;
+  };
+
  union ExtCpuid8Ecx {
    uint32_t value;
    struct {
@ -176,15 +188,34 @@ public:
    } bits;
  };

-  union ExtCpuid7Edx {
+  union SefCpuid7Eax {
+    uint32_t value;
+  };
+
+  union SefCpuid7Ebx {
    uint32_t value;
    struct {
-      uint32_t               : 8,
-              tsc_invariance : 1,
+      uint32_t fsgsbase : 1,
+                        : 2,
+                   bmi1 : 1,
+                        : 1,
+                   avx2 : 1,
+                        : 2,
+                   bmi2 : 1,
                        : 23;
    } bits;
  };

+  union XemXcr0Eax {
+    uint32_t value;
+    struct {
+      uint32_t x87 : 1,
+               sse : 1,
+               ymm : 1,
+                   : 29;
+    } bits;
+  };
+
 protected:
  static int _cpu;
  static int _model;
@ -211,7 +242,9 @@ protected:
    CPU_POPCNT = (1 << 13),
    CPU_LZCNT  = (1 << 14),
    CPU_TSC    = (1 << 15),
-    CPU_TSCINV = (1 << 16)
+    CPU_TSCINV = (1 << 16),
+    CPU_AVX    = (1 << 17),
+    CPU_AVX2   = (1 << 18)
  } cpuFeatureFlags;

  enum {
@ -250,6 +283,12 @@ protected:
    uint32_t     dcp_cpuid4_ecx; // unused currently
    uint32_t     dcp_cpuid4_edx; // unused currently

+    // cpuid function 7 (structured extended features)
+    SefCpuid7Eax sef_cpuid7_eax;
+    SefCpuid7Ebx sef_cpuid7_ebx;
+    uint32_t     sef_cpuid7_ecx; // unused currently
+    uint32_t     sef_cpuid7_edx; // unused currently
+
    // cpuid function 0xB (processor topology)
    // ecx = 0
    uint32_t     tpl_cpuidB0_eax;
@ -303,6 +342,10 @@ protected:
    uint32_t     ext_cpuid8_ebx; // reserved
    ExtCpuid8Ecx ext_cpuid8_ecx;
    uint32_t     ext_cpuid8_edx; // reserved
+
+    // extended control register XCR0 (the XFEATURE_ENABLED_MASK register)
+    XemXcr0Eax   xem_xcr0_eax;
+    uint32_t     xem_xcr0_edx; // reserved
  };

  // The actual cpuid info block
@ -360,6 +403,14 @@ protected:
      result |= CPU_SSE4_2;
    if (_cpuid_info.std_cpuid1_ecx.bits.popcnt != 0)
      result |= CPU_POPCNT;
+    if (_cpuid_info.std_cpuid1_ecx.bits.avx != 0 &&
+        _cpuid_info.std_cpuid1_ecx.bits.osxsave != 0 &&
+        _cpuid_info.xem_xcr0_eax.bits.sse != 0 &&
+        _cpuid_info.xem_xcr0_eax.bits.ymm != 0) {
+      result |= CPU_AVX;
+      if (_cpuid_info.sef_cpuid7_ebx.bits.avx2 != 0)
+        result |= CPU_AVX2;
+    }
    if (_cpuid_info.std_cpuid1_edx.bits.tsc != 0)
      result |= CPU_TSC;
    if (_cpuid_info.ext_cpuid7_edx.bits.tsc_invariance != 0)
@ -386,6 +437,7 @@ public:
  static ByteSize std_cpuid0_offset() { return byte_offset_of(CpuidInfo, std_max_function); }
  static ByteSize std_cpuid1_offset() { return byte_offset_of(CpuidInfo, std_cpuid1_eax); }
  static ByteSize dcp_cpuid4_offset() { return byte_offset_of(CpuidInfo, dcp_cpuid4_eax); }
+  static ByteSize sef_cpuid7_offset() { return byte_offset_of(CpuidInfo, sef_cpuid7_eax); }
  static ByteSize ext_cpuid1_offset() { return byte_offset_of(CpuidInfo, ext_cpuid1_eax); }
  static ByteSize ext_cpuid5_offset() { return byte_offset_of(CpuidInfo, ext_cpuid5_eax); }
  static ByteSize ext_cpuid7_offset() { return byte_offset_of(CpuidInfo, ext_cpuid7_eax); }
@ -393,6 +445,7 @@ public:
  static ByteSize tpl_cpuidB0_offset() { return byte_offset_of(CpuidInfo, tpl_cpuidB0_eax); }
  static ByteSize tpl_cpuidB1_offset() { return byte_offset_of(CpuidInfo, tpl_cpuidB1_eax); }
  static ByteSize tpl_cpuidB2_offset() { return byte_offset_of(CpuidInfo, tpl_cpuidB2_eax); }
+  static ByteSize xem_xcr0_offset() { return byte_offset_of(CpuidInfo, xem_xcr0_eax); }

  // Initialization
  static void initialize();
@ -483,6 +536,8 @@ public:
  static bool supports_sse4_1()   { return (_cpuFeatures & CPU_SSE4_1) != 0; }
  static bool supports_sse4_2()   { return (_cpuFeatures & CPU_SSE4_2) != 0; }
  static bool supports_popcnt()   { return (_cpuFeatures & CPU_POPCNT) != 0; }
+  static bool supports_avx()      { return (_cpuFeatures & CPU_AVX) != 0; }
+  static bool supports_avx2()     { return (_cpuFeatures & CPU_AVX2) != 0; }
  static bool supports_tsc()      { return (_cpuFeatures & CPU_TSC)    != 0; }

  // Intel features
--- a/hotspot/src/cpu/x86/vm/x86.ad
+++ b/hotspot/src/cpu/x86/vm/x86.ad
@ -0,0 +1,777 @@
+//
+// Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+//
+// This code is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License version 2 only, as
+// published by the Free Software Foundation.
+//
+// This code is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// version 2 for more details (a copy is included in the LICENSE file that
+// accompanied this code).
+//
+// You should have received a copy of the GNU General Public License version
+// 2 along with this work; if not, write to the Free Software Foundation,
+// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+// or visit www.oracle.com if you need additional information or have any
+// questions.
+//
+//
+
+// X86 Common Architecture Description File
+
+source %{
+  // Float masks come from different places depending on platform.
+#ifdef _LP64
+  static address float_signmask()  { return StubRoutines::x86::float_sign_mask(); }
+  static address float_signflip()  { return StubRoutines::x86::float_sign_flip(); }
+  static address double_signmask() { return StubRoutines::x86::double_sign_mask(); }
+  static address double_signflip() { return StubRoutines::x86::double_sign_flip(); }
+#else
+  static address float_signmask()  { return (address)float_signmask_pool; }
+  static address float_signflip()  { return (address)float_signflip_pool; }
+  static address double_signmask() { return (address)double_signmask_pool; }
+  static address double_signflip() { return (address)double_signflip_pool; }
+#endif
+%}
+
+// INSTRUCTIONS -- Platform independent definitions (same for 32- and 64-bit)
+
+instruct addF_reg(regF dst, regF src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (AddF dst src));
+
+  format %{ "addss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ addss($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct addF_mem(regF dst, memory src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (AddF dst (LoadF src)));
+
+  format %{ "addss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ addss($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct addF_imm(regF dst, immF con) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (AddF dst con));
+  format %{ "addss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ addss($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vaddF_reg(regF dst, regF src1, regF src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddF src1 src2));
+
+  format %{ "vaddss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vaddF_mem(regF dst, regF src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddF src1 (LoadF src2)));
+
+  format %{ "vaddss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vaddss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vaddF_imm(regF dst, regF src, immF con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddF src con));
+
+  format %{ "vaddss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vaddss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct addD_reg(regD dst, regD src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (AddD dst src));
+
+  format %{ "addsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ addsd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct addD_mem(regD dst, memory src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (AddD dst (LoadD src)));
+
+  format %{ "addsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ addsd($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct addD_imm(regD dst, immD con) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (AddD dst con));
+  format %{ "addsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ addsd($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vaddD_reg(regD dst, regD src1, regD src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddD src1 src2));
+
+  format %{ "vaddsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vaddD_mem(regD dst, regD src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddD src1 (LoadD src2)));
+
+  format %{ "vaddsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vaddsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vaddD_imm(regD dst, regD src, immD con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddD src con));
+
+  format %{ "vaddsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vaddsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct subF_reg(regF dst, regF src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (SubF dst src));
+
+  format %{ "subss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ subss($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct subF_mem(regF dst, memory src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (SubF dst (LoadF src)));
+
+  format %{ "subss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ subss($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct subF_imm(regF dst, immF con) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (SubF dst con));
+  format %{ "subss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ subss($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vsubF_reg(regF dst, regF src1, regF src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (SubF src1 src2));
+
+  format %{ "vsubss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vsubF_mem(regF dst, regF src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (SubF src1 (LoadF src2)));
+
+  format %{ "vsubss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vsubss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vsubF_imm(regF dst, regF src, immF con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (SubF src con));
+
+  format %{ "vsubss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vsubss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct subD_reg(regD dst, regD src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (SubD dst src));
+
+  format %{ "subsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ subsd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct subD_mem(regD dst, memory src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (SubD dst (LoadD src)));
+
+  format %{ "subsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ subsd($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct subD_imm(regD dst, immD con) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (SubD dst con));
+  format %{ "subsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ subsd($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vsubD_reg(regD dst, regD src1, regD src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (SubD src1 src2));
+
+  format %{ "vsubsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vsubD_mem(regD dst, regD src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (SubD src1 (LoadD src2)));
+
+  format %{ "vsubsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vsubsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vsubD_imm(regD dst, regD src, immD con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (SubD src con));
+
+  format %{ "vsubsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vsubsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct mulF_reg(regF dst, regF src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (MulF dst src));
+
+  format %{ "mulss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ mulss($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct mulF_mem(regF dst, memory src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (MulF dst (LoadF src)));
+
+  format %{ "mulss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ mulss($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct mulF_imm(regF dst, immF con) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (MulF dst con));
+  format %{ "mulss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ mulss($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmulF_reg(regF dst, regF src1, regF src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulF src1 src2));
+
+  format %{ "vmulss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmulF_mem(regF dst, regF src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulF src1 (LoadF src2)));
+
+  format %{ "vmulss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vmulss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmulF_imm(regF dst, regF src, immF con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulF src con));
+
+  format %{ "vmulss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vmulss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct mulD_reg(regD dst, regD src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (MulD dst src));
+
+  format %{ "mulsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ mulsd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct mulD_mem(regD dst, memory src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (MulD dst (LoadD src)));
+
+  format %{ "mulsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ mulsd($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct mulD_imm(regD dst, immD con) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (MulD dst con));
+  format %{ "mulsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ mulsd($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmulD_reg(regD dst, regD src1, regD src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulD src1 src2));
+
+  format %{ "vmulsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmulD_mem(regD dst, regD src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulD src1 (LoadD src2)));
+
+  format %{ "vmulsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vmulsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vmulD_imm(regD dst, regD src, immD con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulD src con));
+
+  format %{ "vmulsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vmulsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct divF_reg(regF dst, regF src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (DivF dst src));
+
+  format %{ "divss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ divss($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct divF_mem(regF dst, memory src) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (DivF dst (LoadF src)));
+
+  format %{ "divss   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ divss($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct divF_imm(regF dst, immF con) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (DivF dst con));
+  format %{ "divss   $dst, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ divss($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vdivF_reg(regF dst, regF src1, regF src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (DivF src1 src2));
+
+  format %{ "vdivss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vdivF_mem(regF dst, regF src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (DivF src1 (LoadF src2)));
+
+  format %{ "vdivss  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vdivss($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vdivF_imm(regF dst, regF src, immF con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (DivF src con));
+
+  format %{ "vdivss  $dst, $src, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vdivss($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct divD_reg(regD dst, regD src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (DivD dst src));
+
+  format %{ "divsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ divsd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct divD_mem(regD dst, memory src) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (DivD dst (LoadD src)));
+
+  format %{ "divsd   $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ divsd($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct divD_imm(regD dst, immD con) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (DivD dst con));
+  format %{ "divsd   $dst, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ divsd($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vdivD_reg(regD dst, regD src1, regD src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (DivD src1 src2));
+
+  format %{ "vdivsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vdivD_mem(regD dst, regD src1, memory src2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (DivD src1 (LoadD src2)));
+
+  format %{ "vdivsd  $dst, $src1, $src2" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vdivsd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vdivD_imm(regD dst, regD src, immD con) %{
+  predicate(UseAVX > 0);
+  match(Set dst (DivD src con));
+
+  format %{ "vdivsd  $dst, $src, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ vdivsd($dst$$XMMRegister, $src$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct absF_reg(regF dst) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (AbsF dst));
+  ins_cost(150);
+  format %{ "andps   $dst, [0x7fffffff]\t# abs float by sign masking" %}
+  ins_encode %{
+    __ andps($dst$$XMMRegister, ExternalAddress(float_signmask()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vabsF_reg(regF dst, regF src) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AbsF src));
+  ins_cost(150);
+  format %{ "vandps  $dst, $src, [0x7fffffff]\t# abs float by sign masking" %}
+  ins_encode %{
+    __ vandps($dst$$XMMRegister, $src$$XMMRegister,
+              ExternalAddress(float_signmask()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct absD_reg(regD dst) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (AbsD dst));
+  ins_cost(150);
+  format %{ "andpd   $dst, [0x7fffffffffffffff]\t"
+            "# abs double by sign masking" %}
+  ins_encode %{
+    __ andpd($dst$$XMMRegister, ExternalAddress(double_signmask()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vabsD_reg(regD dst, regD src) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AbsD src));
+  ins_cost(150);
+  format %{ "vandpd  $dst, $src, [0x7fffffffffffffff]\t"
+            "# abs double by sign masking" %}
+  ins_encode %{
+    __ vandpd($dst$$XMMRegister, $src$$XMMRegister,
+              ExternalAddress(double_signmask()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct negF_reg(regF dst) %{
+  predicate((UseSSE>=1) && (UseAVX == 0));
+  match(Set dst (NegF dst));
+  ins_cost(150);
+  format %{ "xorps   $dst, [0x80000000]\t# neg float by sign flipping" %}
+  ins_encode %{
+    __ xorps($dst$$XMMRegister, ExternalAddress(float_signflip()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vnegF_reg(regF dst, regF src) %{
+  predicate(UseAVX > 0);
+  match(Set dst (NegF src));
+  ins_cost(150);
+  format %{ "vxorps  $dst, $src, [0x80000000]\t# neg float by sign flipping" %}
+  ins_encode %{
+    __ vxorps($dst$$XMMRegister, $src$$XMMRegister,
+              ExternalAddress(float_signflip()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct negD_reg(regD dst) %{
+  predicate((UseSSE>=2) && (UseAVX == 0));
+  match(Set dst (NegD dst));
+  ins_cost(150);
+  format %{ "xorpd   $dst, [0x8000000000000000]\t"
+            "# neg double by sign flipping" %}
+  ins_encode %{
+    __ xorpd($dst$$XMMRegister, ExternalAddress(double_signflip()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct vnegD_reg(regD dst, regD src) %{
+  predicate(UseAVX > 0);
+  match(Set dst (NegD src));
+  ins_cost(150);
+  format %{ "vxorpd  $dst, $src, [0x8000000000000000]\t"
+            "# neg double by sign flipping" %}
+  ins_encode %{
+    __ vxorpd($dst$$XMMRegister, $src$$XMMRegister,
+              ExternalAddress(double_signflip()));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct sqrtF_reg(regF dst, regF src) %{
+  predicate(UseSSE>=1);
+  match(Set dst (ConvD2F (SqrtD (ConvF2D src))));
+
+  format %{ "sqrtss  $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ sqrtss($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct sqrtF_mem(regF dst, memory src) %{
+  predicate(UseSSE>=1);
+  match(Set dst (ConvD2F (SqrtD (ConvF2D (LoadF src)))));
+
+  format %{ "sqrtss  $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ sqrtss($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct sqrtF_imm(regF dst, immF con) %{
+  predicate(UseSSE>=1);
+  match(Set dst (ConvD2F (SqrtD (ConvF2D con))));
+  format %{ "sqrtss  $dst, [$constantaddress]\t# load from constant table: float=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ sqrtss($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct sqrtD_reg(regD dst, regD src) %{
+  predicate(UseSSE>=2);
+  match(Set dst (SqrtD src));
+
+  format %{ "sqrtsd  $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ sqrtsd($dst$$XMMRegister, $src$$XMMRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct sqrtD_mem(regD dst, memory src) %{
+  predicate(UseSSE>=2);
+  match(Set dst (SqrtD (LoadD src)));
+
+  format %{ "sqrtsd  $dst, $src" %}
+  ins_cost(150);
+  ins_encode %{
+    __ sqrtsd($dst$$XMMRegister, $src$$Address);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+instruct sqrtD_imm(regD dst, immD con) %{
+  predicate(UseSSE>=2);
+  match(Set dst (SqrtD con));
+  format %{ "sqrtsd  $dst, [$constantaddress]\t# load from constant table: double=$con" %}
+  ins_cost(150);
+  ins_encode %{
+    __ sqrtsd($dst$$XMMRegister, $constantaddress($con));
+  %}
+  ins_pipe(pipe_slow);
+%}
+
--- a/hotspot/src/cpu/x86/vm/x86_32.ad
+++ b/hotspot/src/cpu/x86/vm/x86_32.ad
--- a/hotspot/src/cpu/x86/vm/x86_64.ad
+++ b/hotspot/src/cpu/x86/vm/x86_64.ad
--- a/hotspot/src/os/bsd/vm/os_bsd.cpp
+++ b/hotspot/src/os/bsd/vm/os_bsd.cpp
@ -2835,7 +2835,7 @@ void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) {
 #endif
 }

-void os::free_memory(char *addr, size_t bytes) {
+void os::free_memory(char *addr, size_t bytes, size_t alignment_hint) {
  ::madvise(addr, bytes, MADV_DONTNEED);
 }

--- a/hotspot/src/os/linux/vm/os_linux.cpp
+++ b/hotspot/src/os/linux/vm/os_linux.cpp
@ -2546,8 +2546,8 @@ void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) {
  }
 }

-void os::free_memory(char *addr, size_t bytes) {
-  commit_memory(addr, bytes, false);
+void os::free_memory(char *addr, size_t bytes, size_t alignment_hint) {
+  commit_memory(addr, bytes, alignment_hint, false);
 }

 void os::numa_make_global(char *addr, size_t bytes) {
--- a/hotspot/src/os/solaris/vm/os_solaris.cpp
+++ b/hotspot/src/os/solaris/vm/os_solaris.cpp
@ -2821,7 +2821,7 @@ bool os::commit_memory(char* addr, size_t bytes, size_t alignment_hint,
 }

 // Uncommit the pages in a specified region.
-void os::free_memory(char* addr, size_t bytes) {
+void os::free_memory(char* addr, size_t bytes, size_t alignment_hint) {
  if (madvise(addr, bytes, MADV_FREE) < 0) {
    debug_only(warning("MADV_FREE failed."));
    return;
--- a/hotspot/src/os/windows/vm/os_windows.cpp
+++ b/hotspot/src/os/windows/vm/os_windows.cpp
@ -3137,7 +3137,7 @@ bool os::unguard_memory(char* addr, size_t bytes) {
 }

 void os::realign_memory(char *addr, size_t bytes, size_t alignment_hint) { }
-void os::free_memory(char *addr, size_t bytes)         { }
+void os::free_memory(char *addr, size_t bytes, size_t alignment_hint)    { }
 void os::numa_make_global(char *addr, size_t bytes)    { }
 void os::numa_make_local(char *addr, size_t bytes, int lgrp_hint)    { }
 bool os::numa_topology_changed()                       { return false; }
--- a/hotspot/src/share/vm/adlc/formssel.cpp
+++ b/hotspot/src/share/vm/adlc/formssel.cpp
@ -627,6 +627,7 @@ bool InstructForm::is_wide_memory_kill(FormDict &globals) const {
  if( strcmp(_matrule->_opType,"MemBarAcquire") == 0 ) return true;
  if( strcmp(_matrule->_opType,"MemBarReleaseLock") == 0 ) return true;
  if( strcmp(_matrule->_opType,"MemBarAcquireLock") == 0 ) return true;
+  if( strcmp(_matrule->_opType,"MemBarStoreStore") == 0 ) return true;

  return false;
 }
@ -3978,7 +3979,8 @@ bool MatchRule::is_ideal_membar() const {
    !strcmp(_opType,"MemBarAcquireLock") ||
    !strcmp(_opType,"MemBarReleaseLock") ||
    !strcmp(_opType,"MemBarVolatile" ) ||
-    !strcmp(_opType,"MemBarCPUOrder" ) ;
+    !strcmp(_opType,"MemBarCPUOrder" ) ||
+    !strcmp(_opType,"MemBarStoreStore" );
 }

 bool MatchRule::is_ideal_loadPC() const {
--- a/hotspot/src/share/vm/asm/assembler.cpp
+++ b/hotspot/src/share/vm/asm/assembler.cpp
@ -61,6 +61,7 @@ AbstractAssembler::AbstractAssembler(CodeBuffer* code) {
  _code_limit  = cs->limit();
  _code_pos    = cs->end();
  _oop_recorder= code->oop_recorder();
+  DEBUG_ONLY( _short_branch_delta = 0; )
  if (_code_begin == NULL)  {
    vm_exit_out_of_memory(0, err_msg("CodeCache: no room for %s",
                                     code->name()));
--- a/hotspot/src/share/vm/asm/assembler.hpp
+++ b/hotspot/src/share/vm/asm/assembler.hpp
@ -241,6 +241,33 @@ class AbstractAssembler : public ResourceObj  {
  // Make it return true on platforms which need to verify
  // instruction boundaries for some operations.
  inline static bool pd_check_instruction_mark();
+
+  // Add delta to short branch distance to verify that it still fit into imm8.
+  int _short_branch_delta;
+
+  int  short_branch_delta() const { return _short_branch_delta; }
+  void set_short_branch_delta()   { _short_branch_delta = 32; }
+  void clear_short_branch_delta() { _short_branch_delta = 0; }
+
+  class ShortBranchVerifier: public StackObj {
+   private:
+    AbstractAssembler* _assm;
+
+   public:
+    ShortBranchVerifier(AbstractAssembler* assm) : _assm(assm) {
+      assert(assm->short_branch_delta() == 0, "overlapping instructions");
+      _assm->set_short_branch_delta();
+    }
+    ~ShortBranchVerifier() {
+      _assm->clear_short_branch_delta();
+    }
+  };
+  #else
+  // Dummy in product.
+  class ShortBranchVerifier: public StackObj {
+   public:
+    ShortBranchVerifier(AbstractAssembler* assm) {}
+  };
  #endif

  // Label functions
--- a/hotspot/src/share/vm/c1/c1_LIR.cpp
+++ b/hotspot/src/share/vm/c1/c1_LIR.cpp
@ -854,6 +854,9 @@ void LIR_OpVisitState::visit(LIR_Op* op) {
      if (opTypeCheck->_info_for_exception)       do_info(opTypeCheck->_info_for_exception);
      if (opTypeCheck->_info_for_patch)           do_info(opTypeCheck->_info_for_patch);
      if (opTypeCheck->_object->is_valid())       do_input(opTypeCheck->_object);
+      if (op->code() == lir_store_check && opTypeCheck->_object->is_valid()) {
+        do_temp(opTypeCheck->_object);
+      }
      if (opTypeCheck->_array->is_valid())        do_input(opTypeCheck->_array);
      if (opTypeCheck->_tmp1->is_valid())         do_temp(opTypeCheck->_tmp1);
      if (opTypeCheck->_tmp2->is_valid())         do_temp(opTypeCheck->_tmp2);
--- a/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp
+++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp
@ -1256,8 +1256,7 @@ void LIRGenerator::do_getClass(Intrinsic* x) {
    info = state_for(x);
  }
  __ move(new LIR_Address(rcvr.result(), oopDesc::klass_offset_in_bytes(), T_OBJECT), result, info);
-  __ move_wide(new LIR_Address(result, Klass::java_mirror_offset_in_bytes() +
-                               klassOopDesc::klass_part_offset_in_bytes(), T_OBJECT), result);
+  __ move_wide(new LIR_Address(result, in_bytes(Klass::java_mirror_offset()), T_OBJECT), result);
 }


--- a/hotspot/src/share/vm/c1/c1_Optimizer.cpp
+++ b/hotspot/src/share/vm/c1/c1_Optimizer.cpp
@ -122,18 +122,32 @@ void CE_Eliminator::block_do(BlockBegin* block) {
  if (sux != f_goto->default_sux()) return;

  // check if at least one word was pushed on sux_state
+  // inlining depths must match
+  ValueStack* if_state = if_->state();
  ValueStack* sux_state = sux->state();
-  if (sux_state->stack_size() <= if_->state()->stack_size()) return;
+  if (if_state->scope()->level() > sux_state->scope()->level()) {
+    while (sux_state->scope() != if_state->scope()) {
+      if_state = if_state->caller_state();
+      assert(if_state != NULL, "states do not match up");
+    }
+  } else if (if_state->scope()->level() < sux_state->scope()->level()) {
+    while (sux_state->scope() != if_state->scope()) {
+      sux_state = sux_state->caller_state();
+      assert(sux_state != NULL, "states do not match up");
+    }
+  }
+
+  if (sux_state->stack_size() <= if_state->stack_size()) return;

  // check if phi function is present at end of successor stack and that
  // only this phi was pushed on the stack
-  Value sux_phi = sux_state->stack_at(if_->state()->stack_size());
+  Value sux_phi = sux_state->stack_at(if_state->stack_size());
  if (sux_phi == NULL || sux_phi->as_Phi() == NULL || sux_phi->as_Phi()->block() != sux) return;
-  if (sux_phi->type()->size() != sux_state->stack_size() - if_->state()->stack_size()) return;
+  if (sux_phi->type()->size() != sux_state->stack_size() - if_state->stack_size()) return;

  // get the values that were pushed in the true- and false-branch
-  Value t_value = t_goto->state()->stack_at(if_->state()->stack_size());
-  Value f_value = f_goto->state()->stack_at(if_->state()->stack_size());
+  Value t_value = t_goto->state()->stack_at(if_state->stack_size());
+  Value f_value = f_goto->state()->stack_at(if_state->stack_size());

  // backend does not support floats
  assert(t_value->type()->base() == f_value->type()->base(), "incompatible types");
@ -180,11 +194,7 @@ void CE_Eliminator::block_do(BlockBegin* block) {
  Goto* goto_ = new Goto(sux, state_before, if_->is_safepoint() || t_goto->is_safepoint() || f_goto->is_safepoint());

  // prepare state for Goto
-  ValueStack* goto_state = if_->state();
-  while (sux_state->scope() != goto_state->scope()) {
-    goto_state = goto_state->caller_state();
-    assert(goto_state != NULL, "states do not match up");
-  }
+  ValueStack* goto_state = if_state;
  goto_state = goto_state->copy(ValueStack::StateAfter, goto_state->bci());
  goto_state->push(result->type(), result);
  assert(goto_state->is_same(sux_state), "states must match now");
--- a/hotspot/src/share/vm/ci/ciInstanceKlass.cpp
+++ b/hotspot/src/share/vm/ci/ciInstanceKlass.cpp
@ -54,7 +54,7 @@ ciInstanceKlass::ciInstanceKlass(KlassHandle h_k) :
  _flags = ciFlags(access_flags);
  _has_finalizer = access_flags.has_finalizer();
  _has_subklass = ik->subklass() != NULL;
-  _init_state = (instanceKlass::ClassState)ik->get_init_state();
+  _init_state = ik->init_state();
  _nonstatic_field_size = ik->nonstatic_field_size();
  _has_nonstatic_fields = ik->has_nonstatic_fields();
  _nonstatic_fields = NULL; // initialized lazily by compute_nonstatic_fields:
@ -118,7 +118,7 @@ ciInstanceKlass::ciInstanceKlass(ciSymbol* name,
 void ciInstanceKlass::compute_shared_init_state() {
  GUARDED_VM_ENTRY(
    instanceKlass* ik = get_instanceKlass();
-    _init_state = (instanceKlass::ClassState)ik->get_init_state();
+    _init_state = ik->init_state();
  )
 }

--- a/hotspot/src/share/vm/classfile/classFileParser.cpp
+++ b/hotspot/src/share/vm/classfile/classFileParser.cpp
@ -1051,7 +1051,7 @@ static FieldAllocationType basic_type_to_atype(bool is_static, BasicType type) {

 class FieldAllocationCount: public ResourceObj {
 public:
-  unsigned int count[MAX_FIELD_ALLOCATION_TYPE];
+  u2 count[MAX_FIELD_ALLOCATION_TYPE];

  FieldAllocationCount() {
    for (int i = 0; i < MAX_FIELD_ALLOCATION_TYPE; i++) {
@ -1061,6 +1061,8 @@ class FieldAllocationCount: public ResourceObj {

  FieldAllocationType update(bool is_static, BasicType type) {
    FieldAllocationType atype = basic_type_to_atype(is_static, type);
+    // Make sure there is no overflow with injected fields.
+    assert(count[atype] < 0xFFFF, "More than 65535 fields");
    count[atype]++;
    return atype;
  }
@ -1071,7 +1073,7 @@ typeArrayHandle ClassFileParser::parse_fields(Symbol* class_name,
                                              constantPoolHandle cp, bool is_interface,
                                              FieldAllocationCount *fac,
                                              objArrayHandle* fields_annotations,
-                                              int* java_fields_count_ptr, TRAPS) {
+                                              u2* java_fields_count_ptr, TRAPS) {
  ClassFileStream* cfs = stream();
  typeArrayHandle nullHandle;
  cfs->guarantee_more(2, CHECK_(nullHandle));  // length
@ -2866,7 +2868,7 @@ instanceKlassHandle ClassFileParser::parseClassFile(Symbol* name,
      local_interfaces = parse_interfaces(cp, itfs_len, class_loader, protection_domain, _class_name, CHECK_(nullHandle));
    }

-    int java_fields_count = 0;
+    u2 java_fields_count = 0;
    // Fields (offsets are filled in later)
    FieldAllocationCount fac;
    objArrayHandle fields_annotations;
--- a/hotspot/src/share/vm/classfile/classFileParser.hpp
+++ b/hotspot/src/share/vm/classfile/classFileParser.hpp
@ -91,7 +91,7 @@ class ClassFileParser VALUE_OBJ_CLASS_SPEC {
                               constantPoolHandle cp, bool is_interface,
                               FieldAllocationCount *fac,
                               objArrayHandle* fields_annotations,
-                               int* java_fields_count_ptr, TRAPS);
+                               u2* java_fields_count_ptr, TRAPS);

  // Method parsing
  methodHandle parse_method(constantPoolHandle cp, bool is_interface,
--- a/hotspot/src/share/vm/code/dependencies.cpp
+++ b/hotspot/src/share/vm/code/dependencies.cpp
@ -1631,7 +1631,7 @@ void KlassDepChange::initialize() {
  for (ContextStream str(*this); str.next(); ) {
    klassOop d = str.klass();
    assert(!instanceKlass::cast(d)->is_marked_dependent(), "checking");
-    instanceKlass::cast(d)->set_is_marked_dependent(true);
+    instanceKlass::cast(d)->set_is_marked_dependent();
  }
 }

@ -1640,7 +1640,7 @@ KlassDepChange::~KlassDepChange() {
  // Unmark transitive interfaces
  for (ContextStream str(*this); str.next(); ) {
    klassOop d = str.klass();
-    instanceKlass::cast(d)->set_is_marked_dependent(false);
+    instanceKlass::cast(d)->clear_is_marked_dependent();
  }
 }

--- a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.cpp
+++ b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.cpp
@ -2598,7 +2598,7 @@ void CompactibleFreeListSpace::printFLCensus(size_t sweep_count) const {
 AdaptiveWeightedAverage CFLS_LAB::_blocks_to_claim[]    =
  VECTOR_257(AdaptiveWeightedAverage(OldPLABWeight, (float)CMSParPromoteBlocksToClaim));
 size_t CFLS_LAB::_global_num_blocks[]  = VECTOR_257(0);
-int    CFLS_LAB::_global_num_workers[] = VECTOR_257(0);
+uint   CFLS_LAB::_global_num_workers[] = VECTOR_257(0);

 CFLS_LAB::CFLS_LAB(CompactibleFreeListSpace* cfls) :
  _cfls(cfls)
@ -2732,7 +2732,7 @@ void CFLS_LAB::retire(int tid) {
        // Update globals stats for num_blocks used
        _global_num_blocks[i] += (_num_blocks[i] - num_retire);
        _global_num_workers[i]++;
-        assert(_global_num_workers[i] <= (ssize_t)ParallelGCThreads, "Too big");
+        assert(_global_num_workers[i] <= ParallelGCThreads, "Too big");
        if (num_retire > 0) {
          _cfls->_indexedFreeList[i].prepend(&_indexedFreeList[i]);
          // Reset this list.
--- a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.hpp
+++ b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/compactibleFreeListSpace.hpp
@ -631,7 +631,7 @@ class CFLS_LAB : public CHeapObj {
  static AdaptiveWeightedAverage
                 _blocks_to_claim  [CompactibleFreeListSpace::IndexSetSize];
  static size_t _global_num_blocks [CompactibleFreeListSpace::IndexSetSize];
-  static int    _global_num_workers[CompactibleFreeListSpace::IndexSetSize];
+  static uint   _global_num_workers[CompactibleFreeListSpace::IndexSetSize];
  size_t        _num_blocks        [CompactibleFreeListSpace::IndexSetSize];

  // Internal work method
--- a/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
+++ b/hotspot/src/share/vm/gc_implementation/concurrentMarkSweep/concurrentMarkSweepGeneration.cpp
@ -3779,7 +3779,7 @@ class CMSConcMarkingTask: public YieldingFlexibleGangTask {
    terminator()->reset_for_reuse(active_workers);
  }

-  void work(int i);
+  void work(uint worker_id);
  bool should_yield() {
    return    ConcurrentMarkSweepThread::should_yield()
           && !_collector->foregroundGCIsActive()
@ -3852,7 +3852,7 @@ void CMSConcMarkingTerminator::yield() {
 //    . if neither is available, offer termination
 // -- Terminate and return result
 //
-void CMSConcMarkingTask::work(int i) {
+void CMSConcMarkingTask::work(uint worker_id) {
  elapsedTimer _timer;
  ResourceMark rm;
  HandleMark hm;
@ -3860,37 +3860,40 @@ void CMSConcMarkingTask::work(int i) {
  DEBUG_ONLY(_collector->verify_overflow_empty();)

  // Before we begin work, our work queue should be empty
-  assert(work_queue(i)->size() == 0, "Expected to be empty");
+  assert(work_queue(worker_id)->size() == 0, "Expected to be empty");
  // Scan the bitmap covering _cms_space, tracing through grey objects.
  _timer.start();
-  do_scan_and_mark(i, _cms_space);
+  do_scan_and_mark(worker_id, _cms_space);
  _timer.stop();
  if (PrintCMSStatistics != 0) {
    gclog_or_tty->print_cr("Finished cms space scanning in %dth thread: %3.3f sec",
-      i, _timer.seconds()); // XXX: need xxx/xxx type of notation, two timers
+      worker_id, _timer.seconds());
+      // XXX: need xxx/xxx type of notation, two timers
  }

  // ... do the same for the _perm_space
  _timer.reset();
  _timer.start();
-  do_scan_and_mark(i, _perm_space);
+  do_scan_and_mark(worker_id, _perm_space);
  _timer.stop();
  if (PrintCMSStatistics != 0) {
    gclog_or_tty->print_cr("Finished perm space scanning in %dth thread: %3.3f sec",
-      i, _timer.seconds()); // XXX: need xxx/xxx type of notation, two timers
+      worker_id, _timer.seconds());
+      // XXX: need xxx/xxx type of notation, two timers
  }

  // ... do work stealing
  _timer.reset();
  _timer.start();
-  do_work_steal(i);
+  do_work_steal(worker_id);
  _timer.stop();
  if (PrintCMSStatistics != 0) {
    gclog_or_tty->print_cr("Finished work stealing in %dth thread: %3.3f sec",
-      i, _timer.seconds()); // XXX: need xxx/xxx type of notation, two timers
+      worker_id, _timer.seconds());
+      // XXX: need xxx/xxx type of notation, two timers
  }
  assert(_collector->_markStack.isEmpty(), "Should have been emptied");
-  assert(work_queue(i)->size() == 0, "Should have been emptied");
+  assert(work_queue(worker_id)->size() == 0, "Should have been emptied");
  // Note that under the current task protocol, the
  // following assertion is true even of the spaces
  // expanded since the completion of the concurrent
@ -3946,7 +3949,7 @@ void CMSConcMarkingTask::do_scan_and_mark(int i, CompactibleFreeListSpace* sp) {
  // We allow that there may be no tasks to do here because
  // we are restarting after a stack overflow.
  assert(pst->valid() || n_tasks == 0, "Uninitialized use?");
-  int nth_task = 0;
+  uint nth_task = 0;

  HeapWord* aligned_start = sp->bottom();
  if (sp->used_region().contains(_restart_addr)) {
@ -5075,7 +5078,7 @@ class CMSParRemarkTask: public AbstractGangTask {
  ParallelTaskTerminator* terminator() { return &_term; }
  int n_workers() { return _n_workers; }

-  void work(int i);
+  void work(uint worker_id);

 private:
  // Work method in support of parallel rescan ... of young gen spaces
@ -5096,7 +5099,7 @@ class CMSParRemarkTask: public AbstractGangTask {
 // also is passed to do_dirty_card_rescan_tasks() and to
 // do_work_steal() to select the i-th task_queue.

-void CMSParRemarkTask::work(int i) {
+void CMSParRemarkTask::work(uint worker_id) {
  elapsedTimer _timer;
  ResourceMark rm;
  HandleMark   hm;
@ -5107,7 +5110,7 @@ void CMSParRemarkTask::work(int i) {
  Par_MarkRefsIntoAndScanClosure par_mrias_cl(_collector,
    _collector->_span, _collector->ref_processor(),
    &(_collector->_markBitMap),
-    work_queue(i), &(_collector->_revisitStack));
+    work_queue(worker_id), &(_collector->_revisitStack));

  // Rescan young gen roots first since these are likely
  // coarsely partitioned and may, on that account, constitute
@ -5128,15 +5131,15 @@ void CMSParRemarkTask::work(int i) {
    assert(ect <= _collector->_eden_chunk_capacity, "out of bounds");
    assert(sct <= _collector->_survivor_chunk_capacity, "out of bounds");

-    do_young_space_rescan(i, &par_mrias_cl, to_space, NULL, 0);
-    do_young_space_rescan(i, &par_mrias_cl, from_space, sca, sct);
-    do_young_space_rescan(i, &par_mrias_cl, eden_space, eca, ect);
+    do_young_space_rescan(worker_id, &par_mrias_cl, to_space, NULL, 0);
+    do_young_space_rescan(worker_id, &par_mrias_cl, from_space, sca, sct);
+    do_young_space_rescan(worker_id, &par_mrias_cl, eden_space, eca, ect);

    _timer.stop();
    if (PrintCMSStatistics != 0) {
      gclog_or_tty->print_cr(
        "Finished young gen rescan work in %dth thread: %3.3f sec",
-        i, _timer.seconds());
+        worker_id, _timer.seconds());
    }
  }

@ -5158,7 +5161,7 @@ void CMSParRemarkTask::work(int i) {
  if (PrintCMSStatistics != 0) {
    gclog_or_tty->print_cr(
      "Finished remaining root rescan work in %dth thread: %3.3f sec",
-      i, _timer.seconds());
+      worker_id, _timer.seconds());
  }

  // ---------- rescan dirty cards ------------
@ -5167,26 +5170,26 @@ void CMSParRemarkTask::work(int i) {

  // Do the rescan tasks for each of the two spaces
  // (cms_space and perm_space) in turn.
-  // "i" is passed to select the "i-th" task_queue
-  do_dirty_card_rescan_tasks(_cms_space, i, &par_mrias_cl);
-  do_dirty_card_rescan_tasks(_perm_space, i, &par_mrias_cl);
+  // "worker_id" is passed to select the task_queue for "worker_id"
+  do_dirty_card_rescan_tasks(_cms_space, worker_id, &par_mrias_cl);
+  do_dirty_card_rescan_tasks(_perm_space, worker_id, &par_mrias_cl);
  _timer.stop();
  if (PrintCMSStatistics != 0) {
    gclog_or_tty->print_cr(
      "Finished dirty card rescan work in %dth thread: %3.3f sec",
-      i, _timer.seconds());
+      worker_id, _timer.seconds());
  }

  // ---------- steal work from other threads ...
  // ---------- ... and drain overflow list.
  _timer.reset();
  _timer.start();
-  do_work_steal(i, &par_mrias_cl, _collector->hash_seed(i));
+  do_work_steal(worker_id, &par_mrias_cl, _collector->hash_seed(worker_id));
  _timer.stop();
  if (PrintCMSStatistics != 0) {
    gclog_or_tty->print_cr(
      "Finished work stealing in %dth thread: %3.3f sec",
-      i, _timer.seconds());
+      worker_id, _timer.seconds());
  }
 }

@ -5207,8 +5210,8 @@ CMSParRemarkTask::do_young_space_rescan(int i,
  SequentialSubTasksDone* pst = space->par_seq_tasks();
  assert(pst->valid(), "Uninitialized use?");

-  int nth_task = 0;
-  int n_tasks  = pst->n_tasks();
+  uint nth_task = 0;
+  uint n_tasks  = pst->n_tasks();

  HeapWord *start, *end;
  while (!pst->is_task_claimed(/* reference */ nth_task)) {
@ -5220,12 +5223,12 @@ CMSParRemarkTask::do_young_space_rescan(int i,
    } else if (nth_task == 0) {
      start = space->bottom();
      end   = chunk_array[nth_task];
-    } else if (nth_task < (jint)chunk_top) {
+    } else if (nth_task < (uint)chunk_top) {
      assert(nth_task >= 1, "Control point invariant");
      start = chunk_array[nth_task - 1];
      end   = chunk_array[nth_task];
    } else {
-      assert(nth_task == (jint)chunk_top, "Control point invariant");
+      assert(nth_task == (uint)chunk_top, "Control point invariant");
      start = chunk_array[chunk_top - 1];
      end   = space->top();
    }
@ -5288,7 +5291,7 @@ CMSParRemarkTask::do_dirty_card_rescan_tasks(

  SequentialSubTasksDone* pst = sp->conc_par_seq_tasks();
  assert(pst->valid(), "Uninitialized use?");
-  int nth_task = 0;
+  uint nth_task = 0;
  const int alignment = CardTableModRefBS::card_size * BitsPerWord;
  MemRegion span = sp->used_region();
  HeapWord* start_addr = span.start();
@ -5736,26 +5739,26 @@ public:
                     CMSParKeepAliveClosure* keep_alive,
                     int* seed);

-  virtual void work(int i);
+  virtual void work(uint worker_id);
 };

-void CMSRefProcTaskProxy::work(int i) {
+void CMSRefProcTaskProxy::work(uint worker_id) {
  assert(_collector->_span.equals(_span), "Inconsistency in _span");
  CMSParKeepAliveClosure par_keep_alive(_collector, _span,
                                        _mark_bit_map,
                                        &_collector->_revisitStack,
-                                        work_queue(i));
+                                        work_queue(worker_id));
  CMSParDrainMarkingStackClosure par_drain_stack(_collector, _span,
                                                 _mark_bit_map,
                                                 &_collector->_revisitStack,
-                                                 work_queue(i));
+                                                 work_queue(worker_id));
  CMSIsAliveClosure is_alive_closure(_span, _mark_bit_map);
-  _task.work(i, is_alive_closure, par_keep_alive, par_drain_stack);
+  _task.work(worker_id, is_alive_closure, par_keep_alive, par_drain_stack);
  if (_task.marks_oops_alive()) {
-    do_work_steal(i, &par_drain_stack, &par_keep_alive,
-                  _collector->hash_seed(i));
+    do_work_steal(worker_id, &par_drain_stack, &par_keep_alive,
+                  _collector->hash_seed(worker_id));
  }
-  assert(work_queue(i)->size() == 0, "work_queue should be empty");
+  assert(work_queue(worker_id)->size() == 0, "work_queue should be empty");
  assert(_collector->_overflow_list == NULL, "non-empty _overflow_list");
 }

@ -5769,9 +5772,9 @@ public:
      _task(task)
  { }

-  virtual void work(int i)
+  virtual void work(uint worker_id)
  {
-    _task.work(i);
+    _task.work(worker_id);
  }
 };

--- a/hotspot/src/share/vm/gc_implementation/g1/collectionSetChooser.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/collectionSetChooser.cpp
@ -264,7 +264,7 @@ prepareForAddMarkedHeapRegionsPar(size_t n_regions, size_t chunkSize) {
    // or some improperly initialized variable with leads to no
    // active threads, protect against that in a product build.
    n_threads = MAX2(G1CollectedHeap::heap()->workers()->active_workers(),
-                     1);
+                     1U);
  }
  size_t max_waste = n_threads * chunkSize;
  // it should be aligned with respect to chunkSize
--- a/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.cpp
@ -458,8 +458,8 @@ bool ConcurrentMark::not_yet_marked(oop obj) const {
 #pragma warning( disable:4355 ) // 'this' : used in base member initializer list
 #endif // _MSC_VER

-size_t ConcurrentMark::scale_parallel_threads(size_t n_par_threads) {
-  return MAX2((n_par_threads + 2) / 4, (size_t)1);
+uint ConcurrentMark::scale_parallel_threads(uint n_par_threads) {
+  return MAX2((n_par_threads + 2) / 4, 1U);
 }

 ConcurrentMark::ConcurrentMark(ReservedSpace rs,
@ -486,7 +486,7 @@ ConcurrentMark::ConcurrentMark(ReservedSpace rs,
  _regionStack(),
  // _finger set in set_non_marking_state

-  _max_task_num(MAX2(ParallelGCThreads, (size_t)1)),
+  _max_task_num(MAX2((uint)ParallelGCThreads, 1U)),
  // _active_tasks set in set_non_marking_state
  // _tasks set inside the constructor
  _task_queues(new CMTaskQueueSet((int) _max_task_num)),
@ -506,7 +506,6 @@ ConcurrentMark::ConcurrentMark(ReservedSpace rs,
  _cleanup_times(),
  _total_counting_time(0.0),
  _total_rs_scrub_time(0.0),
-
  _parallel_workers(NULL) {
  CMVerboseLevel verbose_level = (CMVerboseLevel) G1MarkingVerboseLevel;
  if (verbose_level < no_verbose) {
@ -568,7 +567,7 @@ ConcurrentMark::ConcurrentMark(ReservedSpace rs,
      // notice that ConcGCThreads overwrites G1MarkingOverheadPercent
      // if both are set

-      _parallel_marking_threads = ConcGCThreads;
+      _parallel_marking_threads = (uint) ConcGCThreads;
      _max_parallel_marking_threads = _parallel_marking_threads;
      _sleep_factor             = 0.0;
      _marking_task_overhead    = 1.0;
@ -589,12 +588,12 @@ ConcurrentMark::ConcurrentMark(ReservedSpace rs,
      double sleep_factor =
                         (1.0 - marking_task_overhead) / marking_task_overhead;

-      _parallel_marking_threads = (size_t) marking_thread_num;
+      _parallel_marking_threads = (uint) marking_thread_num;
      _max_parallel_marking_threads = _parallel_marking_threads;
      _sleep_factor             = sleep_factor;
      _marking_task_overhead    = marking_task_overhead;
    } else {
-      _parallel_marking_threads = scale_parallel_threads(ParallelGCThreads);
+      _parallel_marking_threads = scale_parallel_threads((uint)ParallelGCThreads);
      _max_parallel_marking_threads = _parallel_marking_threads;
      _sleep_factor             = 0.0;
      _marking_task_overhead    = 1.0;
@ -618,7 +617,7 @@ ConcurrentMark::ConcurrentMark(ReservedSpace rs,

    guarantee(parallel_marking_threads() > 0, "peace of mind");
    _parallel_workers = new FlexibleWorkGang("G1 Parallel Marking Threads",
-         (int) _max_parallel_marking_threads, false, true);
+         _max_parallel_marking_threads, false, true);
    if (_parallel_workers == NULL) {
      vm_exit_during_initialization("Failed necessary allocation.");
    } else {
@ -691,7 +690,7 @@ void ConcurrentMark::reset() {
  set_concurrent_marking_in_progress();
 }

-void ConcurrentMark::set_phase(size_t active_tasks, bool concurrent) {
+void ConcurrentMark::set_phase(uint active_tasks, bool concurrent) {
  assert(active_tasks <= _max_task_num, "we should not have more");

  _active_tasks = active_tasks;
@ -727,12 +726,8 @@ void ConcurrentMark::set_non_marking_state() {
 }

 ConcurrentMark::~ConcurrentMark() {
-  for (int i = 0; i < (int) _max_task_num; ++i) {
-    delete _task_queues->queue(i);
-    delete _tasks[i];
-  }
-  delete _task_queues;
-  FREE_C_HEAP_ARRAY(CMTask*, _max_task_num);
+  // The ConcurrentMark instance is never freed.
+  ShouldNotReachHere();
 }

 // This closure is used to mark refs into the g1 generation
@ -1048,7 +1043,7 @@ private:
  ConcurrentMarkThread* _cmt;

 public:
-  void work(int worker_i) {
+  void work(uint worker_id) {
    assert(Thread::current()->is_ConcurrentGC_thread(),
           "this should only be done by a conc GC thread");
    ResourceMark rm;
@ -1057,8 +1052,8 @@ public:

    ConcurrentGCThread::stsJoin();

-    assert((size_t) worker_i < _cm->active_tasks(), "invariant");
-    CMTask* the_task = _cm->task(worker_i);
+    assert(worker_id < _cm->active_tasks(), "invariant");
+    CMTask* the_task = _cm->task(worker_id);
    the_task->record_start_time();
    if (!_cm->has_aborted()) {
      do {
@ -1076,7 +1071,7 @@ public:
        double elapsed_time_sec = end_time_sec - start_time_sec;
        _cm->clear_has_overflown();

-        bool ret = _cm->do_yield_check(worker_i);
+        bool ret = _cm->do_yield_check(worker_id);

        jlong sleep_time_ms;
        if (!_cm->has_aborted() && the_task->has_aborted()) {
@ -1105,7 +1100,7 @@ public:
    ConcurrentGCThread::stsLeave();

    double end_vtime = os::elapsedVTime();
-    _cm->update_accum_task_vtime(worker_i, end_vtime - start_vtime);
+    _cm->update_accum_task_vtime(worker_id, end_vtime - start_vtime);
  }

  CMConcurrentMarkingTask(ConcurrentMark* cm,
@ -1117,9 +1112,9 @@ public:

 // Calculates the number of active workers for a concurrent
 // phase.
-size_t ConcurrentMark::calc_parallel_marking_threads() {
+uint ConcurrentMark::calc_parallel_marking_threads() {
  if (G1CollectedHeap::use_parallel_gc_threads()) {
-    size_t n_conc_workers = 0;
+    uint n_conc_workers = 0;
    if (!UseDynamicNumberOfGCThreads ||
        (!FLAG_IS_DEFAULT(ConcGCThreads) &&
         !ForceDynamicNumberOfGCThreads)) {
@ -1159,7 +1154,7 @@ void ConcurrentMark::markFromRoots() {
  assert(parallel_marking_threads() <= max_parallel_marking_threads(),
    "Maximum number of marking threads exceeded");

-  size_t active_workers = MAX2((size_t) 1, parallel_marking_threads());
+  uint active_workers = MAX2(1U, parallel_marking_threads());

  // Parallel task terminator is set in "set_phase()"
  set_phase(active_workers, true /* concurrent */);
@ -1503,7 +1498,7 @@ class G1ParFinalCountTask: public AbstractGangTask {
 protected:
  G1CollectedHeap* _g1h;
  CMBitMap* _bm;
-  size_t _n_workers;
+  uint    _n_workers;
  size_t *_live_bytes;
  size_t *_used_bytes;
  BitMap* _region_bm;
@ -1535,13 +1530,13 @@ public:
    FREE_C_HEAP_ARRAY(size_t, _used_bytes);
  }

-  void work(int i) {
+  void work(uint worker_id) {
    CalcLiveObjectsClosure calccl(true /*final*/,
                                  _bm, _g1h->concurrent_mark(),
                                  _region_bm, _card_bm);
    calccl.no_yield();
    if (G1CollectedHeap::use_parallel_gc_threads()) {
-      _g1h->heap_region_par_iterate_chunked(&calccl, i,
+      _g1h->heap_region_par_iterate_chunked(&calccl, worker_id,
                                            (int) _n_workers,
                                            HeapRegion::FinalCountClaimValue);
    } else {
@ -1549,19 +1544,19 @@ public:
    }
    assert(calccl.complete(), "Shouldn't have yielded!");

-    assert((size_t) i < _n_workers, "invariant");
-    _live_bytes[i] = calccl.tot_live();
-    _used_bytes[i] = calccl.tot_used();
+    assert(worker_id < _n_workers, "invariant");
+    _live_bytes[worker_id] = calccl.tot_live();
+    _used_bytes[worker_id] = calccl.tot_used();
  }
  size_t live_bytes()  {
    size_t live_bytes = 0;
-    for (size_t i = 0; i < _n_workers; ++i)
+    for (uint i = 0; i < _n_workers; ++i)
      live_bytes += _live_bytes[i];
    return live_bytes;
  }
  size_t used_bytes()  {
    size_t used_bytes = 0;
-    for (size_t i = 0; i < _n_workers; ++i)
+    for (uint i = 0; i < _n_workers; ++i)
      used_bytes += _used_bytes[i];
    return used_bytes;
  }
@ -1646,18 +1641,18 @@ public:
    AbstractGangTask("G1 note end"), _g1h(g1h),
    _max_live_bytes(0), _freed_bytes(0), _cleanup_list(cleanup_list) { }

-  void work(int i) {
+  void work(uint worker_id) {
    double start = os::elapsedTime();
    FreeRegionList local_cleanup_list("Local Cleanup List");
    OldRegionSet old_proxy_set("Local Cleanup Old Proxy Set");
    HumongousRegionSet humongous_proxy_set("Local Cleanup Humongous Proxy Set");
    HRRSCleanupTask hrrs_cleanup_task;
-    G1NoteEndOfConcMarkClosure g1_note_end(_g1h, i, &local_cleanup_list,
+    G1NoteEndOfConcMarkClosure g1_note_end(_g1h, worker_id, &local_cleanup_list,
                                           &old_proxy_set,
                                           &humongous_proxy_set,
                                           &hrrs_cleanup_task);
    if (G1CollectedHeap::use_parallel_gc_threads()) {
-      _g1h->heap_region_par_iterate_chunked(&g1_note_end, i,
+      _g1h->heap_region_par_iterate_chunked(&g1_note_end, worker_id,
                                            _g1h->workers()->active_workers(),
                                            HeapRegion::NoteEndClaimValue);
    } else {
@ -1701,8 +1696,8 @@ public:
    double end = os::elapsedTime();
    if (G1PrintParCleanupStats) {
      gclog_or_tty->print("     Worker thread %d [%8.3f..%8.3f = %8.3f ms] "
-                          "claimed %d regions (tot = %8.3f ms, max = %8.3f ms).\n",
-                          i, start, end, (end-start)*1000.0,
+                          "claimed %u regions (tot = %8.3f ms, max = %8.3f ms).\n",
+                          worker_id, start, end, (end-start)*1000.0,
                          g1_note_end.regions_claimed(),
                          g1_note_end.claimed_region_time_sec()*1000.0,
                          g1_note_end.max_region_time_sec()*1000.0);
@ -1724,9 +1719,9 @@ public:
    _region_bm(region_bm), _card_bm(card_bm)
  {}

-  void work(int i) {
+  void work(uint worker_id) {
    if (G1CollectedHeap::use_parallel_gc_threads()) {
-      _g1rs->scrub_par(_region_bm, _card_bm, i,
+      _g1rs->scrub_par(_region_bm, _card_bm, worker_id,
                       HeapRegion::ScrubRemSetClaimValue);
    } else {
      _g1rs->scrub(_region_bm, _card_bm);
@ -1766,7 +1761,7 @@ void ConcurrentMark::cleanup() {

  HeapRegionRemSet::reset_for_cleanup_tasks();

-  size_t n_workers;
+  uint n_workers;

  // Do counting once more with the world stopped for good measure.
  G1ParFinalCountTask g1_par_count_task(g1h, nextMarkBitMap(),
@ -1778,7 +1773,7 @@ void ConcurrentMark::cleanup() {

    g1h->set_par_threads();
    n_workers = g1h->n_par_threads();
-    assert(g1h->n_par_threads() == (int) n_workers,
+    assert(g1h->n_par_threads() == n_workers,
           "Should not have been reset");
    g1h->workers()->run_task(&g1_par_count_task);
    // Done with the parallel phase so reset to 0.
@ -2169,13 +2164,13 @@ public:
    AbstractGangTask("Process reference objects in parallel"),
    _proc_task(proc_task), _g1h(g1h), _cm(cm) { }

-  virtual void work(int i) {
-    CMTask* marking_task = _cm->task(i);
+  virtual void work(uint worker_id) {
+    CMTask* marking_task = _cm->task(worker_id);
    G1CMIsAliveClosure g1_is_alive(_g1h);
    G1CMParKeepAliveAndDrainClosure g1_par_keep_alive(_cm, marking_task);
    G1CMParDrainMarkingStackClosure g1_par_drain(_cm, marking_task);

-    _proc_task.work(i, g1_is_alive, g1_par_keep_alive, g1_par_drain);
+    _proc_task.work(worker_id, g1_is_alive, g1_par_keep_alive, g1_par_drain);
  }
 };

@ -2201,8 +2196,8 @@ public:
    AbstractGangTask("Enqueue reference objects in parallel"),
    _enq_task(enq_task) { }

-  virtual void work(int i) {
-    _enq_task.work(i);
+  virtual void work(uint worker_id) {
+    _enq_task.work(worker_id);
  }
 };

@ -2249,8 +2244,8 @@ void ConcurrentMark::weakRefsWork(bool clear_all_soft_refs) {

    // We use the work gang from the G1CollectedHeap and we utilize all
    // the worker threads.
-    int active_workers = g1h->workers() ? g1h->workers()->active_workers() : 1;
-    active_workers = MAX2(MIN2(active_workers, (int)_max_task_num), 1);
+    uint active_workers = g1h->workers() ? g1h->workers()->active_workers() : 1U;
+    active_workers = MAX2(MIN2(active_workers, _max_task_num), 1U);

    G1CMRefProcTaskExecutor par_task_executor(g1h, this,
                                              g1h->workers(), active_workers);
@ -2314,11 +2309,11 @@ private:
  ConcurrentMark *_cm;

 public:
-  void work(int worker_i) {
+  void work(uint worker_id) {
    // Since all available tasks are actually started, we should
    // only proceed if we're supposed to be actived.
-    if ((size_t)worker_i < _cm->active_tasks()) {
-      CMTask* task = _cm->task(worker_i);
+    if (worker_id < _cm->active_tasks()) {
+      CMTask* task = _cm->task(worker_id);
      task->record_start_time();
      do {
        task->do_marking_step(1000000000.0 /* something very large */,
@ -2347,10 +2342,10 @@ void ConcurrentMark::checkpointRootsFinalWork() {
  if (G1CollectedHeap::use_parallel_gc_threads()) {
    G1CollectedHeap::StrongRootsScope srs(g1h);
    // this is remark, so we'll use up all active threads
-    int active_workers = g1h->workers()->active_workers();
+    uint active_workers = g1h->workers()->active_workers();
    if (active_workers == 0) {
      assert(active_workers > 0, "Should have been set earlier");
-      active_workers = ParallelGCThreads;
+      active_workers = (uint) ParallelGCThreads;
      g1h->workers()->set_active_workers(active_workers);
    }
    set_phase(active_workers, false /* concurrent */);
@ -2366,7 +2361,7 @@ void ConcurrentMark::checkpointRootsFinalWork() {
  } else {
    G1CollectedHeap::StrongRootsScope srs(g1h);
    // this is remark, so we'll use up all available threads
-    int active_workers = 1;
+    uint active_workers = 1;
    set_phase(active_workers, false /* concurrent */);

    CMRemarkTask remarkTask(this, active_workers);
@ -2921,7 +2916,7 @@ class CSetMarkOopClosure: public OopClosure {
  int              _ms_size;
  int              _ms_ind;
  int              _array_increment;
-  int              _worker_i;
+  uint             _worker_id;

  bool push(oop obj, int arr_ind = 0) {
    if (_ms_ind == _ms_size) {
@ -2971,7 +2966,7 @@ class CSetMarkOopClosure: public OopClosure {
  }

 public:
-  CSetMarkOopClosure(ConcurrentMark* cm, int ms_size, int worker_i) :
+  CSetMarkOopClosure(ConcurrentMark* cm, int ms_size, uint worker_id) :
    _g1h(G1CollectedHeap::heap()),
    _cm(cm),
    _bm(cm->nextMarkBitMap()),
@ -2979,7 +2974,7 @@ public:
    _ms(NEW_C_HEAP_ARRAY(oop, ms_size)),
    _array_ind_stack(NEW_C_HEAP_ARRAY(jint, ms_size)),
    _array_increment(MAX2(ms_size/8, 16)),
-    _worker_i(worker_i) { }
+    _worker_id(worker_id) { }

  ~CSetMarkOopClosure() {
    FREE_C_HEAP_ARRAY(oop, _ms);
@ -3024,14 +3019,14 @@ class CSetMarkBitMapClosure: public BitMapClosure {
  CMBitMap*          _bitMap;
  ConcurrentMark*    _cm;
  CSetMarkOopClosure _oop_cl;
-  int                _worker_i;
+  uint               _worker_id;

 public:
-  CSetMarkBitMapClosure(ConcurrentMark* cm, int ms_size, int worker_i) :
+  CSetMarkBitMapClosure(ConcurrentMark* cm, int ms_size, int worker_id) :
    _g1h(G1CollectedHeap::heap()),
    _bitMap(cm->nextMarkBitMap()),
-    _oop_cl(cm, ms_size, worker_i),
-    _worker_i(worker_i) { }
+    _oop_cl(cm, ms_size, worker_id),
+    _worker_id(worker_id) { }

  bool do_bit(size_t offset) {
    // convert offset into a HeapWord*
@ -3056,17 +3051,17 @@ public:
 class CompleteMarkingInCSetHRClosure: public HeapRegionClosure {
  CMBitMap*             _bm;
  CSetMarkBitMapClosure _bit_cl;
-  int                   _worker_i;
+  uint                  _worker_id;

  enum SomePrivateConstants {
    MSSize = 1000
  };

 public:
-  CompleteMarkingInCSetHRClosure(ConcurrentMark* cm, int worker_i) :
+  CompleteMarkingInCSetHRClosure(ConcurrentMark* cm, int worker_id) :
    _bm(cm->nextMarkBitMap()),
-    _bit_cl(cm, MSSize, worker_i),
-    _worker_i(worker_i) { }
+    _bit_cl(cm, MSSize, worker_id),
+    _worker_id(worker_id) { }

  bool doHeapRegion(HeapRegion* hr) {
    if (hr->claimHeapRegion(HeapRegion::CompleteMarkCSetClaimValue)) {
@ -3109,9 +3104,9 @@ public:
    AbstractGangTask("Complete Mark in CSet"),
    _g1h(g1h), _cm(cm) { }

-  void work(int worker_i) {
-    CompleteMarkingInCSetHRClosure cmplt(_cm, worker_i);
-    HeapRegion* hr = _g1h->start_cset_region_for_worker(worker_i);
+  void work(uint worker_id) {
+    CompleteMarkingInCSetHRClosure cmplt(_cm, worker_id);
+    HeapRegion* hr = _g1h->start_cset_region_for_worker(worker_id);
    _g1h->collection_set_iterate_from(hr, &cmplt);
  }
 };
@ -3307,13 +3302,13 @@ void ConcurrentMark::print_worker_threads_on(outputStream* st) const {
 // the CMS bit map. Called at the first checkpoint.

 // We take a break if someone is trying to stop the world.
-bool ConcurrentMark::do_yield_check(int worker_i) {
+bool ConcurrentMark::do_yield_check(uint worker_id) {
  if (should_yield()) {
-    if (worker_i == 0) {
+    if (worker_id == 0) {
      _g1h->g1_policy()->record_concurrent_pause();
    }
    cmThread()->yield();
-    if (worker_i == 0) {
+    if (worker_id == 0) {
      _g1h->g1_policy()->record_concurrent_pause_end();
    }
    return true;
--- a/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/concurrentMark.hpp
@ -374,9 +374,9 @@ class ConcurrentMark: public CHeapObj {
 protected:
  ConcurrentMarkThread* _cmThread;   // the thread doing the work
  G1CollectedHeap*      _g1h;        // the heap.
-  size_t                _parallel_marking_threads; // the number of marking
+  uint                  _parallel_marking_threads; // the number of marking
                                                   // threads we're use
-  size_t                _max_parallel_marking_threads; // max number of marking
+  uint                  _max_parallel_marking_threads; // max number of marking
                                                   // threads we'll ever use
  double                _sleep_factor; // how much we have to sleep, with
                                       // respect to the work we just did, to
@ -412,8 +412,8 @@ protected:
                                    // last claimed region

  // marking tasks
-  size_t                  _max_task_num; // maximum task number
-  size_t                  _active_tasks; // task num currently active
+  uint                    _max_task_num; // maximum task number
+  uint                    _active_tasks; // task num currently active
  CMTask**                _tasks;        // task queue array (max_task_num len)
  CMTaskQueueSet*         _task_queues;  // task queue set
  ParallelTaskTerminator  _terminator;   // for termination
@ -492,7 +492,7 @@ protected:

  // It should be called to indicate which phase we're in (concurrent
  // mark or remark) and how many threads are currently active.
-  void set_phase(size_t active_tasks, bool concurrent);
+  void set_phase(uint active_tasks, bool concurrent);
  // We do this after we're done with marking so that the marking data
  // structures are initialised to a sensible and predictable state.
  void set_non_marking_state();
@ -505,8 +505,8 @@ protected:
  }

  // accessor methods
-  size_t parallel_marking_threads() { return _parallel_marking_threads; }
-  size_t max_parallel_marking_threads() { return _max_parallel_marking_threads;}
+  uint parallel_marking_threads() { return _parallel_marking_threads; }
+  uint max_parallel_marking_threads() { return _max_parallel_marking_threads;}
  double sleep_factor()             { return _sleep_factor; }
  double marking_task_overhead()    { return _marking_task_overhead;}
  double cleanup_sleep_factor()     { return _cleanup_sleep_factor; }
@ -514,7 +514,7 @@ protected:

  HeapWord*               finger()        { return _finger;   }
  bool                    concurrent()    { return _concurrent; }
-  size_t                  active_tasks()  { return _active_tasks; }
+  uint                    active_tasks()  { return _active_tasks; }
  ParallelTaskTerminator* terminator()    { return &_terminator; }

  // It claims the next available region to be scanned by a marking
@ -715,10 +715,10 @@ public:
  // Returns the number of GC threads to be used in a concurrent
  // phase based on the number of GC threads being used in a STW
  // phase.
-  size_t scale_parallel_threads(size_t n_par_threads);
+  uint scale_parallel_threads(uint n_par_threads);

  // Calculates the number of GC threads to be used in a concurrent phase.
-  size_t calc_parallel_marking_threads();
+  uint calc_parallel_marking_threads();

  // The following three are interaction between CM and
  // G1CollectedHeap
@ -873,7 +873,7 @@ public:
    return _prevMarkBitMap->isMarked(addr);
  }

-  inline bool do_yield_check(int worker_i = 0);
+  inline bool do_yield_check(uint worker_i = 0);
  inline bool should_yield();

  // Called to abort the marking cycle after a Full GC takes palce.
--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.cpp
@ -1165,9 +1165,9 @@ public:
      _g1(g1)
  { }

-  void work(int i) {
-    RebuildRSOutOfRegionClosure rebuild_rs(_g1, i);
-    _g1->heap_region_par_iterate_chunked(&rebuild_rs, i,
+  void work(uint worker_id) {
+    RebuildRSOutOfRegionClosure rebuild_rs(_g1, worker_id);
+    _g1->heap_region_par_iterate_chunked(&rebuild_rs, worker_id,
                                          _g1->workers()->active_workers(),
                                         HeapRegion::RebuildRSClaimValue);
  }
@ -1374,7 +1374,7 @@ bool G1CollectedHeap::do_collection(bool explicit_gc,

    // Rebuild remembered sets of all regions.
    if (G1CollectedHeap::use_parallel_gc_threads()) {
-      int n_workers =
+      uint n_workers =
        AdaptiveSizePolicy::calc_active_workers(workers()->total_workers(),
                                       workers()->active_workers(),
                                       Threads::number_of_non_daemon_threads());
@ -2519,11 +2519,11 @@ void G1CollectedHeap::heap_region_iterate_from(HeapRegion* r,

 void
 G1CollectedHeap::heap_region_par_iterate_chunked(HeapRegionClosure* cl,
-                                                 int worker,
-                                                 int no_of_par_workers,
+                                                 uint worker,
+                                                 uint no_of_par_workers,
                                                 jint claim_value) {
  const size_t regions = n_regions();
-  const size_t max_workers = (G1CollectedHeap::use_parallel_gc_threads() ?
+  const uint max_workers = (G1CollectedHeap::use_parallel_gc_threads() ?
                             no_of_par_workers :
                             1);
  assert(UseDynamicNumberOfGCThreads ||
@ -2739,7 +2739,7 @@ HeapRegion* G1CollectedHeap::start_cset_region_for_worker(int worker_i) {
  result = g1_policy()->collection_set();
  if (G1CollectedHeap::use_parallel_gc_threads()) {
    size_t cs_size = g1_policy()->cset_region_length();
-    int active_workers = workers()->active_workers();
+    uint active_workers = workers()->active_workers();
    assert(UseDynamicNumberOfGCThreads ||
             active_workers == workers()->total_workers(),
             "Unless dynamic should use total workers");
@ -3075,10 +3075,10 @@ public:
    return _failures;
  }

-  void work(int worker_i) {
+  void work(uint worker_id) {
    HandleMark hm;
    VerifyRegionClosure blk(_allow_dirty, true, _vo);
-    _g1h->heap_region_par_iterate_chunked(&blk, worker_i,
+    _g1h->heap_region_par_iterate_chunked(&blk, worker_id,
                                          _g1h->workers()->active_workers(),
                                          HeapRegion::ParVerifyClaimValue);
    if (blk.failures()) {
@ -4725,7 +4725,7 @@ protected:
  G1CollectedHeap*       _g1h;
  RefToScanQueueSet      *_queues;
  ParallelTaskTerminator _terminator;
-  int _n_workers;
+  uint _n_workers;

  Mutex _stats_lock;
  Mutex* stats_lock() { return &_stats_lock; }
@ -4765,18 +4765,18 @@ public:
    _n_workers = active_workers;
  }

-  void work(int i) {
-    if (i >= _n_workers) return;  // no work needed this round
+  void work(uint worker_id) {
+    if (worker_id >= _n_workers) return;  // no work needed this round

    double start_time_ms = os::elapsedTime() * 1000.0;
-    _g1h->g1_policy()->record_gc_worker_start_time(i, start_time_ms);
+    _g1h->g1_policy()->record_gc_worker_start_time(worker_id, start_time_ms);

    ResourceMark rm;
    HandleMark   hm;

    ReferenceProcessor*             rp = _g1h->ref_processor_stw();

-    G1ParScanThreadState            pss(_g1h, i);
+    G1ParScanThreadState            pss(_g1h, worker_id);
    G1ParScanHeapEvacClosure        scan_evac_cl(_g1h, &pss, rp);
    G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss, rp);
    G1ParScanPartialArrayClosure    partial_scan_cl(_g1h, &pss, rp);
@ -4808,7 +4808,7 @@ public:
                                  scan_root_cl,
                                  &push_heap_rs_cl,
                                  scan_perm_cl,
-                                  i);
+                                  worker_id);
    pss.end_strong_roots();

    {
@ -4817,8 +4817,8 @@ public:
      evac.do_void();
      double elapsed_ms = (os::elapsedTime()-start)*1000.0;
      double term_ms = pss.term_time()*1000.0;
-      _g1h->g1_policy()->record_obj_copy_time(i, elapsed_ms-term_ms);
-      _g1h->g1_policy()->record_termination(i, term_ms, pss.term_attempts());
+      _g1h->g1_policy()->record_obj_copy_time(worker_id, elapsed_ms-term_ms);
+      _g1h->g1_policy()->record_termination(worker_id, term_ms, pss.term_attempts());
    }
    _g1h->g1_policy()->record_thread_age_table(pss.age_table());
    _g1h->update_surviving_young_words(pss.surviving_young_words()+1);
@ -4828,12 +4828,12 @@ public:

    if (ParallelGCVerbose) {
      MutexLocker x(stats_lock());
-      pss.print_termination_stats(i);
+      pss.print_termination_stats(worker_id);
    }

    assert(pss.refs()->is_empty(), "should be empty");
    double end_time_ms = os::elapsedTime() * 1000.0;
-    _g1h->g1_policy()->record_gc_worker_end_time(i, end_time_ms);
+    _g1h->g1_policy()->record_gc_worker_end_time(worker_id, end_time_ms);
  }
 };

@ -5091,14 +5091,14 @@ public:
    _terminator(terminator)
  {}

-  virtual void work(int i) {
+  virtual void work(uint worker_id) {
    // The reference processing task executed by a single worker.
    ResourceMark rm;
    HandleMark   hm;

    G1STWIsAliveClosure is_alive(_g1h);

-    G1ParScanThreadState pss(_g1h, i);
+    G1ParScanThreadState pss(_g1h, worker_id);

    G1ParScanHeapEvacClosure        scan_evac_cl(_g1h, &pss, NULL);
    G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss, NULL);
@ -5130,7 +5130,7 @@ public:
    G1ParEvacuateFollowersClosure drain_queue(_g1h, &pss, _task_queues, _terminator);

    // Call the reference processing task's work routine.
-    _proc_task.work(i, is_alive, keep_alive, drain_queue);
+    _proc_task.work(worker_id, is_alive, keep_alive, drain_queue);

    // Note we cannot assert that the refs array is empty here as not all
    // of the processing tasks (specifically phase2 - pp2_work) execute
@ -5165,8 +5165,8 @@ public:
    _enq_task(enq_task)
  { }

-  virtual void work(int i) {
-    _enq_task.work(i);
+  virtual void work(uint worker_id) {
+    _enq_task.work(worker_id);
  }
 };

@ -5195,7 +5195,7 @@ protected:
  G1CollectedHeap* _g1h;
  RefToScanQueueSet      *_queues;
  ParallelTaskTerminator _terminator;
-  int _n_workers;
+  uint _n_workers;

 public:
  G1ParPreserveCMReferentsTask(G1CollectedHeap* g1h,int workers, RefToScanQueueSet *task_queues) :
@ -5206,11 +5206,11 @@ public:
    _n_workers(workers)
  { }

-  void work(int i) {
+  void work(uint worker_id) {
    ResourceMark rm;
    HandleMark   hm;

-    G1ParScanThreadState            pss(_g1h, i);
+    G1ParScanThreadState            pss(_g1h, worker_id);
    G1ParScanHeapEvacClosure        scan_evac_cl(_g1h, &pss, NULL);
    G1ParScanHeapEvacFailureClosure evac_failure_cl(_g1h, &pss, NULL);
    G1ParScanPartialArrayClosure    partial_scan_cl(_g1h, &pss, NULL);
@ -5246,17 +5246,17 @@ public:

    ReferenceProcessor* rp = _g1h->ref_processor_cm();

-    int limit = ReferenceProcessor::number_of_subclasses_of_ref() * rp->max_num_q();
-    int stride = MIN2(MAX2(_n_workers, 1), limit);
+    uint limit = ReferenceProcessor::number_of_subclasses_of_ref() * rp->max_num_q();
+    uint stride = MIN2(MAX2(_n_workers, 1U), limit);

    // limit is set using max_num_q() - which was set using ParallelGCThreads.
    // So this must be true - but assert just in case someone decides to
    // change the worker ids.
-    assert(0 <= i && i < limit, "sanity");
+    assert(0 <= worker_id && worker_id < limit, "sanity");
    assert(!rp->discovery_is_atomic(), "check this code");

    // Select discovered lists [i, i+stride, i+2*stride,...,limit)
-    for (int idx = i; idx < limit; idx += stride) {
+    for (uint idx = worker_id; idx < limit; idx += stride) {
      DiscoveredList& ref_list = rp->discovered_refs()[idx];

      DiscoveredListIterator iter(ref_list, &keep_alive, &always_alive);
@ -5310,7 +5310,7 @@ void G1CollectedHeap::process_discovered_references() {
  // referents points to another object which is also referenced by an
  // object discovered by the STW ref processor.

-  int active_workers = (G1CollectedHeap::use_parallel_gc_threads() ?
+  uint active_workers = (G1CollectedHeap::use_parallel_gc_threads() ?
                        workers()->active_workers() : 1);

  assert(!G1CollectedHeap::use_parallel_gc_threads() ||
@ -5416,7 +5416,7 @@ void G1CollectedHeap::enqueue_discovered_references() {
  } else {
    // Parallel reference enqueuing

-    int active_workers = (ParallelGCThreads > 0 ? workers()->active_workers() : 1);
+    uint active_workers = (ParallelGCThreads > 0 ? workers()->active_workers() : 1);
    assert(active_workers == workers()->active_workers(),
           "Need to reset active_workers");
    assert(rp->num_q() == active_workers, "sanity");
@ -5445,7 +5445,7 @@ void G1CollectedHeap::evacuate_collection_set() {
  concurrent_g1_refine()->set_use_cache(false);
  concurrent_g1_refine()->clear_hot_cache_claimed_index();

-  int n_workers;
+  uint n_workers;
  if (G1CollectedHeap::use_parallel_gc_threads()) {
    n_workers =
      AdaptiveSizePolicy::calc_active_workers(workers()->total_workers(),
@ -5658,7 +5658,7 @@ public:
    AbstractGangTask("G1 Par Cleanup CT Task"),
    _ct_bs(ct_bs), _g1h(g1h) { }

-  void work(int i) {
+  void work(uint worker_id) {
    HeapRegion* r;
    while (r = _g1h->pop_dirty_cards_region()) {
      clear_cards(r);
@ -6141,7 +6141,7 @@ void G1CollectedHeap::set_par_threads() {
  // Don't change the number of workers.  Use the value previously set
  // in the workgroup.
  assert(G1CollectedHeap::use_parallel_gc_threads(), "shouldn't be here otherwise");
-  int n_workers = workers()->active_workers();
+  uint n_workers = workers()->active_workers();
  assert(UseDynamicNumberOfGCThreads ||
           n_workers == workers()->total_workers(),
      "Otherwise should be using the total number of workers");
--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectedHeap.hpp
@ -995,7 +995,7 @@ public:
  // Initialize weak reference processing.
  virtual void ref_processing_init();

-  void set_par_threads(int t) {
+  void set_par_threads(uint t) {
    SharedHeap::set_par_threads(t);
    // Done in SharedHeap but oddly there are
    // two _process_strong_tasks's in a G1CollectedHeap
@ -1298,8 +1298,8 @@ public:
  // chunk.)  For now requires that "doHeapRegion" always returns "false",
  // i.e., that a closure never attempt to abort a traversal.
  void heap_region_par_iterate_chunked(HeapRegionClosure* blk,
-                                       int worker,
-                                       int no_of_par_workers,
+                                       uint worker,
+                                       uint no_of_par_workers,
                                       jint claim_value);

  // It resets all the region claim values to the default.
--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.cpp
@ -136,7 +136,6 @@ G1CollectorPolicy::G1CollectorPolicy() :
  _stop_world_start(0.0),
  _all_stop_world_times_ms(new NumberSeq()),
  _all_yield_times_ms(new NumberSeq()),
-  _using_new_ratio_calculations(false),

  _summary(new Summary()),

@ -230,7 +229,9 @@ G1CollectorPolicy::G1CollectorPolicy() :
  _inc_cset_bytes_used_before(0),
  _inc_cset_max_finger(NULL),
  _inc_cset_recorded_rs_lengths(0),
+  _inc_cset_recorded_rs_lengths_diffs(0),
  _inc_cset_predicted_elapsed_time_ms(0.0),
+  _inc_cset_predicted_elapsed_time_ms_diffs(0.0),

 #ifdef _MSC_VER // the use of 'this' below gets a warning, make it go away
 #pragma warning( disable:4355 ) // 'this' : used in base member initializer list
@ -407,11 +408,7 @@ G1CollectorPolicy::G1CollectorPolicy() :

  initialize_all();
  _collectionSetChooser = new CollectionSetChooser();
-}
-
-// Increment "i", mod "len"
-static void inc_mod(int& i, int len) {
-  i++; if (i == len) i = 0;
+  _young_gen_sizer = new G1YoungGenSizer(); // Must be after call to initialize_flags
 }

 void G1CollectorPolicy::initialize_flags() {
@ -423,39 +420,74 @@ void G1CollectorPolicy::initialize_flags() {
  CollectorPolicy::initialize_flags();
 }

-// The easiest way to deal with the parsing of the NewSize /
-// MaxNewSize / etc. parameteres is to re-use the code in the
-// TwoGenerationCollectorPolicy class. This is similar to what
-// ParallelScavenge does with its GenerationSizer class (see
-// ParallelScavengeHeap::initialize()). We might change this in the
-// future, but it's a good start.
-class G1YoungGenSizer : public TwoGenerationCollectorPolicy {
-private:
-  size_t size_to_region_num(size_t byte_size) {
-    return MAX2((size_t) 1, byte_size / HeapRegion::GrainBytes);
+G1YoungGenSizer::G1YoungGenSizer() : _sizer_kind(SizerDefaults), _adaptive_size(true) {
+  assert(G1DefaultMinNewGenPercent <= G1DefaultMaxNewGenPercent, "Min larger than max");
+  assert(G1DefaultMinNewGenPercent > 0 && G1DefaultMinNewGenPercent < 100, "Min out of bounds");
+  assert(G1DefaultMaxNewGenPercent > 0 && G1DefaultMaxNewGenPercent < 100, "Max out of bounds");
+
+  if (FLAG_IS_CMDLINE(NewRatio)) {
+    if (FLAG_IS_CMDLINE(NewSize) || FLAG_IS_CMDLINE(MaxNewSize)) {
+      warning("-XX:NewSize and -XX:MaxNewSize override -XX:NewRatio");
+    } else {
+      _sizer_kind = SizerNewRatio;
+      _adaptive_size = false;
+      return;
+    }
  }

-public:
-  G1YoungGenSizer() {
-    initialize_flags();
-    initialize_size_info();
+  if (FLAG_IS_CMDLINE(NewSize)) {
+     _min_desired_young_length = MAX2((size_t) 1, NewSize / HeapRegion::GrainBytes);
+    if (FLAG_IS_CMDLINE(MaxNewSize)) {
+      _max_desired_young_length = MAX2((size_t) 1, MaxNewSize / HeapRegion::GrainBytes);
+      _sizer_kind = SizerMaxAndNewSize;
+      _adaptive_size = _min_desired_young_length == _max_desired_young_length;
+    } else {
+      _sizer_kind = SizerNewSizeOnly;
    }
-  size_t min_young_region_num() {
-    return size_to_region_num(_min_gen0_size);
+  } else if (FLAG_IS_CMDLINE(MaxNewSize)) {
+    _max_desired_young_length = MAX2((size_t) 1, MaxNewSize / HeapRegion::GrainBytes);
+    _sizer_kind = SizerMaxNewSizeOnly;
  }
-  size_t initial_young_region_num() {
-    return size_to_region_num(_initial_gen0_size);
 }
-  size_t max_young_region_num() {
-    return size_to_region_num(_max_gen0_size);
-  }
-};

-void G1CollectorPolicy::update_young_list_size_using_newratio(size_t number_of_heap_regions) {
-  assert(number_of_heap_regions > 0, "Heap must be initialized");
-  size_t young_size = number_of_heap_regions / (NewRatio + 1);
-  _min_desired_young_length = young_size;
-  _max_desired_young_length = young_size;
+size_t G1YoungGenSizer::calculate_default_min_length(size_t new_number_of_heap_regions) {
+  size_t default_value = (new_number_of_heap_regions * G1DefaultMinNewGenPercent) / 100;
+  return MAX2((size_t)1, default_value);
+}
+
+size_t G1YoungGenSizer::calculate_default_max_length(size_t new_number_of_heap_regions) {
+  size_t default_value = (new_number_of_heap_regions * G1DefaultMaxNewGenPercent) / 100;
+  return MAX2((size_t)1, default_value);
+}
+
+void G1YoungGenSizer::heap_size_changed(size_t new_number_of_heap_regions) {
+  assert(new_number_of_heap_regions > 0, "Heap must be initialized");
+
+  switch (_sizer_kind) {
+    case SizerDefaults:
+      _min_desired_young_length = calculate_default_min_length(new_number_of_heap_regions);
+      _max_desired_young_length = calculate_default_max_length(new_number_of_heap_regions);
+      break;
+    case SizerNewSizeOnly:
+      _max_desired_young_length = calculate_default_max_length(new_number_of_heap_regions);
+      _max_desired_young_length = MAX2(_min_desired_young_length, _max_desired_young_length);
+      break;
+    case SizerMaxNewSizeOnly:
+      _min_desired_young_length = calculate_default_min_length(new_number_of_heap_regions);
+      _min_desired_young_length = MIN2(_min_desired_young_length, _max_desired_young_length);
+      break;
+    case SizerMaxAndNewSize:
+      // Do nothing. Values set on the command line, don't update them at runtime.
+      break;
+    case SizerNewRatio:
+      _min_desired_young_length = new_number_of_heap_regions / (NewRatio + 1);
+      _max_desired_young_length = _min_desired_young_length;
+      break;
+    default:
+      ShouldNotReachHere();
+  }
+
+  assert(_min_desired_young_length <= _max_desired_young_length, "Invalid min/max young gen size values");
 }

 void G1CollectorPolicy::init() {
@ -466,28 +498,10 @@ void G1CollectorPolicy::init() {

  initialize_gc_policy_counters();

-  G1YoungGenSizer sizer;
-  _min_desired_young_length = sizer.min_young_region_num();
-  _max_desired_young_length = sizer.max_young_region_num();
-
-  if (FLAG_IS_CMDLINE(NewRatio)) {
-    if (FLAG_IS_CMDLINE(NewSize) || FLAG_IS_CMDLINE(MaxNewSize)) {
-      warning("-XX:NewSize and -XX:MaxNewSize override -XX:NewRatio");
-    } else {
-      // Treat NewRatio as a fixed size that is only recalculated when the heap size changes
-      update_young_list_size_using_newratio(_g1->n_regions());
-      _using_new_ratio_calculations = true;
-    }
-  }
-
-  assert(_min_desired_young_length <= _max_desired_young_length, "Invalid min/max young gen size values");
-
-  set_adaptive_young_list_length(_min_desired_young_length < _max_desired_young_length);
  if (adaptive_young_list_length()) {
    _young_list_fixed_length = 0;
  } else {
-    assert(_min_desired_young_length == _max_desired_young_length, "Min and max young size differ");
-    _young_list_fixed_length = _min_desired_young_length;
+    _young_list_fixed_length = _young_gen_sizer->min_desired_young_length();
  }
  _free_regions_at_end_of_collection = _g1->free_regions();
  update_young_list_target_length();
@ -541,11 +555,7 @@ void G1CollectorPolicy::record_new_heap_size(size_t new_number_of_regions) {
  // smaller than 1.0) we'll get 1.
  _reserve_regions = (size_t) ceil(reserve_regions_d);

-  if (_using_new_ratio_calculations) {
-    // -XX:NewRatio was specified so we need to update the
-    // young gen length when the heap size has changed.
-    update_young_list_size_using_newratio(new_number_of_regions);
-  }
+  _young_gen_sizer->heap_size_changed(new_number_of_regions);
 }

 size_t G1CollectorPolicy::calculate_young_list_desired_min_length(
@ -563,14 +573,14 @@ size_t G1CollectorPolicy::calculate_young_list_desired_min_length(
  }
  desired_min_length += base_min_length;
  // make sure we don't go below any user-defined minimum bound
-  return MAX2(_min_desired_young_length, desired_min_length);
+  return MAX2(_young_gen_sizer->min_desired_young_length(), desired_min_length);
 }

 size_t G1CollectorPolicy::calculate_young_list_desired_max_length() {
  // Here, we might want to also take into account any additional
  // constraints (i.e., user-defined minimum bound). Currently, we
  // effectively don't set this bound.
-  return _max_desired_young_length;
+  return _young_gen_sizer->max_desired_young_length();
 }

 void G1CollectorPolicy::update_young_list_target_length(size_t rs_lengths) {
@ -1551,10 +1561,19 @@ void G1CollectorPolicy::record_collection_pause_end(int no_of_gc_threads) {
      }
    }

-    // It turns out that, sometimes, _max_rs_lengths can get smaller
-    // than _recorded_rs_lengths which causes rs_length_diff to get
-    // very large and mess up the RSet length predictions. We'll be
-    // defensive until we work out why this happens.
+    // This is defensive. For a while _max_rs_lengths could get
+    // smaller than _recorded_rs_lengths which was causing
+    // rs_length_diff to get very large and mess up the RSet length
+    // predictions. The reason was unsafe concurrent updates to the
+    // _inc_cset_recorded_rs_lengths field which the code below guards
+    // against (see CR 7118202). This bug has now been fixed (see CR
+    // 7119027). However, I'm still worried that
+    // _inc_cset_recorded_rs_lengths might still end up somewhat
+    // inaccurate. The concurrent refinement thread calculates an
+    // RSet's length concurrently with other CR threads updating it
+    // which might cause it to calculate the length incorrectly (if,
+    // say, it's in mid-coarsening). So I'll leave in the defensive
+    // conditional below just in case.
    size_t rs_length_diff = 0;
    if (_max_rs_lengths > _recorded_rs_lengths) {
      rs_length_diff = _max_rs_lengths - _recorded_rs_lengths;
@ -2321,17 +2340,19 @@ public:
    _g1(G1CollectedHeap::heap())
  {}

-  void work(int i) {
-    ParKnownGarbageHRClosure parKnownGarbageCl(_hrSorted, _chunk_size, i);
+  void work(uint worker_id) {
+    ParKnownGarbageHRClosure parKnownGarbageCl(_hrSorted,
+                                               _chunk_size,
+                                               worker_id);
    // Back to zero for the claim value.
-    _g1->heap_region_par_iterate_chunked(&parKnownGarbageCl, i,
+    _g1->heap_region_par_iterate_chunked(&parKnownGarbageCl, worker_id,
                                         _g1->workers()->active_workers(),
                                         HeapRegion::InitialClaimValue);
    jint regions_added = parKnownGarbageCl.marked_regions_added();
    _hrSorted->incNumMarkedHeapRegions(regions_added);
    if (G1PrintParCleanupStats) {
      gclog_or_tty->print_cr("     Thread %d called %d times, added %d regions to list.",
-                 i, parKnownGarbageCl.invokes(), regions_added);
+                 worker_id, parKnownGarbageCl.invokes(), regions_added);
    }
  }
 };
@ -2436,10 +2457,45 @@ void G1CollectorPolicy::start_incremental_cset_building() {

  _inc_cset_max_finger = 0;
  _inc_cset_recorded_rs_lengths = 0;
-  _inc_cset_predicted_elapsed_time_ms = 0;
+  _inc_cset_recorded_rs_lengths_diffs = 0;
+  _inc_cset_predicted_elapsed_time_ms = 0.0;
+  _inc_cset_predicted_elapsed_time_ms_diffs = 0.0;
  _inc_cset_build_state = Active;
 }

+void G1CollectorPolicy::finalize_incremental_cset_building() {
+  assert(_inc_cset_build_state == Active, "Precondition");
+  assert(SafepointSynchronize::is_at_safepoint(), "should be at a safepoint");
+
+  // The two "main" fields, _inc_cset_recorded_rs_lengths and
+  // _inc_cset_predicted_elapsed_time_ms, are updated by the thread
+  // that adds a new region to the CSet. Further updates by the
+  // concurrent refinement thread that samples the young RSet lengths
+  // are accumulated in the *_diffs fields. Here we add the diffs to
+  // the "main" fields.
+
+  if (_inc_cset_recorded_rs_lengths_diffs >= 0) {
+    _inc_cset_recorded_rs_lengths += _inc_cset_recorded_rs_lengths_diffs;
+  } else {
+    // This is defensive. The diff should in theory be always positive
+    // as RSets can only grow between GCs. However, given that we
+    // sample their size concurrently with other threads updating them
+    // it's possible that we might get the wrong size back, which
+    // could make the calculations somewhat inaccurate.
+    size_t diffs = (size_t) (-_inc_cset_recorded_rs_lengths_diffs);
+    if (_inc_cset_recorded_rs_lengths >= diffs) {
+      _inc_cset_recorded_rs_lengths -= diffs;
+    } else {
+      _inc_cset_recorded_rs_lengths = 0;
+    }
+  }
+  _inc_cset_predicted_elapsed_time_ms +=
+                                     _inc_cset_predicted_elapsed_time_ms_diffs;
+
+  _inc_cset_recorded_rs_lengths_diffs = 0;
+  _inc_cset_predicted_elapsed_time_ms_diffs = 0.0;
+}
+
 void G1CollectorPolicy::add_to_incremental_cset_info(HeapRegion* hr, size_t rs_length) {
  // This routine is used when:
  // * adding survivor regions to the incremental cset at the end of an
@ -2455,10 +2511,8 @@ void G1CollectorPolicy::add_to_incremental_cset_info(HeapRegion* hr, size_t rs_l

  double region_elapsed_time_ms = predict_region_elapsed_time_ms(hr, true);
  size_t used_bytes = hr->used();
-
  _inc_cset_recorded_rs_lengths += rs_length;
  _inc_cset_predicted_elapsed_time_ms += region_elapsed_time_ms;
-
  _inc_cset_bytes_used_before += used_bytes;

  // Cache the values we have added to the aggregated informtion
@ -2469,37 +2523,33 @@ void G1CollectorPolicy::add_to_incremental_cset_info(HeapRegion* hr, size_t rs_l
  hr->set_predicted_elapsed_time_ms(region_elapsed_time_ms);
 }

-void G1CollectorPolicy::remove_from_incremental_cset_info(HeapRegion* hr) {
-  // This routine is currently only called as part of the updating of
-  // existing policy information for regions in the incremental cset that
-  // is performed by the concurrent refine thread(s) as part of young list
-  // RSet sampling. Therefore we should not be at a safepoint.
-
-  assert(!SafepointSynchronize::is_at_safepoint(), "should not be at safepoint");
-  assert(hr->is_young(), "it should be");
-
-  size_t used_bytes = hr->used();
-  size_t old_rs_length = hr->recorded_rs_length();
-  double old_elapsed_time_ms = hr->predicted_elapsed_time_ms();
-
-  // Subtract the old recorded/predicted policy information for
-  // the given heap region from the collection set info.
-  _inc_cset_recorded_rs_lengths -= old_rs_length;
-  _inc_cset_predicted_elapsed_time_ms -= old_elapsed_time_ms;
-
-  _inc_cset_bytes_used_before -= used_bytes;
-
-  // Clear the values cached in the heap region
-  hr->set_recorded_rs_length(0);
-  hr->set_predicted_elapsed_time_ms(0);
-}
-
-void G1CollectorPolicy::update_incremental_cset_info(HeapRegion* hr, size_t new_rs_length) {
-  // Update the collection set information that is dependent on the new RS length
+void G1CollectorPolicy::update_incremental_cset_info(HeapRegion* hr,
+                                                     size_t new_rs_length) {
+  // Update the CSet information that is dependent on the new RS length
  assert(hr->is_young(), "Precondition");
+  assert(!SafepointSynchronize::is_at_safepoint(),
+                                               "should not be at a safepoint");

-  remove_from_incremental_cset_info(hr);
-  add_to_incremental_cset_info(hr, new_rs_length);
+  // We could have updated _inc_cset_recorded_rs_lengths and
+  // _inc_cset_predicted_elapsed_time_ms directly but we'd need to do
+  // that atomically, as this code is executed by a concurrent
+  // refinement thread, potentially concurrently with a mutator thread
+  // allocating a new region and also updating the same fields. To
+  // avoid the atomic operations we accumulate these updates on two
+  // separate fields (*_diffs) and we'll just add them to the "main"
+  // fields at the start of a GC.
+
+  ssize_t old_rs_length = (ssize_t) hr->recorded_rs_length();
+  ssize_t rs_lengths_diff = (ssize_t) new_rs_length - old_rs_length;
+  _inc_cset_recorded_rs_lengths_diffs += rs_lengths_diff;
+
+  double old_elapsed_time_ms = hr->predicted_elapsed_time_ms();
+  double new_region_elapsed_time_ms = predict_region_elapsed_time_ms(hr, true);
+  double elapsed_ms_diff = new_region_elapsed_time_ms - old_elapsed_time_ms;
+  _inc_cset_predicted_elapsed_time_ms_diffs += elapsed_ms_diff;
+
+  hr->set_recorded_rs_length(new_rs_length);
+  hr->set_predicted_elapsed_time_ms(new_region_elapsed_time_ms);
 }

 void G1CollectorPolicy::add_region_to_incremental_cset_common(HeapRegion* hr) {
@ -2591,6 +2641,7 @@ void G1CollectorPolicy::choose_collection_set(double target_pause_time_ms) {
  double non_young_start_time_sec = os::elapsedTime();

  YoungList* young_list = _g1->young_list();
+  finalize_incremental_cset_building();

  guarantee(target_pause_time_ms > 0.0,
            err_msg("target_pause_time_ms = %1.6lf should be positive",
--- a/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1CollectorPolicy.hpp
@ -83,6 +83,72 @@ public:
  virtual MainBodySummary*    main_body_summary()    { return this; }
 };

+// There are three command line options related to the young gen size:
+// NewSize, MaxNewSize and NewRatio (There is also -Xmn, but that is
+// just a short form for NewSize==MaxNewSize). G1 will use its internal
+// heuristics to calculate the actual young gen size, so these options
+// basically only limit the range within which G1 can pick a young gen
+// size. Also, these are general options taking byte sizes. G1 will
+// internally work with a number of regions instead. So, some rounding
+// will occur.
+//
+// If nothing related to the the young gen size is set on the command
+// line we should allow the young gen to be between
+// G1DefaultMinNewGenPercent and G1DefaultMaxNewGenPercent of the
+// heap size. This means that every time the heap size changes the
+// limits for the young gen size will be updated.
+//
+// If only -XX:NewSize is set we should use the specified value as the
+// minimum size for young gen. Still using G1DefaultMaxNewGenPercent
+// of the heap as maximum.
+//
+// If only -XX:MaxNewSize is set we should use the specified value as the
+// maximum size for young gen. Still using G1DefaultMinNewGenPercent
+// of the heap as minimum.
+//
+// If -XX:NewSize and -XX:MaxNewSize are both specified we use these values.
+// No updates when the heap size changes. There is a special case when
+// NewSize==MaxNewSize. This is interpreted as "fixed" and will use a
+// different heuristic for calculating the collection set when we do mixed
+// collection.
+//
+// If only -XX:NewRatio is set we should use the specified ratio of the heap
+// as both min and max. This will be interpreted as "fixed" just like the
+// NewSize==MaxNewSize case above. But we will update the min and max
+// everytime the heap size changes.
+//
+// NewSize and MaxNewSize override NewRatio. So, NewRatio is ignored if it is
+// combined with either NewSize or MaxNewSize. (A warning message is printed.)
+class G1YoungGenSizer : public CHeapObj {
+private:
+  enum SizerKind {
+    SizerDefaults,
+    SizerNewSizeOnly,
+    SizerMaxNewSizeOnly,
+    SizerMaxAndNewSize,
+    SizerNewRatio
+  };
+  SizerKind _sizer_kind;
+  size_t _min_desired_young_length;
+  size_t _max_desired_young_length;
+  bool _adaptive_size;
+  size_t calculate_default_min_length(size_t new_number_of_heap_regions);
+  size_t calculate_default_max_length(size_t new_number_of_heap_regions);
+
+public:
+  G1YoungGenSizer();
+  void heap_size_changed(size_t new_number_of_heap_regions);
+  size_t min_desired_young_length() {
+    return _min_desired_young_length;
+  }
+  size_t max_desired_young_length() {
+    return _max_desired_young_length;
+  }
+  bool adaptive_young_list_length() {
+    return _adaptive_size;
+  }
+};
+
 class G1CollectorPolicy: public CollectorPolicy {
 private:
  // either equal to the number of parallel threads, if ParallelGCThreads
@ -167,9 +233,6 @@ private:
  // indicates whether we are in young or mixed GC mode
  bool _gcs_are_young;

-  // if true, then it tries to dynamically adjust the length of the
-  // young list
-  bool _adaptive_young_list_length;
  size_t _young_list_target_length;
  size_t _young_list_fixed_length;
  size_t _prev_eden_capacity; // used for logging
@ -227,9 +290,7 @@ private:

  TruncatedSeq* _young_gc_eff_seq;

-  bool   _using_new_ratio_calculations;
-  size_t _min_desired_young_length; // as set on the command line or default calculations
-  size_t _max_desired_young_length; // as set on the command line or default calculations
+  G1YoungGenSizer* _young_gen_sizer;

  size_t _eden_cset_region_length;
  size_t _survivor_cset_region_length;
@ -588,16 +649,29 @@ private:
  // Used to record the highest end of heap region in collection set
  HeapWord* _inc_cset_max_finger;

-  // The RSet lengths recorded for regions in the collection set
-  // (updated by the periodic sampling of the regions in the
-  // young list/collection set).
+  // The RSet lengths recorded for regions in the CSet. It is updated
+  // by the thread that adds a new region to the CSet. We assume that
+  // only one thread can be allocating a new CSet region (currently,
+  // it does so after taking the Heap_lock) hence no need to
+  // synchronize updates to this field.
  size_t _inc_cset_recorded_rs_lengths;

-  // The predicted elapsed time it will take to collect the regions
-  // in the collection set (updated by the periodic sampling of the
-  // regions in the young list/collection set).
+  // A concurrent refinement thread periodcially samples the young
+  // region RSets and needs to update _inc_cset_recorded_rs_lengths as
+  // the RSets grow. Instead of having to syncronize updates to that
+  // field we accumulate them in this field and add it to
+  // _inc_cset_recorded_rs_lengths_diffs at the start of a GC.
+  ssize_t _inc_cset_recorded_rs_lengths_diffs;
+
+  // The predicted elapsed time it will take to collect the regions in
+  // the CSet. This is updated by the thread that adds a new region to
+  // the CSet. See the comment for _inc_cset_recorded_rs_lengths about
+  // MT-safety assumptions.
  double _inc_cset_predicted_elapsed_time_ms;

+  // See the comment for _inc_cset_recorded_rs_lengths_diffs.
+  double _inc_cset_predicted_elapsed_time_ms_diffs;
+
  // Stash a pointer to the g1 heap.
  G1CollectedHeap* _g1;

@ -682,8 +756,6 @@ private:
  // Count the number of bytes used in the CS.
  void count_CS_bytes_used();

-  void update_young_list_size_using_newratio(size_t number_of_heap_regions);
-
 public:

  G1CollectorPolicy();
@ -710,8 +782,6 @@ public:
  // This should be called after the heap is resized.
  void record_new_heap_size(size_t new_number_of_regions);

-public:
-
  void init();

  // Create jstat counters for the policy.
@ -894,6 +964,10 @@ public:
  // Initialize incremental collection set info.
  void start_incremental_cset_building();

+  // Perform any final calculations on the incremental CSet fields
+  // before we can use them.
+  void finalize_incremental_cset_building();
+
  void clear_incremental_cset() {
    _inc_cset_head = NULL;
    _inc_cset_tail = NULL;
@ -902,10 +976,9 @@ public:
  // Stop adding regions to the incremental collection set
  void stop_incremental_cset_building() { _inc_cset_build_state = Inactive; }

-  // Add/remove information about hr to the aggregated information
-  // for the incrementally built collection set.
+  // Add information about hr to the aggregated information for the
+  // incrementally built collection set.
  void add_to_incremental_cset_info(HeapRegion* hr, size_t rs_length);
-  void remove_from_incremental_cset_info(HeapRegion* hr);

  // Update information about hr in the aggregated information for
  // the incrementally built collection set.
@ -998,10 +1071,7 @@ public:
  }

  bool adaptive_young_list_length() {
-    return _adaptive_young_list_length;
-  }
-  void set_adaptive_young_list_length(bool adaptive_young_list_length) {
-    _adaptive_young_list_length = adaptive_young_list_length;
+    return _young_gen_sizer->adaptive_young_list_length();
  }

  inline double get_gc_eff_factor() {
--- a/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.cpp
@ -558,11 +558,11 @@ void G1RemSet::scrub(BitMap* region_bm, BitMap* card_bm) {
 }

 void G1RemSet::scrub_par(BitMap* region_bm, BitMap* card_bm,
-                                int worker_num, int claim_val) {
+                                uint worker_num, int claim_val) {
  ScrubRSClosure scrub_cl(region_bm, card_bm);
  _g1->heap_region_par_iterate_chunked(&scrub_cl,
                                       worker_num,
-                                       (int) n_workers(),
+                                       n_workers(),
                                       claim_val);
 }

--- a/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.hpp
@ -40,7 +40,7 @@ class G1RemSet: public CHeapObj {
 protected:
  G1CollectedHeap* _g1;
  unsigned _conc_refine_cards;
-  size_t n_workers();
+  uint n_workers();

 protected:
  enum SomePrivateConstants {
@ -122,7 +122,7 @@ public:
  // parallel thread id of the current thread, and "claim_val" is the
  // value that should be used to claim heap regions.
  void scrub_par(BitMap* region_bm, BitMap* card_bm,
-                 int worker_num, int claim_val);
+                 uint worker_num, int claim_val);

  // Refine the card corresponding to "card_ptr".  If "sts" is non-NULL,
  // join and leave around parts that must be atomic wrt GC.  (NULL means
--- a/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.inline.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1RemSet.inline.hpp
@ -29,7 +29,7 @@
 #include "gc_implementation/g1/heapRegionRemSet.hpp"
 #include "oops/oop.inline.hpp"

-inline size_t G1RemSet::n_workers() {
+inline uint G1RemSet::n_workers() {
  if (_g1->workers() != NULL) {
    return _g1->workers()->total_workers();
  } else {
--- a/hotspot/src/share/vm/gc_implementation/g1/g1_globals.hpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/g1_globals.hpp
@ -289,7 +289,15 @@
                                                                            \
  develop(uintx, G1ConcMarkForceOverflow, 0,                                \
          "The number of times we'll force an overflow during "             \
-          "concurrent marking")
+          "concurrent marking")                                             \
+                                                                            \
+  develop(uintx, G1DefaultMinNewGenPercent, 20,                             \
+          "Percentage (0-100) of the heap size to use as minimum "          \
+          "young gen size.")                                                \
+                                                                            \
+  develop(uintx, G1DefaultMaxNewGenPercent, 50,                             \
+          "Percentage (0-100) of the heap size to use as maximum "          \
+          "young gen size.")

 G1_FLAGS(DECLARE_DEVELOPER_FLAG, DECLARE_PD_DEVELOPER_FLAG, DECLARE_PRODUCT_FLAG, DECLARE_PD_PRODUCT_FLAG, DECLARE_DIAGNOSTIC_FLAG, DECLARE_EXPERIMENTAL_FLAG, DECLARE_NOTPRODUCT_FLAG, DECLARE_MANAGEABLE_FLAG, DECLARE_PRODUCT_RW_FLAG)

--- a/hotspot/src/share/vm/gc_implementation/g1/heapRegion.cpp
+++ b/hotspot/src/share/vm/gc_implementation/g1/heapRegion.cpp
@ -94,7 +94,8 @@ public:
 #endif // PRODUCT
  }

-  template <class T> void do_oop_work(T* p) {
+  template <class T>
+  void do_oop_work(T* p) {
    assert(_containing_obj != NULL, "Precondition");
    assert(!_g1h->is_obj_dead_cond(_containing_obj, _vo),
           "Precondition");
@ -102,8 +103,10 @@ public:
    if (!oopDesc::is_null(heap_oop)) {
      oop obj = oopDesc::decode_heap_oop_not_null(heap_oop);
      bool failed = false;
-      if (!_g1h->is_in_closed_subset(obj) ||
-          _g1h->is_obj_dead_cond(obj, _vo)) {
+      if (!_g1h->is_in_closed_subset(obj) || _g1h->is_obj_dead_cond(obj, _vo)) {
+        MutexLockerEx x(ParGCRareEvent_lock,
+                        Mutex::_no_safepoint_check_flag);
+
        if (!_failures) {
          gclog_or_tty->print_cr("");
          gclog_or_tty->print_cr("----------");
@ -133,6 +136,7 @@ public:
          print_object(gclog_or_tty, obj);
        }
        gclog_or_tty->print_cr("----------");
+        gclog_or_tty->flush();
        _failures = true;
        failed = true;
        _n_failures++;
@ -155,6 +159,9 @@ public:
                                  cv_field == dirty
                               : cv_obj == dirty || cv_field == dirty));
          if (is_bad) {
+            MutexLockerEx x(ParGCRareEvent_lock,
+                            Mutex::_no_safepoint_check_flag);
+
            if (!_failures) {
              gclog_or_tty->print_cr("");
              gclog_or_tty->print_cr("----------");
@ -174,6 +181,7 @@ public:
            gclog_or_tty->print_cr("Obj head CTE = %d, field CTE = %d.",
                          cv_obj, cv_field);
            gclog_or_tty->print_cr("----------");
+            gclog_or_tty->flush();
            _failures = true;
            if (!failed) _n_failures++;
          }
--- a/hotspot/src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp
+++ b/hotspot/src/share/vm/gc_implementation/parNew/parCardTableModRefBS.cpp
@ -56,14 +56,14 @@ void CardTableModRefBS::non_clean_card_iterate_parallel_work(Space* sp, MemRegio
                          lowest_non_clean_base_chunk_index,
                          lowest_non_clean_chunk_size);

-  int n_strides = n_threads * ParGCStridesPerThread;
+  uint n_strides = n_threads * ParGCStridesPerThread;
  SequentialSubTasksDone* pst = sp->par_seq_tasks();
  // Sets the condition for completion of the subtask (how many threads
  // need to finish in order to be done).
  pst->set_n_threads(n_threads);
  pst->set_n_tasks(n_strides);

-  int stride = 0;
+  uint stride = 0;
  while (!pst->is_task_claimed(/* reference */ stride)) {
    process_stride(sp, mr, stride, n_strides, cl, ct,
                   lowest_non_clean,
--- a/hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp
+++ b/hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.cpp
@ -590,7 +590,7 @@ void ParNewGenTask::set_for_termination(int active_workers) {
 // called after a task is started.  So "i" is based on
 // first-come-first-served.

-void ParNewGenTask::work(int i) {
+void ParNewGenTask::work(uint worker_id) {
  GenCollectedHeap* gch = GenCollectedHeap::heap();
  // Since this is being done in a separate thread, need new resource
  // and handle marks.
@ -601,8 +601,8 @@ void ParNewGenTask::work(int i) {

  Generation* old_gen = gch->next_gen(_gen);

-  ParScanThreadState& par_scan_state = _state_set->thread_state(i);
-  assert(_state_set->is_valid(i), "Should not have been called");
+  ParScanThreadState& par_scan_state = _state_set->thread_state(worker_id);
+  assert(_state_set->is_valid(worker_id), "Should not have been called");

  par_scan_state.set_young_old_boundary(_young_old_boundary);

@ -755,7 +755,7 @@ public:
                         ParScanThreadStateSet& state_set);

 private:
-  virtual void work(int i);
+  virtual void work(uint worker_id);
  virtual void set_for_termination(int active_workers) {
    _state_set.terminator()->reset_for_reuse(active_workers);
  }
@ -781,13 +781,13 @@ ParNewRefProcTaskProxy::ParNewRefProcTaskProxy(
 {
 }

-void ParNewRefProcTaskProxy::work(int i)
+void ParNewRefProcTaskProxy::work(uint worker_id)
 {
  ResourceMark rm;
  HandleMark hm;
-  ParScanThreadState& par_scan_state = _state_set.thread_state(i);
+  ParScanThreadState& par_scan_state = _state_set.thread_state(worker_id);
  par_scan_state.set_young_old_boundary(_young_old_boundary);
-  _task.work(i, par_scan_state.is_alive_closure(),
+  _task.work(worker_id, par_scan_state.is_alive_closure(),
             par_scan_state.keep_alive_closure(),
             par_scan_state.evacuate_followers_closure());
 }
@ -802,9 +802,9 @@ public:
      _task(task)
  { }

-  virtual void work(int i)
+  virtual void work(uint worker_id)
  {
-    _task.work(i);
+    _task.work(worker_id);
  }
 };

--- a/hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp
+++ b/hotspot/src/share/vm/gc_implementation/parNew/parNewGeneration.hpp
@ -239,7 +239,7 @@ public:

  HeapWord* young_old_boundary() { return _young_old_boundary; }

-  void work(int i);
+  void work(uint worker_id);

  // Reset the terminator in ParScanThreadStateSet for
  // "active_workers" threads.
--- a/hotspot/src/share/vm/gc_implementation/shared/mutableNUMASpace.cpp
+++ b/hotspot/src/share/vm/gc_implementation/shared/mutableNUMASpace.cpp
@ -282,7 +282,7 @@ void MutableNUMASpace::bias_region(MemRegion mr, int lgrp_id) {
    // large page can be broken down if we require small pages.
    os::realign_memory((char*)aligned_region.start(), aligned_region.byte_size(), page_size());
    // Then we uncommit the pages in the range.
-    os::free_memory((char*)aligned_region.start(), aligned_region.byte_size());
+    os::free_memory((char*)aligned_region.start(), aligned_region.byte_size(), page_size());
    // And make them local/first-touch biased.
    os::numa_make_local((char*)aligned_region.start(), aligned_region.byte_size(), lgrp_id);
  }
@ -297,7 +297,7 @@ void MutableNUMASpace::free_region(MemRegion mr) {
    assert((intptr_t)aligned_region.start()     % page_size() == 0 &&
           (intptr_t)aligned_region.byte_size() % page_size() == 0, "Bad alignment");
    assert(region().contains(aligned_region), "Sanity");
-    os::free_memory((char*)aligned_region.start(), aligned_region.byte_size());
+    os::free_memory((char*)aligned_region.start(), aligned_region.byte_size(), page_size());
  }
 }

@ -954,7 +954,7 @@ void MutableNUMASpace::LGRPSpace::scan_pages(size_t page_size, size_t page_count
    if (e != scan_end) {
      if ((page_expected.size != page_size || page_expected.lgrp_id != lgrp_id())
          && page_expected.size != 0) {
-        os::free_memory(s, pointer_delta(e, s, sizeof(char)));
+        os::free_memory(s, pointer_delta(e, s, sizeof(char)), page_size);
      }
      page_expected = page_found;
    }
--- a/hotspot/src/share/vm/gc_implementation/shared/mutableSpace.cpp
+++ b/hotspot/src/share/vm/gc_implementation/shared/mutableSpace.cpp
@ -51,7 +51,7 @@ void MutableSpace::numa_setup_pages(MemRegion mr, bool clear_space) {
      size_t size = pointer_delta(end, start, sizeof(char));
      if (clear_space) {
        // Prefer page reallocation to migration.
-        os::free_memory((char*)start, size);
+        os::free_memory((char*)start, size, page_size);
      }
      os::numa_make_global((char*)start, size);
    }
--- a/hotspot/src/share/vm/gc_interface/collectedHeap.cpp
+++ b/hotspot/src/share/vm/gc_interface/collectedHeap.cpp
@ -478,18 +478,22 @@ oop CollectedHeap::Class_obj_allocate(KlassHandle klass, int size, KlassHandle r
 void CollectedHeap::test_is_in() {
  CollectedHeap* heap = Universe::heap();

+  uintptr_t epsilon    = (uintptr_t) MinObjAlignment;
+  uintptr_t heap_start = (uintptr_t) heap->_reserved.start();
+  uintptr_t heap_end   = (uintptr_t) heap->_reserved.end();
+
  // Test that NULL is not in the heap.
  assert(!heap->is_in(NULL), "NULL is unexpectedly in the heap");

  // Test that a pointer to before the heap start is reported as outside the heap.
-  assert(heap->_reserved.start() >= (void*)MinObjAlignment, "sanity");
-  void* before_heap = (void*)((intptr_t)heap->_reserved.start() - MinObjAlignment);
+  assert(heap_start >= ((uintptr_t)NULL + epsilon), "sanity");
+  void* before_heap = (void*)(heap_start - epsilon);
  assert(!heap->is_in(before_heap),
      err_msg("before_heap: " PTR_FORMAT " is unexpectedly in the heap", before_heap));

  // Test that a pointer to after the heap end is reported as outside the heap.
-  assert(heap->_reserved.end() <= (void*)(uintptr_t(-1) - (uint)MinObjAlignment), "sanity");
-  void* after_heap = (void*)((intptr_t)heap->_reserved.end() + MinObjAlignment);
+  assert(heap_end <= ((uintptr_t)-1 - epsilon), "sanity");
+  void* after_heap = (void*)(heap_end + epsilon);
  assert(!heap->is_in(after_heap),
      err_msg("after_heap: " PTR_FORMAT " is unexpectedly in the heap", after_heap));
 }
--- a/hotspot/src/share/vm/gc_interface/collectedHeap.hpp
+++ b/hotspot/src/share/vm/gc_interface/collectedHeap.hpp
@ -69,7 +69,7 @@ class CollectedHeap : public CHeapObj {
  MemRegion _reserved;
  BarrierSet* _barrier_set;
  bool _is_gc_active;
-  int _n_par_threads;
+  uint _n_par_threads;

  unsigned int _total_collections;          // ... started
  unsigned int _total_full_collections;     // ... started
@ -309,10 +309,10 @@ class CollectedHeap : public CHeapObj {
  GCCause::Cause gc_cause() { return _gc_cause; }

  // Number of threads currently working on GC tasks.
-  int n_par_threads() { return _n_par_threads; }
+  uint n_par_threads() { return _n_par_threads; }

  // May be overridden to set additional parallelism.
-  virtual void set_par_threads(int t) { _n_par_threads = t; };
+  virtual void set_par_threads(uint t) { _n_par_threads = t; };

  // Preload classes into the shared portion of the heap, and then dump
  // that data to a file so that it can be loaded directly by another
--- a/hotspot/src/share/vm/memory/dump.cpp
+++ b/hotspot/src/share/vm/memory/dump.cpp
@ -1402,7 +1402,7 @@ class LinkClassesClosure : public ObjectClosure {
        instanceKlass* ik = (instanceKlass*) k;
        // Link the class to cause the bytecodes to be rewritten and the
        // cpcache to be created.
-        if (ik->get_init_state() < instanceKlass::linked) {
+        if (ik->init_state() < instanceKlass::linked) {
          ik->link_class(THREAD);
          guarantee(!HAS_PENDING_EXCEPTION, "exception in class rewriting");
        }
@ -1535,7 +1535,7 @@ void GenCollectedHeap::preload_and_dump(TRAPS) {
        // are loaded in order that the related data structures (klass,
        // cpCache, Sting constants) are located together.

-        if (ik->get_init_state() < instanceKlass::linked) {
+        if (ik->init_state() < instanceKlass::linked) {
          ik->link_class(THREAD);
          guarantee(!(HAS_PENDING_EXCEPTION), "exception in class rewriting");
        }
--- a/hotspot/src/share/vm/memory/genCollectedHeap.cpp
+++ b/hotspot/src/share/vm/memory/genCollectedHeap.cpp
@ -703,7 +703,7 @@ HeapWord* GenCollectedHeap::satisfy_failed_allocation(size_t size, bool is_tlab)
  return collector_policy()->satisfy_failed_allocation(size, is_tlab);
 }

-void GenCollectedHeap::set_par_threads(int t) {
+void GenCollectedHeap::set_par_threads(uint t) {
  SharedHeap::set_par_threads(t);
  _gen_process_strong_tasks->set_n_threads(t);
 }
--- a/hotspot/src/share/vm/memory/genCollectedHeap.hpp
+++ b/hotspot/src/share/vm/memory/genCollectedHeap.hpp
@ -419,8 +419,7 @@ public:
  // asserted to be this type.
  static GenCollectedHeap* heap();

-  void set_par_threads(int t);
-
+  void set_par_threads(uint t);

  // Invoke the "do_oop" method of one of the closures "not_older_gens"
  // or "older_gens" on root locations for the generation at
--- a/hotspot/src/share/vm/memory/referenceProcessor.cpp
+++ b/hotspot/src/share/vm/memory/referenceProcessor.cpp
@ -88,9 +88,9 @@ void ReferenceProcessor::enable_discovery(bool verify_disabled, bool check_no_re

 ReferenceProcessor::ReferenceProcessor(MemRegion span,
                                       bool      mt_processing,
-                                       int       mt_processing_degree,
+                                       uint      mt_processing_degree,
                                       bool      mt_discovery,
-                                       int       mt_discovery_degree,
+                                       uint      mt_discovery_degree,
                                       bool      atomic_discovery,
                                       BoolObjectClosure* is_alive_non_header,
                                       bool      discovered_list_needs_barrier)  :
@ -105,7 +105,7 @@ ReferenceProcessor::ReferenceProcessor(MemRegion span,
  _span = span;
  _discovery_is_atomic = atomic_discovery;
  _discovery_is_mt     = mt_discovery;
-  _num_q               = MAX2(1, mt_processing_degree);
+  _num_q               = MAX2(1U, mt_processing_degree);
  _max_num_q           = MAX2(_num_q, mt_discovery_degree);
  _discovered_refs     = NEW_C_HEAP_ARRAY(DiscoveredList,
                                          _max_num_q * number_of_subclasses_of_ref());
@ -118,7 +118,7 @@ ReferenceProcessor::ReferenceProcessor(MemRegion span,
  _discoveredPhantomRefs = &_discoveredFinalRefs[_max_num_q];

  // Initialize all entries to NULL
-  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
+  for (uint i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
    _discovered_refs[i].set_head(NULL);
    _discovered_refs[i].set_length(0);
  }
@ -133,7 +133,7 @@ ReferenceProcessor::ReferenceProcessor(MemRegion span,
 #ifndef PRODUCT
 void ReferenceProcessor::verify_no_references_recorded() {
  guarantee(!_discovering_refs, "Discovering refs?");
-  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
+  for (uint i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
    guarantee(_discovered_refs[i].is_empty(),
              "Found non-empty discovered list");
  }
@ -141,7 +141,7 @@ void ReferenceProcessor::verify_no_references_recorded() {
 #endif

 void ReferenceProcessor::weak_oops_do(OopClosure* f) {
-  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
+  for (uint i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
    if (UseCompressedOops) {
      f->do_oop((narrowOop*)_discovered_refs[i].adr_head());
    } else {
@ -437,7 +437,7 @@ void ReferenceProcessor::enqueue_discovered_reflists(HeapWord* pending_list_addr
    task_executor->execute(tsk);
  } else {
    // Serial code: call the parent class's implementation
-    for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
+    for (uint i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
      enqueue_discovered_reflist(_discovered_refs[i], pending_list_addr);
      _discovered_refs[i].set_head(NULL);
      _discovered_refs[i].set_length(0);
@ -696,7 +696,7 @@ ReferenceProcessor::abandon_partial_discovered_list(DiscoveredList& refs_list) {

 void ReferenceProcessor::abandon_partial_discovery() {
  // loop over the lists
-  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
+  for (uint i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
    if (TraceReferenceGC && PrintGCDetails && ((i % _max_num_q) == 0)) {
      gclog_or_tty->print_cr("\nAbandoning %s discovered list", list_name(i));
    }
@ -787,7 +787,7 @@ void ReferenceProcessor::balance_queues(DiscoveredList ref_lists[])
    gclog_or_tty->print_cr("\nBalance ref_lists ");
  }

-  for (int i = 0; i < _max_num_q; ++i) {
+  for (uint i = 0; i < _max_num_q; ++i) {
    total_refs += ref_lists[i].length();
    if (TraceReferenceGC && PrintGCDetails) {
      gclog_or_tty->print("%d ", ref_lists[i].length());
@ -797,8 +797,8 @@ void ReferenceProcessor::balance_queues(DiscoveredList ref_lists[])
    gclog_or_tty->print_cr(" = %d", total_refs);
  }
  size_t avg_refs = total_refs / _num_q + 1;
-  int to_idx = 0;
-  for (int from_idx = 0; from_idx < _max_num_q; from_idx++) {
+  uint to_idx = 0;
+  for (uint from_idx = 0; from_idx < _max_num_q; from_idx++) {
    bool move_all = false;
    if (from_idx >= _num_q) {
      move_all = ref_lists[from_idx].length() > 0;
@ -857,7 +857,7 @@ void ReferenceProcessor::balance_queues(DiscoveredList ref_lists[])
  }
 #ifdef ASSERT
  size_t balanced_total_refs = 0;
-  for (int i = 0; i < _max_num_q; ++i) {
+  for (uint i = 0; i < _max_num_q; ++i) {
    balanced_total_refs += ref_lists[i].length();
    if (TraceReferenceGC && PrintGCDetails) {
      gclog_or_tty->print("%d ", ref_lists[i].length());
@ -903,7 +903,7 @@ ReferenceProcessor::process_discovered_reflist(
  }
  if (PrintReferenceGC && PrintGCDetails) {
    size_t total = 0;
-    for (int i = 0; i < _max_num_q; ++i) {
+    for (uint i = 0; i < _max_num_q; ++i) {
      total += refs_lists[i].length();
    }
    gclog_or_tty->print(", %u refs", total);
@ -919,7 +919,7 @@ ReferenceProcessor::process_discovered_reflist(
      RefProcPhase1Task phase1(*this, refs_lists, policy, true /*marks_oops_alive*/);
      task_executor->execute(phase1);
    } else {
-      for (int i = 0; i < _max_num_q; i++) {
+      for (uint i = 0; i < _max_num_q; i++) {
        process_phase1(refs_lists[i], policy,
                       is_alive, keep_alive, complete_gc);
      }
@ -935,7 +935,7 @@ ReferenceProcessor::process_discovered_reflist(
    RefProcPhase2Task phase2(*this, refs_lists, !discovery_is_atomic() /*marks_oops_alive*/);
    task_executor->execute(phase2);
  } else {
-    for (int i = 0; i < _max_num_q; i++) {
+    for (uint i = 0; i < _max_num_q; i++) {
      process_phase2(refs_lists[i], is_alive, keep_alive, complete_gc);
    }
  }
@ -946,7 +946,7 @@ ReferenceProcessor::process_discovered_reflist(
    RefProcPhase3Task phase3(*this, refs_lists, clear_referent, true /*marks_oops_alive*/);
    task_executor->execute(phase3);
  } else {
-    for (int i = 0; i < _max_num_q; i++) {
+    for (uint i = 0; i < _max_num_q; i++) {
      process_phase3(refs_lists[i], clear_referent,
                     is_alive, keep_alive, complete_gc);
    }
@ -955,7 +955,7 @@ ReferenceProcessor::process_discovered_reflist(

 void ReferenceProcessor::clean_up_discovered_references() {
  // loop over the lists
-  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
+  for (uint i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
    if (TraceReferenceGC && PrintGCDetails && ((i % _max_num_q) == 0)) {
      gclog_or_tty->print_cr(
        "\nScrubbing %s discovered list of Null referents",
@ -1000,7 +1000,7 @@ void ReferenceProcessor::clean_up_discovered_reflist(DiscoveredList& refs_list)
 }

 inline DiscoveredList* ReferenceProcessor::get_discovered_list(ReferenceType rt) {
-  int id = 0;
+  uint id = 0;
  // Determine the queue index to use for this object.
  if (_discovery_is_mt) {
    // During a multi-threaded discovery phase,
@ -1282,7 +1282,7 @@ void ReferenceProcessor::preclean_discovered_references(
  {
    TraceTime tt("Preclean SoftReferences", PrintGCDetails && PrintReferenceGC,
              false, gclog_or_tty);
-    for (int i = 0; i < _max_num_q; i++) {
+    for (uint i = 0; i < _max_num_q; i++) {
      if (yield->should_return()) {
        return;
      }
@ -1295,7 +1295,7 @@ void ReferenceProcessor::preclean_discovered_references(
  {
    TraceTime tt("Preclean WeakReferences", PrintGCDetails && PrintReferenceGC,
              false, gclog_or_tty);
-    for (int i = 0; i < _max_num_q; i++) {
+    for (uint i = 0; i < _max_num_q; i++) {
      if (yield->should_return()) {
        return;
      }
@ -1308,7 +1308,7 @@ void ReferenceProcessor::preclean_discovered_references(
  {
    TraceTime tt("Preclean FinalReferences", PrintGCDetails && PrintReferenceGC,
              false, gclog_or_tty);
-    for (int i = 0; i < _max_num_q; i++) {
+    for (uint i = 0; i < _max_num_q; i++) {
      if (yield->should_return()) {
        return;
      }
@ -1321,7 +1321,7 @@ void ReferenceProcessor::preclean_discovered_references(
  {
    TraceTime tt("Preclean PhantomReferences", PrintGCDetails && PrintReferenceGC,
              false, gclog_or_tty);
-    for (int i = 0; i < _max_num_q; i++) {
+    for (uint i = 0; i < _max_num_q; i++) {
      if (yield->should_return()) {
        return;
      }
@ -1386,7 +1386,7 @@ ReferenceProcessor::preclean_discovered_reflist(DiscoveredList&    refs_list,
  )
 }

-const char* ReferenceProcessor::list_name(int i) {
+const char* ReferenceProcessor::list_name(uint i) {
   assert(i >= 0 && i <= _max_num_q * number_of_subclasses_of_ref(),
          "Out of bounds index");

@ -1410,7 +1410,7 @@ void ReferenceProcessor::verify_ok_to_handle_reflists() {
 #ifndef PRODUCT
 void ReferenceProcessor::clear_discovered_references() {
  guarantee(!_discovering_refs, "Discovering refs?");
-  for (int i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
+  for (uint i = 0; i < _max_num_q * number_of_subclasses_of_ref(); i++) {
    clear_discovered_references(_discovered_refs[i]);
  }
 }
--- a/hotspot/src/share/vm/memory/referenceProcessor.hpp
+++ b/hotspot/src/share/vm/memory/referenceProcessor.hpp
@ -231,7 +231,7 @@ class ReferenceProcessor : public CHeapObj {
  bool        _enqueuing_is_done;       // true if all weak references enqueued
  bool        _processing_is_mt;        // true during phases when
                                        // reference processing is MT.
-  int         _next_id;                 // round-robin mod _num_q counter in
+  uint        _next_id;                 // round-robin mod _num_q counter in
                                        // support of work distribution

  // For collectors that do not keep GC liveness information
@ -252,9 +252,9 @@ class ReferenceProcessor : public CHeapObj {
  // The discovered ref lists themselves

  // The active MT'ness degree of the queues below
-  int             _num_q;
+  uint             _num_q;
  // The maximum MT'ness degree of the queues below
-  int             _max_num_q;
+  uint             _max_num_q;

  // Master array of discovered oops
  DiscoveredList* _discovered_refs;
@ -268,9 +268,9 @@ class ReferenceProcessor : public CHeapObj {
 public:
  static int number_of_subclasses_of_ref() { return (REF_PHANTOM - REF_OTHER); }

-  int num_q()                              { return _num_q; }
-  int max_num_q()                          { return _max_num_q; }
-  void set_active_mt_degree(int v)         { _num_q = v; }
+  uint num_q()                             { return _num_q; }
+  uint max_num_q()                         { return _max_num_q; }
+  void set_active_mt_degree(uint v)        { _num_q = v; }

  DiscoveredList* discovered_refs()        { return _discovered_refs; }

@ -368,7 +368,7 @@ class ReferenceProcessor : public CHeapObj {

  // Returns the name of the discovered reference list
  // occupying the i / _num_q slot.
-  const char* list_name(int i);
+  const char* list_name(uint i);

  void enqueue_discovered_reflists(HeapWord* pending_list_addr, AbstractRefProcTaskExecutor* task_executor);

@ -388,8 +388,8 @@ class ReferenceProcessor : public CHeapObj {
                                   YieldClosure*      yield);

  // round-robin mod _num_q (not: _not_ mode _max_num_q)
-  int next_id() {
-    int id = _next_id;
+  uint next_id() {
+    uint id = _next_id;
    if (++_next_id == _num_q) {
      _next_id = 0;
    }
@ -434,8 +434,8 @@ class ReferenceProcessor : public CHeapObj {

  // Default parameters give you a vanilla reference processor.
  ReferenceProcessor(MemRegion span,
-                     bool mt_processing = false, int mt_processing_degree = 1,
-                     bool mt_discovery  = false, int mt_discovery_degree  = 1,
+                     bool mt_processing = false, uint mt_processing_degree = 1,
+                     bool mt_discovery  = false, uint mt_discovery_degree  = 1,
                     bool atomic_discovery = true,
                     BoolObjectClosure* is_alive_non_header = NULL,
                     bool discovered_list_needs_barrier = false);
--- a/hotspot/src/share/vm/memory/sharedHeap.cpp
+++ b/hotspot/src/share/vm/memory/sharedHeap.cpp
@ -94,7 +94,7 @@ bool SharedHeap::heap_lock_held_for_gc() {
             && _thread_holds_heap_lock_for_gc);
 }

-void SharedHeap::set_par_threads(int t) {
+void SharedHeap::set_par_threads(uint t) {
  assert(t == 0 || !UseSerialGC, "Cannot have parallel threads");
  _n_par_threads = t;
  _process_strong_tasks->set_n_threads(t);
--- a/hotspot/src/share/vm/memory/sharedHeap.hpp
+++ b/hotspot/src/share/vm/memory/sharedHeap.hpp
@ -287,7 +287,7 @@ public:

  // Sets the number of parallel threads that will be doing tasks
  // (such as process strong roots) subsequently.
-  virtual void set_par_threads(int t);
+  virtual void set_par_threads(uint t);

  int n_termination();
  void set_n_termination(int t);
--- a/hotspot/src/share/vm/oops/arrayKlass.hpp
+++ b/hotspot/src/share/vm/oops/arrayKlass.hpp
@ -73,7 +73,7 @@ class arrayKlass: public Klass {
  oop* adr_component_mirror()           { return (oop*)&this->_component_mirror;}

  // Compiler/Interpreter offset
-  static ByteSize component_mirror_offset() { return byte_offset_of(arrayKlass, _component_mirror); }
+  static ByteSize component_mirror_offset() { return in_ByteSize(sizeof(klassOopDesc) + offset_of(arrayKlass, _component_mirror)); }

  virtual klassOop java_super() const;//{ return SystemDictionary::Object_klass(); }

--- a/hotspot/src/share/vm/oops/instanceKlass.cpp
+++ b/hotspot/src/share/vm/oops/instanceKlass.cpp
@ -208,7 +208,7 @@ void instanceKlass::eager_initialize_impl(instanceKlassHandle this_oop) {
  // abort if someone beat us to the initialization
  if (!this_oop->is_not_initialized()) return;  // note: not equivalent to is_initialized()

-  ClassState old_state = this_oop->_init_state;
+  ClassState old_state = this_oop->init_state();
  link_class_impl(this_oop, true, THREAD);
  if (HAS_PENDING_EXCEPTION) {
    CLEAR_PENDING_EXCEPTION;
@ -2479,7 +2479,7 @@ void instanceKlass::set_init_state(ClassState state) {
  bool good_state = as_klassOop()->is_shared() ? (_init_state <= state)
                                               : (_init_state < state);
  assert(good_state || state == allocated, "illegal state transition");
-  _init_state = state;
+  _init_state = (u1)state;
 }
 #endif

--- a/hotspot/src/share/vm/oops/instanceKlass.hpp
+++ b/hotspot/src/share/vm/oops/instanceKlass.hpp
@ -227,16 +227,12 @@ class instanceKlass: public Klass {
  // (including inherited fields but after header_size()).
  int             _nonstatic_field_size;
  int             _static_field_size;    // number words used by static fields (oop and non-oop) in this klass
-  int             _static_oop_field_count;// number of static oop fields in this klass
+  u2              _static_oop_field_count;// number of static oop fields in this klass
+  u2              _java_fields_count;    // The number of declared Java fields
  int             _nonstatic_oop_map_size;// size in words of nonstatic oop map blocks
-  int             _java_fields_count;    // The number of declared Java fields
-  bool            _is_marked_dependent;  // used for marking during flushing and deoptimization
-  bool            _rewritten;            // methods rewritten.
-  bool            _has_nonstatic_fields; // for sizing with UseCompressedOops
-  bool            _should_verify_class;  // allow caching of preverification
+
  u2              _minor_version;        // minor version number of class file
  u2              _major_version;        // major version number of class file
-  ClassState      _init_state;           // state of class
  Thread*         _init_thread;          // Pointer to current thread doing initialization (to handle recusive initialization)
  int             _vtable_len;           // length of Java vtable (in words)
  int             _itable_len;           // length of Java itable (in words)
@ -260,6 +256,24 @@ class instanceKlass: public Klass {
  JvmtiCachedClassFieldMap* _jvmti_cached_class_field_map;  // JVMTI: used during heap iteration
  volatile u2     _idnum_allocated_count;         // JNI/JVMTI: increments with the addition of methods, old ids don't change

+  // Class states are defined as ClassState (see above).
+  // Place the _init_state here to utilize the unused 2-byte after
+  // _idnum_allocated_count.
+  u1              _init_state;                    // state of class
+
+  // Compact the following four boolean flags into 1-bit each.  These four flags
+  // were defined as separate boolean fields and each was 1-byte before. Since
+  // there are 2 bytes unused after the _idnum_allocated_count field, place the
+  // _misc_flags field after _idnum_allocated_count to utilize the unused bits
+  // and save total 4-bytes.
+  enum {
+    IS_MARKED_DEPENDENT  = 0x1, // used for marking during flushing and deoptimization
+    REWRITTEN            = 0x2, // methods rewritten.
+    HAS_NONSTATIC_FIELDS = 0x4, // for sizing with UseCompressedOops
+    SHOULD_VERIFY_CLASS  = 0x8  // allow caching of preverification
+  };
+  u1              _misc_flags;
+
  // embedded Java vtable follows here
  // embedded Java itables follows here
  // embedded static fields follows here
@ -269,8 +283,14 @@ class instanceKlass: public Klass {
  friend class SystemDictionary;

 public:
-  bool has_nonstatic_fields() const        { return _has_nonstatic_fields; }
-  void set_has_nonstatic_fields(bool b)    { _has_nonstatic_fields = b; }
+  bool has_nonstatic_fields() const        { return (_misc_flags & HAS_NONSTATIC_FIELDS) != 0; }
+  void set_has_nonstatic_fields(bool b) {
+    if (b) {
+      _misc_flags |= HAS_NONSTATIC_FIELDS;
+    } else {
+      _misc_flags &= ~HAS_NONSTATIC_FIELDS;
+    }
+  }

  // field sizes
  int nonstatic_field_size() const         { return _nonstatic_field_size; }
@ -279,8 +299,8 @@ class instanceKlass: public Klass {
  int static_field_size() const            { return _static_field_size; }
  void set_static_field_size(int size)     { _static_field_size = size; }

-  int static_oop_field_count() const        { return _static_oop_field_count; }
-  void set_static_oop_field_count(int size) { _static_oop_field_count = size; }
+  int static_oop_field_count() const       { return (int)_static_oop_field_count; }
+  void set_static_oop_field_count(u2 size) { _static_oop_field_count = size; }

  // Java vtable
  int  vtable_length() const               { return _vtable_len; }
@ -320,14 +340,14 @@ class instanceKlass: public Klass {
  Symbol* field_signature   (int index) const { return field(index)->signature(constants()); }

  // Number of Java declared fields
-  int java_fields_count() const           { return _java_fields_count; }
+  int java_fields_count() const           { return (int)_java_fields_count; }

  // Number of fields including any injected fields
  int all_fields_count() const            { return _fields->length() / sizeof(FieldInfo::field_slots); }

  typeArrayOop fields() const              { return _fields; }

-  void set_fields(typeArrayOop f, int java_fields_count) {
+  void set_fields(typeArrayOop f, u2 java_fields_count) {
    oop_store_without_check((oop*) &_fields, (oop) f);
    _java_fields_count = java_fields_count;
  }
@ -377,16 +397,24 @@ class instanceKlass: public Klass {
  bool is_being_initialized() const        { return _init_state == being_initialized; }
  bool is_in_error_state() const           { return _init_state == initialization_error; }
  bool is_reentrant_initialization(Thread *thread)  { return thread == _init_thread; }
-  int  get_init_state()                    { return _init_state; } // Useful for debugging
-  bool is_rewritten() const                { return _rewritten; }
+  ClassState  init_state()                 { return (ClassState)_init_state; }
+  bool is_rewritten() const                { return (_misc_flags & REWRITTEN) != 0; }

  // defineClass specified verification
-  bool should_verify_class() const         { return _should_verify_class; }
-  void set_should_verify_class(bool value) { _should_verify_class = value; }
+  bool should_verify_class() const         { return (_misc_flags & SHOULD_VERIFY_CLASS) != 0; }
+  void set_should_verify_class(bool value) {
+    if (value) {
+      _misc_flags |= SHOULD_VERIFY_CLASS;
+    } else {
+      _misc_flags &= ~SHOULD_VERIFY_CLASS;
+    }
+  }
+

  // marking
-  bool is_marked_dependent() const         { return _is_marked_dependent; }
-  void set_is_marked_dependent(bool value) { _is_marked_dependent = value; }
+  bool is_marked_dependent() const         { return (_misc_flags & IS_MARKED_DEPENDENT) != 0; }
+  void set_is_marked_dependent()           { _misc_flags |= IS_MARKED_DEPENDENT; }
+  void clear_is_marked_dependent()         { _misc_flags &= ~IS_MARKED_DEPENDENT; }

  // initialization (virtuals from Klass)
  bool should_be_initialized() const;  // means that initialize should be called
@ -405,7 +433,7 @@ class instanceKlass: public Klass {
  ReferenceType reference_type() const     { return _reference_type; }
  void set_reference_type(ReferenceType t) { _reference_type = t; }

-  static int reference_type_offset_in_bytes() { return offset_of(instanceKlass, _reference_type); }
+  static ByteSize reference_type_offset() { return in_ByteSize(sizeof(klassOopDesc) + offset_of(instanceKlass, _reference_type)); }

  // find local field, returns true if found
  bool find_local_field(Symbol* name, Symbol* sig, fieldDescriptor* fd) const;
@ -616,8 +644,8 @@ class instanceKlass: public Klass {
  void set_breakpoints(BreakpointInfo* bps) { _breakpoints = bps; };

  // support for stub routines
-  static int init_state_offset_in_bytes()    { return offset_of(instanceKlass, _init_state); }
-  static int init_thread_offset_in_bytes()   { return offset_of(instanceKlass, _init_thread); }
+  static ByteSize init_state_offset()  { return in_ByteSize(sizeof(klassOopDesc) + offset_of(instanceKlass, _init_state)); }
+  static ByteSize init_thread_offset() { return in_ByteSize(sizeof(klassOopDesc) + offset_of(instanceKlass, _init_thread)); }

  // subclass/subinterface checks
  bool implements_interface(klassOop k) const;
@ -754,9 +782,9 @@ private:
 #ifdef ASSERT
  void set_init_state(ClassState state);
 #else
-  void set_init_state(ClassState state) { _init_state = state; }
+  void set_init_state(ClassState state) { _init_state = (u1)state; }
 #endif
-  void set_rewritten()                  { _rewritten = true; }
+  void set_rewritten()                  { _misc_flags |= REWRITTEN; }
  void set_init_thread(Thread *thread)  { _init_thread = thread; }

  u2 idnum_allocated_count() const      { return _idnum_allocated_count; }
--- a/hotspot/src/share/vm/oops/instanceKlassKlass.cpp
+++ b/hotspot/src/share/vm/oops/instanceKlassKlass.cpp
@ -399,7 +399,7 @@ instanceKlassKlass::allocate_instance_klass(Symbol* name, int vtable_len, int it
    ik->set_inner_classes(NULL);
    ik->set_static_oop_field_count(0);
    ik->set_nonstatic_field_size(0);
-    ik->set_is_marked_dependent(false);
+    ik->clear_is_marked_dependent();
    ik->set_init_state(instanceKlass::allocated);
    ik->set_init_thread(NULL);
    ik->set_reference_type(rt);
--- a/hotspot/src/share/vm/oops/klass.cpp
+++ b/hotspot/src/share/vm/oops/klass.cpp
@ -144,7 +144,7 @@ klassOop Klass::base_create_klass_oop(KlassHandle& klass, int size,
    }
    kl->set_secondary_supers(NULL);
    oop_store_without_check((oop*) &kl->_primary_supers[0], k);
-    kl->set_super_check_offset(primary_supers_offset_in_bytes() + sizeof(oopDesc));
+    kl->set_super_check_offset(in_bytes(primary_supers_offset()));
  }

  kl->set_java_mirror(NULL);
--- a/hotspot/src/share/vm/oops/klass.hpp
+++ b/hotspot/src/share/vm/oops/klass.hpp
@ -318,7 +318,7 @@ class Klass : public Klass_vtbl {
  // Can this klass be a primary super?  False for interfaces and arrays of
  // interfaces.  False also for arrays or classes with long super chains.
  bool can_be_primary_super() const {
-    const juint secondary_offset = secondary_super_cache_offset_in_bytes() + sizeof(oopDesc);
+    const juint secondary_offset = in_bytes(secondary_super_cache_offset());
    return super_check_offset() != secondary_offset;
  }
  virtual bool can_be_primary_super_slow() const;
@ -328,7 +328,7 @@ class Klass : public Klass_vtbl {
    if (!can_be_primary_super()) {
      return primary_super_limit();
    } else {
-      juint d = (super_check_offset() - (primary_supers_offset_in_bytes() + sizeof(oopDesc))) / sizeof(klassOop);
+      juint d = (super_check_offset() - in_bytes(primary_supers_offset())) / sizeof(klassOop);
      assert(d < primary_super_limit(), "oob");
      assert(_primary_supers[d] == as_klassOop(), "proper init");
      return d;
@ -378,15 +378,15 @@ class Klass : public Klass_vtbl {
  virtual void set_alloc_size(juint n) = 0;

  // Compiler support
-  static int super_offset_in_bytes()         { return offset_of(Klass, _super); }
-  static int super_check_offset_offset_in_bytes() { return offset_of(Klass, _super_check_offset); }
-  static int primary_supers_offset_in_bytes(){ return offset_of(Klass, _primary_supers); }
-  static int secondary_super_cache_offset_in_bytes() { return offset_of(Klass, _secondary_super_cache); }
-  static int secondary_supers_offset_in_bytes() { return offset_of(Klass, _secondary_supers); }
-  static int java_mirror_offset_in_bytes()   { return offset_of(Klass, _java_mirror); }
-  static int modifier_flags_offset_in_bytes(){ return offset_of(Klass, _modifier_flags); }
-  static int layout_helper_offset_in_bytes() { return offset_of(Klass, _layout_helper); }
-  static int access_flags_offset_in_bytes()  { return offset_of(Klass, _access_flags); }
+  static ByteSize super_offset()                 { return in_ByteSize(sizeof(klassOopDesc) + offset_of(Klass, _super)); }
+  static ByteSize super_check_offset_offset()    { return in_ByteSize(sizeof(klassOopDesc) + offset_of(Klass, _super_check_offset)); }
+  static ByteSize primary_supers_offset()        { return in_ByteSize(sizeof(klassOopDesc) + offset_of(Klass, _primary_supers)); }
+  static ByteSize secondary_super_cache_offset() { return in_ByteSize(sizeof(klassOopDesc) + offset_of(Klass, _secondary_super_cache)); }
+  static ByteSize secondary_supers_offset()      { return in_ByteSize(sizeof(klassOopDesc) + offset_of(Klass, _secondary_supers)); }
+  static ByteSize java_mirror_offset()           { return in_ByteSize(sizeof(klassOopDesc) + offset_of(Klass, _java_mirror)); }
+  static ByteSize modifier_flags_offset()        { return in_ByteSize(sizeof(klassOopDesc) + offset_of(Klass, _modifier_flags)); }
+  static ByteSize layout_helper_offset()         { return in_ByteSize(sizeof(klassOopDesc) + offset_of(Klass, _layout_helper)); }
+  static ByteSize access_flags_offset()          { return in_ByteSize(sizeof(klassOopDesc) + offset_of(Klass, _access_flags)); }

  // Unpacking layout_helper:
  enum {
@ -483,7 +483,7 @@ class Klass : public Klass_vtbl {
  bool is_subtype_of(klassOop k) const {
    juint    off = k->klass_part()->super_check_offset();
    klassOop sup = *(klassOop*)( (address)as_klassOop() + off );
-    const juint secondary_offset = secondary_super_cache_offset_in_bytes() + sizeof(oopDesc);
+    const juint secondary_offset = in_bytes(secondary_super_cache_offset());
    if (sup == k) {
      return true;
    } else if (off != secondary_offset) {
@ -679,7 +679,7 @@ class Klass : public Klass_vtbl {
  // are potential problems in setting the bias pattern for
  // JVM-internal oops.
  inline void set_prototype_header(markOop header);
-  static int prototype_header_offset_in_bytes() { return offset_of(Klass, _prototype_header); }
+  static ByteSize prototype_header_offset() { return in_ByteSize(sizeof(klassOopDesc) + offset_of(Klass, _prototype_header)); }

  int  biased_lock_revocation_count() const { return (int) _biased_lock_revocation_count; }
  // Atomically increments biased_lock_revocation_count and returns updated value
--- a/hotspot/src/share/vm/oops/klassOop.hpp
+++ b/hotspot/src/share/vm/oops/klassOop.hpp
@ -38,14 +38,8 @@

 class klassOopDesc : public oopDesc {
 public:
-  // size operation
-  static int header_size()                       { return sizeof(klassOopDesc)/HeapWordSize; }
-
-  // support for code generation
-  static int klass_part_offset_in_bytes()        { return sizeof(klassOopDesc); }
-
  // returns the Klass part containing dispatching behavior
-  Klass* klass_part() const                      { return (Klass*)((address)this + klass_part_offset_in_bytes()); }
+  Klass* klass_part() const                      { return (Klass*)((address)this + sizeof(klassOopDesc)); }

  // Convenience wrapper
  inline oop java_mirror() const;
--- a/hotspot/src/share/vm/oops/objArrayKlass.hpp
+++ b/hotspot/src/share/vm/oops/objArrayKlass.hpp
@ -47,7 +47,7 @@ class objArrayKlass : public arrayKlass {
  oop* bottom_klass_addr()            { return (oop*)&_bottom_klass; }

  // Compiler/Interpreter offset
-  static int element_klass_offset_in_bytes() { return offset_of(objArrayKlass, _element_klass); }
+  static ByteSize element_klass_offset() { return in_ByteSize(sizeof(klassOopDesc) + offset_of(objArrayKlass, _element_klass)); }

  // Dispatched operation
  bool can_be_primary_super_slow() const;
--- a/hotspot/src/share/vm/opto/callnode.hpp
+++ b/hotspot/src/share/vm/opto/callnode.hpp
@ -791,6 +791,10 @@ public:
  // are defined in graphKit.cpp, which sets up the bidirectional relation.)
  InitializeNode* initialization();

+  // Return the corresponding storestore barrier (or null if none).
+  // Walks out edges to find it...
+  MemBarStoreStoreNode* storestore();
+
  // Convenience for initialization->maybe_set_complete(phase)
  bool maybe_set_complete(PhaseGVN* phase);
 };
--- a/hotspot/src/share/vm/opto/classes.hpp
+++ b/hotspot/src/share/vm/opto/classes.hpp
@ -166,6 +166,7 @@ macro(MemBarCPUOrder)
 macro(MemBarRelease)
 macro(MemBarReleaseLock)
 macro(MemBarVolatile)
+macro(MemBarStoreStore)
 macro(MergeMem)
 macro(MinI)
 macro(ModD)
--- a/hotspot/src/share/vm/opto/compile.cpp
+++ b/hotspot/src/share/vm/opto/compile.cpp
@ -1282,12 +1282,11 @@ const TypePtr *Compile::flatten_alias_type( const TypePtr *tj ) const {
  if( tk ) {
    // If we are referencing a field within a Klass, we need
    // to assume the worst case of an Object.  Both exact and
-    // inexact types must flatten to the same alias class.
-    // Since the flattened result for a klass is defined to be
-    // precisely java.lang.Object, use a constant ptr.
+    // inexact types must flatten to the same alias class so
+    // use NotNull as the PTR.
    if ( offset == Type::OffsetBot || (offset >= 0 && (size_t)offset < sizeof(Klass)) ) {

-      tj = tk = TypeKlassPtr::make(TypePtr::Constant,
+      tj = tk = TypeKlassPtr::make(TypePtr::NotNull,
                                   TypeKlassPtr::OBJECT->klass(),
                                   offset);
    }
@ -1307,10 +1306,12 @@ const TypePtr *Compile::flatten_alias_type( const TypePtr *tj ) const {
    // these 2 disparate memories into the same alias class.  Since the
    // primary supertype array is read-only, there's no chance of confusion
    // where we bypass an array load and an array store.
-    uint off2 = offset - Klass::primary_supers_offset_in_bytes();
+    int primary_supers_offset = in_bytes(Klass::primary_supers_offset());
    if (offset == Type::OffsetBot ||
-        off2 < Klass::primary_super_limit()*wordSize ) {
-      offset = sizeof(oopDesc) +Klass::secondary_super_cache_offset_in_bytes();
+        (offset >= primary_supers_offset &&
+         offset < (int)(primary_supers_offset + Klass::primary_super_limit() * wordSize)) ||
+        offset == (int)in_bytes(Klass::secondary_super_cache_offset())) {
+      offset = in_bytes(Klass::secondary_super_cache_offset());
      tj = tk = TypeKlassPtr::make( TypePtr::NotNull, tk->klass(), offset );
    }
  }
@ -1489,13 +1490,13 @@ Compile::AliasType* Compile::find_alias_type(const TypePtr* adr_type, bool no_cr
        alias_type(idx)->set_rewritable(false);
    }
    if (flat->isa_klassptr()) {
-      if (flat->offset() == Klass::super_check_offset_offset_in_bytes() + (int)sizeof(oopDesc))
+      if (flat->offset() == in_bytes(Klass::super_check_offset_offset()))
        alias_type(idx)->set_rewritable(false);
-      if (flat->offset() == Klass::modifier_flags_offset_in_bytes() + (int)sizeof(oopDesc))
+      if (flat->offset() == in_bytes(Klass::modifier_flags_offset()))
        alias_type(idx)->set_rewritable(false);
-      if (flat->offset() == Klass::access_flags_offset_in_bytes() + (int)sizeof(oopDesc))
+      if (flat->offset() == in_bytes(Klass::access_flags_offset()))
        alias_type(idx)->set_rewritable(false);
-      if (flat->offset() == Klass::java_mirror_offset_in_bytes() + (int)sizeof(oopDesc))
+      if (flat->offset() == in_bytes(Klass::java_mirror_offset()))
        alias_type(idx)->set_rewritable(false);
    }
    // %%% (We would like to finalize JavaThread::threadObj_offset(),
@ -2521,7 +2522,7 @@ static void final_graph_reshaping_impl( Node *n, Final_Reshape_Counts &frc ) {
            break;
          }
        }
-        assert(p != NULL, "must be found");
+        assert(proj != NULL, "must be found");
        p->subsume_by(proj);
      }
    }
--- a/hotspot/src/share/vm/opto/escape.cpp
+++ b/hotspot/src/share/vm/opto/escape.cpp
@ -1595,6 +1595,7 @@ bool ConnectionGraph::compute_escape() {
  GrowableArray<Node*> alloc_worklist;
  GrowableArray<Node*> addp_worklist;
  GrowableArray<Node*> ptr_cmp_worklist;
+  GrowableArray<Node*> storestore_worklist;
  PhaseGVN* igvn = _igvn;

  // Push all useful nodes onto CG list and set their type.
@ -1618,6 +1619,11 @@ bool ConnectionGraph::compute_escape() {
               (n->Opcode() == Op_CmpP || n->Opcode() == Op_CmpN)) {
      // Compare pointers nodes
      ptr_cmp_worklist.append(n);
+    } else if (n->is_MemBarStoreStore()) {
+      // Collect all MemBarStoreStore nodes so that depending on the
+      // escape status of the associated Allocate node some of them
+      // may be eliminated.
+      storestore_worklist.append(n);
    }
    for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
      Node* m = n->fast_out(i);   // Get user
@ -1724,11 +1730,20 @@ bool ConnectionGraph::compute_escape() {
  uint alloc_length = alloc_worklist.length();
  for (uint next = 0; next < alloc_length; ++next) {
    Node* n = alloc_worklist.at(next);
-    if (ptnode_adr(n->_idx)->escape_state() == PointsToNode::NoEscape) {
+    PointsToNode::EscapeState es = ptnode_adr(n->_idx)->escape_state();
+    if (es == PointsToNode::NoEscape) {
      has_non_escaping_obj = true;
      if (n->is_Allocate()) {
        find_init_values(n, &visited, igvn);
+        // The object allocated by this Allocate node will never be
+        // seen by an other thread. Mark it so that when it is
+        // expanded no MemBarStoreStore is added.
+        n->as_Allocate()->initialization()->set_does_not_escape();
      }
+    } else if ((es == PointsToNode::ArgEscape) && n->is_Allocate()) {
+      // Same as above. Mark this Allocate node so that when it is
+      // expanded no MemBarStoreStore is added.
+      n->as_Allocate()->initialization()->set_does_not_escape();
    }
  }

@ -1874,6 +1889,25 @@ bool ConnectionGraph::compute_escape() {
      igvn->hash_delete(_pcmp_eq);
  }

+  // For MemBarStoreStore nodes added in library_call.cpp, check
+  // escape status of associated AllocateNode and optimize out
+  // MemBarStoreStore node if the allocated object never escapes.
+  while (storestore_worklist.length() != 0) {
+    Node *n = storestore_worklist.pop();
+    MemBarStoreStoreNode *storestore = n ->as_MemBarStoreStore();
+    Node *alloc = storestore->in(MemBarNode::Precedent)->in(0);
+    assert (alloc->is_Allocate(), "storestore should point to AllocateNode");
+    PointsToNode::EscapeState es = ptnode_adr(alloc->_idx)->escape_state();
+    if (es == PointsToNode::NoEscape || es == PointsToNode::ArgEscape) {
+      MemBarNode* mb = MemBarNode::make(C, Op_MemBarCPUOrder, Compile::AliasIdxBot);
+      mb->init_req(TypeFunc::Memory, storestore->in(TypeFunc::Memory));
+      mb->init_req(TypeFunc::Control, storestore->in(TypeFunc::Control));
+
+      _igvn->register_new_node_with_optimizer(mb);
+      _igvn->replace_node(storestore, mb);
+    }
+  }
+
 #ifndef PRODUCT
  if (PrintEscapeAnalysis) {
    dump(); // Dump ConnectionGraph
--- a/hotspot/src/share/vm/opto/graphKit.cpp
+++ b/hotspot/src/share/vm/opto/graphKit.cpp
@ -2304,9 +2304,9 @@ Node* GraphKit::gen_subtype_check(Node* subklass, Node* superklass) {
  // will always succeed.  We could leave a dependency behind to ensure this.

  // First load the super-klass's check-offset
-  Node *p1 = basic_plus_adr( superklass, superklass, sizeof(oopDesc) + Klass::super_check_offset_offset_in_bytes() );
+  Node *p1 = basic_plus_adr( superklass, superklass, in_bytes(Klass::super_check_offset_offset()) );
  Node *chk_off = _gvn.transform( new (C, 3) LoadINode( NULL, memory(p1), p1, _gvn.type(p1)->is_ptr() ) );
-  int cacheoff_con = sizeof(oopDesc) + Klass::secondary_super_cache_offset_in_bytes();
+  int cacheoff_con = in_bytes(Klass::secondary_super_cache_offset());
  bool might_be_cache = (find_int_con(chk_off, cacheoff_con) == cacheoff_con);

  // Load from the sub-klass's super-class display list, or a 1-word cache of
@ -2934,7 +2934,7 @@ Node* GraphKit::get_layout_helper(Node* klass_node, jint& constant_value) {
    }
  }
  constant_value = Klass::_lh_neutral_value;  // put in a known value
-  Node* lhp = basic_plus_adr(klass_node, klass_node, Klass::layout_helper_offset_in_bytes() + sizeof(oopDesc));
+  Node* lhp = basic_plus_adr(klass_node, klass_node, in_bytes(Klass::layout_helper_offset()));
  return make_load(NULL, lhp, TypeInt::INT, T_INT);
 }

@ -3337,6 +3337,19 @@ InitializeNode* AllocateNode::initialization() {
  return NULL;
 }

+// Trace Allocate -> Proj[Parm] -> MemBarStoreStore
+MemBarStoreStoreNode* AllocateNode::storestore() {
+  ProjNode* rawoop = proj_out(AllocateNode::RawAddress);
+  if (rawoop == NULL)  return NULL;
+  for (DUIterator_Fast imax, i = rawoop->fast_outs(imax); i < imax; i++) {
+    Node* storestore = rawoop->fast_out(i);
+    if (storestore->is_MemBarStoreStore()) {
+      return storestore->as_MemBarStoreStore();
+    }
+  }
+  return NULL;
+}
+
 //----------------------------- loop predicates ---------------------------

 //------------------------------add_predicate_impl----------------------------
--- a/hotspot/src/share/vm/opto/library_call.cpp
+++ b/hotspot/src/share/vm/opto/library_call.cpp
@ -2165,8 +2165,7 @@ void LibraryCallKit::insert_g1_pre_barrier(Node* base_oop, Node* offset, Node* p
  IdealKit ideal(this);
 #define __ ideal.

-  const int reference_type_offset = instanceKlass::reference_type_offset_in_bytes() +
-                                        sizeof(oopDesc);
+  const int reference_type_offset = in_bytes(instanceKlass::reference_type_offset());

  Node* referent_off = __ ConX(java_lang_ref_Reference::referent_offset);

@ -2806,8 +2805,10 @@ bool LibraryCallKit::inline_unsafe_allocate() {
  // Note:  The argument might still be an illegal value like
  // Serializable.class or Object[].class.   The runtime will handle it.
  // But we must make an explicit check for initialization.
-  Node* insp = basic_plus_adr(kls, instanceKlass::init_state_offset_in_bytes() + sizeof(oopDesc));
-  Node* inst = make_load(NULL, insp, TypeInt::INT, T_INT);
+  Node* insp = basic_plus_adr(kls, in_bytes(instanceKlass::init_state_offset()));
+  // Use T_BOOLEAN for instanceKlass::_init_state so the compiler
+  // can generate code to load it as unsigned byte.
+  Node* inst = make_load(NULL, insp, TypeInt::UBYTE, T_BOOLEAN);
  Node* bits = intcon(instanceKlass::fully_initialized);
  Node* test = _gvn.transform( new (C, 3) SubINode(inst, bits) );
  // The 'test' is non-zero if we need to take a slow path.
@ -2954,7 +2955,7 @@ bool LibraryCallKit::inline_native_isInterrupted() {
 //---------------------------load_mirror_from_klass----------------------------
 // Given a klass oop, load its java mirror (a java.lang.Class oop).
 Node* LibraryCallKit::load_mirror_from_klass(Node* klass) {
-  Node* p = basic_plus_adr(klass, Klass::java_mirror_offset_in_bytes() + sizeof(oopDesc));
+  Node* p = basic_plus_adr(klass, in_bytes(Klass::java_mirror_offset()));
  return make_load(NULL, p, TypeInstPtr::MIRROR, T_OBJECT);
 }

@ -2994,7 +2995,7 @@ Node* LibraryCallKit::load_klass_from_mirror_common(Node* mirror,
 Node* LibraryCallKit::generate_access_flags_guard(Node* kls, int modifier_mask, int modifier_bits, RegionNode* region) {
  // Branch around if the given klass has the given modifier bit set.
  // Like generate_guard, adds a new path onto the region.
-  Node* modp = basic_plus_adr(kls, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc));
+  Node* modp = basic_plus_adr(kls, in_bytes(Klass::access_flags_offset()));
  Node* mods = make_load(NULL, modp, TypeInt::INT, T_INT);
  Node* mask = intcon(modifier_mask);
  Node* bits = intcon(modifier_bits);
@ -3115,7 +3116,7 @@ bool LibraryCallKit::inline_native_Class_query(vmIntrinsics::ID id) {
    break;

  case vmIntrinsics::_getModifiers:
-    p = basic_plus_adr(kls, Klass::modifier_flags_offset_in_bytes() + sizeof(oopDesc));
+    p = basic_plus_adr(kls, in_bytes(Klass::modifier_flags_offset()));
    query_value = make_load(NULL, p, TypeInt::INT, T_INT);
    break;

@ -3155,7 +3156,7 @@ bool LibraryCallKit::inline_native_Class_query(vmIntrinsics::ID id) {
      // A guard was added.  If the guard is taken, it was an array.
      phi->add_req(makecon(TypeInstPtr::make(env()->Object_klass()->java_mirror())));
    // If we fall through, it's a plain class.  Get its _super.
-    p = basic_plus_adr(kls, Klass::super_offset_in_bytes() + sizeof(oopDesc));
+    p = basic_plus_adr(kls, in_bytes(Klass::super_offset()));
    kls = _gvn.transform( LoadKlassNode::make(_gvn, immutable_memory(), p, TypeRawPtr::BOTTOM, TypeKlassPtr::OBJECT_OR_NULL) );
    null_ctl = top();
    kls = null_check_oop(kls, &null_ctl);
@ -3173,7 +3174,7 @@ bool LibraryCallKit::inline_native_Class_query(vmIntrinsics::ID id) {
    if (generate_array_guard(kls, region) != NULL) {
      // Be sure to pin the oop load to the guard edge just created:
      Node* is_array_ctrl = region->in(region->req()-1);
-      Node* cma = basic_plus_adr(kls, in_bytes(arrayKlass::component_mirror_offset()) + sizeof(oopDesc));
+      Node* cma = basic_plus_adr(kls, in_bytes(arrayKlass::component_mirror_offset()));
      Node* cmo = make_load(is_array_ctrl, cma, TypeInstPtr::MIRROR, T_OBJECT);
      phi->add_req(cmo);
    }
@ -3181,7 +3182,7 @@ bool LibraryCallKit::inline_native_Class_query(vmIntrinsics::ID id) {
    break;

  case vmIntrinsics::_getClassAccessFlags:
-    p = basic_plus_adr(kls, Klass::access_flags_offset_in_bytes() + sizeof(oopDesc));
+    p = basic_plus_adr(kls, in_bytes(Klass::access_flags_offset()));
    query_value = make_load(NULL, p, TypeInt::INT, T_INT);
    break;

@ -4194,12 +4195,17 @@ void LibraryCallKit::copy_to_clone(Node* obj, Node* alloc_obj, Node* obj_size, b
  Node* raw_obj = alloc_obj->in(1);
  assert(alloc_obj->is_CheckCastPP() && raw_obj->is_Proj() && raw_obj->in(0)->is_Allocate(), "");

+  AllocateNode* alloc = NULL;
  if (ReduceBulkZeroing) {
    // We will be completely responsible for initializing this object -
    // mark Initialize node as complete.
-    AllocateNode* alloc = AllocateNode::Ideal_allocation(alloc_obj, &_gvn);
+    alloc = AllocateNode::Ideal_allocation(alloc_obj, &_gvn);
    // The object was just allocated - there should be no any stores!
    guarantee(alloc != NULL && alloc->maybe_set_complete(&_gvn), "");
+    // Mark as complete_with_arraycopy so that on AllocateNode
+    // expansion, we know this AllocateNode is initialized by an array
+    // copy and a StoreStore barrier exists after the array copy.
+    alloc->initialization()->set_complete_with_arraycopy();
  }

  // Copy the fastest available way.
@ -4261,8 +4267,19 @@ void LibraryCallKit::copy_to_clone(Node* obj, Node* alloc_obj, Node* obj_size, b
  }

  // Do not let reads from the cloned object float above the arraycopy.
+  if (alloc != NULL) {
+    // Do not let stores that initialize this object be reordered with
+    // a subsequent store that would make this object accessible by
+    // other threads.
+    // Record what AllocateNode this StoreStore protects so that
+    // escape analysis can go from the MemBarStoreStoreNode to the
+    // AllocateNode and eliminate the MemBarStoreStoreNode if possible
+    // based on the escape status of the AllocateNode.
+    insert_mem_bar(Op_MemBarStoreStore, alloc->proj_out(AllocateNode::RawAddress));
+  } else {
    insert_mem_bar(Op_MemBarCPUOrder);
  }
+}

 //------------------------inline_native_clone----------------------------
 // Here are the simple edge cases:
@ -4857,7 +4874,7 @@ LibraryCallKit::generate_arraycopy(const TypePtr* adr_type,
      PreserveJVMState pjvms(this);
      set_control(not_subtype_ctrl);
      // (At this point we can assume disjoint_bases, since types differ.)
-      int ek_offset = objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc);
+      int ek_offset = in_bytes(objArrayKlass::element_klass_offset());
      Node* p1 = basic_plus_adr(dest_klass, ek_offset);
      Node* n1 = LoadKlassNode::make(_gvn, immutable_memory(), p1, TypeRawPtr::BOTTOM);
      Node* dest_elem_klass = _gvn.transform(n1);
@ -5004,7 +5021,16 @@ LibraryCallKit::generate_arraycopy(const TypePtr* adr_type,
  // the membar also.
  //
  // Do not let reads from the cloned object float above the arraycopy.
-  if (InsertMemBarAfterArraycopy || alloc != NULL)
+  if (alloc != NULL) {
+    // Do not let stores that initialize this object be reordered with
+    // a subsequent store that would make this object accessible by
+    // other threads.
+    // Record what AllocateNode this StoreStore protects so that
+    // escape analysis can go from the MemBarStoreStoreNode to the
+    // AllocateNode and eliminate the MemBarStoreStoreNode if possible
+    // based on the escape status of the AllocateNode.
+    insert_mem_bar(Op_MemBarStoreStore, alloc->proj_out(AllocateNode::RawAddress));
+  } else if (InsertMemBarAfterArraycopy)
    insert_mem_bar(Op_MemBarCPUOrder);
 }

@ -5308,7 +5334,7 @@ LibraryCallKit::generate_checkcast_arraycopy(const TypePtr* adr_type,
  // for the target array.  This is an optimistic check.  It will
  // look in each non-null element's class, at the desired klass's
  // super_check_offset, for the desired klass.
-  int sco_offset = Klass::super_check_offset_offset_in_bytes() + sizeof(oopDesc);
+  int sco_offset = in_bytes(Klass::super_check_offset_offset());
  Node* p3 = basic_plus_adr(dest_elem_klass, sco_offset);
  Node* n3 = new(C, 3) LoadINode(NULL, memory(p3), p3, _gvn.type(p3)->is_ptr());
  Node* check_offset = ConvI2X(_gvn.transform(n3));
--- a/hotspot/src/share/vm/opto/macro.cpp
+++ b/hotspot/src/share/vm/opto/macro.cpp
@ -1088,6 +1088,12 @@ void PhaseMacroExpand::expand_allocate_common(
  Node* klass_node        = alloc->in(AllocateNode::KlassNode);
  Node* initial_slow_test = alloc->in(AllocateNode::InitialTest);

+  Node* storestore = alloc->storestore();
+  if (storestore != NULL) {
+    // Break this link that is no longer useful and confuses register allocation
+    storestore->set_req(MemBarNode::Precedent, top());
+  }
+
  assert(ctrl != NULL, "must have control");
  // We need a Region and corresponding Phi's to merge the slow-path and fast-path results.
  // they will not be used if "always_slow" is set
@ -1289,10 +1295,66 @@ void PhaseMacroExpand::expand_allocate_common(
                                   0, new_alloc_bytes, T_LONG);
    }

+    InitializeNode* init = alloc->initialization();
    fast_oop_rawmem = initialize_object(alloc,
                                        fast_oop_ctrl, fast_oop_rawmem, fast_oop,
                                        klass_node, length, size_in_bytes);

+    // If initialization is performed by an array copy, any required
+    // MemBarStoreStore was already added. If the object does not
+    // escape no need for a MemBarStoreStore. Otherwise we need a
+    // MemBarStoreStore so that stores that initialize this object
+    // can't be reordered with a subsequent store that makes this
+    // object accessible by other threads.
+    if (init == NULL || (!init->is_complete_with_arraycopy() && !init->does_not_escape())) {
+      if (init == NULL || init->req() < InitializeNode::RawStores) {
+        // No InitializeNode or no stores captured by zeroing
+        // elimination. Simply add the MemBarStoreStore after object
+        // initialization.
+        MemBarNode* mb = MemBarNode::make(C, Op_MemBarStoreStore, Compile::AliasIdxBot, fast_oop_rawmem);
+        transform_later(mb);
+
+        mb->init_req(TypeFunc::Memory, fast_oop_rawmem);
+        mb->init_req(TypeFunc::Control, fast_oop_ctrl);
+        fast_oop_ctrl = new (C, 1) ProjNode(mb,TypeFunc::Control);
+        transform_later(fast_oop_ctrl);
+        fast_oop_rawmem = new (C, 1) ProjNode(mb,TypeFunc::Memory);
+        transform_later(fast_oop_rawmem);
+      } else {
+        // Add the MemBarStoreStore after the InitializeNode so that
+        // all stores performing the initialization that were moved
+        // before the InitializeNode happen before the storestore
+        // barrier.
+
+        Node* init_ctrl = init->proj_out(TypeFunc::Control);
+        Node* init_mem = init->proj_out(TypeFunc::Memory);
+
+        MemBarNode* mb = MemBarNode::make(C, Op_MemBarStoreStore, Compile::AliasIdxBot);
+        transform_later(mb);
+
+        Node* ctrl = new (C, 1) ProjNode(init,TypeFunc::Control);
+        transform_later(ctrl);
+        Node* mem = new (C, 1) ProjNode(init,TypeFunc::Memory);
+        transform_later(mem);
+
+        // The MemBarStoreStore depends on control and memory coming
+        // from the InitializeNode
+        mb->init_req(TypeFunc::Memory, mem);
+        mb->init_req(TypeFunc::Control, ctrl);
+
+        ctrl = new (C, 1) ProjNode(mb,TypeFunc::Control);
+        transform_later(ctrl);
+        mem = new (C, 1) ProjNode(mb,TypeFunc::Memory);
+        transform_later(mem);
+
+        // All nodes that depended on the InitializeNode for control
+        // and memory must now depend on the MemBarNode that itself
+        // depends on the InitializeNode
+        _igvn.replace_node(init_ctrl, ctrl);
+        _igvn.replace_node(init_mem, mem);
+      }
+    }
+
    if (C->env()->dtrace_extended_probes()) {
      // Slow-path call
      int size = TypeFunc::Parms + 2;
@ -1326,6 +1388,7 @@ void PhaseMacroExpand::expand_allocate_common(
    result_phi_rawmem->init_req(fast_result_path, fast_oop_rawmem);
  } else {
    slow_region = ctrl;
+    result_phi_i_o = i_o; // Rename it to use in the following code.
  }

  // Generate slow-path call
@ -1350,6 +1413,10 @@ void PhaseMacroExpand::expand_allocate_common(
  copy_call_debug_info((CallNode *) alloc,  call);
  if (!always_slow) {
    call->set_cnt(PROB_UNLIKELY_MAG(4));  // Same effect as RC_UNCOMMON.
+  } else {
+    // Hook i_o projection to avoid its elimination during allocation
+    // replacement (when only a slow call is generated).
+    call->set_req(TypeFunc::I_O, result_phi_i_o);
  }
  _igvn.replace_node(alloc, call);
  transform_later(call);
@ -1366,8 +1433,10 @@ void PhaseMacroExpand::expand_allocate_common(
  //
  extract_call_projections(call);

-  // An allocate node has separate memory projections for the uses on the control and i_o paths
-  // Replace uses of the control memory projection with result_phi_rawmem (unless we are only generating a slow call)
+  // An allocate node has separate memory projections for the uses on
+  // the control and i_o paths. Replace the control memory projection with
+  // result_phi_rawmem (unless we are only generating a slow call when
+  // both memory projections are combined)
  if (!always_slow && _memproj_fallthrough != NULL) {
    for (DUIterator_Fast imax, i = _memproj_fallthrough->fast_outs(imax); i < imax; i++) {
      Node *use = _memproj_fallthrough->fast_out(i);
@ -1378,8 +1447,8 @@ void PhaseMacroExpand::expand_allocate_common(
      --i;
    }
  }
-  // Now change uses of _memproj_catchall to use _memproj_fallthrough and delete _memproj_catchall so
-  // we end up with a call that has only 1 memory projection
+  // Now change uses of _memproj_catchall to use _memproj_fallthrough and delete
+  // _memproj_catchall so we end up with a call that has only 1 memory projection.
  if (_memproj_catchall != NULL ) {
    if (_memproj_fallthrough == NULL) {
      _memproj_fallthrough = new (C, 1) ProjNode(call, TypeFunc::Memory);
@ -1393,17 +1462,18 @@ void PhaseMacroExpand::expand_allocate_common(
      // back up iterator
      --i;
    }
+    assert(_memproj_catchall->outcnt() == 0, "all uses must be deleted");
+    _igvn.remove_dead_node(_memproj_catchall);
  }

-  // An allocate node has separate i_o projections for the uses on the control and i_o paths
-  // Replace uses of the control i_o projection with result_phi_i_o (unless we are only generating a slow call)
-  if (_ioproj_fallthrough == NULL) {
-    _ioproj_fallthrough = new (C, 1) ProjNode(call, TypeFunc::I_O);
-    transform_later(_ioproj_fallthrough);
-  } else if (!always_slow) {
+  // An allocate node has separate i_o projections for the uses on the control
+  // and i_o paths. Always replace the control i_o projection with result i_o
+  // otherwise incoming i_o become dead when only a slow call is generated
+  // (it is different from memory projections where both projections are
+  // combined in such case).
+  if (_ioproj_fallthrough != NULL) {
    for (DUIterator_Fast imax, i = _ioproj_fallthrough->fast_outs(imax); i < imax; i++) {
      Node *use = _ioproj_fallthrough->fast_out(i);
-
      _igvn.hash_delete(use);
      imax -= replace_input(use, _ioproj_fallthrough, result_phi_i_o);
      _igvn._worklist.push(use);
@ -1411,9 +1481,13 @@ void PhaseMacroExpand::expand_allocate_common(
      --i;
    }
  }
-  // Now change uses of _ioproj_catchall to use _ioproj_fallthrough and delete _ioproj_catchall so
-  // we end up with a call that has only 1 control projection
+  // Now change uses of _ioproj_catchall to use _ioproj_fallthrough and delete
+  // _ioproj_catchall so we end up with a call that has only 1 i_o projection.
  if (_ioproj_catchall != NULL ) {
+    if (_ioproj_fallthrough == NULL) {
+      _ioproj_fallthrough = new (C, 1) ProjNode(call, TypeFunc::I_O);
+      transform_later(_ioproj_fallthrough);
+    }
    for (DUIterator_Fast imax, i = _ioproj_catchall->fast_outs(imax); i < imax; i++) {
      Node *use = _ioproj_catchall->fast_out(i);
      _igvn.hash_delete(use);
@ -1422,11 +1496,25 @@ void PhaseMacroExpand::expand_allocate_common(
      // back up iterator
      --i;
    }
+    assert(_ioproj_catchall->outcnt() == 0, "all uses must be deleted");
+    _igvn.remove_dead_node(_ioproj_catchall);
  }

  // if we generated only a slow call, we are done
-  if (always_slow)
+  if (always_slow) {
+    // Now we can unhook i_o.
+    if (result_phi_i_o->outcnt() > 1) {
+      call->set_req(TypeFunc::I_O, top());
+    } else {
+      assert(result_phi_i_o->unique_ctrl_out() == call, "");
+      // Case of new array with negative size known during compilation.
+      // AllocateArrayNode::Ideal() optimization disconnect unreachable
+      // following code since call to runtime will throw exception.
+      // As result there will be no users of i_o after the call.
+      // Leave i_o attached to this call to avoid problems in preceding graph.
+    }
    return;
+  }


  if (_fallthroughcatchproj != NULL) {
@ -1470,7 +1558,7 @@ PhaseMacroExpand::initialize_object(AllocateNode* alloc,
  Node* mark_node = NULL;
  // For now only enable fast locking for non-array types
  if (UseBiasedLocking && (length == NULL)) {
-    mark_node = make_load(control, rawmem, klass_node, Klass::prototype_header_offset_in_bytes() + sizeof(oopDesc), TypeRawPtr::BOTTOM, T_ADDRESS);
+    mark_node = make_load(control, rawmem, klass_node, in_bytes(Klass::prototype_header_offset()), TypeRawPtr::BOTTOM, T_ADDRESS);
  } else {
    mark_node = makecon(TypeRawPtr::make((address)markOopDesc::prototype()));
  }
@ -1958,7 +2046,7 @@ void PhaseMacroExpand::expand_lock_node(LockNode *lock) {
 #endif
      klass_node->init_req(0, ctrl);
    }
-    Node *proto_node = make_load(ctrl, mem, klass_node, Klass::prototype_header_offset_in_bytes() + sizeof(oopDesc), TypeX_X, TypeX_X->basic_type());
+    Node *proto_node = make_load(ctrl, mem, klass_node, in_bytes(Klass::prototype_header_offset()), TypeX_X, TypeX_X->basic_type());

    Node* thread = transform_later(new (C, 1) ThreadLocalNode());
    Node* cast_thread = transform_later(new (C, 2) CastP2XNode(ctrl, thread));
--- a/hotspot/src/share/vm/opto/matcher.cpp
+++ b/hotspot/src/share/vm/opto/matcher.cpp
@ -1372,8 +1372,11 @@ static bool match_into_reg( const Node *n, Node *m, Node *control, int i, bool s
    return false;
  } else {                      // Not a constant
    // Stop recursion if they have different Controls.
-    // Slot 0 of constants is not really a Control.
-    if( control && m->in(0) && control != m->in(0) ) {
+    Node* m_control = m->in(0);
+    // Control of load's memory can post-dominates load's control.
+    // So use it since load can't float above its memory.
+    Node* mem_control = (m->is_Load()) ? m->in(MemNode::Memory)->in(0) : NULL;
+    if (control && m_control && control != m_control && control != mem_control) {

      // Actually, we can live with the most conservative control we
      // find, if it post-dominates the others.  This allows us to
@ -1386,8 +1389,10 @@ static bool match_into_reg( const Node *n, Node *m, Node *control, int i, bool s
        if (x->is_Region())     // Bail out at merge points
          return true;
        x = x->in(0);
-        if( x == m->in(0) )     // Does 'control' post-dominate
+        if (x == m_control)     // Does 'control' post-dominate
          break;                // m->in(0)?  If so, we can use it
+        if (x == mem_control)   // Does 'control' post-dominate
+          break;                // mem_control?  If so, we can use it
      }
      if (j == max_scan)        // No post-domination before scan end?
        return true;            // Then break the match tree up
--- a/hotspot/src/share/vm/opto/memnode.cpp
+++ b/hotspot/src/share/vm/opto/memnode.cpp
@ -1473,19 +1473,19 @@ Node *LoadNode::Ideal(PhaseGVN *phase, bool can_reshape) {
 const Type*
 LoadNode::load_array_final_field(const TypeKlassPtr *tkls,
                                 ciKlass* klass) const {
-  if (tkls->offset() == Klass::modifier_flags_offset_in_bytes() + (int)sizeof(oopDesc)) {
+  if (tkls->offset() == in_bytes(Klass::modifier_flags_offset())) {
    // The field is Klass::_modifier_flags.  Return its (constant) value.
    // (Folds up the 2nd indirection in aClassConstant.getModifiers().)
    assert(this->Opcode() == Op_LoadI, "must load an int from _modifier_flags");
    return TypeInt::make(klass->modifier_flags());
  }
-  if (tkls->offset() == Klass::access_flags_offset_in_bytes() + (int)sizeof(oopDesc)) {
+  if (tkls->offset() == in_bytes(Klass::access_flags_offset())) {
    // The field is Klass::_access_flags.  Return its (constant) value.
    // (Folds up the 2nd indirection in Reflection.getClassAccessFlags(aClassConstant).)
    assert(this->Opcode() == Op_LoadI, "must load an int from _access_flags");
    return TypeInt::make(klass->access_flags());
  }
-  if (tkls->offset() == Klass::layout_helper_offset_in_bytes() + (int)sizeof(oopDesc)) {
+  if (tkls->offset() == in_bytes(Klass::layout_helper_offset())) {
    // The field is Klass::_layout_helper.  Return its constant value if known.
    assert(this->Opcode() == Op_LoadI, "must load an int from _layout_helper");
    return TypeInt::make(klass->layout_helper());
@ -1636,14 +1636,14 @@ const Type *LoadNode::Value( PhaseTransform *phase ) const {
      // We are loading a field from a Klass metaobject whose identity
      // is known at compile time (the type is "exact" or "precise").
      // Check for fields we know are maintained as constants by the VM.
-      if (tkls->offset() == Klass::super_check_offset_offset_in_bytes() + (int)sizeof(oopDesc)) {
+      if (tkls->offset() == in_bytes(Klass::super_check_offset_offset())) {
        // The field is Klass::_super_check_offset.  Return its (constant) value.
        // (Folds up type checking code.)
        assert(Opcode() == Op_LoadI, "must load an int from _super_check_offset");
        return TypeInt::make(klass->super_check_offset());
      }
      // Compute index into primary_supers array
-      juint depth = (tkls->offset() - (Klass::primary_supers_offset_in_bytes() + (int)sizeof(oopDesc))) / sizeof(klassOop);
+      juint depth = (tkls->offset() - in_bytes(Klass::primary_supers_offset())) / sizeof(klassOop);
      // Check for overflowing; use unsigned compare to handle the negative case.
      if( depth < ciKlass::primary_super_limit() ) {
        // The field is an element of Klass::_primary_supers.  Return its (constant) value.
@ -1654,14 +1654,14 @@ const Type *LoadNode::Value( PhaseTransform *phase ) const {
      }
      const Type* aift = load_array_final_field(tkls, klass);
      if (aift != NULL)  return aift;
-      if (tkls->offset() == in_bytes(arrayKlass::component_mirror_offset()) + (int)sizeof(oopDesc)
+      if (tkls->offset() == in_bytes(arrayKlass::component_mirror_offset())
          && klass->is_array_klass()) {
        // The field is arrayKlass::_component_mirror.  Return its (constant) value.
        // (Folds up aClassConstant.getComponentType, common in Arrays.copyOf.)
        assert(Opcode() == Op_LoadP, "must load an oop from _component_mirror");
        return TypeInstPtr::make(klass->as_array_klass()->component_mirror());
      }
-      if (tkls->offset() == Klass::java_mirror_offset_in_bytes() + (int)sizeof(oopDesc)) {
+      if (tkls->offset() == in_bytes(Klass::java_mirror_offset())) {
        // The field is Klass::_java_mirror.  Return its (constant) value.
        // (Folds up the 2nd indirection in anObjConstant.getClass().)
        assert(Opcode() == Op_LoadP, "must load an oop from _java_mirror");
@ -1679,7 +1679,7 @@ const Type *LoadNode::Value( PhaseTransform *phase ) const {
      if( inner->is_instance_klass() &&
          !inner->as_instance_klass()->flags().is_interface() ) {
        // Compute index into primary_supers array
-        juint depth = (tkls->offset() - (Klass::primary_supers_offset_in_bytes() + (int)sizeof(oopDesc))) / sizeof(klassOop);
+        juint depth = (tkls->offset() - in_bytes(Klass::primary_supers_offset())) / sizeof(klassOop);
        // Check for overflowing; use unsigned compare to handle the negative case.
        if( depth < ciKlass::primary_super_limit() &&
            depth <= klass->super_depth() ) { // allow self-depth checks to handle self-check case
@ -1695,7 +1695,7 @@ const Type *LoadNode::Value( PhaseTransform *phase ) const {
    // If the type is enough to determine that the thing is not an array,
    // we can give the layout_helper a positive interval type.
    // This will help short-circuit some reflective code.
-    if (tkls->offset() == Klass::layout_helper_offset_in_bytes() + (int)sizeof(oopDesc)
+    if (tkls->offset() == in_bytes(Klass::layout_helper_offset())
        && !klass->is_array_klass() // not directly typed as an array
        && !klass->is_interface()  // specifically not Serializable & Cloneable
        && !klass->is_java_lang_Object()   // not the supertype of all T[]
@ -1938,7 +1938,7 @@ const Type *LoadNode::klass_value_common( PhaseTransform *phase ) const {
    if( !klass->is_loaded() )
      return _type;             // Bail out if not loaded
    if( klass->is_obj_array_klass() &&
-        (uint)tkls->offset() == objArrayKlass::element_klass_offset_in_bytes() + sizeof(oopDesc)) {
+        tkls->offset() == in_bytes(objArrayKlass::element_klass_offset())) {
      ciKlass* elem = klass->as_obj_array_klass()->element_klass();
      // // Always returning precise element type is incorrect,
      // // e.g., element type could be object and array may contain strings
@ -1949,7 +1949,7 @@ const Type *LoadNode::klass_value_common( PhaseTransform *phase ) const {
      return TypeKlassPtr::make(tkls->ptr(), elem, 0/*offset*/);
    }
    if( klass->is_instance_klass() && tkls->klass_is_exact() &&
-        (uint)tkls->offset() == Klass::super_offset_in_bytes() + sizeof(oopDesc)) {
+        tkls->offset() == in_bytes(Klass::super_offset())) {
      ciKlass* sup = klass->as_instance_klass()->super();
      // The field is Klass::_super.  Return its (constant) value.
      // (Folds up the 2nd indirection in aClassConstant.getSuperClass().)
@ -2013,11 +2013,11 @@ Node* LoadNode::klass_identity_common(PhaseTransform *phase ) {
              tkls->klass()->is_array_klass())
          && adr2->is_AddP()
          ) {
-        int mirror_field = Klass::java_mirror_offset_in_bytes();
+        int mirror_field = in_bytes(Klass::java_mirror_offset());
        if (offset == java_lang_Class::array_klass_offset_in_bytes()) {
          mirror_field = in_bytes(arrayKlass::component_mirror_offset());
        }
-        if (tkls->offset() == mirror_field + (int)sizeof(oopDesc)) {
+        if (tkls->offset() == mirror_field) {
          return adr2->in(AddPNode::Base);
        }
      }
@ -2721,6 +2721,7 @@ MemBarNode* MemBarNode::make(Compile* C, int opcode, int atp, Node* pn) {
  case Op_MemBarVolatile:  return new(C, len) MemBarVolatileNode(C, atp, pn);
  case Op_MemBarCPUOrder:  return new(C, len) MemBarCPUOrderNode(C, atp, pn);
  case Op_Initialize:      return new(C, len) InitializeNode(C,     atp, pn);
+  case Op_MemBarStoreStore: return new(C, len) MemBarStoreStoreNode(C,  atp, pn);
  default:                 ShouldNotReachHere(); return NULL;
  }
 }
@ -2870,7 +2871,7 @@ Node *MemBarNode::match( const ProjNode *proj, const Matcher *m ) {

 //---------------------------InitializeNode------------------------------------
 InitializeNode::InitializeNode(Compile* C, int adr_type, Node* rawoop)
-  : _is_complete(Incomplete),
+  : _is_complete(Incomplete), _does_not_escape(false),
    MemBarNode(C, adr_type, rawoop)
 {
  init_class_id(Class_Initialize);
--- a/hotspot/src/share/vm/opto/memnode.hpp
+++ b/hotspot/src/share/vm/opto/memnode.hpp
@ -918,6 +918,15 @@ public:
  virtual int Opcode() const;
 };

+class MemBarStoreStoreNode: public MemBarNode {
+public:
+  MemBarStoreStoreNode(Compile* C, int alias_idx, Node* precedent)
+    : MemBarNode(C, alias_idx, precedent) {
+    init_class_id(Class_MemBarStoreStore);
+  }
+  virtual int Opcode() const;
+};
+
 // Ordering between a volatile store and a following volatile load.
 // Requires multi-CPU visibility?
 class MemBarVolatileNode: public MemBarNode {
@ -950,6 +959,8 @@ class InitializeNode: public MemBarNode {
  };
  int _is_complete;

+  bool _does_not_escape;
+
 public:
  enum {
    Control    = TypeFunc::Control,
@ -989,6 +1000,9 @@ public:
  void set_complete(PhaseGVN* phase);
  void set_complete_with_arraycopy() { _is_complete = Complete | WithArraycopy; }

+  bool does_not_escape() { return _does_not_escape; }
+  void set_does_not_escape() { _does_not_escape = true; }
+
 #ifdef ASSERT
  // ensure all non-degenerate stores are ordered and non-overlapping
  bool stores_are_sane(PhaseTransform* phase);
--- a/Show more
+++ b/Show more