8149421: Vectorized Post Loops

Add vectorised post loop for counted loops with vectors. Reviewed-by: kvn
2025-08-28 23:34:52 +02:00 · 2016-02-12 16:12:15 -08:00 · 2016-02-12 16:12:15 -08:00 · 3ade110066
commit 3ade110066
parent 9d5f03332e
10 changed files with 213 additions and 50 deletions
--- a/hotspot/src/cpu/aarch64/vm/c2_globals_aarch64.hpp
+++ b/hotspot/src/cpu/aarch64/vm/c2_globals_aarch64.hpp
@ -54,6 +54,7 @@ define_pd_global(intx, INTPRESSURE,                  25);
 define_pd_global(intx, InteriorEntryAlignment,       16);
 define_pd_global(intx, NewSizeThreadIncrease, ScaleForWordSize(4*K));
 define_pd_global(intx, LoopUnrollLimit,              60);
 define_pd_global(intx, LoopPercentProfileLimit,      10);
 // InitialCodeCacheSize derived from specjbb2000 run.
 define_pd_global(intx, InitialCodeCacheSize,         2496*K); // Integral multiple of CodeCacheExpansionSize
 define_pd_global(intx, CodeCacheExpansionSize,       64*K);
--- a/hotspot/src/cpu/ppc/vm/c2_globals_ppc.hpp
+++ b/hotspot/src/cpu/ppc/vm/c2_globals_ppc.hpp
@ -54,6 +54,7 @@ define_pd_global(intx, RegisterCostAreaRatio,        16000);
 define_pd_global(bool, UseTLAB,                      true);
 define_pd_global(bool, ResizeTLAB,                   true);
 define_pd_global(intx, LoopUnrollLimit,              60);
 define_pd_global(intx, LoopPercentProfileLimit,      10);
 // Peephole and CISC spilling both break the graph, and so make the
 // scheduler sick.
--- a/hotspot/src/cpu/sparc/vm/c2_globals_sparc.hpp
+++ b/hotspot/src/cpu/sparc/vm/c2_globals_sparc.hpp
@ -52,6 +52,7 @@ define_pd_global(intx, RegisterCostAreaRatio,        12000);
 define_pd_global(bool, UseTLAB,                      true);
 define_pd_global(bool, ResizeTLAB,                   true);
 define_pd_global(intx, LoopUnrollLimit,              60); // Design center runs on 1.3.1
 define_pd_global(intx, LoopPercentProfileLimit,      10);
 define_pd_global(intx, MinJumpTableSize,             5);
 // Peephole and CISC spilling both break the graph, and so makes the
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
@ -2361,7 +2361,7 @@ void Assembler::movdqa(XMMRegister dst, Address src) {
 void Assembler::movdqu(XMMRegister dst, Address src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  simd_prefix(dst, xnoreg, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
  emit_int8(0x6F);
@ -2398,7 +2398,7 @@ void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) {
 void Assembler::vmovdqu(XMMRegister dst, Address src) {
  assert(UseAVX > 0, "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(AVX_256bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
  emit_int8(0x6F);
@ -2486,7 +2486,7 @@ void Assembler::evmovdqul(XMMRegister dst, XMMRegister src, int vector_len) {
 void Assembler::evmovdqul(XMMRegister dst, Address src, int vector_len) {
  assert(VM_Version::supports_evex(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false , /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
  emit_int8(0x6F);
@ -2515,7 +2515,7 @@ void Assembler::evmovdquq(XMMRegister dst, XMMRegister src, int vector_len) {
 void Assembler::evmovdquq(XMMRegister dst, Address src, int vector_len) {
  assert(VM_Version::supports_evex(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
  vex_prefix(src, 0, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
  emit_int8(0x6F);
@ -2640,7 +2640,7 @@ void Assembler::movsbl(Register dst, Register src) { // movsxb
 void Assembler::movsd(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x10);
  emit_int8((unsigned char)(0xC0 | encode));
@ -2649,7 +2649,7 @@ void Assembler::movsd(XMMRegister dst, XMMRegister src) {
 void Assembler::movsd(XMMRegister dst, Address src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
  simd_prefix(dst, xnoreg, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x10);
@ -2668,7 +2668,7 @@ void Assembler::movsd(Address dst, XMMRegister src) {
 void Assembler::movss(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
  emit_int8(0x10);
  emit_int8((unsigned char)(0xC0 | encode));
@ -2677,7 +2677,7 @@ void Assembler::movss(XMMRegister dst, XMMRegister src) {
 void Assembler::movss(XMMRegister dst, Address src) {
  NOT_LP64(assert(VM_Version::supports_sse(), ""));
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
  simd_prefix(dst, xnoreg, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
  emit_int8(0x10);
@ -2782,7 +2782,7 @@ void Assembler::mull(Register src) {
 void Assembler::mulsd(XMMRegister dst, Address src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
  simd_prefix(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x59);
@ -2791,7 +2791,7 @@ void Assembler::mulsd(XMMRegister dst, Address src) {
 void Assembler::mulsd(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x59);
  emit_int8((unsigned char)(0xC0 | encode));
@ -2800,7 +2800,7 @@ void Assembler::mulsd(XMMRegister dst, XMMRegister src) {
 void Assembler::mulss(XMMRegister dst, Address src) {
  NOT_LP64(assert(VM_Version::supports_sse(), ""));
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
  simd_prefix(dst, dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
  emit_int8(0x59);
@ -2809,7 +2809,7 @@ void Assembler::mulss(XMMRegister dst, Address src) {
 void Assembler::mulss(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
  emit_int8(0x59);
  emit_int8((unsigned char)(0xC0 | encode));
@ -3993,7 +3993,7 @@ void Assembler::smovl() {
 void Assembler::sqrtsd(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x51);
  emit_int8((unsigned char)(0xC0 | encode));
@ -4002,7 +4002,7 @@ void Assembler::sqrtsd(XMMRegister dst, XMMRegister src) {
 void Assembler::sqrtsd(XMMRegister dst, Address src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
  simd_prefix(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x51);
@ -4011,7 +4011,7 @@ void Assembler::sqrtsd(XMMRegister dst, Address src) {
 void Assembler::sqrtss(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
  emit_int8(0x51);
  emit_int8((unsigned char)(0xC0 | encode));
@ -4024,7 +4024,7 @@ void Assembler::std() {
 void Assembler::sqrtss(XMMRegister dst, Address src) {
  NOT_LP64(assert(VM_Version::supports_sse(), ""));
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
  simd_prefix(dst, dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
  emit_int8(0x51);
@ -4078,7 +4078,7 @@ void Assembler::subl(Register dst, Register src) {
 void Assembler::subsd(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x5C);
  emit_int8((unsigned char)(0xC0 | encode));
@ -4087,7 +4087,7 @@ void Assembler::subsd(XMMRegister dst, XMMRegister src) {
 void Assembler::subsd(XMMRegister dst, Address src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
  simd_prefix(dst, dst, src, VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x5C);
@ -4096,7 +4096,7 @@ void Assembler::subsd(XMMRegister dst, Address src) {
 void Assembler::subss(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse(), ""));
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false , /* uses_vl */ false);
  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
  emit_int8(0x5C);
  emit_int8((unsigned char)(0xC0 | encode));
@ -4105,7 +4105,7 @@ void Assembler::subss(XMMRegister dst, XMMRegister src) {
 void Assembler::subss(XMMRegister dst, Address src) {
  NOT_LP64(assert(VM_Version::supports_sse(), ""));
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
  simd_prefix(dst, dst, src, VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
  emit_int8(0x5C);
@ -4293,7 +4293,7 @@ void Assembler::xorb(Register dst, Address src) {
 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) {
  assert(VM_Version::supports_avx(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
@ -4303,7 +4303,7 @@ void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, Address src) {
 void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
  assert(VM_Version::supports_avx(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x58);
@ -4313,7 +4313,7 @@ void Assembler::vaddsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) {
  assert(VM_Version::supports_avx(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
@ -4323,7 +4323,7 @@ void Assembler::vaddss(XMMRegister dst, XMMRegister nds, Address src) {
 void Assembler::vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
  assert(VM_Version::supports_avx(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
  emit_int8(0x58);
@ -4333,7 +4333,7 @@ void Assembler::vaddss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) {
  assert(VM_Version::supports_avx(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
@ -4343,7 +4343,7 @@ void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, Address src) {
 void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
  assert(VM_Version::supports_avx(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x5E);
@ -4353,7 +4353,7 @@ void Assembler::vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) {
  assert(VM_Version::supports_avx(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
@ -4363,7 +4363,7 @@ void Assembler::vdivss(XMMRegister dst, XMMRegister nds, Address src) {
 void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
  assert(VM_Version::supports_avx(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
  emit_int8(0x5E);
@ -4373,7 +4373,7 @@ void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
  assert(VM_Version::supports_avx(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
@ -4383,7 +4383,7 @@ void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
  assert(VM_Version::supports_avx(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x59);
@ -4393,7 +4393,7 @@ void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) {
  assert(VM_Version::supports_avx(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
@ -4403,7 +4403,7 @@ void Assembler::vmulss(XMMRegister dst, XMMRegister nds, Address src) {
 void Assembler::vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
  assert(VM_Version::supports_avx(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
  emit_int8(0x59);
@ -4413,7 +4413,7 @@ void Assembler::vmulss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) {
  assert(VM_Version::supports_avx(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_64bit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
@ -4423,7 +4423,7 @@ void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, Address src) {
 void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
  assert(VM_Version::supports_avx(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ VM_Version::supports_evex(), /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F, &attributes);
  emit_int8(0x5C);
@ -4433,7 +4433,7 @@ void Assembler::vsubsd(XMMRegister dst, XMMRegister nds, XMMRegister src) {
 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) {
  assert(VM_Version::supports_avx(), "");
  InstructionMark im(this);
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_32bit);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  vex_prefix(src, nds_enc, dst->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
@ -4443,7 +4443,7 @@ void Assembler::vsubss(XMMRegister dst, XMMRegister nds, Address src) {
 void Assembler::vsubss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
  assert(VM_Version::supports_avx(), "");
-  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
  int nds_enc = nds->is_valid() ? nds->encoding() : 0;
  int encode = vex_prefix_and_encode(dst->encoding(), nds_enc, src->encoding(), VEX_SIMD_F3, VEX_OPCODE_0F, &attributes);
  emit_int8(0x5C);
@ -5901,7 +5901,7 @@ void Assembler::vpbroadcastw(XMMRegister dst, XMMRegister src) {
 // duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastb(XMMRegister dst, XMMRegister src, int vector_len) {
  assert(VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8(0x78);
  emit_int8((unsigned char)(0xC0 | encode));
@ -5911,7 +5911,7 @@ void Assembler::evpbroadcastb(XMMRegister dst, Address src, int vector_len) {
  assert(VM_Version::supports_evex(), "");
  assert(dst != xnoreg, "sanity");
  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_8bit);
  // swap src<->dst for encoding
  vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
@ -5922,7 +5922,7 @@ void Assembler::evpbroadcastb(XMMRegister dst, Address src, int vector_len) {
 // duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) {
  assert(VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8(0x79);
  emit_int8((unsigned char)(0xC0 | encode));
@ -5932,7 +5932,7 @@ void Assembler::evpbroadcastw(XMMRegister dst, Address src, int vector_len) {
  assert(VM_Version::supports_evex(), "");
  assert(dst != xnoreg, "sanity");
  InstructionMark im(this);
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  attributes.set_address_attributes(/* tuple_type */ EVEX_T1S, /* input_size_in_bits */ EVEX_16bit);
  // swap src<->dst for encoding
  vex_prefix(src, dst->encoding(), dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
@ -6027,7 +6027,7 @@ void Assembler::evpbroadcastsd(XMMRegister dst, Address src, int vector_len) {
 // duplicate 1-byte integer data from src into 16||32|64 locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastb(XMMRegister dst, Register src, int vector_len) {
  assert(VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8(0x7A);
  emit_int8((unsigned char)(0xC0 | encode));
@ -6036,7 +6036,7 @@ void Assembler::evpbroadcastb(XMMRegister dst, Register src, int vector_len) {
 // duplicate 2-byte integer data from src into 8|16||32 locations in dest : requires AVX512BW and AVX512VL
 void Assembler::evpbroadcastw(XMMRegister dst, Register src, int vector_len) {
  assert(VM_Version::supports_evex(), "");
-  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
+  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8(0x7B);
  emit_int8((unsigned char)(0xC0 | encode));
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp
@ -2044,11 +2044,11 @@ private:
 class InstructionAttr {
 public:
  InstructionAttr(
-    int vector_len,
+    int vector_len,     // The length of vector to be applied in encoding - for both AVX and EVEX
-    bool rex_vex_w,
+    bool rex_vex_w,     // Width of data: if 32-bits or less, false, else if 64-bit or specially defined, true
-    bool legacy_mode,
+    bool legacy_mode,   // Details if either this instruction is conditionally encoded to AVX or earlier if true else possibly EVEX
-    bool no_reg_mask,
+    bool no_reg_mask,   // when true, k0 is used when EVEX encoding is chosen, else k1 is used under the same condition
-    bool uses_vl)
+    bool uses_vl)       // This instruction may have legacy constraints based on vector length for EVEX
    :
      _avx_vector_len(vector_len),
      _rex_vex_w(rex_vex_w),
--- a/hotspot/src/cpu/x86/vm/c2_globals_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/c2_globals_x86.hpp
@ -46,6 +46,7 @@ define_pd_global(intx, OnStackReplacePercentage,     140);
 define_pd_global(intx, ConditionalMoveLimit,         3);
 define_pd_global(intx, FreqInlineSize,               325);
 define_pd_global(intx, MinJumpTableSize,             10);
 define_pd_global(intx, LoopPercentProfileLimit,      30);
 #ifdef AMD64
 define_pd_global(intx, INTPRESSURE,                  13);
 define_pd_global(intx, FLOATPRESSURE,                14);
--- a/hotspot/src/share/vm/opto/c2_globals.hpp
+++ b/hotspot/src/share/vm/opto/c2_globals.hpp
@ -182,6 +182,10 @@
          "Unroll loop bodies with node count less than this")              \
          range(0, max_jint / 4)                                            \
                                                                            \
  product_pd(intx, LoopPercentProfileLimit,                                 \
             "Unroll loop bodies with % node count of profile limit")       \
             range(10, 100)                                                 \
                                                                            \
  product(intx,  LoopMaxUnroll, 16,                                         \
          "Maximum number of unrolls for main loop")                        \
          range(0, max_jint)                                                \
--- a/hotspot/src/share/vm/opto/loopTransform.cpp
+++ b/hotspot/src/share/vm/opto/loopTransform.cpp
@ -666,7 +666,8 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
    if (future_unroll_ct > LoopMaxUnroll) return false;
  } else {
    // obey user constraints on vector mapped loops with additional unrolling applied
-    if ((future_unroll_ct / cl->slp_max_unroll()) > LoopMaxUnroll) return false;
+    int unroll_constraint = (cl->slp_max_unroll()) ? cl->slp_max_unroll() : 1;
    if ((future_unroll_ct / unroll_constraint) > LoopMaxUnroll) return false;
  }
  // Check for initial stride being a small enough constant
@ -689,7 +690,7 @@ bool IdealLoopTree::policy_unroll(PhaseIdealLoop *phase) {
  //   Progress defined as current size less than 20% larger than previous size.
  if (UseSuperWord && cl->node_count_before_unroll() > 0 &&
      future_unroll_ct > LoopUnrollMin &&
-      (future_unroll_ct - 1) * 10.0 > cl->profile_trip_cnt() &&
+      (future_unroll_ct - 1) * (100 / LoopPercentProfileLimit) > cl->profile_trip_cnt() &&
      1.2 * cl->node_count_before_unroll() < (double)_body.size()) {
    return false;
  }
@ -1260,6 +1261,146 @@ void PhaseIdealLoop::insert_pre_post_loops( IdealLoopTree *loop, Node_List &old_
  loop->record_for_igvn();
 }
 //------------------------------insert_vector_post_loop------------------------
 // Insert a copy of the atomic unrolled vectorized main loop as a post loop,
 // unroll_policy has already informed us that more unrolling is about to happen to
 // the main loop.  The resultant post loop will serve as a vectorized drain loop.
 void PhaseIdealLoop::insert_vector_post_loop(IdealLoopTree *loop, Node_List &old_new) {
  if (!loop->_head->is_CountedLoop()) return;
  CountedLoopNode *cl = loop->_head->as_CountedLoop();
  // only process vectorized main loops
  if (!cl->is_vectorized_loop() || !cl->is_main_loop()) return;
  int slp_max_unroll_factor = cl->slp_max_unroll();
  int cur_unroll = cl->unrolled_count();
  if (slp_max_unroll_factor == 0) return;
  // only process atomic unroll vector loops (not super unrolled after vectorization)
  if (cur_unroll != slp_max_unroll_factor) return;
  // we only ever process this one time
  if (cl->has_atomic_post_loop()) return;
 #ifndef PRODUCT
  if (TraceLoopOpts) {
    tty->print("PostVector  ");
    loop->dump_head();
  }
 #endif
  C->set_major_progress();
  // Find common pieces of the loop being guarded with pre & post loops
  CountedLoopNode *main_head = loop->_head->as_CountedLoop();
  CountedLoopEndNode *main_end = main_head->loopexit();
  guarantee(main_end != NULL, "no loop exit node");
  // diagnostic to show loop end is not properly formed
  assert(main_end->outcnt() == 2, "1 true, 1 false path only");
  uint dd_main_head = dom_depth(main_head);
  uint max = main_head->outcnt();
  // mark this loop as processed
  main_head->mark_has_atomic_post_loop();
  Node *pre_header = main_head->in(LoopNode::EntryControl);
  Node *init = main_head->init_trip();
  Node *incr = main_end->incr();
  Node *limit = main_end->limit();
  Node *stride = main_end->stride();
  Node *cmp = main_end->cmp_node();
  BoolTest::mask b_test = main_end->test_trip();
  //------------------------------
  // Step A: Create a new post-Loop.
  Node* main_exit = main_end->proj_out(false);
  assert(main_exit->Opcode() == Op_IfFalse, "");
  int dd_main_exit = dom_depth(main_exit);
  // Step A1: Clone the loop body of main.  The clone becomes the vector post-loop.
  // The main loop pre-header illegally has 2 control users (old & new loops).
  clone_loop(loop, old_new, dd_main_exit);
  assert(old_new[main_end->_idx]->Opcode() == Op_CountedLoopEnd, "");
  CountedLoopNode *post_head = old_new[main_head->_idx]->as_CountedLoop();
  post_head->set_normal_loop();
  post_head->set_post_loop(main_head);
  // Reduce the post-loop trip count.
  CountedLoopEndNode* post_end = old_new[main_end->_idx]->as_CountedLoopEnd();
  post_end->_prob = PROB_FAIR;
  // Build the main-loop normal exit.
  IfFalseNode *new_main_exit = new IfFalseNode(main_end);
  _igvn.register_new_node_with_optimizer(new_main_exit);
  set_idom(new_main_exit, main_end, dd_main_exit);
  set_loop(new_main_exit, loop->_parent);
  // Step A2: Build a zero-trip guard for the vector post-loop.  After leaving the
  // main-loop, the vector post-loop may not execute at all.  We 'opaque' the incr
  // (the vectorized main-loop trip-counter exit value) because we will be changing
  // the exit value (via additional unrolling) so we cannot constant-fold away the zero
  // trip guard until all unrolling is done.
  Node *zer_opaq = new Opaque1Node(C, incr);
  Node *zer_cmp = new CmpINode(zer_opaq, limit);
  Node *zer_bol = new BoolNode(zer_cmp, b_test);
  register_new_node(zer_opaq, new_main_exit);
  register_new_node(zer_cmp, new_main_exit);
  register_new_node(zer_bol, new_main_exit);
  // Build the IfNode
  IfNode *zer_iff = new IfNode(new_main_exit, zer_bol, PROB_FAIR, COUNT_UNKNOWN);
  _igvn.register_new_node_with_optimizer(zer_iff);
  set_idom(zer_iff, new_main_exit, dd_main_exit);
  set_loop(zer_iff, loop->_parent);
  // Plug in the false-path, taken if we need to skip vector post-loop
  _igvn.replace_input_of(main_exit, 0, zer_iff);
  set_idom(main_exit, zer_iff, dd_main_exit);
  set_idom(main_exit->unique_out(), zer_iff, dd_main_exit);
  // Make the true-path, must enter the vector post loop
  Node *zer_taken = new IfTrueNode(zer_iff);
  _igvn.register_new_node_with_optimizer(zer_taken);
  set_idom(zer_taken, zer_iff, dd_main_exit);
  set_loop(zer_taken, loop->_parent);
  // Plug in the true path
  _igvn.hash_delete(post_head);
  post_head->set_req(LoopNode::EntryControl, zer_taken);
  set_idom(post_head, zer_taken, dd_main_exit);
  Arena *a = Thread::current()->resource_area();
  VectorSet visited(a);
  Node_Stack clones(a, main_head->back_control()->outcnt());
  // Step A3: Make the fall-in values to the vector post-loop come from the
  // fall-out values of the main-loop.
  for (DUIterator_Fast imax, i = main_head->fast_outs(imax); i < imax; i++) {
    Node* main_phi = main_head->fast_out(i);
    if (main_phi->is_Phi() && main_phi->in(0) == main_head && main_phi->outcnt() >0) {
      Node *cur_phi = old_new[main_phi->_idx];
      Node *fallnew = clone_up_backedge_goo(main_head->back_control(),
                                            post_head->init_control(),
                                            main_phi->in(LoopNode::LoopBackControl),
                                            visited, clones);
      _igvn.hash_delete(cur_phi);
      cur_phi->set_req(LoopNode::EntryControl, fallnew);
    }
  }
  // CastII for the new post loop:
  bool inserted = cast_incr_before_loop(zer_opaq->in(1), zer_taken, post_head);
  assert(inserted, "no castII inserted");
  // It's difficult to be precise about the trip-counts
  // for post loops.  They are usually very short,
  // so guess that unit vector trips is a reasonable value.
  post_head->set_profile_trip_cnt((float)slp_max_unroll_factor);
  // Now force out all loop-invariant dominating tests.  The optimizer
  // finds some, but we _know_ they are all useless.
  peeled_dom_test_elim(loop, old_new);
  loop->record_for_igvn();
 }
 //------------------------------is_invariant-----------------------------
 // Return true if n is invariant
 bool IdealLoopTree::is_invariant(Node* n) const {
@ -2608,6 +2749,9 @@ bool IdealLoopTree::iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_
    // and we'd rather unroll the post-RCE'd loop SO... do not unroll if
    // peeling.
    if (should_unroll && !should_peel) {
      if (SuperWordLoopUnrollAnalysis) {
        phase->insert_vector_post_loop(this, old_new);
      }
      phase->do_unroll(this, old_new, true);
    }
--- a/hotspot/src/share/vm/opto/loopnode.hpp
+++ b/hotspot/src/share/vm/opto/loopnode.hpp
@ -67,7 +67,9 @@ protected:
         HasReductions=128,
         WasSlpAnalyzed=256,
         PassedSlpAnalysis=512,
-         DoUnrollOnly=1024 };
+         DoUnrollOnly=1024,
         VectorizedLoop=2048,
         HasAtomicPostLoop=4096 };
  char _unswitch_count;
  enum { _unswitch_max=3 };
@ -86,6 +88,8 @@ public:
  void mark_was_slp() { _loop_flags |= WasSlpAnalyzed; }
  void mark_passed_slp() { _loop_flags |= PassedSlpAnalysis; }
  void mark_do_unroll_only() { _loop_flags |= DoUnrollOnly; }
  void mark_loop_vectorized() { _loop_flags |= VectorizedLoop; }
  void mark_has_atomic_post_loop() { _loop_flags |= HasAtomicPostLoop; }
  int unswitch_max() { return _unswitch_max; }
  int unswitch_count() { return _unswitch_count; }
@ -221,6 +225,8 @@ public:
  int has_passed_slp   () const { return (_loop_flags&PassedSlpAnalysis) == PassedSlpAnalysis; }
  int do_unroll_only      () const { return (_loop_flags&DoUnrollOnly) == DoUnrollOnly; }
  int is_main_no_pre_loop() const { return _loop_flags & MainHasNoPreLoop; }
  int is_vectorized_loop    () const { return (_loop_flags & VectorizedLoop) == VectorizedLoop; }
  int has_atomic_post_loop  () const { return (_loop_flags & HasAtomicPostLoop) == HasAtomicPostLoop; }
  void set_main_no_pre_loop() { _loop_flags |= MainHasNoPreLoop; }
  int main_idx() const { return _main_idx; }
@ -893,6 +899,8 @@ public:
  // Add pre and post loops around the given loop.  These loops are used
  // during RCE, unrolling and aligning loops.
  void insert_pre_post_loops( IdealLoopTree *loop, Node_List &old_new, bool peel_only );
  // Add a vector post loop between a vector main loop and the current post loop
  void insert_vector_post_loop(IdealLoopTree *loop, Node_List &old_new);
  // If Node n lives in the back_ctrl block, we clone a private version of n
  // in preheader_ctrl block and return that, otherwise return n.
  Node *clone_up_backedge_goo( Node *back_ctrl, Node *preheader_ctrl, Node *n, VectorSet &visited, Node_Stack &clones );
--- a/hotspot/src/share/vm/opto/superword.cpp
+++ b/hotspot/src/share/vm/opto/superword.cpp
@ -2253,6 +2253,9 @@ void SuperWord::output() {
          C->set_major_progress();
        }
        cl->mark_do_unroll_only();
        if (do_reserve_copy()) {
          cl->mark_loop_vectorized();
        }
      }
    }
  }