8074981: Integer/FP scalar reduction optimization

Add scalar reduction optimization to C2 to take advantage of vector instructions in modern x86 CPUs. Reviewed-by: kvn, twisti
2025-08-28 15:24:43 +02:00 · 2015-04-01 18:07:50 -07:00 · 2015-04-01 18:07:50 -07:00 · 9e55e44c85
commit 9e55e44c85
parent 7c5d30b0e3
22 changed files with 1599 additions and 20 deletions
--- a/hotspot/make/build.sh
+++ b/hotspot/make/build.sh
@ -40,7 +40,7 @@ if [ $# -lt 1 ]; then
    exit 1
 fi

-if [ "${JAVA_HOME-}" = ""  -o  ! -d "${JAVA_HOME-}" -o ! -d ${JAVA_HOME-}/jre/lib/ ]; then
+if [ "${JAVA_HOME-}" = ""  -o  ! -d "${JAVA_HOME-}" ]; then
    echo "JAVA_HOME needs to be set to a valid JDK path"
    echo "JAVA_HOME: ${JAVA_HOME-}"
    exit 1
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
@ -3359,6 +3359,20 @@ void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vecto


 // Integer vector arithmetic
+void Assembler::vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
+  emit_int8(0x01);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
+  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
+  int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
+  emit_int8(0x02);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::paddb(XMMRegister dst, XMMRegister src) {
  NOT_LP64(assert(VM_Version::supports_sse2(), ""));
  emit_simd_arith(0xFC, dst, src, VEX_SIMD_66);
@ -3379,6 +3393,20 @@ void Assembler::paddq(XMMRegister dst, XMMRegister src) {
  emit_simd_arith(0xD4, dst, src, VEX_SIMD_66);
 }

+void Assembler::phaddw(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse3(), ""));
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_int8(0x01);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::phaddd(XMMRegister dst, XMMRegister src) {
+  NOT_LP64(assert(VM_Version::supports_sse3(), ""));
+  int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
+  emit_int8(0x02);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
  assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
  emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector256);
@ -3804,6 +3832,17 @@ void Assembler::vinsertf128h(XMMRegister dst, Address src) {
  emit_int8(0x01);
 }

+void Assembler::vextractf128h(XMMRegister dst, XMMRegister src) {
+  assert(VM_Version::supports_avx(), "");
+  bool vector256 = true;
+  int encode = vex_prefix_and_encode(src, xnoreg, dst, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A);
+  emit_int8(0x19);
+  emit_int8((unsigned char)(0xC0 | encode));
+  // 0x00 - insert into lower 128 bits
+  // 0x01 - insert into upper 128 bits
+  emit_int8(0x01);
+}
+
 void Assembler::vextractf128h(Address dst, XMMRegister src) {
  assert(VM_Version::supports_avx(), "");
  InstructionMark im(this);
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp
@ -1777,6 +1777,12 @@ private:
  void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
  void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);

+  // Add horizontal packed integers
+  void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
+  void phaddw(XMMRegister dst, XMMRegister src);
+  void phaddd(XMMRegister dst, XMMRegister src);
+
  // Add packed integers
  void paddb(XMMRegister dst, XMMRegister src);
  void paddw(XMMRegister dst, XMMRegister src);
@ -1869,6 +1875,7 @@ private:
  // Copy low 128bit into high 128bit of YMM registers.
  void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
  void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vextractf128h(XMMRegister dst, XMMRegister src);

  // Load/store high 128bit of YMM registers which does not destroy other half.
  void vinsertf128h(XMMRegister dst, Address src);
--- a/hotspot/src/cpu/x86/vm/x86.ad
+++ b/hotspot/src/cpu/x86/vm/x86.ad
@ -623,6 +623,22 @@ const bool Matcher::match_rule_supported(int opcode) {
      if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
        return false;
    break;
+    case Op_AddReductionVL:
+      if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
+        return false;
+    case Op_AddReductionVI:
+      if (UseSSE < 3) // requires at least SSE3
+        return false;
+    case Op_MulReductionVI:
+      if (UseSSE < 4) // requires at least SSE4
+        return false;
+    case Op_AddReductionVF:
+    case Op_AddReductionVD:
+    case Op_MulReductionVF:
+    case Op_MulReductionVD:
+      if (UseSSE < 1) // requires at least SSE
+        return false;
+    break;
    case Op_CompareAndSwapL:
 #ifdef _LP64
    case Op_CompareAndSwapP:
@ -2532,6 +2548,574 @@ instruct Repl4D_zero(vecY dst, immD0 zero) %{
  ins_pipe( fpu_reg_reg );
 %}

+// ====================REDUCTION ARITHMETIC=======================================
+
+instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
+  predicate(UseSSE > 2 && UseAVX == 0);
+  match(Set dst (AddReductionVI src1 src2));
+  effect(TEMP tmp2, TEMP tmp);
+  format %{ "movdqu  $tmp2,$src2\n\t"
+            "phaddd  $tmp2,$tmp2\n\t"
+            "movd    $tmp,$src1\n\t"
+            "paddd   $tmp,$tmp2\n\t"
+            "movd    $dst,$tmp\t! add reduction2I" %}
+  ins_encode %{
+    __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
+    __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
+    __ movdl($tmp$$XMMRegister, $src1$$Register);
+    __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ movdl($dst$$Register, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddReductionVI src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "vphaddd $tmp,$src2,$src2\n\t"
+            "movd    $tmp2,$src1\n\t"
+            "vpaddd  $tmp2,$tmp2,$tmp\n\t"
+            "movd    $dst,$tmp2\t! add reduction2I" %}
+  ins_encode %{
+    __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, false);
+    __ movdl($tmp2$$XMMRegister, $src1$$Register);
+    __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false);
+    __ movdl($dst$$Register, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
+  predicate(UseSSE > 2 && UseAVX == 0);
+  match(Set dst (AddReductionVI src1 src2));
+  effect(TEMP tmp2, TEMP tmp);
+  format %{ "movdqu  $tmp2,$src2\n\t"
+            "phaddd  $tmp2,$tmp2\n\t"
+            "phaddd  $tmp2,$tmp2\n\t"
+            "movd    $tmp,$src1\n\t"
+            "paddd   $tmp,$tmp2\n\t"
+            "movd    $dst,$tmp\t! add reduction4I" %}
+  ins_encode %{
+    __ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
+    __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
+    __ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
+    __ movdl($tmp$$XMMRegister, $src1$$Register);
+    __ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ movdl($dst$$Register, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddReductionVI src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "vphaddd $tmp,$src2,$src2\n\t"
+            "vphaddd $tmp,$tmp,$tmp2\n\t"
+            "movd    $tmp2,$src1\n\t"
+            "vpaddd  $tmp2,$tmp2,$tmp\n\t"
+            "movd    $dst,$tmp2\t! add reduction4I" %}
+  ins_encode %{
+    __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, false);
+    __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+    __ movdl($tmp2$$XMMRegister, $src1$$Register);
+    __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false);
+    __ movdl($dst$$Register, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddReductionVI src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "vphaddd $tmp,$src2,$src2\n\t"
+            "vphaddd $tmp,$tmp,$tmp2\n\t"
+            "vextractf128  $tmp2,$tmp\n\t"
+            "vpaddd  $tmp,$tmp,$tmp2\n\t"
+            "movd    $tmp2,$src1\n\t"
+            "vpaddd  $tmp2,$tmp2,$tmp\n\t"
+            "movd    $dst,$tmp2\t! add reduction8I" %}
+  ins_encode %{
+    __ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, true);
+    __ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, true);
+    __ vextractf128h($tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+    __ movdl($tmp2$$XMMRegister, $src1$$Register);
+    __ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false);
+    __ movdl($dst$$Register, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rsadd2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
+  predicate(UseSSE >= 1 && UseAVX == 0);
+  match(Set dst (AddReductionVF src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "movdqu  $tmp,$src1\n\t"
+            "addss   $tmp,$src2\n\t"
+            "pshufd  $tmp2,$src2,0x01\n\t"
+            "addss   $tmp,$tmp2\n\t"
+            "movdqu  $dst,$tmp\t! add reduction2F" %}
+  ins_encode %{
+    __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
+    __ addss($tmp$$XMMRegister, $src2$$XMMRegister);
+    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
+    __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvadd2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddReductionVF src1 src2));
+  effect(TEMP tmp2, TEMP tmp);
+  format %{ "vaddss  $tmp2,$src1,$src2\n\t"
+            "pshufd  $tmp,$src2,0x01\n\t"
+            "vaddss  $dst,$tmp2,$tmp\t! add reduction2F" %}
+  ins_encode %{
+    __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
+    __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rsadd4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
+  predicate(UseSSE >= 1 && UseAVX == 0);
+  match(Set dst (AddReductionVF src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "movdqu  $tmp,$src1\n\t"
+            "addss   $tmp,$src2\n\t"
+            "pshufd  $tmp2,$src2,0x01\n\t"
+            "addss   $tmp,$tmp2\n\t"
+            "pshufd  $tmp2,$src2,0x02\n\t"
+            "addss   $tmp,$tmp2\n\t"
+            "pshufd  $tmp2,$src2,0x03\n\t"
+            "addss   $tmp,$tmp2\n\t"
+            "movdqu  $dst,$tmp\t! add reduction4F" %}
+  ins_encode %{
+    __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
+    __ addss($tmp$$XMMRegister, $src2$$XMMRegister);
+    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
+    __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x02);
+    __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x03);
+    __ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvadd4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddReductionVF src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "vaddss  $tmp2,$src1,$src2\n\t"
+            "pshufd  $tmp,$src2,0x01\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$src2,0x02\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$src2,0x03\n\t"
+            "vaddss  $dst,$tmp2,$tmp\t! add reduction4F" %}
+  ins_encode %{
+    __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
+    __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct radd8F_reduction_reg(regF dst, regF src1, vecY src2, regF tmp, regF tmp2, regF tmp3) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddReductionVF src1 src2));
+  effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
+  format %{ "vaddss  $tmp2,$src1,$src2\n\t"
+            "pshufd  $tmp,$src2,0x01\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$src2,0x02\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$src2,0x03\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp\n\t"
+            "vextractf128  $tmp3,$src2\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp3\n\t"
+            "pshufd  $tmp,$tmp3,0x01\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$tmp3,0x02\n\t"
+            "vaddss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$tmp3,0x03\n\t"
+            "vaddss  $dst,$tmp2,$tmp\t! add reduction8F" %}
+  ins_encode %{
+    __ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
+    __ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
+    __ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rsadd2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{
+  predicate(UseSSE >= 1 && UseAVX == 0);
+  match(Set dst (AddReductionVD src1 src2));
+  effect(TEMP tmp, TEMP dst);
+  format %{ "movdqu  $tmp,$src1\n\t"
+            "addsd   $tmp,$src2\n\t"
+            "pshufd  $dst,$src2,0xE\n\t"
+            "addsd   $dst,$tmp\t! add reduction2D" %}
+  ins_encode %{
+    __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
+    __ addsd($tmp$$XMMRegister, $src2$$XMMRegister);
+    __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xE);
+    __ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvadd2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp, regD tmp2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddReductionVD src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "vaddsd  $tmp2,$src1,$src2\n\t"
+            "pshufd  $tmp,$src2,0xE\n\t"
+            "vaddsd  $dst,$tmp2,$tmp\t! add reduction2D" %}
+  ins_encode %{
+    __ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
+    __ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvadd4D_reduction_reg(regD dst, regD src1, vecY src2, regD tmp, regD tmp2, regD tmp3) %{
+  predicate(UseAVX > 0);
+  match(Set dst (AddReductionVD src1 src2));
+  effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
+  format %{ "vaddsd  $tmp2,$src1,$src2\n\t"
+            "pshufd  $tmp,$src2,0xE\n\t"
+            "vaddsd  $tmp2,$tmp2,$tmp\n\t"
+            "vextractf128  $tmp3,$src2\n\t"
+            "vaddsd  $tmp2,$tmp2,$tmp3\n\t"
+            "pshufd  $tmp,$tmp3,0xE\n\t"
+            "vaddsd  $dst,$tmp2,$tmp\t! add reduction4D" %}
+  ins_encode %{
+    __ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
+    __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
+    __ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
+    __ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
+  predicate(UseSSE > 3 && UseAVX == 0);
+  match(Set dst (MulReductionVI src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "pshufd  $tmp2,$src2,0x1\n\t"
+            "pmulld  $tmp2,$src2\n\t"
+            "movd    $tmp,$src1\n\t"
+            "pmulld  $tmp2,$tmp\n\t"
+            "movd    $dst,$tmp2\t! mul reduction2I" %}
+  ins_encode %{
+    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
+    __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
+    __ movdl($tmp$$XMMRegister, $src1$$Register);
+    __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ movdl($dst$$Register, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulReductionVI src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "pshufd  $tmp2,$src2,0x1\n\t"
+            "vpmulld $tmp,$src2,$tmp2\n\t"
+            "movd    $tmp2,$src1\n\t"
+            "vpmulld $tmp2,$tmp,$tmp2\n\t"
+            "movd    $dst,$tmp2\t! mul reduction2I" %}
+  ins_encode %{
+    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
+    __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, false);
+    __ movdl($tmp2$$XMMRegister, $src1$$Register);
+    __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+    __ movdl($dst$$Register, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
+  predicate(UseSSE > 3 && UseAVX == 0);
+  match(Set dst (MulReductionVI src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "pshufd  $tmp2,$src2,0xE\n\t"
+            "pmulld  $tmp2,$src2\n\t"
+            "pshufd  $tmp,$tmp2,0x1\n\t"
+            "pmulld  $tmp2,$tmp\n\t"
+            "movd    $tmp,$src1\n\t"
+            "pmulld  $tmp2,$tmp\n\t"
+            "movd    $dst,$tmp2\t! mul reduction4I" %}
+  ins_encode %{
+    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
+    __ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
+    __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ movdl($tmp$$XMMRegister, $src1$$Register);
+    __ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ movdl($dst$$Register, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulReductionVI src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "pshufd  $tmp2,$src2,0xE\n\t"
+            "vpmulld $tmp,$src2,$tmp2\n\t"
+            "pshufd  $tmp2,$tmp,0x1\n\t"
+            "vpmulld $tmp,$tmp,$tmp2\n\t"
+            "movd    $tmp2,$src1\n\t"
+            "vpmulld $tmp2,$tmp,$tmp2\n\t"
+            "movd    $dst,$tmp2\t! mul reduction4I" %}
+  ins_encode %{
+    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
+    __ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, false);
+    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
+    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+    __ movdl($tmp2$$XMMRegister, $src1$$Register);
+    __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+    __ movdl($dst$$Register, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulReductionVI src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "vextractf128  $tmp,$src2\n\t"
+            "vpmulld $tmp,$tmp,$src2\n\t"
+            "pshufd  $tmp2,$tmp,0xE\n\t"
+            "vpmulld $tmp,$tmp,$tmp2\n\t"
+            "pshufd  $tmp2,$tmp,0x1\n\t"
+            "vpmulld $tmp,$tmp,$tmp2\n\t"
+            "movd    $tmp2,$src1\n\t"
+            "vpmulld $tmp2,$tmp,$tmp2\n\t"
+            "movd    $dst,$tmp2\t! mul reduction8I" %}
+  ins_encode %{
+    __ vextractf128h($tmp$$XMMRegister, $src2$$XMMRegister);
+    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, false);
+    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
+    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+    __ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
+    __ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+    __ movdl($tmp2$$XMMRegister, $src1$$Register);
+    __ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
+    __ movdl($dst$$Register, $tmp2$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rsmul2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
+  predicate(UseSSE >= 1 && UseAVX == 0);
+  match(Set dst (MulReductionVF src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "movdqu  $tmp,$src1\n\t"
+            "mulss   $tmp,$src2\n\t"
+            "pshufd  $tmp2,$src2,0x01\n\t"
+            "mulss   $tmp,$tmp2\n\t"
+            "movdqu  $dst,$tmp\t! add reduction2F" %}
+  ins_encode %{
+    __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
+    __ mulss($tmp$$XMMRegister, $src2$$XMMRegister);
+    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
+    __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvmul2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulReductionVF src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "vmulss  $tmp2,$src1,$src2\n\t"
+            "pshufd  $tmp,$src2,0x01\n\t"
+            "vmulss  $dst,$tmp2,$tmp\t! add reduction2F" %}
+  ins_encode %{
+    __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
+    __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rsmul4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
+  predicate(UseSSE >= 1 && UseAVX == 0);
+  match(Set dst (MulReductionVF src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "movdqu  $tmp,$src1\n\t"
+            "mulss   $tmp,$src2\n\t"
+            "pshufd  $tmp2,$src2,0x01\n\t"
+            "mulss   $tmp,$tmp2\n\t"
+            "pshufd  $tmp2,$src2,0x02\n\t"
+            "mulss   $tmp,$tmp2\n\t"
+            "pshufd  $tmp2,$src2,0x03\n\t"
+            "mulss   $tmp,$tmp2\n\t"
+            "movdqu  $dst,$tmp\t! add reduction4F" %}
+  ins_encode %{
+    __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
+    __ mulss($tmp$$XMMRegister, $src2$$XMMRegister);
+    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
+    __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x02);
+    __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x03);
+    __ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
+    __ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvmul4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulReductionVF src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "vmulss  $tmp2,$src1,$src2\n\t"
+            "pshufd  $tmp,$src2,0x01\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$src2,0x02\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$src2,0x03\n\t"
+            "vmulss  $dst,$tmp2,$tmp\t! add reduction4F" %}
+  ins_encode %{
+    __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
+    __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvmul8F_reduction_reg(regF dst, regF src1, vecY src2, regF tmp, regF tmp2, regF tmp3) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulReductionVF src1 src2));
+  effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
+  format %{ "vmulss  $tmp2,$src1,$src2\n\t"
+            "pshufd  $tmp,$src2,0x01\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$src2,0x02\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$src2,0x03\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp\n\t"
+            "vextractf128  $tmp3,$src2\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp3\n\t"
+            "pshufd  $tmp,$tmp3,0x01\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$tmp3,0x02\n\t"
+            "vmulss  $tmp2,$tmp2,$tmp\n\t"
+            "pshufd  $tmp,$tmp3,0x03\n\t"
+            "vmulss  $dst,$tmp2,$tmp\t! mul reduction8F" %}
+  ins_encode %{
+    __ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
+    __ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
+    __ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rsmul2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{
+  predicate(UseSSE >= 1 && UseAVX == 0);
+  match(Set dst (MulReductionVD src1 src2));
+  effect(TEMP tmp, TEMP dst);
+  format %{ "movdqu  $tmp,$src1\n\t"
+            "mulsd   $tmp,$src2\n\t"
+            "pshufd  $dst,$src2,0xE\n\t"
+            "mulsd   $dst,$tmp\t! add reduction2D" %}
+  ins_encode %{
+    __ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
+    __ mulsd($tmp$$XMMRegister, $src2$$XMMRegister);
+    __ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xE);
+    __ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvmul2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp, regD tmp2) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulReductionVD src1 src2));
+  effect(TEMP tmp, TEMP tmp2);
+  format %{ "vmulsd  $tmp2,$src1,$src2\n\t"
+            "pshufd  $tmp,$src2,0xE\n\t"
+            "vmulsd  $dst,$tmp2,$tmp\t! mul reduction2D" %}
+  ins_encode %{
+    __ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
+    __ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+instruct rvmul4D_reduction_reg(regD dst, regD src1, vecY src2, regD tmp, regD tmp2, regD tmp3) %{
+  predicate(UseAVX > 0);
+  match(Set dst (MulReductionVD src1 src2));
+  effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
+  format %{ "vmulsd  $tmp2,$src1,$src2\n\t"
+            "pshufd  $tmp,$src2,0xE\n\t"
+            "vmulsd  $tmp2,$tmp2,$tmp\n\t"
+            "vextractf128  $tmp3,$src2\n\t"
+            "vmulsd  $tmp2,$tmp2,$tmp3\n\t"
+            "pshufd  $tmp,$tmp3,0xE\n\t"
+            "vmulsd  $dst,$tmp2,$tmp\t! mul reduction4D" %}
+  ins_encode %{
+    __ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
+    __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+    __ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
+    __ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
+    __ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
+    __ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // ====================VECTOR ARITHMETIC=======================================

 // --------------------------------- ADD --------------------------------------
--- a/hotspot/src/share/vm/adlc/formssel.cpp
+++ b/hotspot/src/share/vm/adlc/formssel.cpp
@ -4043,6 +4043,13 @@ int MatchRule::is_expensive() const {
        strcmp(opType,"ReplicateL")==0 ||
        strcmp(opType,"ReplicateF")==0 ||
        strcmp(opType,"ReplicateD")==0 ||
+        strcmp(opType,"AddReductionVI")==0 ||
+        strcmp(opType,"AddReductionVL")==0 ||
+        strcmp(opType,"AddReductionVF")==0 ||
+        strcmp(opType,"AddReductionVD")==0 ||
+        strcmp(opType,"MulReductionVI")==0 ||
+        strcmp(opType,"MulReductionVF")==0 ||
+        strcmp(opType,"MulReductionVD")==0 ||
        0 /* 0 to line up columns nicely */ )
      return 1;
  }
@ -4135,6 +4142,10 @@ bool MatchRule::is_vector() const {
    "MulVS","MulVI","MulVF","MulVD",
    "DivVF","DivVD",
    "AndV" ,"XorV" ,"OrV",
+    "AddReductionVI", "AddReductionVL",
+    "AddReductionVF", "AddReductionVD",
+    "MulReductionVI",
+    "MulReductionVF", "MulReductionVD",
    "LShiftCntV","RShiftCntV",
    "LShiftVB","LShiftVS","LShiftVI","LShiftVL",
    "RShiftVB","RShiftVS","RShiftVI","RShiftVL",
--- a/hotspot/src/share/vm/opto/c2_globals.hpp
+++ b/hotspot/src/share/vm/opto/c2_globals.hpp
@ -324,6 +324,9 @@
  develop(bool, SuperWordRTDepCheck, false,                                 \
          "Enable runtime dependency checks.")                              \
                                                                            \
+  product(bool, SuperWordReductions, true,                                  \
+          "Enable reductions support in superword.")                        \
+                                                                            \
  notproduct(bool, TraceSuperWord, false,                                   \
          "Trace superword transforms")                                     \
                                                                            \
--- a/hotspot/src/share/vm/opto/classes.hpp
+++ b/hotspot/src/share/vm/opto/classes.hpp
@ -266,9 +266,13 @@ macro(Vector)
 macro(AddVB)
 macro(AddVS)
 macro(AddVI)
+macro(AddReductionVI)
 macro(AddVL)
+macro(AddReductionVL)
 macro(AddVF)
+macro(AddReductionVF)
 macro(AddVD)
+macro(AddReductionVD)
 macro(SubVB)
 macro(SubVS)
 macro(SubVI)
@ -277,8 +281,11 @@ macro(SubVF)
 macro(SubVD)
 macro(MulVS)
 macro(MulVI)
+macro(MulReductionVI)
 macro(MulVF)
+macro(MulReductionVF)
 macro(MulVD)
+macro(MulReductionVD)
 macro(DivVF)
 macro(DivVD)
 macro(LShiftCntV)
--- a/hotspot/src/share/vm/opto/compile.cpp
+++ b/hotspot/src/share/vm/opto/compile.cpp
@ -3049,6 +3049,15 @@ void Compile::final_graph_reshaping_impl( Node *n, Final_Reshape_Counts &frc) {
  case Op_StoreVector:
    break;

+  case Op_AddReductionVI:
+  case Op_AddReductionVL:
+  case Op_AddReductionVF:
+  case Op_AddReductionVD:
+  case Op_MulReductionVI:
+  case Op_MulReductionVF:
+  case Op_MulReductionVD:
+    break;
+
  case Op_PackB:
  case Op_PackS:
  case Op_PackI:
--- a/hotspot/src/share/vm/opto/loopTransform.cpp
+++ b/hotspot/src/share/vm/opto/loopTransform.cpp
@ -38,6 +38,7 @@
 #include "opto/rootnode.hpp"
 #include "opto/runtime.hpp"
 #include "opto/subnode.hpp"
+#include "opto/vectornode.hpp"

 //------------------------------is_loop_exit-----------------------------------
 // Given an IfNode, return the loop-exiting projection or NULL if both
@ -1524,6 +1525,44 @@ void PhaseIdealLoop::do_maximally_unroll( IdealLoopTree *loop, Node_List &old_ne
  }
 }

+void PhaseIdealLoop::mark_reductions(IdealLoopTree *loop) {
+  if (SuperWordReductions == false) return;
+
+  CountedLoopNode* loop_head = loop->_head->as_CountedLoop();
+  if (loop_head->unrolled_count() > 1) {
+    return;
+  }
+
+  Node* trip_phi = loop_head->phi();
+  for (DUIterator_Fast imax, i = loop_head->fast_outs(imax); i < imax; i++) {
+    Node* phi = loop_head->fast_out(i);
+    if (phi->is_Phi() && phi->outcnt() > 0 && phi != trip_phi) {
+      // For definitions which are loop inclusive and not tripcounts.
+      Node* def_node = phi->in(LoopNode::LoopBackControl);
+
+      if (def_node != NULL) {
+        Node* n_ctrl = get_ctrl(def_node);
+        if (n_ctrl != NULL && loop->is_member(get_loop(n_ctrl))) {
+          // Now test it to see if it fits the standard pattern for a reduction operator.
+          int opc = def_node->Opcode();
+          if (opc != ReductionNode::opcode(opc, def_node->bottom_type()->basic_type())) {
+            if (!def_node->is_reduction()) { // Not marked yet
+              // To be a reduction, the arithmetic node must have the phi as input and provide a def to it
+              for (unsigned j = 1; j < def_node->req(); j++) {
+                Node* in = def_node->in(j);
+                if (in == phi) {
+                  def_node->add_flag(Node::Flag_is_reduction);
+                  break;
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
 //------------------------------dominates_backedge---------------------------------
 // Returns true if ctrl is executed on every complete iteration
 bool IdealLoopTree::dominates_backedge(Node* ctrl) {
@ -2361,8 +2400,10 @@ bool IdealLoopTree::iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_
    // an even number of trips).  If we are peeling, we might enable some RCE
    // and we'd rather unroll the post-RCE'd loop SO... do not unroll if
    // peeling.
-    if (should_unroll && !should_peel)
-      phase->do_unroll(this,old_new, true);
+    if (should_unroll && !should_peel) {
+      phase->mark_reductions(this);
+      phase->do_unroll(this, old_new, true);
+    }

    // Adjust the pre-loop limits to align the main body
    // iterations.
--- a/hotspot/src/share/vm/opto/loopnode.hpp
+++ b/hotspot/src/share/vm/opto/loopnode.hpp
@ -872,6 +872,9 @@ public:
  // Unroll the loop body one step - make each trip do 2 iterations.
  void do_unroll( IdealLoopTree *loop, Node_List &old_new, bool adjust_min_trip );

+  // Mark vector reduction candidates before loop unrolling
+  void mark_reductions( IdealLoopTree *loop );
+
  // Return true if exp is a constant times an induction var
  bool is_scaled_iv(Node* exp, Node* iv, int* p_scale);

--- a/hotspot/src/share/vm/opto/node.hpp
+++ b/hotspot/src/share/vm/opto/node.hpp
@ -673,7 +673,8 @@ public:
    Flag_avoid_back_to_back_before   = Flag_may_be_short_branch << 1,
    Flag_avoid_back_to_back_after    = Flag_avoid_back_to_back_before << 1,
    Flag_has_call                    = Flag_avoid_back_to_back_after << 1,
-    Flag_is_expensive                = Flag_has_call << 1,
+    Flag_is_reduction                = Flag_has_call << 1,
+    Flag_is_expensive                = Flag_is_reduction << 1,
    _max_flags = (Flag_is_expensive << 1) - 1 // allow flags combination
  };

@ -701,6 +702,10 @@ public:

  const jushort flags() const { return _flags; }

+  void add_flag(jushort fl) { init_flags(fl); }
+
+  void remove_flag(jushort fl) { clear_flag(fl); }
+
  // Return a dense integer opcode number
  virtual int Opcode() const;

@ -852,6 +857,10 @@ public:
  // The node is expensive: the best control is set during loop opts
  bool is_expensive() const { return (_flags & Flag_is_expensive) != 0 && in(0) != NULL; }

+  // An arithmetic node which accumulates a data in a loop.
+  // It must have the loop's phi as input and provide a def to the phi.
+  bool is_reduction() const { return (_flags & Flag_is_reduction) != 0; }
+
 //----------------- Optimization

  // Get the worst-case Type output for this Node.
--- a/hotspot/src/share/vm/opto/superword.cpp
+++ b/hotspot/src/share/vm/opto/superword.cpp
@ -65,7 +65,8 @@ SuperWord::SuperWord(PhaseIdealLoop* phase) :
  _lpt(NULL),                             // loop tree node
  _lp(NULL),                              // LoopNode
  _bb(NULL),                              // basic block
-  _iv(NULL)                               // induction var
+  _iv(NULL),                              // induction var
+  _race_possible(false)                   // cases where SDMU is true
 {}

 //------------------------------transform_loop---------------------------
@ -145,7 +146,6 @@ void SuperWord::transform_loop(IdealLoopTree* lpt) {
 void SuperWord::SLP_extract() {

  // Ready the block
-
  if (!construct_bb())
    return; // Exit if no interesting nodes or complex graph.

@ -640,7 +640,7 @@ bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {
  }

  if (isomorphic(s1, s2)) {
-    if (independent(s1, s2)) {
+    if (independent(s1, s2) || reduction(s1, s2)) {
      if (!exists_at(s1, 0) && !exists_at(s2, 1)) {
        if (!s1->is_Mem() || are_adjacent_refs(s1, s2)) {
          int s1_align = alignment(s1);
@ -718,6 +718,28 @@ bool SuperWord::independent(Node* s1, Node* s2) {
  return independent_path(shallow, deep);
 }

+//------------------------------reduction---------------------------
+// Is there a data path between s1 and s2 and the nodes reductions?
+bool SuperWord::reduction(Node* s1, Node* s2) {
+  bool retValue = false;
+  int d1 = depth(s1);
+  int d2 = depth(s2);
+  if (d1 + 1 == d2) {
+    if (s1->is_reduction() && s2->is_reduction()) {
+      // This is an ordered set, so s1 should define s2
+      for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
+        Node* t1 = s1->fast_out(i);
+        if (t1 == s2) {
+          // both nodes are reductions and connected
+          retValue = true;
+        }
+      }
+    }
+  }
+
+  return retValue;
+}
+
 //------------------------------independent_path------------------------------
 // Helper for independent
 bool SuperWord::independent_path(Node* shallow, Node* deep, uint dp) {
@ -761,6 +783,7 @@ int SuperWord::data_size(Node* s) {
 void SuperWord::extend_packlist() {
  bool changed;
  do {
+    packset_sort(_packset.length());
    changed = false;
    for (int i = 0; i < _packset.length(); i++) {
      Node_List* p = _packset.at(i);
@ -769,6 +792,13 @@ void SuperWord::extend_packlist() {
    }
  } while (changed);

+  if (_race_possible) {
+    for (int i = 0; i < _packset.length(); i++) {
+      Node_List* p = _packset.at(i);
+      order_def_uses(p);
+    }
+  }
+
 #ifndef PRODUCT
  if (TraceSuperWord) {
    tty->print_cr("\nAfter extend_packlist");
@ -825,10 +855,12 @@ bool SuperWord::follow_def_uses(Node_List* p) {

  int align = alignment(s1);
  int savings = -1;
+  int num_s1_uses = 0;
  Node* u1 = NULL;
  Node* u2 = NULL;
  for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
    Node* t1 = s1->fast_out(i);
+    num_s1_uses++;
    if (!in_bb(t1)) continue;
    for (DUIterator_Fast jmax, j = s2->fast_outs(jmax); j < jmax; j++) {
      Node* t2 = s2->fast_out(j);
@ -845,6 +877,9 @@ bool SuperWord::follow_def_uses(Node_List* p) {
      }
    }
  }
+  if (num_s1_uses > 1) {
+    _race_possible = true;
+  }
  if (savings >= 0) {
    Node_List* pair = new Node_List();
    pair->push(u1);
@ -856,9 +891,64 @@ bool SuperWord::follow_def_uses(Node_List* p) {
  return changed;
 }

+//------------------------------order_def_uses---------------------------
+// For extended packsets, ordinally arrange uses packset by major component
+void SuperWord::order_def_uses(Node_List* p) {
+  Node* s1 = p->at(0);
+
+  if (s1->is_Store()) return;
+
+  // reductions are always managed beforehand
+  if (s1->is_reduction()) return;
+
+  for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
+    Node* t1 = s1->fast_out(i);
+
+    // Only allow operand swap on commuting operations
+    if (!t1->is_Add() && !t1->is_Mul()) {
+      break;
+    }
+
+    // Now find t1's packset
+    Node_List* p2 = NULL;
+    for (int j = 0; j < _packset.length(); j++) {
+      p2 = _packset.at(j);
+      Node* first = p2->at(0);
+      if (t1 == first) {
+        break;
+      }
+      p2 = NULL;
+    }
+    // Arrange all sub components by the major component
+    if (p2 != NULL) {
+      for (uint j = 1; j < p->size(); j++) {
+        Node* d1 = p->at(j);
+        Node* u1 = p2->at(j);
+        opnd_positions_match(s1, t1, d1, u1);
+      }
+    }
+  }
+}
+
 //---------------------------opnd_positions_match-------------------------
 // Is the use of d1 in u1 at the same operand position as d2 in u2?
 bool SuperWord::opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2) {
+  // check reductions to see if they are marshalled to represent the reduction
+  // operator in a specified opnd
+  if (u1->is_reduction() && u2->is_reduction()) {
+    // ensure reductions have phis and reduction definitions feeding the 1st operand
+    Node* first = u1->in(2);
+    if (first->is_Phi() || first->is_reduction()) {
+      u1->swap_edges(1, 2);
+    }
+    // ensure reductions have phis and reduction definitions feeding the 1st operand
+    first = u2->in(2);
+    if (first->is_Phi() || first->is_reduction()) {
+      u2->swap_edges(1, 2);
+    }
+    return true;
+  }
+
  uint ct = u1->req();
  if (ct != u2->req()) return false;
  uint i1 = 0;
@ -940,7 +1030,8 @@ void SuperWord::combine_packs() {
    for (int i = 0; i < _packset.length(); i++) {
      Node_List* p1 = _packset.at(i);
      if (p1 == NULL) continue;
-      for (int j = 0; j < _packset.length(); j++) {
+      // Because of sorting we can start at i + 1
+      for (int j = i + 1; j < _packset.length(); j++) {
        Node_List* p2 = _packset.at(j);
        if (p2 == NULL) continue;
        if (i == j) continue;
@ -1067,8 +1158,19 @@ void SuperWord::filter_packs() {
 //------------------------------implemented---------------------------
 // Can code be generated for pack p?
 bool SuperWord::implemented(Node_List* p) {
+  bool retValue = false;
  Node* p0 = p->at(0);
-  return VectorNode::implemented(p0->Opcode(), p->size(), velt_basic_type(p0));
+  if (p0 != NULL) {
+    int opc = p0->Opcode();
+    uint size = p->size();
+    if (p0->is_reduction()) {
+      const Type *arith_type = p0->bottom_type();
+      retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
+    } else {
+      retValue = VectorNode::implemented(opc, size, velt_basic_type(p0));
+    }
+  }
+  return retValue;
 }

 //------------------------------same_inputs--------------------------
@ -1102,6 +1204,18 @@ bool SuperWord::profitable(Node_List* p) {
    if (!is_vector_use(p0, i))
      return false;
  }
+  // Check if reductions are connected
+  if (p0->is_reduction()) {
+    Node* second_in = p0->in(2);
+    Node_List* second_pk = my_pack(second_in);
+    if (second_pk == NULL) {
+      // Remove reduction flag if no parent pack, it is not profitable
+      p0->remove_flag(Node::Flag_is_reduction);
+      return false;
+    } else if (second_pk->size() != p->size()) {
+      return false;
+    }
+  }
  if (VectorNode::is_shift(p0)) {
    // For now, return false if shift count is vector or not scalar promotion
    // case (different shift counts) because it is not supported yet.
@ -1123,6 +1237,9 @@ bool SuperWord::profitable(Node_List* p) {
        for (uint k = 0; k < use->req(); k++) {
          Node* n = use->in(k);
          if (def == n) {
+            // reductions can be loop carried dependences
+            if (def->is_reduction() && use->is_Phi())
+              continue;
            if (!is_vector_use(use, k)) {
              return false;
            }
@ -1407,16 +1524,33 @@ void SuperWord::output() {
        vlen_in_bytes = vn->as_StoreVector()->memory_size();
      } else if (n->req() == 3) {
        // Promote operands to vector
-        Node* in1 = vector_opd(p, 1);
+        Node* in1 = NULL;
+        bool node_isa_reduction = n->is_reduction();
+        if (node_isa_reduction) {
+          // the input to the first reduction operation is retained
+          in1 = low_adr->in(1);
+        } else {
+          in1 = vector_opd(p, 1);
+        }
        Node* in2 = vector_opd(p, 2);
-        if (VectorNode::is_invariant_vector(in1) && (n->is_Add() || n->is_Mul())) {
+        if (VectorNode::is_invariant_vector(in1) && (node_isa_reduction == false) && (n->is_Add() || n->is_Mul())) {
          // Move invariant vector input into second position to avoid register spilling.
          Node* tmp = in1;
          in1 = in2;
          in2 = tmp;
        }
+        if (node_isa_reduction) {
+          const Type *arith_type = n->bottom_type();
+          vn = ReductionNode::make(opc, NULL, in1, in2, arith_type->basic_type());
+          if (in2->is_Load()) {
+            vlen_in_bytes = in2->as_LoadVector()->memory_size();
+          } else {
+            vlen_in_bytes = in2->as_Vector()->length_in_bytes();
+          }
+        } else {
          vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
          vlen_in_bytes = vn->as_Vector()->length_in_bytes();
+        }
      } else {
        ShouldNotReachHere();
      }
@ -1556,6 +1690,8 @@ void SuperWord::insert_extracts(Node_List* p) {
    _n_idx_list.pop();
    Node* def = use->in(idx);

+    if (def->is_reduction()) continue;
+
    // Insert extract operation
    _igvn.hash_delete(def);
    int def_pos = alignment(def) / data_size(def);
@ -1576,6 +1712,7 @@ void SuperWord::insert_extracts(Node_List* p) {
 bool SuperWord::is_vector_use(Node* use, int u_idx) {
  Node_List* u_pk = my_pack(use);
  if (u_pk == NULL) return false;
+  if (use->is_reduction()) return true;
  Node* def = use->in(u_idx);
  Node_List* d_pk = my_pack(def);
  if (d_pk == NULL) {
@ -1613,7 +1750,7 @@ bool SuperWord::construct_bb() {
  // by the visited and post_visited sets,
  // and count number of nodes in block.
  int bb_ct = 0;
-  for (uint i = 0; i < lpt()->_body.size(); i++ ) {
+  for (uint i = 0; i < lpt()->_body.size(); i++) {
    Node *n = lpt()->_body.at(i);
    set_bb_idx(n, i); // Create a temporary map
    if (in_bb(n)) {
@ -1674,6 +1811,7 @@ bool SuperWord::construct_bb() {
  // Do a depth first walk over out edges
  int rpo_idx = bb_ct - 1;
  int size;
+  int reduction_uses = 0;
  while ((size = _stk.length()) > 0) {
    Node* n = _stk.top(); // Leave node on stack
    if (!visited_test_set(n)) {
@ -1685,6 +1823,14 @@ bool SuperWord::construct_bb() {
        if (in_bb(use) && !visited_test(use) &&
            // Don't go around backedge
            (!use->is_Phi() || n == entry)) {
+          if (use->is_reduction()) {
+            // First see if we can map the reduction on the given system we are on, then
+            // make a data entry operation for each reduction we see.
+            BasicType bt = use->bottom_type()->basic_type();
+            if (ReductionNode::implemented(use->Opcode(), Matcher::min_vector_size(bt), bt)) {
+              reduction_uses++;
+            }
+          }
          _stk.push(use);
        }
      }
@ -1708,7 +1854,8 @@ bool SuperWord::construct_bb() {
    set_bb_idx(n, j);
  }

-  initialize_bb(); // Ensure extra info is allocated.
+  // Ensure extra info is allocated.
+  initialize_bb();

 #ifndef PRODUCT
  if (TraceSuperWord) {
@ -1726,7 +1873,7 @@ bool SuperWord::construct_bb() {
  }
 #endif
  assert(rpo_idx == -1 && bb_ct == _block.length(), "all block members found");
-  return (_mem_slice_head.length() > 0) || (_data_entry.length() > 0);
+  return (_mem_slice_head.length() > 0) || (reduction_uses > 0) || (_data_entry.length() > 0);
 }

 //------------------------------initialize_bb---------------------------
@ -1959,6 +2106,27 @@ void SuperWord::remove_pack_at(int pos) {
  _packset.remove_at(pos);
 }

+void SuperWord::packset_sort(int n) {
+  // simple bubble sort so that we capitalize with O(n) when its already sorted
+  while (n != 0) {
+    bool swapped = false;
+    for (int i = 1; i < n; i++) {
+      Node_List* q_low = _packset.at(i-1);
+      Node_List* q_i = _packset.at(i);
+
+      // only swap when we find something to swap
+      if (alignment(q_low->at(0)) > alignment(q_i->at(0))) {
+        Node_List* t = q_i;
+        *(_packset.adr_at(i)) = q_low;
+        *(_packset.adr_at(i-1)) = q_i;
+        swapped = true;
+      }
+    }
+    if (swapped == false) break;
+    n--;
+  }
+}
+
 //------------------------------executed_first---------------------------
 // Return the node executed first in pack p.  Uses the RPO block list
 // to determine order.
--- a/hotspot/src/share/vm/opto/superword.hpp
+++ b/hotspot/src/share/vm/opto/superword.hpp
@ -249,6 +249,7 @@ class SuperWord : public ResourceObj {
  LoopNode*      _lp;              // Current LoopNode
  Node*          _bb;              // Current basic block
  PhiNode*       _iv;              // Induction var
+  bool           _race_possible;   // In cases where SDMU is true

  // Accessors
  Arena* arena()                   { return _arena; }
@ -337,6 +338,8 @@ class SuperWord : public ResourceObj {
  bool isomorphic(Node* s1, Node* s2);
  // Is there no data path from s1 to s2 or s2 to s1?
  bool independent(Node* s1, Node* s2);
+  // Is there a data path between s1 and s2 and both are reductions?
+  bool reduction(Node* s1, Node* s2);
  // Helper for independent
  bool independent_path(Node* shallow, Node* deep, uint dp=0);
  void set_alignment(Node* s1, Node* s2, int align);
@ -347,6 +350,8 @@ class SuperWord : public ResourceObj {
  bool follow_use_defs(Node_List* p);
  // Extend the packset by visiting uses of nodes in pack p
  bool follow_def_uses(Node_List* p);
+  // For extended packsets, ordinally arrange uses packset by major component
+  void order_def_uses(Node_List* p);
  // Estimate the savings from executing s1 and s2 as a pack
  int est_savings(Node* s1, Node* s2);
  int adjacent_profit(Node* s1, Node* s2);
@ -419,9 +424,12 @@ class SuperWord : public ResourceObj {
  void print_bb();
  void print_stmt(Node* s);
  char* blank(uint depth);
+
+  void packset_sort(int n);
 };


+
 //------------------------------SWPointer---------------------------
 // Information about an address for dependence checking and vector alignment
 class SWPointer VALUE_OBJ_CLASS_SPEC {
--- a/hotspot/src/share/vm/opto/vectornode.cpp
+++ b/hotspot/src/share/vm/opto/vectornode.cpp
@ -250,7 +250,6 @@ VectorNode* VectorNode::make(int opc, Node* n1, Node* n2, uint vlen, BasicType b
  int vopc = VectorNode::opcode(opc, bt);
  // This method should not be called for unimplemented vectors.
  guarantee(vopc > 0, err_msg_res("Vector for '%s' is not implemented", NodeClassNames[opc]));
-
  switch (vopc) {
  case Op_AddVB: return new AddVBNode(n1, n2, vt);
  case Op_AddVS: return new AddVSNode(n1, n2, vt);
@ -441,3 +440,72 @@ Node* ExtractNode::make(Node* v, uint position, BasicType bt) {
  return NULL;
 }

+int ReductionNode::opcode(int opc, BasicType bt) {
+  int vopc = opc;
+  switch (opc) {
+    case Op_AddI:
+      assert(bt == T_INT, "must be");
+      vopc = Op_AddReductionVI;
+      break;
+    case Op_AddL:
+      assert(bt == T_LONG, "must be");
+      vopc = Op_AddReductionVL;
+      break;
+    case Op_AddF:
+      assert(bt == T_FLOAT, "must be");
+      vopc = Op_AddReductionVF;
+      break;
+    case Op_AddD:
+      assert(bt == T_DOUBLE, "must be");
+      vopc = Op_AddReductionVD;
+      break;
+    case Op_MulI:
+      assert(bt == T_INT, "must be");
+      vopc = Op_MulReductionVI;
+      break;
+    case Op_MulF:
+      assert(bt == T_FLOAT, "must be");
+      vopc = Op_MulReductionVF;
+      break;
+    case Op_MulD:
+      assert(bt == T_DOUBLE, "must be");
+      vopc = Op_MulReductionVD;
+      break;
+    // TODO: add MulL for targets that support it
+    default:
+      break;
+  }
+  return vopc;
+}
+
+// Return the appropriate reduction node.
+ReductionNode* ReductionNode::make(int opc, Node *ctrl, Node* n1, Node* n2, BasicType bt) {
+
+  int vopc = opcode(opc, bt);
+
+  // This method should not be called for unimplemented vectors.
+  guarantee(vopc != opc, err_msg_res("Vector for '%s' is not implemented", NodeClassNames[opc]));
+
+  switch (vopc) {
+  case Op_AddReductionVI: return new AddReductionVINode(ctrl, n1, n2);
+  case Op_AddReductionVL: return new AddReductionVLNode(ctrl, n1, n2);
+  case Op_AddReductionVF: return new AddReductionVFNode(ctrl, n1, n2);
+  case Op_AddReductionVD: return new AddReductionVDNode(ctrl, n1, n2);
+  case Op_MulReductionVI: return new MulReductionVINode(ctrl, n1, n2);
+  case Op_MulReductionVF: return new MulReductionVFNode(ctrl, n1, n2);
+  case Op_MulReductionVD: return new MulReductionVDNode(ctrl, n1, n2);
+  }
+  fatal(err_msg_res("Missed vector creation for '%s'", NodeClassNames[vopc]));
+  return NULL;
+}
+
+bool ReductionNode::implemented(int opc, uint vlen, BasicType bt) {
+  if (is_java_primitive(bt) &&
+      (vlen > 1) && is_power_of_2(vlen) &&
+      Matcher::vector_size_supported(bt, vlen)) {
+    int vopc = ReductionNode::opcode(opc, bt);
+    return vopc != opc && Matcher::match_rule_supported(vopc);
+  }
+  return false;
+}
+
--- a/hotspot/src/share/vm/opto/vectornode.hpp
+++ b/hotspot/src/share/vm/opto/vectornode.hpp
@ -90,6 +90,37 @@ class AddVINode : public VectorNode {
  virtual int Opcode() const;
 };

+//------------------------------ReductionNode------------------------------------
+// Perform reduction of a vector
+class ReductionNode : public Node {
+ public:
+  ReductionNode(Node *ctrl, Node* in1, Node* in2) : Node(ctrl, in1, in2) {}
+
+  static ReductionNode* make(int opc, Node *ctrl, Node* in1, Node* in2, BasicType bt);
+  static int  opcode(int opc, BasicType bt);
+  static bool implemented(int opc, uint vlen, BasicType bt);
+};
+
+//------------------------------AddReductionVINode--------------------------------------
+// Vector add int as a reduction
+class AddReductionVINode : public ReductionNode {
+public:
+  AddReductionVINode(Node * ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+  virtual int Opcode() const;
+  virtual const Type* bottom_type() const { return TypeInt::INT; }
+  virtual uint ideal_reg() const { return Op_RegI; }
+};
+
+//------------------------------AddReductionVLNode--------------------------------------
+// Vector add long as a reduction
+class AddReductionVLNode : public ReductionNode {
+public:
+  AddReductionVLNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+  virtual int Opcode() const;
+  virtual const Type* bottom_type() const { return TypeLong::LONG; }
+  virtual uint ideal_reg() const { return Op_RegL; }
+};
+
 //------------------------------AddVLNode--------------------------------------
 // Vector add long
 class AddVLNode : public VectorNode {
@ -106,6 +137,16 @@ class AddVFNode : public VectorNode {
  virtual int Opcode() const;
 };

+//------------------------------AddReductionVFNode--------------------------------------
+// Vector add float as a reduction
+class AddReductionVFNode : public ReductionNode {
+public:
+  AddReductionVFNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+  virtual int Opcode() const;
+  virtual const Type* bottom_type() const { return Type::FLOAT; }
+  virtual uint ideal_reg() const { return Op_RegF; }
+};
+
 //------------------------------AddVDNode--------------------------------------
 // Vector add double
 class AddVDNode : public VectorNode {
@ -114,6 +155,16 @@ class AddVDNode : public VectorNode {
  virtual int Opcode() const;
 };

+//------------------------------AddReductionVDNode--------------------------------------
+// Vector add double as a reduction
+class AddReductionVDNode : public ReductionNode {
+public:
+  AddReductionVDNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+  virtual int Opcode() const;
+  virtual const Type* bottom_type() const { return Type::DOUBLE; }
+  virtual uint ideal_reg() const { return Op_RegD; }
+};
+
 //------------------------------SubVBNode--------------------------------------
 // Vector subtract byte
 class SubVBNode : public VectorNode {
@ -178,6 +229,16 @@ class MulVINode : public VectorNode {
  virtual int Opcode() const;
 };

+//------------------------------MulReductionVINode--------------------------------------
+// Vector multiply int as a reduction
+class MulReductionVINode : public ReductionNode {
+public:
+  MulReductionVINode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+  virtual int Opcode() const;
+  virtual const Type* bottom_type() const { return TypeInt::INT; }
+  virtual uint ideal_reg() const { return Op_RegI; }
+};
+
 //------------------------------MulVFNode--------------------------------------
 // Vector multiply float
 class MulVFNode : public VectorNode {
@ -186,6 +247,16 @@ class MulVFNode : public VectorNode {
  virtual int Opcode() const;
 };

+//------------------------------MulReductionVFNode--------------------------------------
+// Vector multiply float as a reduction
+class MulReductionVFNode : public ReductionNode {
+public:
+  MulReductionVFNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+  virtual int Opcode() const;
+  virtual const Type* bottom_type() const { return Type::FLOAT; }
+  virtual uint ideal_reg() const { return Op_RegF; }
+};
+
 //------------------------------MulVDNode--------------------------------------
 // Vector multiply double
 class MulVDNode : public VectorNode {
@ -194,6 +265,16 @@ class MulVDNode : public VectorNode {
  virtual int Opcode() const;
 };

+//------------------------------MulReductionVDNode--------------------------------------
+// Vector multiply double as a reduction
+class MulReductionVDNode : public ReductionNode {
+public:
+  MulReductionVDNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
+  virtual int Opcode() const;
+  virtual const Type* bottom_type() const { return Type::DOUBLE; }
+  virtual uint ideal_reg() const { return Op_RegD; }
+};
+
 //------------------------------DivVFNode--------------------------------------
 // Vector divide float
 class DivVFNode : public VectorNode {
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp
@ -1982,13 +1982,18 @@ typedef CompactHashtable<Symbol*, char>       SymbolCompactHashTable;
  declare_c2_type(PowDNode, Node)                                         \
  declare_c2_type(ReverseBytesINode, Node)                                \
  declare_c2_type(ReverseBytesLNode, Node)                                \
+  declare_c2_type(ReductionNode, Node)                                    \
  declare_c2_type(VectorNode, Node)                                       \
  declare_c2_type(AddVBNode, VectorNode)                                  \
  declare_c2_type(AddVSNode, VectorNode)                                  \
  declare_c2_type(AddVINode, VectorNode)                                  \
+  declare_c2_type(AddReductionVINode, ReductionNode)                      \
  declare_c2_type(AddVLNode, VectorNode)                                  \
+  declare_c2_type(AddReductionVLNode, ReductionNode)                      \
  declare_c2_type(AddVFNode, VectorNode)                                  \
+  declare_c2_type(AddReductionVFNode, ReductionNode)                      \
  declare_c2_type(AddVDNode, VectorNode)                                  \
+  declare_c2_type(AddReductionVDNode, ReductionNode)                      \
  declare_c2_type(SubVBNode, VectorNode)                                  \
  declare_c2_type(SubVSNode, VectorNode)                                  \
  declare_c2_type(SubVINode, VectorNode)                                  \
@ -1997,8 +2002,11 @@ typedef CompactHashtable<Symbol*, char>       SymbolCompactHashTable;
  declare_c2_type(SubVDNode, VectorNode)                                  \
  declare_c2_type(MulVSNode, VectorNode)                                  \
  declare_c2_type(MulVINode, VectorNode)                                  \
+  declare_c2_type(MulReductionVINode, ReductionNode)                      \
  declare_c2_type(MulVFNode, VectorNode)                                  \
+  declare_c2_type(MulReductionVFNode, ReductionNode)                      \
  declare_c2_type(MulVDNode, VectorNode)                                  \
+  declare_c2_type(MulReductionVDNode, ReductionNode)                      \
  declare_c2_type(DivVFNode, VectorNode)                                  \
  declare_c2_type(DivVDNode, VectorNode)                                  \
  declare_c2_type(LShiftVBNode, VectorNode)                               \
--- a/hotspot/test/compiler/loopopts/superword/ProdRed_Double.java
+++ b/hotspot/test/compiler/loopopts/superword/ProdRed_Double.java
@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8074981
+ * @summary Add C2 x86 Superword support for scalar product reduction optimizations : float test
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 ProdRed_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 ProdRed_Double
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 ProdRed_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 ProdRed_Double
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 ProdRed_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 ProdRed_Double
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 ProdRed_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 ProdRed_Double
+ */
+
+public class ProdRed_Double
+{
+  public static void main(String[] args) throws Exception {
+    double[] a = new double[256*1024];
+    double[] b = new double[256*1024];
+    prodReductionInit(a,b);
+    double valid = 2000;
+    double total = 0;
+    for(int j = 0; j < 2000; j++) {
+      total = j + 1;
+      total = prodReductionImplement(a,b, total);
+    }
+    if(total == valid) {
+      System.out.println("Success");
+    } else {
+      System.out.println("Invalid sum of elements variable in total: " + total);
+      System.out.println("Expected value = " + valid);
+      throw new Exception("Failed");
+    }
+  }
+
+  public static void prodReductionInit(double[] a, double[] b)
+  {
+    for(int i = 0; i < a.length; i++)
+    {
+      a[i] = i + 2;
+      b[i] = i + 1;
+    }
+  }
+
+  public static double prodReductionImplement(double[] a, double[] b, double total)
+  {
+    for(int i = 0; i < a.length; i++)
+    {
+      total *= a[i] - b[i];
+    }
+    return total;
+  }
+
+}
--- a/hotspot/test/compiler/loopopts/superword/ProdRed_Float.java
+++ b/hotspot/test/compiler/loopopts/superword/ProdRed_Float.java
@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8074981
+ * @summary Add C2 x86 Superword support for scalar product reduction optimizations : float test
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 ProdRed_Float
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 ProdRed_Float
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 ProdRed_Float
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 ProdRed_Float
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 ProdRed_Float
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 ProdRed_Float
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 ProdRed_Float
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 ProdRed_Float
+ */
+
+public class ProdRed_Float
+{
+  public static void main(String[] args) throws Exception {
+    float[] a = new float[256*1024];
+    float[] b = new float[256*1024];
+    prodReductionInit(a,b);
+    float valid = 2000;
+    float total = 0;
+    for(int j = 0; j < 2000; j++) {
+      total = j + 1;
+      total = prodReductionImplement(a,b, total);
+    }
+    if(total == valid) {
+      System.out.println("Success");
+    } else {
+      System.out.println("Invalid sum of elements variable in total: " + total);
+      System.out.println("Expected value = " + valid);
+      throw new Exception("Failed");
+    }
+  }
+
+  public static void prodReductionInit(float[] a, float[] b)
+  {
+    for(int i = 0; i < a.length; i++)
+    {
+      a[i] = i + 2;
+      b[i] = i + 1;
+    }
+  }
+
+  public static float prodReductionImplement(float[] a, float[] b, float total)
+  {
+    for(int i = 0; i < a.length; i++)
+    {
+      total *= a[i] - b[i];
+    }
+    return total;
+  }
+
+}
--- a/hotspot/test/compiler/loopopts/superword/ProdRed_Int.java
+++ b/hotspot/test/compiler/loopopts/superword/ProdRed_Int.java
@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8074981
+ * @summary Add C2 x86 Superword support for scalar product reduction optimizations : int test
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 ProdRed_Int
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 ProdRed_Int
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 ProdRed_Int
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 ProdRed_Int
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 ProdRed_Int
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 ProdRed_Int
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 ProdRed_Int
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 ProdRed_Int
+ */
+
+public class ProdRed_Int
+{
+  public static void main(String[] args) throws Exception {
+    int[] a = new int[256*1024];
+    int[] b = new int[256*1024];
+    prodReductionInit(a,b);
+    int valid = 419430401;
+    int total = 1;
+    for(int j = 0; j < 2000; j++) {
+      total = prodReductionImplement(a,b,total);
+    }
+    if(total == valid) {
+      System.out.println("Success");
+    } else {
+      System.out.println("Invalid sum of elements variable in total: " + total);
+      System.out.println("Expected value = " + valid);
+      throw new Exception("Failed");
+    }
+  }
+
+  public static void prodReductionInit(int[] a, int[] b)
+  {
+    for(int i = 0; i < a.length; i++)
+    {
+      a[i] = i + 2;
+      b[i] = i + 1;
+    }
+  }
+
+  public static int prodReductionImplement(int[] a, int[] b, int total)
+  {
+    for(int i = 0; i < a.length; i++)
+    {
+      total *= a[i] + b[i];
+    }
+    return total;
+  }
+
+}
--- a/hotspot/test/compiler/loopopts/superword/SumRed_Double.java
+++ b/hotspot/test/compiler/loopopts/superword/SumRed_Double.java
@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8074981
+ * @summary Add C2 x86 Superword support for scalar sum reduction optimizations : double test
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRed_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRed_Double
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Double
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Double
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRed_Double
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRed_Double
+ */
+
+public class SumRed_Double
+{
+  public static void main(String[] args) throws Exception {
+    double[] a = new double[256*1024];
+    double[] b = new double[256*1024];
+    double[] c = new double[256*1024];
+    double[] d = new double[256*1024];
+    sumReductionInit(a,b,c);
+    double total = 0;
+    double valid = 3.6028590866691944E19;
+    for(int j = 0; j < 2000; j++) {
+      total = sumReductionImplement(a,b,c,d,total);
+    }
+    if(total == valid) {
+      System.out.println("Success");
+    } else {
+      System.out.println("Invalid sum of elements variable in total: " + total);
+      System.out.println("Expected value = " + valid);
+      throw new Exception("Failed");
+    }
+  }
+
+  public static void sumReductionInit(
+    double[] a,
+    double[] b,
+    double[] c)
+  {
+    for(int j = 0; j < 1; j++)
+    {
+      for(int i = 0; i < a.length; i++)
+      {
+        a[i] = i * 1 + j;
+        b[i] = i * 1 - j;
+        c[i] = i + j;
+      }
+    }
+  }
+
+  public static double sumReductionImplement(
+    double[] a,
+    double[] b,
+    double[] c,
+    double[] d,
+    double total)
+  {
+    for(int i = 0; i < a.length; i++)
+    {
+      d[i]= (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
+      total += d[i];
+    }
+    return total;
+  }
+
+}
--- a/hotspot/test/compiler/loopopts/superword/SumRed_Float.java
+++ b/hotspot/test/compiler/loopopts/superword/SumRed_Float.java
@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8074981
+ * @summary Add C2 x86 Superword support for scalar sum reduction optimizations : float test
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRed_Float
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRed_Float
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Float
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Float
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Float
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Float
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRed_Float
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRed_Float
+ */
+
+public class SumRed_Float
+{
+  public static void main(String[] args) throws Exception {
+    float[] a = new float[256*1024];
+    float[] b = new float[256*1024];
+    float[] c = new float[256*1024];
+    float[] d = new float[256*1024];
+    sumReductionInit(a,b,c);
+    float total = 0;
+    float valid = (float)4.611686E18;
+    for(int j = 0; j < 2000; j++) {
+      total = sumReductionImplement(a,b,c,d,total);
+    }
+    if(total == valid) {
+      System.out.println("Success");
+    } else {
+      System.out.println("Invalid sum of elements variable in total: " + total);
+      System.out.println("Expected value = " + valid);
+      throw new Exception("Failed");
+    }
+  }
+
+  public static void sumReductionInit(
+    float[] a,
+    float[] b,
+    float[] c)
+  {
+    for(int j = 0; j < 1; j++)
+    {
+      for(int i = 0; i < a.length; i++)
+      {
+        a[i] = i * 1 + j;
+        b[i] = i * 1 - j;
+        c[i] = i + j;
+      }
+    }
+  }
+
+  public static float sumReductionImplement(
+    float[] a,
+    float[] b,
+    float[] c,
+    float[] d,
+    float total)
+  {
+    for(int i = 0; i < a.length; i++)
+    {
+      d[i]= (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
+      total += d[i];
+    }
+    return total;
+  }
+
+}
--- a/hotspot/test/compiler/loopopts/superword/SumRed_Int.java
+++ b/hotspot/test/compiler/loopopts/superword/SumRed_Int.java
@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+/**
+ * @test
+ * @bug 8074981
+ * @summary Add C2 x86 Superword support for scalar sum reduction optimizations : int test
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRed_Int
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRed_Int
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Int
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Int
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Int
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Int
+ *
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRed_Int
+ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRed_Int
+ */
+
+public class SumRed_Int
+{
+  public static void main(String[] args) throws Exception {
+    int[] a = new int[256*1024];
+    int[] b = new int[256*1024];
+    int[] c = new int[256*1024];
+    int[] d = new int[256*1024];
+    sumReductionInit(a,b,c);
+    int total = 0;
+    int valid = 262144000;
+    for(int j = 0; j < 2000; j++) {
+      total = sumReductionImplement(a,b,c,d,total);
+    }
+    if(total == valid) {
+      System.out.println("Success");
+    } else {
+      System.out.println("Invalid sum of elements variable in total: " + total);
+      System.out.println("Expected value = " + valid);
+      throw new Exception("Failed");
+    }
+  }
+
+  public static void sumReductionInit(
+    int[] a,
+    int[] b,
+    int[] c)
+  {
+    for(int j = 0; j < 1; j++)
+    {
+      for(int i = 0; i < a.length; i++)
+      {
+        a[i] = i * 1 + j;
+        b[i] = i * 1 - j;
+        c[i] = i + j;
+      }
+    }
+  }
+
+  public static int sumReductionImplement(
+    int[] a,
+    int[] b,
+    int[] c,
+    int[] d,
+    int total)
+  {
+    for(int i = 0; i < a.length; i++)
+    {
+      d[i]= (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
+      total += d[i];
+    }
+    return total;
+  }
+
+}