mirror of
https://github.com/openjdk/jdk.git
synced 2025-08-28 15:24:43 +02:00
8074981: Integer/FP scalar reduction optimization
Add scalar reduction optimization to C2 to take advantage of vector instructions in modern x86 CPUs. Reviewed-by: kvn, twisti
This commit is contained in:
parent
7c5d30b0e3
commit
9e55e44c85
22 changed files with 1599 additions and 20 deletions
|
@ -40,7 +40,7 @@ if [ $# -lt 1 ]; then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
if [ "${JAVA_HOME-}" = "" -o ! -d "${JAVA_HOME-}" -o ! -d ${JAVA_HOME-}/jre/lib/ ]; then
|
||||
if [ "${JAVA_HOME-}" = "" -o ! -d "${JAVA_HOME-}" ]; then
|
||||
echo "JAVA_HOME needs to be set to a valid JDK path"
|
||||
echo "JAVA_HOME: ${JAVA_HOME-}"
|
||||
exit 1
|
||||
|
|
|
@ -3359,6 +3359,20 @@ void Assembler::vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vecto
|
|||
|
||||
|
||||
// Integer vector arithmetic
|
||||
void Assembler::vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
|
||||
assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
|
||||
int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
|
||||
emit_int8(0x01);
|
||||
emit_int8((unsigned char)(0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
|
||||
assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
|
||||
int encode = vex_prefix_and_encode(dst, nds, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
|
||||
emit_int8(0x02);
|
||||
emit_int8((unsigned char)(0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::paddb(XMMRegister dst, XMMRegister src) {
|
||||
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
|
||||
emit_simd_arith(0xFC, dst, src, VEX_SIMD_66);
|
||||
|
@ -3379,6 +3393,20 @@ void Assembler::paddq(XMMRegister dst, XMMRegister src) {
|
|||
emit_simd_arith(0xD4, dst, src, VEX_SIMD_66);
|
||||
}
|
||||
|
||||
void Assembler::phaddw(XMMRegister dst, XMMRegister src) {
|
||||
NOT_LP64(assert(VM_Version::supports_sse3(), ""));
|
||||
int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
|
||||
emit_int8(0x01);
|
||||
emit_int8((unsigned char)(0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::phaddd(XMMRegister dst, XMMRegister src) {
|
||||
NOT_LP64(assert(VM_Version::supports_sse3(), ""));
|
||||
int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_38);
|
||||
emit_int8(0x02);
|
||||
emit_int8((unsigned char)(0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256) {
|
||||
assert(VM_Version::supports_avx() && !vector256 || VM_Version::supports_avx2(), "256 bit integer vectors requires AVX2");
|
||||
emit_vex_arith(0xFC, dst, nds, src, VEX_SIMD_66, vector256);
|
||||
|
@ -3804,6 +3832,17 @@ void Assembler::vinsertf128h(XMMRegister dst, Address src) {
|
|||
emit_int8(0x01);
|
||||
}
|
||||
|
||||
void Assembler::vextractf128h(XMMRegister dst, XMMRegister src) {
|
||||
assert(VM_Version::supports_avx(), "");
|
||||
bool vector256 = true;
|
||||
int encode = vex_prefix_and_encode(src, xnoreg, dst, VEX_SIMD_66, vector256, VEX_OPCODE_0F_3A);
|
||||
emit_int8(0x19);
|
||||
emit_int8((unsigned char)(0xC0 | encode));
|
||||
// 0x00 - insert into lower 128 bits
|
||||
// 0x01 - insert into upper 128 bits
|
||||
emit_int8(0x01);
|
||||
}
|
||||
|
||||
void Assembler::vextractf128h(Address dst, XMMRegister src) {
|
||||
assert(VM_Version::supports_avx(), "");
|
||||
InstructionMark im(this);
|
||||
|
|
|
@ -1777,6 +1777,12 @@ private:
|
|||
void vxorpd(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
void vxorps(XMMRegister dst, XMMRegister nds, Address src, bool vector256);
|
||||
|
||||
// Add horizontal packed integers
|
||||
void vphaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void vphaddd(XMMRegister dst, XMMRegister nds, XMMRegister src, bool vector256);
|
||||
void phaddw(XMMRegister dst, XMMRegister src);
|
||||
void phaddd(XMMRegister dst, XMMRegister src);
|
||||
|
||||
// Add packed integers
|
||||
void paddb(XMMRegister dst, XMMRegister src);
|
||||
void paddw(XMMRegister dst, XMMRegister src);
|
||||
|
@ -1869,6 +1875,7 @@ private:
|
|||
// Copy low 128bit into high 128bit of YMM registers.
|
||||
void vinsertf128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
|
||||
void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src);
|
||||
void vextractf128h(XMMRegister dst, XMMRegister src);
|
||||
|
||||
// Load/store high 128bit of YMM registers which does not destroy other half.
|
||||
void vinsertf128h(XMMRegister dst, Address src);
|
||||
|
|
|
@ -623,6 +623,22 @@ const bool Matcher::match_rule_supported(int opcode) {
|
|||
if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
|
||||
return false;
|
||||
break;
|
||||
case Op_AddReductionVL:
|
||||
if (UseAVX < 3) // only EVEX : vector connectivity becomes an issue here
|
||||
return false;
|
||||
case Op_AddReductionVI:
|
||||
if (UseSSE < 3) // requires at least SSE3
|
||||
return false;
|
||||
case Op_MulReductionVI:
|
||||
if (UseSSE < 4) // requires at least SSE4
|
||||
return false;
|
||||
case Op_AddReductionVF:
|
||||
case Op_AddReductionVD:
|
||||
case Op_MulReductionVF:
|
||||
case Op_MulReductionVD:
|
||||
if (UseSSE < 1) // requires at least SSE
|
||||
return false;
|
||||
break;
|
||||
case Op_CompareAndSwapL:
|
||||
#ifdef _LP64
|
||||
case Op_CompareAndSwapP:
|
||||
|
@ -2532,6 +2548,574 @@ instruct Repl4D_zero(vecY dst, immD0 zero) %{
|
|||
ins_pipe( fpu_reg_reg );
|
||||
%}
|
||||
|
||||
// ====================REDUCTION ARITHMETIC=======================================
|
||||
|
||||
instruct rsadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
|
||||
predicate(UseSSE > 2 && UseAVX == 0);
|
||||
match(Set dst (AddReductionVI src1 src2));
|
||||
effect(TEMP tmp2, TEMP tmp);
|
||||
format %{ "movdqu $tmp2,$src2\n\t"
|
||||
"phaddd $tmp2,$tmp2\n\t"
|
||||
"movd $tmp,$src1\n\t"
|
||||
"paddd $tmp,$tmp2\n\t"
|
||||
"movd $dst,$tmp\t! add reduction2I" %}
|
||||
ins_encode %{
|
||||
__ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
|
||||
__ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ movdl($tmp$$XMMRegister, $src1$$Register);
|
||||
__ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ movdl($dst$$Register, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rvadd2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
|
||||
predicate(UseAVX > 0);
|
||||
match(Set dst (AddReductionVI src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "vphaddd $tmp,$src2,$src2\n\t"
|
||||
"movd $tmp2,$src1\n\t"
|
||||
"vpaddd $tmp2,$tmp2,$tmp\n\t"
|
||||
"movd $dst,$tmp2\t! add reduction2I" %}
|
||||
ins_encode %{
|
||||
__ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, false);
|
||||
__ movdl($tmp2$$XMMRegister, $src1$$Register);
|
||||
__ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rsadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
|
||||
predicate(UseSSE > 2 && UseAVX == 0);
|
||||
match(Set dst (AddReductionVI src1 src2));
|
||||
effect(TEMP tmp2, TEMP tmp);
|
||||
format %{ "movdqu $tmp2,$src2\n\t"
|
||||
"phaddd $tmp2,$tmp2\n\t"
|
||||
"phaddd $tmp2,$tmp2\n\t"
|
||||
"movd $tmp,$src1\n\t"
|
||||
"paddd $tmp,$tmp2\n\t"
|
||||
"movd $dst,$tmp\t! add reduction4I" %}
|
||||
ins_encode %{
|
||||
__ movdqu($tmp2$$XMMRegister, $src2$$XMMRegister);
|
||||
__ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ phaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ movdl($tmp$$XMMRegister, $src1$$Register);
|
||||
__ paddd($tmp$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ movdl($dst$$Register, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rvadd4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
|
||||
predicate(UseAVX > 0);
|
||||
match(Set dst (AddReductionVI src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "vphaddd $tmp,$src2,$src2\n\t"
|
||||
"vphaddd $tmp,$tmp,$tmp2\n\t"
|
||||
"movd $tmp2,$src1\n\t"
|
||||
"vpaddd $tmp2,$tmp2,$tmp\n\t"
|
||||
"movd $dst,$tmp2\t! add reduction4I" %}
|
||||
ins_encode %{
|
||||
__ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, false);
|
||||
__ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
|
||||
__ movdl($tmp2$$XMMRegister, $src1$$Register);
|
||||
__ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rvadd8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
|
||||
predicate(UseAVX > 0);
|
||||
match(Set dst (AddReductionVI src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "vphaddd $tmp,$src2,$src2\n\t"
|
||||
"vphaddd $tmp,$tmp,$tmp2\n\t"
|
||||
"vextractf128 $tmp2,$tmp\n\t"
|
||||
"vpaddd $tmp,$tmp,$tmp2\n\t"
|
||||
"movd $tmp2,$src1\n\t"
|
||||
"vpaddd $tmp2,$tmp2,$tmp\n\t"
|
||||
"movd $dst,$tmp2\t! add reduction8I" %}
|
||||
ins_encode %{
|
||||
__ vphaddd($tmp$$XMMRegister, $src2$$XMMRegister, $src2$$XMMRegister, true);
|
||||
__ vphaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, true);
|
||||
__ vextractf128h($tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vpaddd($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
|
||||
__ movdl($tmp2$$XMMRegister, $src1$$Register);
|
||||
__ vpaddd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister, false);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rsadd2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
|
||||
predicate(UseSSE >= 1 && UseAVX == 0);
|
||||
match(Set dst (AddReductionVF src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "movdqu $tmp,$src1\n\t"
|
||||
"addss $tmp,$src2\n\t"
|
||||
"pshufd $tmp2,$src2,0x01\n\t"
|
||||
"addss $tmp,$tmp2\n\t"
|
||||
"movdqu $dst,$tmp\t! add reduction2F" %}
|
||||
ins_encode %{
|
||||
__ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
|
||||
__ addss($tmp$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rvadd2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
|
||||
predicate(UseAVX > 0);
|
||||
match(Set dst (AddReductionVF src1 src2));
|
||||
effect(TEMP tmp2, TEMP tmp);
|
||||
format %{ "vaddss $tmp2,$src1,$src2\n\t"
|
||||
"pshufd $tmp,$src2,0x01\n\t"
|
||||
"vaddss $dst,$tmp2,$tmp\t! add reduction2F" %}
|
||||
ins_encode %{
|
||||
__ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rsadd4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
|
||||
predicate(UseSSE >= 1 && UseAVX == 0);
|
||||
match(Set dst (AddReductionVF src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "movdqu $tmp,$src1\n\t"
|
||||
"addss $tmp,$src2\n\t"
|
||||
"pshufd $tmp2,$src2,0x01\n\t"
|
||||
"addss $tmp,$tmp2\n\t"
|
||||
"pshufd $tmp2,$src2,0x02\n\t"
|
||||
"addss $tmp,$tmp2\n\t"
|
||||
"pshufd $tmp2,$src2,0x03\n\t"
|
||||
"addss $tmp,$tmp2\n\t"
|
||||
"movdqu $dst,$tmp\t! add reduction4F" %}
|
||||
ins_encode %{
|
||||
__ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
|
||||
__ addss($tmp$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x02);
|
||||
__ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x03);
|
||||
__ addss($tmp$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rvadd4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
|
||||
predicate(UseAVX > 0);
|
||||
match(Set dst (AddReductionVF src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "vaddss $tmp2,$src1,$src2\n\t"
|
||||
"pshufd $tmp,$src2,0x01\n\t"
|
||||
"vaddss $tmp2,$tmp2,$tmp\n\t"
|
||||
"pshufd $tmp,$src2,0x02\n\t"
|
||||
"vaddss $tmp2,$tmp2,$tmp\n\t"
|
||||
"pshufd $tmp,$src2,0x03\n\t"
|
||||
"vaddss $dst,$tmp2,$tmp\t! add reduction4F" %}
|
||||
ins_encode %{
|
||||
__ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
|
||||
__ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
|
||||
__ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct radd8F_reduction_reg(regF dst, regF src1, vecY src2, regF tmp, regF tmp2, regF tmp3) %{
|
||||
predicate(UseAVX > 0);
|
||||
match(Set dst (AddReductionVF src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
|
||||
format %{ "vaddss $tmp2,$src1,$src2\n\t"
|
||||
"pshufd $tmp,$src2,0x01\n\t"
|
||||
"vaddss $tmp2,$tmp2,$tmp\n\t"
|
||||
"pshufd $tmp,$src2,0x02\n\t"
|
||||
"vaddss $tmp2,$tmp2,$tmp\n\t"
|
||||
"pshufd $tmp,$src2,0x03\n\t"
|
||||
"vaddss $tmp2,$tmp2,$tmp\n\t"
|
||||
"vextractf128 $tmp3,$src2\n\t"
|
||||
"vaddss $tmp2,$tmp2,$tmp3\n\t"
|
||||
"pshufd $tmp,$tmp3,0x01\n\t"
|
||||
"vaddss $tmp2,$tmp2,$tmp\n\t"
|
||||
"pshufd $tmp,$tmp3,0x02\n\t"
|
||||
"vaddss $tmp2,$tmp2,$tmp\n\t"
|
||||
"pshufd $tmp,$tmp3,0x03\n\t"
|
||||
"vaddss $dst,$tmp2,$tmp\t! add reduction8F" %}
|
||||
ins_encode %{
|
||||
__ vaddss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
|
||||
__ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
|
||||
__ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
|
||||
__ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
|
||||
__ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
|
||||
__ vaddss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
|
||||
__ vaddss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rsadd2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{
|
||||
predicate(UseSSE >= 1 && UseAVX == 0);
|
||||
match(Set dst (AddReductionVD src1 src2));
|
||||
effect(TEMP tmp, TEMP dst);
|
||||
format %{ "movdqu $tmp,$src1\n\t"
|
||||
"addsd $tmp,$src2\n\t"
|
||||
"pshufd $dst,$src2,0xE\n\t"
|
||||
"addsd $dst,$tmp\t! add reduction2D" %}
|
||||
ins_encode %{
|
||||
__ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
|
||||
__ addsd($tmp$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xE);
|
||||
__ addsd($dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rvadd2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp, regD tmp2) %{
|
||||
predicate(UseAVX > 0);
|
||||
match(Set dst (AddReductionVD src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "vaddsd $tmp2,$src1,$src2\n\t"
|
||||
"pshufd $tmp,$src2,0xE\n\t"
|
||||
"vaddsd $dst,$tmp2,$tmp\t! add reduction2D" %}
|
||||
ins_encode %{
|
||||
__ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
|
||||
__ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rvadd4D_reduction_reg(regD dst, regD src1, vecY src2, regD tmp, regD tmp2, regD tmp3) %{
|
||||
predicate(UseAVX > 0);
|
||||
match(Set dst (AddReductionVD src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
|
||||
format %{ "vaddsd $tmp2,$src1,$src2\n\t"
|
||||
"pshufd $tmp,$src2,0xE\n\t"
|
||||
"vaddsd $tmp2,$tmp2,$tmp\n\t"
|
||||
"vextractf128 $tmp3,$src2\n\t"
|
||||
"vaddsd $tmp2,$tmp2,$tmp3\n\t"
|
||||
"pshufd $tmp,$tmp3,0xE\n\t"
|
||||
"vaddsd $dst,$tmp2,$tmp\t! add reduction4D" %}
|
||||
ins_encode %{
|
||||
__ vaddsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
|
||||
__ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
|
||||
__ vaddsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
|
||||
__ vaddsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rsmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
|
||||
predicate(UseSSE > 3 && UseAVX == 0);
|
||||
match(Set dst (MulReductionVI src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "pshufd $tmp2,$src2,0x1\n\t"
|
||||
"pmulld $tmp2,$src2\n\t"
|
||||
"movd $tmp,$src1\n\t"
|
||||
"pmulld $tmp2,$tmp\n\t"
|
||||
"movd $dst,$tmp2\t! mul reduction2I" %}
|
||||
ins_encode %{
|
||||
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
|
||||
__ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
|
||||
__ movdl($tmp$$XMMRegister, $src1$$Register);
|
||||
__ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rvmul2I_reduction_reg(rRegI dst, rRegI src1, vecD src2, regF tmp, regF tmp2) %{
|
||||
predicate(UseAVX > 0);
|
||||
match(Set dst (MulReductionVI src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "pshufd $tmp2,$src2,0x1\n\t"
|
||||
"vpmulld $tmp,$src2,$tmp2\n\t"
|
||||
"movd $tmp2,$src1\n\t"
|
||||
"vpmulld $tmp2,$tmp,$tmp2\n\t"
|
||||
"movd $dst,$tmp2\t! mul reduction2I" %}
|
||||
ins_encode %{
|
||||
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x1);
|
||||
__ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, false);
|
||||
__ movdl($tmp2$$XMMRegister, $src1$$Register);
|
||||
__ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rsmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
|
||||
predicate(UseSSE > 3 && UseAVX == 0);
|
||||
match(Set dst (MulReductionVI src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "pshufd $tmp2,$src2,0xE\n\t"
|
||||
"pmulld $tmp2,$src2\n\t"
|
||||
"pshufd $tmp,$tmp2,0x1\n\t"
|
||||
"pmulld $tmp2,$tmp\n\t"
|
||||
"movd $tmp,$src1\n\t"
|
||||
"pmulld $tmp2,$tmp\n\t"
|
||||
"movd $dst,$tmp2\t! mul reduction4I" %}
|
||||
ins_encode %{
|
||||
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
|
||||
__ pmulld($tmp2$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp2$$XMMRegister, 0x1);
|
||||
__ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ movdl($tmp$$XMMRegister, $src1$$Register);
|
||||
__ pmulld($tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rvmul4I_reduction_reg(rRegI dst, rRegI src1, vecX src2, regF tmp, regF tmp2) %{
|
||||
predicate(UseAVX > 0);
|
||||
match(Set dst (MulReductionVI src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "pshufd $tmp2,$src2,0xE\n\t"
|
||||
"vpmulld $tmp,$src2,$tmp2\n\t"
|
||||
"pshufd $tmp2,$tmp,0x1\n\t"
|
||||
"vpmulld $tmp,$tmp,$tmp2\n\t"
|
||||
"movd $tmp2,$src1\n\t"
|
||||
"vpmulld $tmp2,$tmp,$tmp2\n\t"
|
||||
"movd $dst,$tmp2\t! mul reduction4I" %}
|
||||
ins_encode %{
|
||||
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0xE);
|
||||
__ vpmulld($tmp$$XMMRegister, $src2$$XMMRegister, $tmp2$$XMMRegister, false);
|
||||
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
|
||||
__ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
|
||||
__ movdl($tmp2$$XMMRegister, $src1$$Register);
|
||||
__ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rvmul8I_reduction_reg(rRegI dst, rRegI src1, vecY src2, regF tmp, regF tmp2) %{
|
||||
predicate(UseAVX > 0);
|
||||
match(Set dst (MulReductionVI src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "vextractf128 $tmp,$src2\n\t"
|
||||
"vpmulld $tmp,$tmp,$src2\n\t"
|
||||
"pshufd $tmp2,$tmp,0xE\n\t"
|
||||
"vpmulld $tmp,$tmp,$tmp2\n\t"
|
||||
"pshufd $tmp2,$tmp,0x1\n\t"
|
||||
"vpmulld $tmp,$tmp,$tmp2\n\t"
|
||||
"movd $tmp2,$src1\n\t"
|
||||
"vpmulld $tmp2,$tmp,$tmp2\n\t"
|
||||
"movd $dst,$tmp2\t! mul reduction8I" %}
|
||||
ins_encode %{
|
||||
__ vextractf128h($tmp$$XMMRegister, $src2$$XMMRegister);
|
||||
__ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $src2$$XMMRegister, false);
|
||||
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0xE);
|
||||
__ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
|
||||
__ pshufd($tmp2$$XMMRegister, $tmp$$XMMRegister, 0x1);
|
||||
__ vpmulld($tmp$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
|
||||
__ movdl($tmp2$$XMMRegister, $src1$$Register);
|
||||
__ vpmulld($tmp2$$XMMRegister, $tmp$$XMMRegister, $tmp2$$XMMRegister, false);
|
||||
__ movdl($dst$$Register, $tmp2$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rsmul2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
|
||||
predicate(UseSSE >= 1 && UseAVX == 0);
|
||||
match(Set dst (MulReductionVF src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "movdqu $tmp,$src1\n\t"
|
||||
"mulss $tmp,$src2\n\t"
|
||||
"pshufd $tmp2,$src2,0x01\n\t"
|
||||
"mulss $tmp,$tmp2\n\t"
|
||||
"movdqu $dst,$tmp\t! add reduction2F" %}
|
||||
ins_encode %{
|
||||
__ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
|
||||
__ mulss($tmp$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rvmul2F_reduction_reg(regF dst, regF src1, vecD src2, regF tmp, regF tmp2) %{
|
||||
predicate(UseAVX > 0);
|
||||
match(Set dst (MulReductionVF src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "vmulss $tmp2,$src1,$src2\n\t"
|
||||
"pshufd $tmp,$src2,0x01\n\t"
|
||||
"vmulss $dst,$tmp2,$tmp\t! add reduction2F" %}
|
||||
ins_encode %{
|
||||
__ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rsmul4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
|
||||
predicate(UseSSE >= 1 && UseAVX == 0);
|
||||
match(Set dst (MulReductionVF src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "movdqu $tmp,$src1\n\t"
|
||||
"mulss $tmp,$src2\n\t"
|
||||
"pshufd $tmp2,$src2,0x01\n\t"
|
||||
"mulss $tmp,$tmp2\n\t"
|
||||
"pshufd $tmp2,$src2,0x02\n\t"
|
||||
"mulss $tmp,$tmp2\n\t"
|
||||
"pshufd $tmp2,$src2,0x03\n\t"
|
||||
"mulss $tmp,$tmp2\n\t"
|
||||
"movdqu $dst,$tmp\t! add reduction4F" %}
|
||||
ins_encode %{
|
||||
__ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
|
||||
__ mulss($tmp$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x02);
|
||||
__ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ pshufd($tmp2$$XMMRegister, $src2$$XMMRegister, 0x03);
|
||||
__ mulss($tmp$$XMMRegister, $tmp2$$XMMRegister);
|
||||
__ movdqu($dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rvmul4F_reduction_reg(regF dst, regF src1, vecX src2, regF tmp, regF tmp2) %{
|
||||
predicate(UseAVX > 0);
|
||||
match(Set dst (MulReductionVF src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "vmulss $tmp2,$src1,$src2\n\t"
|
||||
"pshufd $tmp,$src2,0x01\n\t"
|
||||
"vmulss $tmp2,$tmp2,$tmp\n\t"
|
||||
"pshufd $tmp,$src2,0x02\n\t"
|
||||
"vmulss $tmp2,$tmp2,$tmp\n\t"
|
||||
"pshufd $tmp,$src2,0x03\n\t"
|
||||
"vmulss $dst,$tmp2,$tmp\t! add reduction4F" %}
|
||||
ins_encode %{
|
||||
__ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
|
||||
__ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
|
||||
__ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rvmul8F_reduction_reg(regF dst, regF src1, vecY src2, regF tmp, regF tmp2, regF tmp3) %{
|
||||
predicate(UseAVX > 0);
|
||||
match(Set dst (MulReductionVF src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
|
||||
format %{ "vmulss $tmp2,$src1,$src2\n\t"
|
||||
"pshufd $tmp,$src2,0x01\n\t"
|
||||
"vmulss $tmp2,$tmp2,$tmp\n\t"
|
||||
"pshufd $tmp,$src2,0x02\n\t"
|
||||
"vmulss $tmp2,$tmp2,$tmp\n\t"
|
||||
"pshufd $tmp,$src2,0x03\n\t"
|
||||
"vmulss $tmp2,$tmp2,$tmp\n\t"
|
||||
"vextractf128 $tmp3,$src2\n\t"
|
||||
"vmulss $tmp2,$tmp2,$tmp3\n\t"
|
||||
"pshufd $tmp,$tmp3,0x01\n\t"
|
||||
"vmulss $tmp2,$tmp2,$tmp\n\t"
|
||||
"pshufd $tmp,$tmp3,0x02\n\t"
|
||||
"vmulss $tmp2,$tmp2,$tmp\n\t"
|
||||
"pshufd $tmp,$tmp3,0x03\n\t"
|
||||
"vmulss $dst,$tmp2,$tmp\t! mul reduction8F" %}
|
||||
ins_encode %{
|
||||
__ vmulss($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x01);
|
||||
__ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x02);
|
||||
__ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0x03);
|
||||
__ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
|
||||
__ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x01);
|
||||
__ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x02);
|
||||
__ vmulss($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0x03);
|
||||
__ vmulss($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rsmul2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp) %{
|
||||
predicate(UseSSE >= 1 && UseAVX == 0);
|
||||
match(Set dst (MulReductionVD src1 src2));
|
||||
effect(TEMP tmp, TEMP dst);
|
||||
format %{ "movdqu $tmp,$src1\n\t"
|
||||
"mulsd $tmp,$src2\n\t"
|
||||
"pshufd $dst,$src2,0xE\n\t"
|
||||
"mulsd $dst,$tmp\t! add reduction2D" %}
|
||||
ins_encode %{
|
||||
__ movdqu($tmp$$XMMRegister, $src1$$XMMRegister);
|
||||
__ mulsd($tmp$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($dst$$XMMRegister, $src2$$XMMRegister, 0xE);
|
||||
__ mulsd($dst$$XMMRegister, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rvmul2D_reduction_reg(regD dst, regD src1, vecX src2, regD tmp, regD tmp2) %{
|
||||
predicate(UseAVX > 0);
|
||||
match(Set dst (MulReductionVD src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2);
|
||||
format %{ "vmulsd $tmp2,$src1,$src2\n\t"
|
||||
"pshufd $tmp,$src2,0xE\n\t"
|
||||
"vmulsd $dst,$tmp2,$tmp\t! mul reduction2D" %}
|
||||
ins_encode %{
|
||||
__ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
|
||||
__ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rvmul4D_reduction_reg(regD dst, regD src1, vecY src2, regD tmp, regD tmp2, regD tmp3) %{
|
||||
predicate(UseAVX > 0);
|
||||
match(Set dst (MulReductionVD src1 src2));
|
||||
effect(TEMP tmp, TEMP tmp2, TEMP tmp3);
|
||||
format %{ "vmulsd $tmp2,$src1,$src2\n\t"
|
||||
"pshufd $tmp,$src2,0xE\n\t"
|
||||
"vmulsd $tmp2,$tmp2,$tmp\n\t"
|
||||
"vextractf128 $tmp3,$src2\n\t"
|
||||
"vmulsd $tmp2,$tmp2,$tmp3\n\t"
|
||||
"pshufd $tmp,$tmp3,0xE\n\t"
|
||||
"vmulsd $dst,$tmp2,$tmp\t! mul reduction4D" %}
|
||||
ins_encode %{
|
||||
__ vmulsd($tmp2$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $src2$$XMMRegister, 0xE);
|
||||
__ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
__ vextractf128h($tmp3$$XMMRegister, $src2$$XMMRegister);
|
||||
__ vmulsd($tmp2$$XMMRegister, $tmp2$$XMMRegister, $tmp3$$XMMRegister);
|
||||
__ pshufd($tmp$$XMMRegister, $tmp3$$XMMRegister, 0xE);
|
||||
__ vmulsd($dst$$XMMRegister, $tmp2$$XMMRegister, $tmp$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
// ====================VECTOR ARITHMETIC=======================================
|
||||
|
||||
// --------------------------------- ADD --------------------------------------
|
||||
|
|
|
@ -4043,6 +4043,13 @@ int MatchRule::is_expensive() const {
|
|||
strcmp(opType,"ReplicateL")==0 ||
|
||||
strcmp(opType,"ReplicateF")==0 ||
|
||||
strcmp(opType,"ReplicateD")==0 ||
|
||||
strcmp(opType,"AddReductionVI")==0 ||
|
||||
strcmp(opType,"AddReductionVL")==0 ||
|
||||
strcmp(opType,"AddReductionVF")==0 ||
|
||||
strcmp(opType,"AddReductionVD")==0 ||
|
||||
strcmp(opType,"MulReductionVI")==0 ||
|
||||
strcmp(opType,"MulReductionVF")==0 ||
|
||||
strcmp(opType,"MulReductionVD")==0 ||
|
||||
0 /* 0 to line up columns nicely */ )
|
||||
return 1;
|
||||
}
|
||||
|
@ -4135,6 +4142,10 @@ bool MatchRule::is_vector() const {
|
|||
"MulVS","MulVI","MulVF","MulVD",
|
||||
"DivVF","DivVD",
|
||||
"AndV" ,"XorV" ,"OrV",
|
||||
"AddReductionVI", "AddReductionVL",
|
||||
"AddReductionVF", "AddReductionVD",
|
||||
"MulReductionVI",
|
||||
"MulReductionVF", "MulReductionVD",
|
||||
"LShiftCntV","RShiftCntV",
|
||||
"LShiftVB","LShiftVS","LShiftVI","LShiftVL",
|
||||
"RShiftVB","RShiftVS","RShiftVI","RShiftVL",
|
||||
|
|
|
@ -324,6 +324,9 @@
|
|||
develop(bool, SuperWordRTDepCheck, false, \
|
||||
"Enable runtime dependency checks.") \
|
||||
\
|
||||
product(bool, SuperWordReductions, true, \
|
||||
"Enable reductions support in superword.") \
|
||||
\
|
||||
notproduct(bool, TraceSuperWord, false, \
|
||||
"Trace superword transforms") \
|
||||
\
|
||||
|
|
|
@ -266,9 +266,13 @@ macro(Vector)
|
|||
macro(AddVB)
|
||||
macro(AddVS)
|
||||
macro(AddVI)
|
||||
macro(AddReductionVI)
|
||||
macro(AddVL)
|
||||
macro(AddReductionVL)
|
||||
macro(AddVF)
|
||||
macro(AddReductionVF)
|
||||
macro(AddVD)
|
||||
macro(AddReductionVD)
|
||||
macro(SubVB)
|
||||
macro(SubVS)
|
||||
macro(SubVI)
|
||||
|
@ -277,8 +281,11 @@ macro(SubVF)
|
|||
macro(SubVD)
|
||||
macro(MulVS)
|
||||
macro(MulVI)
|
||||
macro(MulReductionVI)
|
||||
macro(MulVF)
|
||||
macro(MulReductionVF)
|
||||
macro(MulVD)
|
||||
macro(MulReductionVD)
|
||||
macro(DivVF)
|
||||
macro(DivVD)
|
||||
macro(LShiftCntV)
|
||||
|
|
|
@ -3049,6 +3049,15 @@ void Compile::final_graph_reshaping_impl( Node *n, Final_Reshape_Counts &frc) {
|
|||
case Op_StoreVector:
|
||||
break;
|
||||
|
||||
case Op_AddReductionVI:
|
||||
case Op_AddReductionVL:
|
||||
case Op_AddReductionVF:
|
||||
case Op_AddReductionVD:
|
||||
case Op_MulReductionVI:
|
||||
case Op_MulReductionVF:
|
||||
case Op_MulReductionVD:
|
||||
break;
|
||||
|
||||
case Op_PackB:
|
||||
case Op_PackS:
|
||||
case Op_PackI:
|
||||
|
|
|
@ -38,6 +38,7 @@
|
|||
#include "opto/rootnode.hpp"
|
||||
#include "opto/runtime.hpp"
|
||||
#include "opto/subnode.hpp"
|
||||
#include "opto/vectornode.hpp"
|
||||
|
||||
//------------------------------is_loop_exit-----------------------------------
|
||||
// Given an IfNode, return the loop-exiting projection or NULL if both
|
||||
|
@ -1524,6 +1525,44 @@ void PhaseIdealLoop::do_maximally_unroll( IdealLoopTree *loop, Node_List &old_ne
|
|||
}
|
||||
}
|
||||
|
||||
void PhaseIdealLoop::mark_reductions(IdealLoopTree *loop) {
|
||||
if (SuperWordReductions == false) return;
|
||||
|
||||
CountedLoopNode* loop_head = loop->_head->as_CountedLoop();
|
||||
if (loop_head->unrolled_count() > 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
Node* trip_phi = loop_head->phi();
|
||||
for (DUIterator_Fast imax, i = loop_head->fast_outs(imax); i < imax; i++) {
|
||||
Node* phi = loop_head->fast_out(i);
|
||||
if (phi->is_Phi() && phi->outcnt() > 0 && phi != trip_phi) {
|
||||
// For definitions which are loop inclusive and not tripcounts.
|
||||
Node* def_node = phi->in(LoopNode::LoopBackControl);
|
||||
|
||||
if (def_node != NULL) {
|
||||
Node* n_ctrl = get_ctrl(def_node);
|
||||
if (n_ctrl != NULL && loop->is_member(get_loop(n_ctrl))) {
|
||||
// Now test it to see if it fits the standard pattern for a reduction operator.
|
||||
int opc = def_node->Opcode();
|
||||
if (opc != ReductionNode::opcode(opc, def_node->bottom_type()->basic_type())) {
|
||||
if (!def_node->is_reduction()) { // Not marked yet
|
||||
// To be a reduction, the arithmetic node must have the phi as input and provide a def to it
|
||||
for (unsigned j = 1; j < def_node->req(); j++) {
|
||||
Node* in = def_node->in(j);
|
||||
if (in == phi) {
|
||||
def_node->add_flag(Node::Flag_is_reduction);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------dominates_backedge---------------------------------
|
||||
// Returns true if ctrl is executed on every complete iteration
|
||||
bool IdealLoopTree::dominates_backedge(Node* ctrl) {
|
||||
|
@ -2361,8 +2400,10 @@ bool IdealLoopTree::iteration_split_impl( PhaseIdealLoop *phase, Node_List &old_
|
|||
// an even number of trips). If we are peeling, we might enable some RCE
|
||||
// and we'd rather unroll the post-RCE'd loop SO... do not unroll if
|
||||
// peeling.
|
||||
if (should_unroll && !should_peel)
|
||||
phase->do_unroll(this,old_new, true);
|
||||
if (should_unroll && !should_peel) {
|
||||
phase->mark_reductions(this);
|
||||
phase->do_unroll(this, old_new, true);
|
||||
}
|
||||
|
||||
// Adjust the pre-loop limits to align the main body
|
||||
// iterations.
|
||||
|
|
|
@ -872,6 +872,9 @@ public:
|
|||
// Unroll the loop body one step - make each trip do 2 iterations.
|
||||
void do_unroll( IdealLoopTree *loop, Node_List &old_new, bool adjust_min_trip );
|
||||
|
||||
// Mark vector reduction candidates before loop unrolling
|
||||
void mark_reductions( IdealLoopTree *loop );
|
||||
|
||||
// Return true if exp is a constant times an induction var
|
||||
bool is_scaled_iv(Node* exp, Node* iv, int* p_scale);
|
||||
|
||||
|
|
|
@ -673,7 +673,8 @@ public:
|
|||
Flag_avoid_back_to_back_before = Flag_may_be_short_branch << 1,
|
||||
Flag_avoid_back_to_back_after = Flag_avoid_back_to_back_before << 1,
|
||||
Flag_has_call = Flag_avoid_back_to_back_after << 1,
|
||||
Flag_is_expensive = Flag_has_call << 1,
|
||||
Flag_is_reduction = Flag_has_call << 1,
|
||||
Flag_is_expensive = Flag_is_reduction << 1,
|
||||
_max_flags = (Flag_is_expensive << 1) - 1 // allow flags combination
|
||||
};
|
||||
|
||||
|
@ -701,6 +702,10 @@ public:
|
|||
|
||||
const jushort flags() const { return _flags; }
|
||||
|
||||
void add_flag(jushort fl) { init_flags(fl); }
|
||||
|
||||
void remove_flag(jushort fl) { clear_flag(fl); }
|
||||
|
||||
// Return a dense integer opcode number
|
||||
virtual int Opcode() const;
|
||||
|
||||
|
@ -852,6 +857,10 @@ public:
|
|||
// The node is expensive: the best control is set during loop opts
|
||||
bool is_expensive() const { return (_flags & Flag_is_expensive) != 0 && in(0) != NULL; }
|
||||
|
||||
// An arithmetic node which accumulates a data in a loop.
|
||||
// It must have the loop's phi as input and provide a def to the phi.
|
||||
bool is_reduction() const { return (_flags & Flag_is_reduction) != 0; }
|
||||
|
||||
//----------------- Optimization
|
||||
|
||||
// Get the worst-case Type output for this Node.
|
||||
|
|
|
@ -65,7 +65,8 @@ SuperWord::SuperWord(PhaseIdealLoop* phase) :
|
|||
_lpt(NULL), // loop tree node
|
||||
_lp(NULL), // LoopNode
|
||||
_bb(NULL), // basic block
|
||||
_iv(NULL) // induction var
|
||||
_iv(NULL), // induction var
|
||||
_race_possible(false) // cases where SDMU is true
|
||||
{}
|
||||
|
||||
//------------------------------transform_loop---------------------------
|
||||
|
@ -145,7 +146,6 @@ void SuperWord::transform_loop(IdealLoopTree* lpt) {
|
|||
void SuperWord::SLP_extract() {
|
||||
|
||||
// Ready the block
|
||||
|
||||
if (!construct_bb())
|
||||
return; // Exit if no interesting nodes or complex graph.
|
||||
|
||||
|
@ -640,7 +640,7 @@ bool SuperWord::stmts_can_pack(Node* s1, Node* s2, int align) {
|
|||
}
|
||||
|
||||
if (isomorphic(s1, s2)) {
|
||||
if (independent(s1, s2)) {
|
||||
if (independent(s1, s2) || reduction(s1, s2)) {
|
||||
if (!exists_at(s1, 0) && !exists_at(s2, 1)) {
|
||||
if (!s1->is_Mem() || are_adjacent_refs(s1, s2)) {
|
||||
int s1_align = alignment(s1);
|
||||
|
@ -718,6 +718,28 @@ bool SuperWord::independent(Node* s1, Node* s2) {
|
|||
return independent_path(shallow, deep);
|
||||
}
|
||||
|
||||
//------------------------------reduction---------------------------
|
||||
// Is there a data path between s1 and s2 and the nodes reductions?
|
||||
bool SuperWord::reduction(Node* s1, Node* s2) {
|
||||
bool retValue = false;
|
||||
int d1 = depth(s1);
|
||||
int d2 = depth(s2);
|
||||
if (d1 + 1 == d2) {
|
||||
if (s1->is_reduction() && s2->is_reduction()) {
|
||||
// This is an ordered set, so s1 should define s2
|
||||
for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
|
||||
Node* t1 = s1->fast_out(i);
|
||||
if (t1 == s2) {
|
||||
// both nodes are reductions and connected
|
||||
retValue = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return retValue;
|
||||
}
|
||||
|
||||
//------------------------------independent_path------------------------------
|
||||
// Helper for independent
|
||||
bool SuperWord::independent_path(Node* shallow, Node* deep, uint dp) {
|
||||
|
@ -761,6 +783,7 @@ int SuperWord::data_size(Node* s) {
|
|||
void SuperWord::extend_packlist() {
|
||||
bool changed;
|
||||
do {
|
||||
packset_sort(_packset.length());
|
||||
changed = false;
|
||||
for (int i = 0; i < _packset.length(); i++) {
|
||||
Node_List* p = _packset.at(i);
|
||||
|
@ -769,6 +792,13 @@ void SuperWord::extend_packlist() {
|
|||
}
|
||||
} while (changed);
|
||||
|
||||
if (_race_possible) {
|
||||
for (int i = 0; i < _packset.length(); i++) {
|
||||
Node_List* p = _packset.at(i);
|
||||
order_def_uses(p);
|
||||
}
|
||||
}
|
||||
|
||||
#ifndef PRODUCT
|
||||
if (TraceSuperWord) {
|
||||
tty->print_cr("\nAfter extend_packlist");
|
||||
|
@ -825,10 +855,12 @@ bool SuperWord::follow_def_uses(Node_List* p) {
|
|||
|
||||
int align = alignment(s1);
|
||||
int savings = -1;
|
||||
int num_s1_uses = 0;
|
||||
Node* u1 = NULL;
|
||||
Node* u2 = NULL;
|
||||
for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
|
||||
Node* t1 = s1->fast_out(i);
|
||||
num_s1_uses++;
|
||||
if (!in_bb(t1)) continue;
|
||||
for (DUIterator_Fast jmax, j = s2->fast_outs(jmax); j < jmax; j++) {
|
||||
Node* t2 = s2->fast_out(j);
|
||||
|
@ -845,6 +877,9 @@ bool SuperWord::follow_def_uses(Node_List* p) {
|
|||
}
|
||||
}
|
||||
}
|
||||
if (num_s1_uses > 1) {
|
||||
_race_possible = true;
|
||||
}
|
||||
if (savings >= 0) {
|
||||
Node_List* pair = new Node_List();
|
||||
pair->push(u1);
|
||||
|
@ -856,9 +891,64 @@ bool SuperWord::follow_def_uses(Node_List* p) {
|
|||
return changed;
|
||||
}
|
||||
|
||||
//------------------------------order_def_uses---------------------------
|
||||
// For extended packsets, ordinally arrange uses packset by major component
|
||||
void SuperWord::order_def_uses(Node_List* p) {
|
||||
Node* s1 = p->at(0);
|
||||
|
||||
if (s1->is_Store()) return;
|
||||
|
||||
// reductions are always managed beforehand
|
||||
if (s1->is_reduction()) return;
|
||||
|
||||
for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
|
||||
Node* t1 = s1->fast_out(i);
|
||||
|
||||
// Only allow operand swap on commuting operations
|
||||
if (!t1->is_Add() && !t1->is_Mul()) {
|
||||
break;
|
||||
}
|
||||
|
||||
// Now find t1's packset
|
||||
Node_List* p2 = NULL;
|
||||
for (int j = 0; j < _packset.length(); j++) {
|
||||
p2 = _packset.at(j);
|
||||
Node* first = p2->at(0);
|
||||
if (t1 == first) {
|
||||
break;
|
||||
}
|
||||
p2 = NULL;
|
||||
}
|
||||
// Arrange all sub components by the major component
|
||||
if (p2 != NULL) {
|
||||
for (uint j = 1; j < p->size(); j++) {
|
||||
Node* d1 = p->at(j);
|
||||
Node* u1 = p2->at(j);
|
||||
opnd_positions_match(s1, t1, d1, u1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------opnd_positions_match-------------------------
|
||||
// Is the use of d1 in u1 at the same operand position as d2 in u2?
|
||||
bool SuperWord::opnd_positions_match(Node* d1, Node* u1, Node* d2, Node* u2) {
|
||||
// check reductions to see if they are marshalled to represent the reduction
|
||||
// operator in a specified opnd
|
||||
if (u1->is_reduction() && u2->is_reduction()) {
|
||||
// ensure reductions have phis and reduction definitions feeding the 1st operand
|
||||
Node* first = u1->in(2);
|
||||
if (first->is_Phi() || first->is_reduction()) {
|
||||
u1->swap_edges(1, 2);
|
||||
}
|
||||
// ensure reductions have phis and reduction definitions feeding the 1st operand
|
||||
first = u2->in(2);
|
||||
if (first->is_Phi() || first->is_reduction()) {
|
||||
u2->swap_edges(1, 2);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
uint ct = u1->req();
|
||||
if (ct != u2->req()) return false;
|
||||
uint i1 = 0;
|
||||
|
@ -940,7 +1030,8 @@ void SuperWord::combine_packs() {
|
|||
for (int i = 0; i < _packset.length(); i++) {
|
||||
Node_List* p1 = _packset.at(i);
|
||||
if (p1 == NULL) continue;
|
||||
for (int j = 0; j < _packset.length(); j++) {
|
||||
// Because of sorting we can start at i + 1
|
||||
for (int j = i + 1; j < _packset.length(); j++) {
|
||||
Node_List* p2 = _packset.at(j);
|
||||
if (p2 == NULL) continue;
|
||||
if (i == j) continue;
|
||||
|
@ -1067,8 +1158,19 @@ void SuperWord::filter_packs() {
|
|||
//------------------------------implemented---------------------------
|
||||
// Can code be generated for pack p?
|
||||
bool SuperWord::implemented(Node_List* p) {
|
||||
bool retValue = false;
|
||||
Node* p0 = p->at(0);
|
||||
return VectorNode::implemented(p0->Opcode(), p->size(), velt_basic_type(p0));
|
||||
if (p0 != NULL) {
|
||||
int opc = p0->Opcode();
|
||||
uint size = p->size();
|
||||
if (p0->is_reduction()) {
|
||||
const Type *arith_type = p0->bottom_type();
|
||||
retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
|
||||
} else {
|
||||
retValue = VectorNode::implemented(opc, size, velt_basic_type(p0));
|
||||
}
|
||||
}
|
||||
return retValue;
|
||||
}
|
||||
|
||||
//------------------------------same_inputs--------------------------
|
||||
|
@ -1102,6 +1204,18 @@ bool SuperWord::profitable(Node_List* p) {
|
|||
if (!is_vector_use(p0, i))
|
||||
return false;
|
||||
}
|
||||
// Check if reductions are connected
|
||||
if (p0->is_reduction()) {
|
||||
Node* second_in = p0->in(2);
|
||||
Node_List* second_pk = my_pack(second_in);
|
||||
if (second_pk == NULL) {
|
||||
// Remove reduction flag if no parent pack, it is not profitable
|
||||
p0->remove_flag(Node::Flag_is_reduction);
|
||||
return false;
|
||||
} else if (second_pk->size() != p->size()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (VectorNode::is_shift(p0)) {
|
||||
// For now, return false if shift count is vector or not scalar promotion
|
||||
// case (different shift counts) because it is not supported yet.
|
||||
|
@ -1123,6 +1237,9 @@ bool SuperWord::profitable(Node_List* p) {
|
|||
for (uint k = 0; k < use->req(); k++) {
|
||||
Node* n = use->in(k);
|
||||
if (def == n) {
|
||||
// reductions can be loop carried dependences
|
||||
if (def->is_reduction() && use->is_Phi())
|
||||
continue;
|
||||
if (!is_vector_use(use, k)) {
|
||||
return false;
|
||||
}
|
||||
|
@ -1407,16 +1524,33 @@ void SuperWord::output() {
|
|||
vlen_in_bytes = vn->as_StoreVector()->memory_size();
|
||||
} else if (n->req() == 3) {
|
||||
// Promote operands to vector
|
||||
Node* in1 = vector_opd(p, 1);
|
||||
Node* in1 = NULL;
|
||||
bool node_isa_reduction = n->is_reduction();
|
||||
if (node_isa_reduction) {
|
||||
// the input to the first reduction operation is retained
|
||||
in1 = low_adr->in(1);
|
||||
} else {
|
||||
in1 = vector_opd(p, 1);
|
||||
}
|
||||
Node* in2 = vector_opd(p, 2);
|
||||
if (VectorNode::is_invariant_vector(in1) && (n->is_Add() || n->is_Mul())) {
|
||||
if (VectorNode::is_invariant_vector(in1) && (node_isa_reduction == false) && (n->is_Add() || n->is_Mul())) {
|
||||
// Move invariant vector input into second position to avoid register spilling.
|
||||
Node* tmp = in1;
|
||||
in1 = in2;
|
||||
in2 = tmp;
|
||||
}
|
||||
if (node_isa_reduction) {
|
||||
const Type *arith_type = n->bottom_type();
|
||||
vn = ReductionNode::make(opc, NULL, in1, in2, arith_type->basic_type());
|
||||
if (in2->is_Load()) {
|
||||
vlen_in_bytes = in2->as_LoadVector()->memory_size();
|
||||
} else {
|
||||
vlen_in_bytes = in2->as_Vector()->length_in_bytes();
|
||||
}
|
||||
} else {
|
||||
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
|
||||
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
||||
}
|
||||
} else {
|
||||
ShouldNotReachHere();
|
||||
}
|
||||
|
@ -1556,6 +1690,8 @@ void SuperWord::insert_extracts(Node_List* p) {
|
|||
_n_idx_list.pop();
|
||||
Node* def = use->in(idx);
|
||||
|
||||
if (def->is_reduction()) continue;
|
||||
|
||||
// Insert extract operation
|
||||
_igvn.hash_delete(def);
|
||||
int def_pos = alignment(def) / data_size(def);
|
||||
|
@ -1576,6 +1712,7 @@ void SuperWord::insert_extracts(Node_List* p) {
|
|||
bool SuperWord::is_vector_use(Node* use, int u_idx) {
|
||||
Node_List* u_pk = my_pack(use);
|
||||
if (u_pk == NULL) return false;
|
||||
if (use->is_reduction()) return true;
|
||||
Node* def = use->in(u_idx);
|
||||
Node_List* d_pk = my_pack(def);
|
||||
if (d_pk == NULL) {
|
||||
|
@ -1613,7 +1750,7 @@ bool SuperWord::construct_bb() {
|
|||
// by the visited and post_visited sets,
|
||||
// and count number of nodes in block.
|
||||
int bb_ct = 0;
|
||||
for (uint i = 0; i < lpt()->_body.size(); i++ ) {
|
||||
for (uint i = 0; i < lpt()->_body.size(); i++) {
|
||||
Node *n = lpt()->_body.at(i);
|
||||
set_bb_idx(n, i); // Create a temporary map
|
||||
if (in_bb(n)) {
|
||||
|
@ -1674,6 +1811,7 @@ bool SuperWord::construct_bb() {
|
|||
// Do a depth first walk over out edges
|
||||
int rpo_idx = bb_ct - 1;
|
||||
int size;
|
||||
int reduction_uses = 0;
|
||||
while ((size = _stk.length()) > 0) {
|
||||
Node* n = _stk.top(); // Leave node on stack
|
||||
if (!visited_test_set(n)) {
|
||||
|
@ -1685,6 +1823,14 @@ bool SuperWord::construct_bb() {
|
|||
if (in_bb(use) && !visited_test(use) &&
|
||||
// Don't go around backedge
|
||||
(!use->is_Phi() || n == entry)) {
|
||||
if (use->is_reduction()) {
|
||||
// First see if we can map the reduction on the given system we are on, then
|
||||
// make a data entry operation for each reduction we see.
|
||||
BasicType bt = use->bottom_type()->basic_type();
|
||||
if (ReductionNode::implemented(use->Opcode(), Matcher::min_vector_size(bt), bt)) {
|
||||
reduction_uses++;
|
||||
}
|
||||
}
|
||||
_stk.push(use);
|
||||
}
|
||||
}
|
||||
|
@ -1708,7 +1854,8 @@ bool SuperWord::construct_bb() {
|
|||
set_bb_idx(n, j);
|
||||
}
|
||||
|
||||
initialize_bb(); // Ensure extra info is allocated.
|
||||
// Ensure extra info is allocated.
|
||||
initialize_bb();
|
||||
|
||||
#ifndef PRODUCT
|
||||
if (TraceSuperWord) {
|
||||
|
@ -1726,7 +1873,7 @@ bool SuperWord::construct_bb() {
|
|||
}
|
||||
#endif
|
||||
assert(rpo_idx == -1 && bb_ct == _block.length(), "all block members found");
|
||||
return (_mem_slice_head.length() > 0) || (_data_entry.length() > 0);
|
||||
return (_mem_slice_head.length() > 0) || (reduction_uses > 0) || (_data_entry.length() > 0);
|
||||
}
|
||||
|
||||
//------------------------------initialize_bb---------------------------
|
||||
|
@ -1959,6 +2106,27 @@ void SuperWord::remove_pack_at(int pos) {
|
|||
_packset.remove_at(pos);
|
||||
}
|
||||
|
||||
void SuperWord::packset_sort(int n) {
|
||||
// simple bubble sort so that we capitalize with O(n) when its already sorted
|
||||
while (n != 0) {
|
||||
bool swapped = false;
|
||||
for (int i = 1; i < n; i++) {
|
||||
Node_List* q_low = _packset.at(i-1);
|
||||
Node_List* q_i = _packset.at(i);
|
||||
|
||||
// only swap when we find something to swap
|
||||
if (alignment(q_low->at(0)) > alignment(q_i->at(0))) {
|
||||
Node_List* t = q_i;
|
||||
*(_packset.adr_at(i)) = q_low;
|
||||
*(_packset.adr_at(i-1)) = q_i;
|
||||
swapped = true;
|
||||
}
|
||||
}
|
||||
if (swapped == false) break;
|
||||
n--;
|
||||
}
|
||||
}
|
||||
|
||||
//------------------------------executed_first---------------------------
|
||||
// Return the node executed first in pack p. Uses the RPO block list
|
||||
// to determine order.
|
||||
|
|
|
@ -249,6 +249,7 @@ class SuperWord : public ResourceObj {
|
|||
LoopNode* _lp; // Current LoopNode
|
||||
Node* _bb; // Current basic block
|
||||
PhiNode* _iv; // Induction var
|
||||
bool _race_possible; // In cases where SDMU is true
|
||||
|
||||
// Accessors
|
||||
Arena* arena() { return _arena; }
|
||||
|
@ -337,6 +338,8 @@ class SuperWord : public ResourceObj {
|
|||
bool isomorphic(Node* s1, Node* s2);
|
||||
// Is there no data path from s1 to s2 or s2 to s1?
|
||||
bool independent(Node* s1, Node* s2);
|
||||
// Is there a data path between s1 and s2 and both are reductions?
|
||||
bool reduction(Node* s1, Node* s2);
|
||||
// Helper for independent
|
||||
bool independent_path(Node* shallow, Node* deep, uint dp=0);
|
||||
void set_alignment(Node* s1, Node* s2, int align);
|
||||
|
@ -347,6 +350,8 @@ class SuperWord : public ResourceObj {
|
|||
bool follow_use_defs(Node_List* p);
|
||||
// Extend the packset by visiting uses of nodes in pack p
|
||||
bool follow_def_uses(Node_List* p);
|
||||
// For extended packsets, ordinally arrange uses packset by major component
|
||||
void order_def_uses(Node_List* p);
|
||||
// Estimate the savings from executing s1 and s2 as a pack
|
||||
int est_savings(Node* s1, Node* s2);
|
||||
int adjacent_profit(Node* s1, Node* s2);
|
||||
|
@ -419,9 +424,12 @@ class SuperWord : public ResourceObj {
|
|||
void print_bb();
|
||||
void print_stmt(Node* s);
|
||||
char* blank(uint depth);
|
||||
|
||||
void packset_sort(int n);
|
||||
};
|
||||
|
||||
|
||||
|
||||
//------------------------------SWPointer---------------------------
|
||||
// Information about an address for dependence checking and vector alignment
|
||||
class SWPointer VALUE_OBJ_CLASS_SPEC {
|
||||
|
|
|
@ -250,7 +250,6 @@ VectorNode* VectorNode::make(int opc, Node* n1, Node* n2, uint vlen, BasicType b
|
|||
int vopc = VectorNode::opcode(opc, bt);
|
||||
// This method should not be called for unimplemented vectors.
|
||||
guarantee(vopc > 0, err_msg_res("Vector for '%s' is not implemented", NodeClassNames[opc]));
|
||||
|
||||
switch (vopc) {
|
||||
case Op_AddVB: return new AddVBNode(n1, n2, vt);
|
||||
case Op_AddVS: return new AddVSNode(n1, n2, vt);
|
||||
|
@ -441,3 +440,72 @@ Node* ExtractNode::make(Node* v, uint position, BasicType bt) {
|
|||
return NULL;
|
||||
}
|
||||
|
||||
int ReductionNode::opcode(int opc, BasicType bt) {
|
||||
int vopc = opc;
|
||||
switch (opc) {
|
||||
case Op_AddI:
|
||||
assert(bt == T_INT, "must be");
|
||||
vopc = Op_AddReductionVI;
|
||||
break;
|
||||
case Op_AddL:
|
||||
assert(bt == T_LONG, "must be");
|
||||
vopc = Op_AddReductionVL;
|
||||
break;
|
||||
case Op_AddF:
|
||||
assert(bt == T_FLOAT, "must be");
|
||||
vopc = Op_AddReductionVF;
|
||||
break;
|
||||
case Op_AddD:
|
||||
assert(bt == T_DOUBLE, "must be");
|
||||
vopc = Op_AddReductionVD;
|
||||
break;
|
||||
case Op_MulI:
|
||||
assert(bt == T_INT, "must be");
|
||||
vopc = Op_MulReductionVI;
|
||||
break;
|
||||
case Op_MulF:
|
||||
assert(bt == T_FLOAT, "must be");
|
||||
vopc = Op_MulReductionVF;
|
||||
break;
|
||||
case Op_MulD:
|
||||
assert(bt == T_DOUBLE, "must be");
|
||||
vopc = Op_MulReductionVD;
|
||||
break;
|
||||
// TODO: add MulL for targets that support it
|
||||
default:
|
||||
break;
|
||||
}
|
||||
return vopc;
|
||||
}
|
||||
|
||||
// Return the appropriate reduction node.
|
||||
ReductionNode* ReductionNode::make(int opc, Node *ctrl, Node* n1, Node* n2, BasicType bt) {
|
||||
|
||||
int vopc = opcode(opc, bt);
|
||||
|
||||
// This method should not be called for unimplemented vectors.
|
||||
guarantee(vopc != opc, err_msg_res("Vector for '%s' is not implemented", NodeClassNames[opc]));
|
||||
|
||||
switch (vopc) {
|
||||
case Op_AddReductionVI: return new AddReductionVINode(ctrl, n1, n2);
|
||||
case Op_AddReductionVL: return new AddReductionVLNode(ctrl, n1, n2);
|
||||
case Op_AddReductionVF: return new AddReductionVFNode(ctrl, n1, n2);
|
||||
case Op_AddReductionVD: return new AddReductionVDNode(ctrl, n1, n2);
|
||||
case Op_MulReductionVI: return new MulReductionVINode(ctrl, n1, n2);
|
||||
case Op_MulReductionVF: return new MulReductionVFNode(ctrl, n1, n2);
|
||||
case Op_MulReductionVD: return new MulReductionVDNode(ctrl, n1, n2);
|
||||
}
|
||||
fatal(err_msg_res("Missed vector creation for '%s'", NodeClassNames[vopc]));
|
||||
return NULL;
|
||||
}
|
||||
|
||||
bool ReductionNode::implemented(int opc, uint vlen, BasicType bt) {
|
||||
if (is_java_primitive(bt) &&
|
||||
(vlen > 1) && is_power_of_2(vlen) &&
|
||||
Matcher::vector_size_supported(bt, vlen)) {
|
||||
int vopc = ReductionNode::opcode(opc, bt);
|
||||
return vopc != opc && Matcher::match_rule_supported(vopc);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
|
|
@ -90,6 +90,37 @@ class AddVINode : public VectorNode {
|
|||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------ReductionNode------------------------------------
|
||||
// Perform reduction of a vector
|
||||
class ReductionNode : public Node {
|
||||
public:
|
||||
ReductionNode(Node *ctrl, Node* in1, Node* in2) : Node(ctrl, in1, in2) {}
|
||||
|
||||
static ReductionNode* make(int opc, Node *ctrl, Node* in1, Node* in2, BasicType bt);
|
||||
static int opcode(int opc, BasicType bt);
|
||||
static bool implemented(int opc, uint vlen, BasicType bt);
|
||||
};
|
||||
|
||||
//------------------------------AddReductionVINode--------------------------------------
|
||||
// Vector add int as a reduction
|
||||
class AddReductionVINode : public ReductionNode {
|
||||
public:
|
||||
AddReductionVINode(Node * ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
virtual const Type* bottom_type() const { return TypeInt::INT; }
|
||||
virtual uint ideal_reg() const { return Op_RegI; }
|
||||
};
|
||||
|
||||
//------------------------------AddReductionVLNode--------------------------------------
|
||||
// Vector add long as a reduction
|
||||
class AddReductionVLNode : public ReductionNode {
|
||||
public:
|
||||
AddReductionVLNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
virtual const Type* bottom_type() const { return TypeLong::LONG; }
|
||||
virtual uint ideal_reg() const { return Op_RegL; }
|
||||
};
|
||||
|
||||
//------------------------------AddVLNode--------------------------------------
|
||||
// Vector add long
|
||||
class AddVLNode : public VectorNode {
|
||||
|
@ -106,6 +137,16 @@ class AddVFNode : public VectorNode {
|
|||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------AddReductionVFNode--------------------------------------
|
||||
// Vector add float as a reduction
|
||||
class AddReductionVFNode : public ReductionNode {
|
||||
public:
|
||||
AddReductionVFNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
virtual const Type* bottom_type() const { return Type::FLOAT; }
|
||||
virtual uint ideal_reg() const { return Op_RegF; }
|
||||
};
|
||||
|
||||
//------------------------------AddVDNode--------------------------------------
|
||||
// Vector add double
|
||||
class AddVDNode : public VectorNode {
|
||||
|
@ -114,6 +155,16 @@ class AddVDNode : public VectorNode {
|
|||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------AddReductionVDNode--------------------------------------
|
||||
// Vector add double as a reduction
|
||||
class AddReductionVDNode : public ReductionNode {
|
||||
public:
|
||||
AddReductionVDNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
virtual const Type* bottom_type() const { return Type::DOUBLE; }
|
||||
virtual uint ideal_reg() const { return Op_RegD; }
|
||||
};
|
||||
|
||||
//------------------------------SubVBNode--------------------------------------
|
||||
// Vector subtract byte
|
||||
class SubVBNode : public VectorNode {
|
||||
|
@ -178,6 +229,16 @@ class MulVINode : public VectorNode {
|
|||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------MulReductionVINode--------------------------------------
|
||||
// Vector multiply int as a reduction
|
||||
class MulReductionVINode : public ReductionNode {
|
||||
public:
|
||||
MulReductionVINode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
virtual const Type* bottom_type() const { return TypeInt::INT; }
|
||||
virtual uint ideal_reg() const { return Op_RegI; }
|
||||
};
|
||||
|
||||
//------------------------------MulVFNode--------------------------------------
|
||||
// Vector multiply float
|
||||
class MulVFNode : public VectorNode {
|
||||
|
@ -186,6 +247,16 @@ class MulVFNode : public VectorNode {
|
|||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------MulReductionVFNode--------------------------------------
|
||||
// Vector multiply float as a reduction
|
||||
class MulReductionVFNode : public ReductionNode {
|
||||
public:
|
||||
MulReductionVFNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
virtual const Type* bottom_type() const { return Type::FLOAT; }
|
||||
virtual uint ideal_reg() const { return Op_RegF; }
|
||||
};
|
||||
|
||||
//------------------------------MulVDNode--------------------------------------
|
||||
// Vector multiply double
|
||||
class MulVDNode : public VectorNode {
|
||||
|
@ -194,6 +265,16 @@ class MulVDNode : public VectorNode {
|
|||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------MulReductionVDNode--------------------------------------
|
||||
// Vector multiply double as a reduction
|
||||
class MulReductionVDNode : public ReductionNode {
|
||||
public:
|
||||
MulReductionVDNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {}
|
||||
virtual int Opcode() const;
|
||||
virtual const Type* bottom_type() const { return Type::DOUBLE; }
|
||||
virtual uint ideal_reg() const { return Op_RegD; }
|
||||
};
|
||||
|
||||
//------------------------------DivVFNode--------------------------------------
|
||||
// Vector divide float
|
||||
class DivVFNode : public VectorNode {
|
||||
|
|
|
@ -1982,13 +1982,18 @@ typedef CompactHashtable<Symbol*, char> SymbolCompactHashTable;
|
|||
declare_c2_type(PowDNode, Node) \
|
||||
declare_c2_type(ReverseBytesINode, Node) \
|
||||
declare_c2_type(ReverseBytesLNode, Node) \
|
||||
declare_c2_type(ReductionNode, Node) \
|
||||
declare_c2_type(VectorNode, Node) \
|
||||
declare_c2_type(AddVBNode, VectorNode) \
|
||||
declare_c2_type(AddVSNode, VectorNode) \
|
||||
declare_c2_type(AddVINode, VectorNode) \
|
||||
declare_c2_type(AddReductionVINode, ReductionNode) \
|
||||
declare_c2_type(AddVLNode, VectorNode) \
|
||||
declare_c2_type(AddReductionVLNode, ReductionNode) \
|
||||
declare_c2_type(AddVFNode, VectorNode) \
|
||||
declare_c2_type(AddReductionVFNode, ReductionNode) \
|
||||
declare_c2_type(AddVDNode, VectorNode) \
|
||||
declare_c2_type(AddReductionVDNode, ReductionNode) \
|
||||
declare_c2_type(SubVBNode, VectorNode) \
|
||||
declare_c2_type(SubVSNode, VectorNode) \
|
||||
declare_c2_type(SubVINode, VectorNode) \
|
||||
|
@ -1997,8 +2002,11 @@ typedef CompactHashtable<Symbol*, char> SymbolCompactHashTable;
|
|||
declare_c2_type(SubVDNode, VectorNode) \
|
||||
declare_c2_type(MulVSNode, VectorNode) \
|
||||
declare_c2_type(MulVINode, VectorNode) \
|
||||
declare_c2_type(MulReductionVINode, ReductionNode) \
|
||||
declare_c2_type(MulVFNode, VectorNode) \
|
||||
declare_c2_type(MulReductionVFNode, ReductionNode) \
|
||||
declare_c2_type(MulVDNode, VectorNode) \
|
||||
declare_c2_type(MulReductionVDNode, ReductionNode) \
|
||||
declare_c2_type(DivVFNode, VectorNode) \
|
||||
declare_c2_type(DivVDNode, VectorNode) \
|
||||
declare_c2_type(LShiftVBNode, VectorNode) \
|
||||
|
|
82
hotspot/test/compiler/loopopts/superword/ProdRed_Double.java
Normal file
82
hotspot/test/compiler/loopopts/superword/ProdRed_Double.java
Normal file
|
@ -0,0 +1,82 @@
|
|||
/*
|
||||
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @bug 8074981
|
||||
* @summary Add C2 x86 Superword support for scalar product reduction optimizations : float test
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 ProdRed_Double
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 ProdRed_Double
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 ProdRed_Double
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 ProdRed_Double
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 ProdRed_Double
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 ProdRed_Double
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 ProdRed_Double
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 ProdRed_Double
|
||||
*/
|
||||
|
||||
public class ProdRed_Double
|
||||
{
|
||||
public static void main(String[] args) throws Exception {
|
||||
double[] a = new double[256*1024];
|
||||
double[] b = new double[256*1024];
|
||||
prodReductionInit(a,b);
|
||||
double valid = 2000;
|
||||
double total = 0;
|
||||
for(int j = 0; j < 2000; j++) {
|
||||
total = j + 1;
|
||||
total = prodReductionImplement(a,b, total);
|
||||
}
|
||||
if(total == valid) {
|
||||
System.out.println("Success");
|
||||
} else {
|
||||
System.out.println("Invalid sum of elements variable in total: " + total);
|
||||
System.out.println("Expected value = " + valid);
|
||||
throw new Exception("Failed");
|
||||
}
|
||||
}
|
||||
|
||||
public static void prodReductionInit(double[] a, double[] b)
|
||||
{
|
||||
for(int i = 0; i < a.length; i++)
|
||||
{
|
||||
a[i] = i + 2;
|
||||
b[i] = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
public static double prodReductionImplement(double[] a, double[] b, double total)
|
||||
{
|
||||
for(int i = 0; i < a.length; i++)
|
||||
{
|
||||
total *= a[i] - b[i];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
}
|
82
hotspot/test/compiler/loopopts/superword/ProdRed_Float.java
Normal file
82
hotspot/test/compiler/loopopts/superword/ProdRed_Float.java
Normal file
|
@ -0,0 +1,82 @@
|
|||
/*
|
||||
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @bug 8074981
|
||||
* @summary Add C2 x86 Superword support for scalar product reduction optimizations : float test
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 ProdRed_Float
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 ProdRed_Float
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 ProdRed_Float
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 ProdRed_Float
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 ProdRed_Float
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 ProdRed_Float
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 ProdRed_Float
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 ProdRed_Float
|
||||
*/
|
||||
|
||||
public class ProdRed_Float
|
||||
{
|
||||
public static void main(String[] args) throws Exception {
|
||||
float[] a = new float[256*1024];
|
||||
float[] b = new float[256*1024];
|
||||
prodReductionInit(a,b);
|
||||
float valid = 2000;
|
||||
float total = 0;
|
||||
for(int j = 0; j < 2000; j++) {
|
||||
total = j + 1;
|
||||
total = prodReductionImplement(a,b, total);
|
||||
}
|
||||
if(total == valid) {
|
||||
System.out.println("Success");
|
||||
} else {
|
||||
System.out.println("Invalid sum of elements variable in total: " + total);
|
||||
System.out.println("Expected value = " + valid);
|
||||
throw new Exception("Failed");
|
||||
}
|
||||
}
|
||||
|
||||
public static void prodReductionInit(float[] a, float[] b)
|
||||
{
|
||||
for(int i = 0; i < a.length; i++)
|
||||
{
|
||||
a[i] = i + 2;
|
||||
b[i] = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
public static float prodReductionImplement(float[] a, float[] b, float total)
|
||||
{
|
||||
for(int i = 0; i < a.length; i++)
|
||||
{
|
||||
total *= a[i] - b[i];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
}
|
81
hotspot/test/compiler/loopopts/superword/ProdRed_Int.java
Normal file
81
hotspot/test/compiler/loopopts/superword/ProdRed_Int.java
Normal file
|
@ -0,0 +1,81 @@
|
|||
/*
|
||||
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @bug 8074981
|
||||
* @summary Add C2 x86 Superword support for scalar product reduction optimizations : int test
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 ProdRed_Int
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 ProdRed_Int
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 ProdRed_Int
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 ProdRed_Int
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 ProdRed_Int
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 ProdRed_Int
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 ProdRed_Int
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 ProdRed_Int
|
||||
*/
|
||||
|
||||
public class ProdRed_Int
|
||||
{
|
||||
public static void main(String[] args) throws Exception {
|
||||
int[] a = new int[256*1024];
|
||||
int[] b = new int[256*1024];
|
||||
prodReductionInit(a,b);
|
||||
int valid = 419430401;
|
||||
int total = 1;
|
||||
for(int j = 0; j < 2000; j++) {
|
||||
total = prodReductionImplement(a,b,total);
|
||||
}
|
||||
if(total == valid) {
|
||||
System.out.println("Success");
|
||||
} else {
|
||||
System.out.println("Invalid sum of elements variable in total: " + total);
|
||||
System.out.println("Expected value = " + valid);
|
||||
throw new Exception("Failed");
|
||||
}
|
||||
}
|
||||
|
||||
public static void prodReductionInit(int[] a, int[] b)
|
||||
{
|
||||
for(int i = 0; i < a.length; i++)
|
||||
{
|
||||
a[i] = i + 2;
|
||||
b[i] = i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
public static int prodReductionImplement(int[] a, int[] b, int total)
|
||||
{
|
||||
for(int i = 0; i < a.length; i++)
|
||||
{
|
||||
total *= a[i] + b[i];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
}
|
96
hotspot/test/compiler/loopopts/superword/SumRed_Double.java
Normal file
96
hotspot/test/compiler/loopopts/superword/SumRed_Double.java
Normal file
|
@ -0,0 +1,96 @@
|
|||
/*
|
||||
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @bug 8074981
|
||||
* @summary Add C2 x86 Superword support for scalar sum reduction optimizations : double test
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRed_Double
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRed_Double
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Double
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Double
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Double
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Double
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRed_Double
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRed_Double
|
||||
*/
|
||||
|
||||
public class SumRed_Double
|
||||
{
|
||||
public static void main(String[] args) throws Exception {
|
||||
double[] a = new double[256*1024];
|
||||
double[] b = new double[256*1024];
|
||||
double[] c = new double[256*1024];
|
||||
double[] d = new double[256*1024];
|
||||
sumReductionInit(a,b,c);
|
||||
double total = 0;
|
||||
double valid = 3.6028590866691944E19;
|
||||
for(int j = 0; j < 2000; j++) {
|
||||
total = sumReductionImplement(a,b,c,d,total);
|
||||
}
|
||||
if(total == valid) {
|
||||
System.out.println("Success");
|
||||
} else {
|
||||
System.out.println("Invalid sum of elements variable in total: " + total);
|
||||
System.out.println("Expected value = " + valid);
|
||||
throw new Exception("Failed");
|
||||
}
|
||||
}
|
||||
|
||||
public static void sumReductionInit(
|
||||
double[] a,
|
||||
double[] b,
|
||||
double[] c)
|
||||
{
|
||||
for(int j = 0; j < 1; j++)
|
||||
{
|
||||
for(int i = 0; i < a.length; i++)
|
||||
{
|
||||
a[i] = i * 1 + j;
|
||||
b[i] = i * 1 - j;
|
||||
c[i] = i + j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static double sumReductionImplement(
|
||||
double[] a,
|
||||
double[] b,
|
||||
double[] c,
|
||||
double[] d,
|
||||
double total)
|
||||
{
|
||||
for(int i = 0; i < a.length; i++)
|
||||
{
|
||||
d[i]= (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
|
||||
total += d[i];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
}
|
96
hotspot/test/compiler/loopopts/superword/SumRed_Float.java
Normal file
96
hotspot/test/compiler/loopopts/superword/SumRed_Float.java
Normal file
|
@ -0,0 +1,96 @@
|
|||
/*
|
||||
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @bug 8074981
|
||||
* @summary Add C2 x86 Superword support for scalar sum reduction optimizations : float test
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRed_Float
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRed_Float
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Float
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Float
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Float
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Float
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRed_Float
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRed_Float
|
||||
*/
|
||||
|
||||
public class SumRed_Float
|
||||
{
|
||||
public static void main(String[] args) throws Exception {
|
||||
float[] a = new float[256*1024];
|
||||
float[] b = new float[256*1024];
|
||||
float[] c = new float[256*1024];
|
||||
float[] d = new float[256*1024];
|
||||
sumReductionInit(a,b,c);
|
||||
float total = 0;
|
||||
float valid = (float)4.611686E18;
|
||||
for(int j = 0; j < 2000; j++) {
|
||||
total = sumReductionImplement(a,b,c,d,total);
|
||||
}
|
||||
if(total == valid) {
|
||||
System.out.println("Success");
|
||||
} else {
|
||||
System.out.println("Invalid sum of elements variable in total: " + total);
|
||||
System.out.println("Expected value = " + valid);
|
||||
throw new Exception("Failed");
|
||||
}
|
||||
}
|
||||
|
||||
public static void sumReductionInit(
|
||||
float[] a,
|
||||
float[] b,
|
||||
float[] c)
|
||||
{
|
||||
for(int j = 0; j < 1; j++)
|
||||
{
|
||||
for(int i = 0; i < a.length; i++)
|
||||
{
|
||||
a[i] = i * 1 + j;
|
||||
b[i] = i * 1 - j;
|
||||
c[i] = i + j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static float sumReductionImplement(
|
||||
float[] a,
|
||||
float[] b,
|
||||
float[] c,
|
||||
float[] d,
|
||||
float total)
|
||||
{
|
||||
for(int i = 0; i < a.length; i++)
|
||||
{
|
||||
d[i]= (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
|
||||
total += d[i];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
}
|
96
hotspot/test/compiler/loopopts/superword/SumRed_Int.java
Normal file
96
hotspot/test/compiler/loopopts/superword/SumRed_Int.java
Normal file
|
@ -0,0 +1,96 @@
|
|||
/*
|
||||
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @bug 8074981
|
||||
* @summary Add C2 x86 Superword support for scalar sum reduction optimizations : int test
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRed_Int
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=2 -XX:CompileThresholdScaling=0.1 SumRed_Int
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Int
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=4 -XX:CompileThresholdScaling=0.1 SumRed_Int
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Int
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=8 -XX:CompileThresholdScaling=0.1 SumRed_Int
|
||||
*
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRed_Int
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:-SuperWordReductions -XX:LoopUnrollLimit=250 -XX:LoopMaxUnroll=16 -XX:CompileThresholdScaling=0.1 SumRed_Int
|
||||
*/
|
||||
|
||||
public class SumRed_Int
|
||||
{
|
||||
public static void main(String[] args) throws Exception {
|
||||
int[] a = new int[256*1024];
|
||||
int[] b = new int[256*1024];
|
||||
int[] c = new int[256*1024];
|
||||
int[] d = new int[256*1024];
|
||||
sumReductionInit(a,b,c);
|
||||
int total = 0;
|
||||
int valid = 262144000;
|
||||
for(int j = 0; j < 2000; j++) {
|
||||
total = sumReductionImplement(a,b,c,d,total);
|
||||
}
|
||||
if(total == valid) {
|
||||
System.out.println("Success");
|
||||
} else {
|
||||
System.out.println("Invalid sum of elements variable in total: " + total);
|
||||
System.out.println("Expected value = " + valid);
|
||||
throw new Exception("Failed");
|
||||
}
|
||||
}
|
||||
|
||||
public static void sumReductionInit(
|
||||
int[] a,
|
||||
int[] b,
|
||||
int[] c)
|
||||
{
|
||||
for(int j = 0; j < 1; j++)
|
||||
{
|
||||
for(int i = 0; i < a.length; i++)
|
||||
{
|
||||
a[i] = i * 1 + j;
|
||||
b[i] = i * 1 - j;
|
||||
c[i] = i + j;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static int sumReductionImplement(
|
||||
int[] a,
|
||||
int[] b,
|
||||
int[] c,
|
||||
int[] d,
|
||||
int total)
|
||||
{
|
||||
for(int i = 0; i < a.length; i++)
|
||||
{
|
||||
d[i]= (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]);
|
||||
total += d[i];
|
||||
}
|
||||
return total;
|
||||
}
|
||||
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue