mirror of
https://github.com/openjdk/jdk.git
synced 2025-09-16 17:14:41 +02:00
8214751: X86: Support for VNNI Instructions
Co-authored-by: Razvan A Lupusoru <razvan.a.lupusoru@intel.com> Reviewed-by: kvn
This commit is contained in:
parent
40d7e4c2e9
commit
05e175bf1b
18 changed files with 491 additions and 4 deletions
|
@ -3966,6 +3966,34 @@ void Assembler::vpmovzxwd(XMMRegister dst, XMMRegister src, int vector_len) {
|
||||||
emit_int8((unsigned char)(0xC0 | encode));
|
emit_int8((unsigned char)(0xC0 | encode));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Assembler::pmaddwd(XMMRegister dst, XMMRegister src) {
|
||||||
|
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
|
||||||
|
InstructionAttr attributes(AVX_128bit, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||||
|
int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
|
||||||
|
emit_int8((unsigned char)0xF5);
|
||||||
|
emit_int8((unsigned char)(0xC0 | encode));
|
||||||
|
}
|
||||||
|
|
||||||
|
void Assembler::vpmaddwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
|
||||||
|
assert(vector_len == AVX_128bit ? VM_Version::supports_avx() :
|
||||||
|
(vector_len == AVX_256bit ? VM_Version::supports_avx2() :
|
||||||
|
(vector_len == AVX_512bit ? VM_Version::supports_evex() : 0)), "");
|
||||||
|
InstructionAttr attributes(vector_len, /* rex_w */ false, /* legacy_mode */ _legacy_mode_bw, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||||
|
int encode = simd_prefix_and_encode(dst, nds, src, VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
|
||||||
|
emit_int8((unsigned char)0xF5);
|
||||||
|
emit_int8((unsigned char)(0xC0 | encode));
|
||||||
|
}
|
||||||
|
|
||||||
|
void Assembler::evpdpwssd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
|
||||||
|
assert(VM_Version::supports_evex(), "");
|
||||||
|
assert(VM_Version::supports_vnni(), "must support vnni");
|
||||||
|
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||||
|
attributes.set_is_evex_instruction();
|
||||||
|
int encode = vex_prefix_and_encode(dst->encoding(), nds->encoding(), src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
|
||||||
|
emit_int8(0x52);
|
||||||
|
emit_int8((unsigned char)(0xC0 | encode));
|
||||||
|
}
|
||||||
|
|
||||||
// generic
|
// generic
|
||||||
void Assembler::pop(Register dst) {
|
void Assembler::pop(Register dst) {
|
||||||
int encode = prefix_and_encode(dst->encoding());
|
int encode = prefix_and_encode(dst->encoding());
|
||||||
|
|
|
@ -1668,6 +1668,12 @@ private:
|
||||||
|
|
||||||
void evpmovdb(Address dst, XMMRegister src, int vector_len);
|
void evpmovdb(Address dst, XMMRegister src, int vector_len);
|
||||||
|
|
||||||
|
// Multiply add
|
||||||
|
void pmaddwd(XMMRegister dst, XMMRegister src);
|
||||||
|
void vpmaddwd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||||
|
// Multiply add accumulate
|
||||||
|
void evpdpwssd(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||||
|
|
||||||
#ifndef _LP64 // no 32bit push/pop on amd64
|
#ifndef _LP64 // no 32bit push/pop on amd64
|
||||||
void popl(Address dst);
|
void popl(Address dst);
|
||||||
#endif
|
#endif
|
||||||
|
|
|
@ -1289,7 +1289,7 @@ void VM_Version::get_processor_features() {
|
||||||
if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
|
if (FLAG_IS_DEFAULT(UseXMMForArrayCopy)) {
|
||||||
UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus
|
UseXMMForArrayCopy = true; // use SSE2 movq on new Intel cpus
|
||||||
}
|
}
|
||||||
if (supports_sse4_2() && supports_ht()) { // Newest Intel cpus
|
if ((supports_sse4_2() && supports_ht()) || supports_avx()) { // Newest Intel cpus
|
||||||
if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
|
if (FLAG_IS_DEFAULT(UseUnalignedLoadStores)) {
|
||||||
UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
|
UseUnalignedLoadStores = true; // use movdqu on newest Intel cpus
|
||||||
}
|
}
|
||||||
|
|
|
@ -336,6 +336,7 @@ protected:
|
||||||
#define CPU_AVX512_VPOPCNTDQ ((uint64_t)UCONST64(0x2000000000)) // Vector popcount
|
#define CPU_AVX512_VPOPCNTDQ ((uint64_t)UCONST64(0x2000000000)) // Vector popcount
|
||||||
#define CPU_VPCLMULQDQ ((uint64_t)UCONST64(0x4000000000)) //Vector carryless multiplication
|
#define CPU_VPCLMULQDQ ((uint64_t)UCONST64(0x4000000000)) //Vector carryless multiplication
|
||||||
#define CPU_VAES ((uint64_t)UCONST64(0x8000000000)) // Vector AES instructions
|
#define CPU_VAES ((uint64_t)UCONST64(0x8000000000)) // Vector AES instructions
|
||||||
|
#define CPU_VNNI ((uint64_t)UCONST64(0x16000000000)) // Vector Neural Network Instructions
|
||||||
|
|
||||||
enum Extended_Family {
|
enum Extended_Family {
|
||||||
// AMD
|
// AMD
|
||||||
|
@ -548,6 +549,8 @@ protected:
|
||||||
result |= CPU_VPCLMULQDQ;
|
result |= CPU_VPCLMULQDQ;
|
||||||
if (_cpuid_info.sef_cpuid7_ecx.bits.vaes != 0)
|
if (_cpuid_info.sef_cpuid7_ecx.bits.vaes != 0)
|
||||||
result |= CPU_VAES;
|
result |= CPU_VAES;
|
||||||
|
if (_cpuid_info.sef_cpuid7_ecx.bits.avx512_vnni != 0)
|
||||||
|
result |= CPU_VNNI;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0)
|
if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0)
|
||||||
|
@ -828,6 +831,7 @@ public:
|
||||||
static bool supports_vpopcntdq() { return (_features & CPU_AVX512_VPOPCNTDQ) != 0; }
|
static bool supports_vpopcntdq() { return (_features & CPU_AVX512_VPOPCNTDQ) != 0; }
|
||||||
static bool supports_vpclmulqdq() { return (_features & CPU_VPCLMULQDQ) != 0; }
|
static bool supports_vpclmulqdq() { return (_features & CPU_VPCLMULQDQ) != 0; }
|
||||||
static bool supports_vaes() { return (_features & CPU_VAES) != 0; }
|
static bool supports_vaes() { return (_features & CPU_VAES) != 0; }
|
||||||
|
static bool supports_vnni() { return (_features & CPU_VNNI) != 0; }
|
||||||
|
|
||||||
// Intel features
|
// Intel features
|
||||||
static bool is_intel_family_core() { return is_intel() &&
|
static bool is_intel_family_core() { return is_intel() &&
|
||||||
|
|
|
@ -1446,6 +1446,10 @@ const bool Matcher::match_rule_supported(int opcode) {
|
||||||
if (VM_Version::supports_on_spin_wait() == false)
|
if (VM_Version::supports_on_spin_wait() == false)
|
||||||
ret_value = false;
|
ret_value = false;
|
||||||
break;
|
break;
|
||||||
|
case Op_MulAddVS2VI:
|
||||||
|
if (UseSSE < 2)
|
||||||
|
ret_value = false;
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
return ret_value; // Per default match rules are supported.
|
return ret_value; // Per default match rules are supported.
|
||||||
|
@ -9855,6 +9859,118 @@ instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{
|
||||||
ins_pipe( pipe_slow );
|
ins_pipe( pipe_slow );
|
||||||
%}
|
%}
|
||||||
|
|
||||||
|
// --------------------------------- Vector Multiply Add --------------------------------------
|
||||||
|
|
||||||
|
instruct smuladd4S2I_reg(vecD dst, vecD src1) %{
|
||||||
|
predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 2);
|
||||||
|
match(Set dst (MulAddVS2VI dst src1));
|
||||||
|
format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed4Sto2I" %}
|
||||||
|
ins_encode %{
|
||||||
|
__ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
|
||||||
|
%}
|
||||||
|
ins_pipe( pipe_slow );
|
||||||
|
%}
|
||||||
|
|
||||||
|
instruct vmuladd4S2I_reg(vecD dst, vecD src1, vecD src2) %{
|
||||||
|
predicate(UseAVX > 0 && n->as_Vector()->length() == 2);
|
||||||
|
match(Set dst (MulAddVS2VI src1 src2));
|
||||||
|
format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed4Sto2I" %}
|
||||||
|
ins_encode %{
|
||||||
|
int vector_len = 0;
|
||||||
|
__ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
|
||||||
|
%}
|
||||||
|
ins_pipe( pipe_slow );
|
||||||
|
%}
|
||||||
|
|
||||||
|
instruct smuladd8S4I_reg(vecX dst, vecX src1) %{
|
||||||
|
predicate(UseSSE >= 2 && UseAVX == 0 && n->as_Vector()->length() == 4);
|
||||||
|
match(Set dst (MulAddVS2VI dst src1));
|
||||||
|
format %{ "pmaddwd $dst,$dst,$src1\t! muladd packed8Sto4I" %}
|
||||||
|
ins_encode %{
|
||||||
|
__ pmaddwd($dst$$XMMRegister, $src1$$XMMRegister);
|
||||||
|
%}
|
||||||
|
ins_pipe( pipe_slow );
|
||||||
|
%}
|
||||||
|
|
||||||
|
instruct vmuladd8S4I_reg(vecX dst, vecX src1, vecX src2) %{
|
||||||
|
predicate(UseAVX > 0 && n->as_Vector()->length() == 4);
|
||||||
|
match(Set dst (MulAddVS2VI src1 src2));
|
||||||
|
format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed8Sto4I" %}
|
||||||
|
ins_encode %{
|
||||||
|
int vector_len = 0;
|
||||||
|
__ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
|
||||||
|
%}
|
||||||
|
ins_pipe( pipe_slow );
|
||||||
|
%}
|
||||||
|
|
||||||
|
instruct vmuladd16S8I_reg(vecY dst, vecY src1, vecY src2) %{
|
||||||
|
predicate(UseAVX > 1 && n->as_Vector()->length() == 8);
|
||||||
|
match(Set dst (MulAddVS2VI src1 src2));
|
||||||
|
format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed16Sto8I" %}
|
||||||
|
ins_encode %{
|
||||||
|
int vector_len = 1;
|
||||||
|
__ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
|
||||||
|
%}
|
||||||
|
ins_pipe( pipe_slow );
|
||||||
|
%}
|
||||||
|
|
||||||
|
instruct vmuladd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
|
||||||
|
predicate(UseAVX > 2 && n->as_Vector()->length() == 16);
|
||||||
|
match(Set dst (MulAddVS2VI src1 src2));
|
||||||
|
format %{ "vpmaddwd $dst,$src1,$src2\t! muladd packed32Sto16I" %}
|
||||||
|
ins_encode %{
|
||||||
|
int vector_len = 2;
|
||||||
|
__ vpmaddwd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
|
||||||
|
%}
|
||||||
|
ins_pipe( pipe_slow );
|
||||||
|
%}
|
||||||
|
|
||||||
|
// --------------------------------- Vector Multiply Add Add ----------------------------------
|
||||||
|
|
||||||
|
instruct vmuladdadd4S2I_reg(vecD dst, vecD src1, vecD src2) %{
|
||||||
|
predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 2);
|
||||||
|
match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
|
||||||
|
format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed4Sto2I" %}
|
||||||
|
ins_encode %{
|
||||||
|
int vector_len = 0;
|
||||||
|
__ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
|
||||||
|
%}
|
||||||
|
ins_pipe( pipe_slow );
|
||||||
|
%}
|
||||||
|
|
||||||
|
instruct vmuladdadd8S4I_reg(vecX dst, vecX src1, vecX src2) %{
|
||||||
|
predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 4);
|
||||||
|
match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
|
||||||
|
format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed8Sto4I" %}
|
||||||
|
ins_encode %{
|
||||||
|
int vector_len = 0;
|
||||||
|
__ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
|
||||||
|
%}
|
||||||
|
ins_pipe( pipe_slow );
|
||||||
|
%}
|
||||||
|
|
||||||
|
instruct vmuladdadd16S8I_reg(vecY dst, vecY src1, vecY src2) %{
|
||||||
|
predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 8);
|
||||||
|
match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
|
||||||
|
format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed16Sto8I" %}
|
||||||
|
ins_encode %{
|
||||||
|
int vector_len = 1;
|
||||||
|
__ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
|
||||||
|
%}
|
||||||
|
ins_pipe( pipe_slow );
|
||||||
|
%}
|
||||||
|
|
||||||
|
instruct vmuladdadd32S16I_reg(vecZ dst, vecZ src1, vecZ src2) %{
|
||||||
|
predicate(VM_Version::supports_vnni() && UseAVX > 2 && n->as_Vector()->length() == 16);
|
||||||
|
match(Set dst (AddVI (MulAddVS2VI src1 src2) dst));
|
||||||
|
format %{ "evpdpwssd $dst,$src1,$src2\t! muladdadd packed32Sto16I" %}
|
||||||
|
ins_encode %{
|
||||||
|
int vector_len = 2;
|
||||||
|
__ evpdpwssd($dst$$XMMRegister, $src1$$XMMRegister, $src2$$XMMRegister, vector_len);
|
||||||
|
%}
|
||||||
|
ins_pipe( pipe_slow );
|
||||||
|
%}
|
||||||
|
|
||||||
// --------------------------------- PopCount --------------------------------------
|
// --------------------------------- PopCount --------------------------------------
|
||||||
|
|
||||||
instruct vpopcount2I(vecD dst, vecD src) %{
|
instruct vpopcount2I(vecD dst, vecD src) %{
|
||||||
|
|
|
@ -7755,6 +7755,16 @@ instruct mulI(rRegI dst, memory src, eFlagsReg cr) %{
|
||||||
ins_pipe( ialu_reg_mem_alu0 );
|
ins_pipe( ialu_reg_mem_alu0 );
|
||||||
%}
|
%}
|
||||||
|
|
||||||
|
instruct mulAddS2I_rReg(rRegI dst, rRegI src1, rRegI src2, rRegI src3, eFlagsReg cr)
|
||||||
|
%{
|
||||||
|
match(Set dst (MulAddS2I (Binary dst src1) (Binary src2 src3)));
|
||||||
|
effect(KILL cr, KILL src2);
|
||||||
|
|
||||||
|
expand %{ mulI_rReg(dst, src1, cr);
|
||||||
|
mulI_rReg(src2, src3, cr);
|
||||||
|
addI_rReg(dst, src2, cr); %}
|
||||||
|
%}
|
||||||
|
|
||||||
// Multiply Register Int to Long
|
// Multiply Register Int to Long
|
||||||
instruct mulI2L(eADXRegL dst, eAXRegI src, nadxRegI src1, eFlagsReg flags) %{
|
instruct mulI2L(eADXRegL dst, eAXRegI src, nadxRegI src1, eFlagsReg flags) %{
|
||||||
// Basic Idea: long = (long)int * (long)int
|
// Basic Idea: long = (long)int * (long)int
|
||||||
|
|
|
@ -8175,6 +8175,16 @@ instruct mulI_mem_imm(rRegI dst, memory src, immI imm, rFlagsReg cr)
|
||||||
ins_pipe(ialu_reg_mem_alu0);
|
ins_pipe(ialu_reg_mem_alu0);
|
||||||
%}
|
%}
|
||||||
|
|
||||||
|
instruct mulAddS2I_rReg(rRegI dst, rRegI src1, rRegI src2, rRegI src3, rFlagsReg cr)
|
||||||
|
%{
|
||||||
|
match(Set dst (MulAddS2I (Binary dst src1) (Binary src2 src3)));
|
||||||
|
effect(KILL cr, KILL src2);
|
||||||
|
|
||||||
|
expand %{ mulI_rReg(dst, src1, cr);
|
||||||
|
mulI_rReg(src2, src3, cr);
|
||||||
|
addI_rReg(dst, src2, cr); %}
|
||||||
|
%}
|
||||||
|
|
||||||
instruct mulL_rReg(rRegL dst, rRegL src, rFlagsReg cr)
|
instruct mulL_rReg(rRegL dst, rRegL src, rFlagsReg cr)
|
||||||
%{
|
%{
|
||||||
match(Set dst (MulL dst src));
|
match(Set dst (MulL dst src));
|
||||||
|
|
|
@ -4181,6 +4181,7 @@ bool MatchRule::is_vector() const {
|
||||||
"AddReductionVF", "AddReductionVD",
|
"AddReductionVF", "AddReductionVD",
|
||||||
"MulReductionVI", "MulReductionVL",
|
"MulReductionVI", "MulReductionVL",
|
||||||
"MulReductionVF", "MulReductionVD",
|
"MulReductionVF", "MulReductionVD",
|
||||||
|
"MulAddVS2VI",
|
||||||
"LShiftCntV","RShiftCntV",
|
"LShiftCntV","RShiftCntV",
|
||||||
"LShiftVB","LShiftVS","LShiftVI","LShiftVL",
|
"LShiftVB","LShiftVS","LShiftVI","LShiftVL",
|
||||||
"RShiftVB","RShiftVS","RShiftVI","RShiftVL",
|
"RShiftVB","RShiftVS","RShiftVI","RShiftVL",
|
||||||
|
|
|
@ -201,6 +201,7 @@ macro(Loop)
|
||||||
macro(LoopLimit)
|
macro(LoopLimit)
|
||||||
macro(Mach)
|
macro(Mach)
|
||||||
macro(MachProj)
|
macro(MachProj)
|
||||||
|
macro(MulAddS2I)
|
||||||
macro(MaxI)
|
macro(MaxI)
|
||||||
macro(MemBarAcquire)
|
macro(MemBarAcquire)
|
||||||
macro(LoadFence)
|
macro(LoadFence)
|
||||||
|
@ -341,6 +342,7 @@ macro(MulVF)
|
||||||
macro(MulReductionVF)
|
macro(MulReductionVF)
|
||||||
macro(MulVD)
|
macro(MulVD)
|
||||||
macro(MulReductionVD)
|
macro(MulReductionVD)
|
||||||
|
macro(MulAddVS2VI)
|
||||||
macro(FmaVD)
|
macro(FmaVD)
|
||||||
macro(FmaVF)
|
macro(FmaVF)
|
||||||
macro(DivVF)
|
macro(DivVF)
|
||||||
|
|
|
@ -1249,6 +1249,9 @@ public:
|
||||||
// important (common) to do address expressions.
|
// important (common) to do address expressions.
|
||||||
Node *remix_address_expressions( Node *n );
|
Node *remix_address_expressions( Node *n );
|
||||||
|
|
||||||
|
// Convert add to muladd to generate MuladdS2I under certain criteria
|
||||||
|
Node * convert_add_to_muladd(Node * n);
|
||||||
|
|
||||||
// Attempt to use a conditional move instead of a phi/branch
|
// Attempt to use a conditional move instead of a phi/branch
|
||||||
Node *conditional_move( Node *n );
|
Node *conditional_move( Node *n );
|
||||||
|
|
||||||
|
|
|
@ -493,6 +493,54 @@ Node *PhaseIdealLoop::remix_address_expressions( Node *n ) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Optimize ((in1[2*i] * in2[2*i]) + (in1[2*i+1] * in2[2*i+1]))
|
||||||
|
Node *PhaseIdealLoop::convert_add_to_muladd(Node* n) {
|
||||||
|
assert(n->Opcode() == Op_AddI, "sanity");
|
||||||
|
Node * nn = NULL;
|
||||||
|
Node * in1 = n->in(1);
|
||||||
|
Node * in2 = n->in(2);
|
||||||
|
if (in1->Opcode() == Op_MulI && in2->Opcode() == Op_MulI) {
|
||||||
|
IdealLoopTree* loop_n = get_loop(get_ctrl(n));
|
||||||
|
if (loop_n->_head->as_Loop()->is_valid_counted_loop() &&
|
||||||
|
Matcher::match_rule_supported(Op_MulAddS2I) &&
|
||||||
|
Matcher::match_rule_supported(Op_MulAddVS2VI)) {
|
||||||
|
Node* mul_in1 = in1->in(1);
|
||||||
|
Node* mul_in2 = in1->in(2);
|
||||||
|
Node* mul_in3 = in2->in(1);
|
||||||
|
Node* mul_in4 = in2->in(2);
|
||||||
|
if (mul_in1->Opcode() == Op_LoadS &&
|
||||||
|
mul_in2->Opcode() == Op_LoadS &&
|
||||||
|
mul_in3->Opcode() == Op_LoadS &&
|
||||||
|
mul_in4->Opcode() == Op_LoadS) {
|
||||||
|
IdealLoopTree* loop1 = get_loop(get_ctrl(mul_in1));
|
||||||
|
IdealLoopTree* loop2 = get_loop(get_ctrl(mul_in2));
|
||||||
|
IdealLoopTree* loop3 = get_loop(get_ctrl(mul_in3));
|
||||||
|
IdealLoopTree* loop4 = get_loop(get_ctrl(mul_in4));
|
||||||
|
IdealLoopTree* loop5 = get_loop(get_ctrl(in1));
|
||||||
|
IdealLoopTree* loop6 = get_loop(get_ctrl(in2));
|
||||||
|
// All nodes should be in the same counted loop.
|
||||||
|
if (loop_n == loop1 && loop_n == loop2 && loop_n == loop3 &&
|
||||||
|
loop_n == loop4 && loop_n == loop5 && loop_n == loop6) {
|
||||||
|
Node* adr1 = mul_in1->in(MemNode::Address);
|
||||||
|
Node* adr2 = mul_in2->in(MemNode::Address);
|
||||||
|
Node* adr3 = mul_in3->in(MemNode::Address);
|
||||||
|
Node* adr4 = mul_in4->in(MemNode::Address);
|
||||||
|
if (adr1->is_AddP() && adr2->is_AddP() && adr3->is_AddP() && adr4->is_AddP()) {
|
||||||
|
if ((adr1->in(AddPNode::Base) == adr3->in(AddPNode::Base)) &&
|
||||||
|
(adr2->in(AddPNode::Base) == adr4->in(AddPNode::Base))) {
|
||||||
|
nn = new MulAddS2INode(mul_in1, mul_in2, mul_in3, mul_in4);
|
||||||
|
register_new_node(nn, get_ctrl(n));
|
||||||
|
_igvn.replace_node(n, nn);
|
||||||
|
return nn;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nn;
|
||||||
|
}
|
||||||
|
|
||||||
//------------------------------conditional_move-------------------------------
|
//------------------------------conditional_move-------------------------------
|
||||||
// Attempt to replace a Phi with a conditional move. We have some pretty
|
// Attempt to replace a Phi with a conditional move. We have some pretty
|
||||||
// strict profitability requirements. All Phis at the merge point must
|
// strict profitability requirements. All Phis at the merge point must
|
||||||
|
@ -927,6 +975,11 @@ Node *PhaseIdealLoop::split_if_with_blocks_pre( Node *n ) {
|
||||||
Node *m = remix_address_expressions( n );
|
Node *m = remix_address_expressions( n );
|
||||||
if( m ) return m;
|
if( m ) return m;
|
||||||
|
|
||||||
|
if (n_op == Op_AddI) {
|
||||||
|
Node *nn = convert_add_to_muladd( n );
|
||||||
|
if ( nn ) return nn;
|
||||||
|
}
|
||||||
|
|
||||||
if (n->is_ConstraintCast()) {
|
if (n->is_ConstraintCast()) {
|
||||||
Node* dom_cast = n->as_ConstraintCast()->dominating_cast(&_igvn, this);
|
Node* dom_cast = n->as_ConstraintCast()->dominating_cast(&_igvn, this);
|
||||||
// ConstraintCastNode::dominating_cast() uses node control input to determine domination.
|
// ConstraintCastNode::dominating_cast() uses node control input to determine domination.
|
||||||
|
|
|
@ -2352,6 +2352,15 @@ void Matcher::find_shared_post_visit(Node* n, uint opcode) {
|
||||||
n->del_req(3);
|
n->del_req(3);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
case Op_MulAddS2I: {
|
||||||
|
Node* pair1 = new BinaryNode(n->in(1), n->in(2));
|
||||||
|
Node* pair2 = new BinaryNode(n->in(3), n->in(4));
|
||||||
|
n->set_req(1, pair1);
|
||||||
|
n->set_req(2, pair2);
|
||||||
|
n->del_req(4);
|
||||||
|
n->del_req(3);
|
||||||
|
break;
|
||||||
|
}
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -285,4 +285,15 @@ public:
|
||||||
virtual const Type* Value(PhaseGVN* phase) const;
|
virtual const Type* Value(PhaseGVN* phase) const;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
//------------------------------MulAddS2INode----------------------------------
|
||||||
|
// Multiply shorts into integers and add them.
|
||||||
|
// Semantics: I_OUT = S1 * S2 + S3 * S4
|
||||||
|
class MulAddS2INode : public Node {
|
||||||
|
public:
|
||||||
|
MulAddS2INode(Node* in1, Node *in2, Node *in3, Node* in4) : Node(0, in1, in2, in3, in4) {}
|
||||||
|
virtual int Opcode() const;
|
||||||
|
const Type *bottom_type() const { return TypeInt::INT; }
|
||||||
|
virtual uint ideal_reg() const { return Op_RegI; }
|
||||||
|
};
|
||||||
|
|
||||||
#endif // SHARE_VM_OPTO_MULNODE_HPP
|
#endif // SHARE_VM_OPTO_MULNODE_HPP
|
||||||
|
|
|
@ -645,6 +645,10 @@ void SuperWord::find_adjacent_refs() {
|
||||||
// with a different alignment were created before.
|
// with a different alignment were created before.
|
||||||
for (uint i = 0; i < align_to_refs.size(); i++) {
|
for (uint i = 0; i < align_to_refs.size(); i++) {
|
||||||
MemNode* mr = align_to_refs.at(i)->as_Mem();
|
MemNode* mr = align_to_refs.at(i)->as_Mem();
|
||||||
|
if (mr == mem_ref) {
|
||||||
|
// Skip when we are looking at same memory operation.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
if (same_velt_type(mr, mem_ref) &&
|
if (same_velt_type(mr, mem_ref) &&
|
||||||
memory_alignment(mr, iv_adjustment) != 0)
|
memory_alignment(mr, iv_adjustment) != 0)
|
||||||
create_pack = false;
|
create_pack = false;
|
||||||
|
@ -846,6 +850,27 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops) {
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//------------------span_works_for_memory_size-----------------------------
|
||||||
|
static bool span_works_for_memory_size(MemNode* mem, int span, int mem_size, int offset) {
|
||||||
|
bool span_matches_memory = false;
|
||||||
|
if ((mem_size == type2aelembytes(T_BYTE) || mem_size == type2aelembytes(T_SHORT))
|
||||||
|
&& ABS(span) == type2aelembytes(T_INT)) {
|
||||||
|
// There is a mismatch on span size compared to memory.
|
||||||
|
for (DUIterator_Fast jmax, j = mem->fast_outs(jmax); j < jmax; j++) {
|
||||||
|
Node* use = mem->fast_out(j);
|
||||||
|
if (!VectorNode::is_type_transition_to_int(use)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// If all uses transition to integer, it means that we can successfully align even on mismatch.
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
span_matches_memory = ABS(span) == mem_size;
|
||||||
|
}
|
||||||
|
return span_matches_memory && (ABS(offset) % mem_size) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
//------------------------------ref_is_alignable---------------------------
|
//------------------------------ref_is_alignable---------------------------
|
||||||
// Can the preloop align the reference to position zero in the vector?
|
// Can the preloop align the reference to position zero in the vector?
|
||||||
bool SuperWord::ref_is_alignable(SWPointer& p) {
|
bool SuperWord::ref_is_alignable(SWPointer& p) {
|
||||||
|
@ -862,7 +887,7 @@ bool SuperWord::ref_is_alignable(SWPointer& p) {
|
||||||
int offset = p.offset_in_bytes();
|
int offset = p.offset_in_bytes();
|
||||||
// Stride one accesses are alignable if offset is aligned to memory operation size.
|
// Stride one accesses are alignable if offset is aligned to memory operation size.
|
||||||
// Offset can be unaligned when UseUnalignedAccesses is used.
|
// Offset can be unaligned when UseUnalignedAccesses is used.
|
||||||
if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) {
|
if (span_works_for_memory_size(p.mem(), span, mem_size, offset)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
// If the initial offset from start of the object is computable,
|
// If the initial offset from start of the object is computable,
|
||||||
|
@ -915,6 +940,28 @@ bool SuperWord::ref_is_alignable(SWPointer& p) {
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
//---------------------------get_vw_bytes_special------------------------
|
||||||
|
int SuperWord::get_vw_bytes_special(MemNode* s) {
|
||||||
|
// Get the vector width in bytes.
|
||||||
|
int vw = vector_width_in_bytes(s);
|
||||||
|
|
||||||
|
// Check for special case where there is an MulAddS2I usage where short vectors are going to need combined.
|
||||||
|
BasicType btype = velt_basic_type(s);
|
||||||
|
if (type2aelembytes(btype) == 2) {
|
||||||
|
bool should_combine_adjacent = true;
|
||||||
|
for (DUIterator_Fast imax, i = s->fast_outs(imax); i < imax; i++) {
|
||||||
|
Node* user = s->fast_out(i);
|
||||||
|
if (!VectorNode::is_muladds2i(user)) {
|
||||||
|
should_combine_adjacent = false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (should_combine_adjacent) {
|
||||||
|
vw = MIN2(Matcher::max_vector_size(btype)*type2aelembytes(btype), vw * 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return vw;
|
||||||
|
}
|
||||||
|
|
||||||
//---------------------------get_iv_adjustment---------------------------
|
//---------------------------get_iv_adjustment---------------------------
|
||||||
// Calculate loop's iv adjustment for this memory ops.
|
// Calculate loop's iv adjustment for this memory ops.
|
||||||
|
@ -923,7 +970,7 @@ int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
|
||||||
int offset = align_to_ref_p.offset_in_bytes();
|
int offset = align_to_ref_p.offset_in_bytes();
|
||||||
int scale = align_to_ref_p.scale_in_bytes();
|
int scale = align_to_ref_p.scale_in_bytes();
|
||||||
int elt_size = align_to_ref_p.memory_size();
|
int elt_size = align_to_ref_p.memory_size();
|
||||||
int vw = vector_width_in_bytes(mem_ref);
|
int vw = get_vw_bytes_special(mem_ref);
|
||||||
assert(vw > 1, "sanity");
|
assert(vw > 1, "sanity");
|
||||||
int iv_adjustment;
|
int iv_adjustment;
|
||||||
if (scale != 0) {
|
if (scale != 0) {
|
||||||
|
@ -2303,6 +2350,12 @@ void SuperWord::output() {
|
||||||
const TypePtr* atyp = n->adr_type();
|
const TypePtr* atyp = n->adr_type();
|
||||||
vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen);
|
vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen);
|
||||||
vlen_in_bytes = vn->as_StoreVector()->memory_size();
|
vlen_in_bytes = vn->as_StoreVector()->memory_size();
|
||||||
|
} else if (VectorNode::is_muladds2i(n)) {
|
||||||
|
assert(n->req() == 5u, "MulAddS2I should have 4 operands.");
|
||||||
|
Node* in1 = vector_opd(p, 1);
|
||||||
|
Node* in2 = vector_opd(p, 2);
|
||||||
|
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
|
||||||
|
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
||||||
} else if (n->req() == 3 && !is_cmov_pack(p)) {
|
} else if (n->req() == 3 && !is_cmov_pack(p)) {
|
||||||
// Promote operands to vector
|
// Promote operands to vector
|
||||||
Node* in1 = NULL;
|
Node* in1 = NULL;
|
||||||
|
@ -2615,6 +2668,16 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) {
|
||||||
}
|
}
|
||||||
assert(opd_bt == in->bottom_type()->basic_type(), "all same type");
|
assert(opd_bt == in->bottom_type()->basic_type(), "all same type");
|
||||||
pk->add_opd(in);
|
pk->add_opd(in);
|
||||||
|
if (VectorNode::is_muladds2i(pi)) {
|
||||||
|
Node* in2 = pi->in(opd_idx + 2);
|
||||||
|
assert(my_pack(in2) == NULL, "Should already have been unpacked");
|
||||||
|
if (my_pack(in2) != NULL) {
|
||||||
|
NOT_PRODUCT(if (is_trace_loop_reverse() || TraceLoopOpts) { tty->print_cr("Should already have been unpacked"); })
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
assert(opd_bt == in2->bottom_type()->basic_type(), "all same type");
|
||||||
|
pk->add_opd(in2);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
_igvn.register_new_node_with_optimizer(pk);
|
_igvn.register_new_node_with_optimizer(pk);
|
||||||
_phase->set_ctrl(pk, _phase->get_ctrl(opd));
|
_phase->set_ctrl(pk, _phase->get_ctrl(opd));
|
||||||
|
@ -2692,6 +2755,21 @@ bool SuperWord::is_vector_use(Node* use, int u_idx) {
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (VectorNode::is_muladds2i(use)) {
|
||||||
|
// MulAddS2I takes shorts and produces ints - hence the special checks
|
||||||
|
// on alignment and size.
|
||||||
|
if (u_pk->size() * 2 != d_pk->size()) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for (uint i = 0; i < MIN2(d_pk->size(), u_pk->size()); i++) {
|
||||||
|
Node* ui = u_pk->at(i);
|
||||||
|
Node* di = d_pk->at(i);
|
||||||
|
if (alignment(ui) != alignment(di) * 2) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (u_pk->size() != d_pk->size())
|
if (u_pk->size() != d_pk->size())
|
||||||
return false;
|
return false;
|
||||||
for (uint i = 0; i < u_pk->size(); i++) {
|
for (uint i = 0; i < u_pk->size(); i++) {
|
||||||
|
@ -3017,7 +3095,7 @@ int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
|
||||||
NOT_PRODUCT(if(is_trace_alignment()) tty->print("SWPointer::memory_alignment: SWPointer p invalid, return bottom_align");)
|
NOT_PRODUCT(if(is_trace_alignment()) tty->print("SWPointer::memory_alignment: SWPointer p invalid, return bottom_align");)
|
||||||
return bottom_align;
|
return bottom_align;
|
||||||
}
|
}
|
||||||
int vw = vector_width_in_bytes(s);
|
int vw = get_vw_bytes_special(s);
|
||||||
if (vw < 2) {
|
if (vw < 2) {
|
||||||
NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SWPointer::memory_alignment: vector_width_in_bytes < 2, return bottom_align");)
|
NOT_PRODUCT(if(is_trace_alignment()) tty->print_cr("SWPointer::memory_alignment: vector_width_in_bytes < 2, return bottom_align");)
|
||||||
return bottom_align; // No vectors for this type
|
return bottom_align; // No vectors for this type
|
||||||
|
|
|
@ -347,6 +347,7 @@ class SuperWord : public ResourceObj {
|
||||||
BasicType bt = velt_basic_type(n);
|
BasicType bt = velt_basic_type(n);
|
||||||
return vector_width(n)*type2aelembytes(bt);
|
return vector_width(n)*type2aelembytes(bt);
|
||||||
}
|
}
|
||||||
|
int get_vw_bytes_special(MemNode* s);
|
||||||
MemNode* align_to_ref() { return _align_to_ref; }
|
MemNode* align_to_ref() { return _align_to_ref; }
|
||||||
void set_align_to_ref(MemNode* m) { _align_to_ref = m; }
|
void set_align_to_ref(MemNode* m) { _align_to_ref = m; }
|
||||||
|
|
||||||
|
|
|
@ -196,6 +196,8 @@ int VectorNode::opcode(int sopc, BasicType bt) {
|
||||||
case Op_StoreF:
|
case Op_StoreF:
|
||||||
case Op_StoreD:
|
case Op_StoreD:
|
||||||
return Op_StoreVector;
|
return Op_StoreVector;
|
||||||
|
case Op_MulAddS2I:
|
||||||
|
return Op_MulAddVS2VI;
|
||||||
|
|
||||||
default:
|
default:
|
||||||
return 0; // Unimplemented
|
return 0; // Unimplemented
|
||||||
|
@ -214,6 +216,25 @@ bool VectorNode::implemented(int opc, uint vlen, BasicType bt) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool VectorNode::is_type_transition_short_to_int(Node* n) {
|
||||||
|
switch (n->Opcode()) {
|
||||||
|
case Op_MulAddS2I:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool VectorNode::is_type_transition_to_int(Node* n) {
|
||||||
|
return is_type_transition_short_to_int(n);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool VectorNode::is_muladds2i(Node* n) {
|
||||||
|
if (n->Opcode() == Op_MulAddS2I) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
bool VectorNode::is_shift(Node* n) {
|
bool VectorNode::is_shift(Node* n) {
|
||||||
switch (n->Opcode()) {
|
switch (n->Opcode()) {
|
||||||
case Op_LShiftI:
|
case Op_LShiftI:
|
||||||
|
@ -277,6 +298,7 @@ void VectorNode::vector_operands(Node* n, uint* start, uint* end) {
|
||||||
case Op_AndI: case Op_AndL:
|
case Op_AndI: case Op_AndL:
|
||||||
case Op_OrI: case Op_OrL:
|
case Op_OrI: case Op_OrL:
|
||||||
case Op_XorI: case Op_XorL:
|
case Op_XorI: case Op_XorL:
|
||||||
|
case Op_MulAddS2I:
|
||||||
*start = 1;
|
*start = 1;
|
||||||
*end = 3; // 2 vector operands
|
*end = 3; // 2 vector operands
|
||||||
break;
|
break;
|
||||||
|
@ -354,6 +376,8 @@ VectorNode* VectorNode::make(int opc, Node* n1, Node* n2, uint vlen, BasicType b
|
||||||
case Op_AndV: return new AndVNode(n1, n2, vt);
|
case Op_AndV: return new AndVNode(n1, n2, vt);
|
||||||
case Op_OrV: return new OrVNode (n1, n2, vt);
|
case Op_OrV: return new OrVNode (n1, n2, vt);
|
||||||
case Op_XorV: return new XorVNode(n1, n2, vt);
|
case Op_XorV: return new XorVNode(n1, n2, vt);
|
||||||
|
|
||||||
|
case Op_MulAddVS2VI: return new MulAddVS2VINode(n1, n2, vt);
|
||||||
default:
|
default:
|
||||||
fatal("Missed vector creation for '%s'", NodeClassNames[vopc]);
|
fatal("Missed vector creation for '%s'", NodeClassNames[vopc]);
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
|
@ -67,6 +67,9 @@ class VectorNode : public TypeNode {
|
||||||
static int opcode(int opc, BasicType bt);
|
static int opcode(int opc, BasicType bt);
|
||||||
static bool implemented(int opc, uint vlen, BasicType bt);
|
static bool implemented(int opc, uint vlen, BasicType bt);
|
||||||
static bool is_shift(Node* n);
|
static bool is_shift(Node* n);
|
||||||
|
static bool is_type_transition_short_to_int(Node* n);
|
||||||
|
static bool is_type_transition_to_int(Node* n);
|
||||||
|
static bool is_muladds2i(Node* n);
|
||||||
static bool is_invariant_vector(Node* n);
|
static bool is_invariant_vector(Node* n);
|
||||||
// [Start, end) half-open range defining which operands are vectors
|
// [Start, end) half-open range defining which operands are vectors
|
||||||
static void vector_operands(Node* n, uint* start, uint* end);
|
static void vector_operands(Node* n, uint* start, uint* end);
|
||||||
|
@ -261,6 +264,14 @@ public:
|
||||||
virtual int Opcode() const;
|
virtual int Opcode() const;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
//------------------------------MulAddVS2VINode--------------------------------
|
||||||
|
// Vector multiply shorts to int and add adjacent ints.
|
||||||
|
class MulAddVS2VINode : public VectorNode {
|
||||||
|
public:
|
||||||
|
MulAddVS2VINode(Node* in1, Node* in2, const TypeVect* vt) : VectorNode(in1, in2, vt) {}
|
||||||
|
virtual int Opcode() const;
|
||||||
|
};
|
||||||
|
|
||||||
//------------------------------FmaVDNode--------------------------------------
|
//------------------------------FmaVDNode--------------------------------------
|
||||||
// Vector multiply double
|
// Vector multiply double
|
||||||
class FmaVDNode : public VectorNode {
|
class FmaVDNode : public VectorNode {
|
||||||
|
|
|
@ -0,0 +1,120 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
|
||||||
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||||
|
*
|
||||||
|
* This code is free software; you can redistribute it and/or modify it
|
||||||
|
* under the terms of the GNU General Public License version 2 only, as
|
||||||
|
* published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||||
|
* version 2 for more details (a copy is included in the LICENSE file that
|
||||||
|
* accompanied this code).
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License version
|
||||||
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||||
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
*
|
||||||
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||||
|
* or visit www.oracle.com if you need additional information or have any
|
||||||
|
* questions.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @test
|
||||||
|
* @bug 8214751
|
||||||
|
* @summary Add C2 x86 Superword support for VNNI VPDPWSSD Instruction
|
||||||
|
* @requires os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64"
|
||||||
|
*
|
||||||
|
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||||
|
* -XX:CompileThresholdScaling=0.1
|
||||||
|
* -XX:+SuperWord
|
||||||
|
* -XX:LoopMaxUnroll=2
|
||||||
|
* compiler.loopopts.superword.Vec_MulAddS2I
|
||||||
|
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||||
|
* -XX:CompileThresholdScaling=0.1
|
||||||
|
* -XX:-SuperWord
|
||||||
|
* -XX:LoopMaxUnroll=2
|
||||||
|
* compiler.loopopts.superword.Vec_MulAddS2I
|
||||||
|
*
|
||||||
|
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||||
|
* -XX:CompileThresholdScaling=0.1
|
||||||
|
* -XX:+SuperWord
|
||||||
|
* -XX:LoopMaxUnroll=4
|
||||||
|
* compiler.loopopts.superword.Vec_MulAddS2I
|
||||||
|
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||||
|
* -XX:CompileThresholdScaling=0.1
|
||||||
|
* -XX:-SuperWord
|
||||||
|
* -XX:LoopMaxUnroll=4
|
||||||
|
* compiler.loopopts.superword.Vec_MulAddS2I
|
||||||
|
*
|
||||||
|
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||||
|
* -XX:CompileThresholdScaling=0.1
|
||||||
|
* -XX:+SuperWord
|
||||||
|
* -XX:LoopMaxUnroll=8
|
||||||
|
* compiler.loopopts.superword.Vec_MulAddS2I
|
||||||
|
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||||
|
* -XX:CompileThresholdScaling=0.1
|
||||||
|
* -XX:-SuperWord
|
||||||
|
* -XX:LoopMaxUnroll=8
|
||||||
|
* compiler.loopopts.superword.Vec_MulAddS2I
|
||||||
|
*
|
||||||
|
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||||
|
* -XX:CompileThresholdScaling=0.1
|
||||||
|
* -XX:+SuperWord
|
||||||
|
* -XX:LoopMaxUnroll=16
|
||||||
|
* compiler.loopopts.superword.Vec_MulAddS2I
|
||||||
|
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:LoopUnrollLimit=250
|
||||||
|
* -XX:CompileThresholdScaling=0.1
|
||||||
|
* -XX:-SuperWord
|
||||||
|
* -XX:LoopMaxUnroll=16
|
||||||
|
* compiler.loopopts.superword.Vec_MulAddS2I
|
||||||
|
*/
|
||||||
|
|
||||||
|
package compiler.loopopts.superword;
|
||||||
|
import java.util.Random;
|
||||||
|
|
||||||
|
public class Vec_MulAddS2I {
|
||||||
|
static final int NUM = 1024;
|
||||||
|
static int[] out = new int[NUM];
|
||||||
|
static short[] in1 = new short[2*NUM];
|
||||||
|
static short[] in2 = new short[2*NUM];
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
Vec_MulAddS2IInit(in1, in2);
|
||||||
|
int result = 0;
|
||||||
|
int valid = 204800000;
|
||||||
|
for (int j = 0; j < 10000*512; j++) {
|
||||||
|
result = Vec_MulAddS2IImplement(in1, in2, out);
|
||||||
|
}
|
||||||
|
if (result == valid) {
|
||||||
|
System.out.println("Success");
|
||||||
|
} else {
|
||||||
|
System.out.println("Invalid calculation of element variables in the out array: " + result);
|
||||||
|
System.out.println("Expected value for each element of out array = " + valid);
|
||||||
|
throw new Exception("Failed");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void Vec_MulAddS2IInit(
|
||||||
|
short[] in1,
|
||||||
|
short[] in2) {
|
||||||
|
for (int i=0; i<2*NUM; i++) {
|
||||||
|
in1[i] = (short)4;
|
||||||
|
in2[i] = (short)5;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public static int Vec_MulAddS2IImplement(
|
||||||
|
short[] in1,
|
||||||
|
short[] in2,
|
||||||
|
int[] out) {
|
||||||
|
for (int i = 0; i < NUM; i++) {
|
||||||
|
out[i] += ((in1[2*i] * in2[2*i]) + (in1[2*i+1] * in2[2*i+1]));
|
||||||
|
}
|
||||||
|
Random rand = new Random();
|
||||||
|
int n = rand.nextInt(NUM-1);
|
||||||
|
return out[n];
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue