8154122: Intrinsify fused mac operations

Added FMA intrinsics on x86

Reviewed-by: kvn, aph, darcy
This commit is contained in:
Vivek Deshpande 2016-08-26 12:17:50 -07:00
parent 474c035379
commit d58e3e0324
42 changed files with 365 additions and 13 deletions

View file

@ -1032,6 +1032,10 @@ void LIRGenerator::do_update_CRC32C(Intrinsic* x) {
Unimplemented();
}
void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
fatal("FMA intrinsic is not implemented on this platform");
}
void LIRGenerator::do_vectorizedMismatch(Intrinsic* x) {
fatal("vectorizedMismatch intrinsic is not implemented on this platform");
}

View file

@ -262,6 +262,11 @@ void VM_Version::get_processor_features() {
FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
}
if (UseFMA) {
warning("FMA instructions are not available on this CPU");
FLAG_SET_DEFAULT(UseFMA, false);
}
if (auxv & (HWCAP_SHA1 | HWCAP_SHA2)) {
if (FLAG_IS_DEFAULT(UseSHA)) {
FLAG_SET_DEFAULT(UseSHA, true);

View file

@ -1433,6 +1433,10 @@ void LIRGenerator::do_update_CRC32(Intrinsic* x) {
}
}
void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
fatal("FMA intrinsic is not implemented on this platform");
}
void LIRGenerator::do_vectorizedMismatch(Intrinsic* x) {
fatal("vectorizedMismatch intrinsic is not implemented on this platform");
}

View file

@ -230,6 +230,11 @@ void VM_Version::initialize() {
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
}
if (UseFMA) {
warning("FMA instructions are not available on this CPU");
FLAG_SET_DEFAULT(UseFMA, false);
}
if (UseSHA) {
warning("SHA instructions are not available on this CPU");
FLAG_SET_DEFAULT(UseSHA, false);

View file

@ -953,6 +953,10 @@ void LIRGenerator::do_update_CRC32C(Intrinsic* x) {
}
}
void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
fatal("FMA intrinsic is not implemented on this platform");
}
void LIRGenerator::do_vectorizedMismatch(Intrinsic* x) {
fatal("vectorizedMismatch intrinsic is not implemented on this platform");
}

View file

@ -266,6 +266,11 @@ void VM_Version::initialize() {
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
}
if (UseFMA) {
warning("FMA instructions are not available on this CPU");
FLAG_SET_DEFAULT(UseFMA, false);
}
// SHA1, SHA256, and SHA512 instructions were added to SPARC T-series at different times
if (has_sha1() || has_sha256() || has_sha512()) {
if (UseVIS > 0) { // SHA intrinsics use VIS1 instructions

View file

@ -172,7 +172,9 @@ bool AbstractInterpreter::can_be_compiled(methodHandle m) {
case Interpreter::java_lang_math_log10 : // fall thru
case Interpreter::java_lang_math_sqrt : // fall thru
case Interpreter::java_lang_math_pow : // fall thru
case Interpreter::java_lang_math_exp :
case Interpreter::java_lang_math_exp : // fall thru
case Interpreter::java_lang_math_fmaD : // fall thru
case Interpreter::java_lang_math_fmaF :
return false;
default:
return true;

View file

@ -4769,6 +4769,22 @@ void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
emit_int8((unsigned char)(0xC0 | encode));
}
void Assembler::vfmadd231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
assert(VM_Version::supports_fma(), "");
InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int8((unsigned char)0xB9);
emit_int8((unsigned char)(0xC0 | encode));
}
void Assembler::vfmadd231ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
assert(VM_Version::supports_fma(), "");
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int8((unsigned char)0xB9);
emit_int8((unsigned char)(0xC0 | encode));
}
void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
assert(VM_Version::supports_avx(), "");
InstructionMark im(this);

View file

@ -1860,6 +1860,8 @@ private:
void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vdivss(XMMRegister dst, XMMRegister nds, Address src);
void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vfmadd231sd(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vfmadd231ss(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vmulsd(XMMRegister dst, XMMRegister nds, Address src);
void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
void vmulss(XMMRegister dst, XMMRegister nds, Address src);

View file

@ -1345,6 +1345,18 @@ void LIR_Assembler::emit_op3(LIR_Op3* op) {
op->result_opr(),
op->info());
break;
case lir_fmad:
__ fmad(op->result_opr()->as_xmm_double_reg(),
op->in_opr1()->as_xmm_double_reg(),
op->in_opr2()->as_xmm_double_reg(),
op->in_opr3()->as_xmm_double_reg());
break;
case lir_fmaf:
__ fmaf(op->result_opr()->as_xmm_float_reg(),
op->in_opr1()->as_xmm_float_reg(),
op->in_opr2()->as_xmm_float_reg(),
op->in_opr3()->as_xmm_float_reg());
break;
default: ShouldNotReachHere(); break;
}
}

View file

@ -806,6 +806,32 @@ void LIRGenerator::do_CompareAndSwap(Intrinsic* x, ValueType* type) {
}
}
void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
assert(x->number_of_arguments() == 3, "wrong type");
assert(UseFMA, "Needs FMA instructions support.");
LIRItem value(x->argument_at(0), this);
LIRItem value1(x->argument_at(1), this);
LIRItem value2(x->argument_at(2), this);
value2.set_destroys_register();
value.load_item();
value1.load_item();
value2.load_item();
LIR_Opr calc_input = value.result();
LIR_Opr calc_input1 = value1.result();
LIR_Opr calc_input2 = value2.result();
LIR_Opr calc_result = rlock_result(x);
switch (x->id()) {
case vmIntrinsics::_fmaD: __ fmad(calc_input, calc_input1, calc_input2, calc_result); break;
case vmIntrinsics::_fmaF: __ fmaf(calc_input, calc_input1, calc_input2, calc_result); break;
default: ShouldNotReachHere();
}
}
void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
assert(x->number_of_arguments() == 1 || (x->number_of_arguments() == 2 && x->id() == vmIntrinsics::_dpow), "wrong type");

View file

@ -3147,6 +3147,24 @@ void MacroAssembler::fremr(Register tmp) {
fpop();
}
// dst = c = a * b + c
void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
Assembler::vfmadd231sd(c, a, b);
if (dst != c) {
movdbl(dst, c);
}
}
// dst = c = a * b + c
void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
Assembler::vfmadd231ss(c, a, b);
if (dst != c) {
movflt(dst, c);
}
}
void MacroAssembler::incrementl(AddressLiteral dst) {
if (reachable(dst)) {

View file

@ -449,6 +449,10 @@ class MacroAssembler: public Assembler {
// tmp is a temporary register, if none is available use noreg
void fremr(Register tmp);
// dst = c = a * b + c
void fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
void fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
// same as fcmp2int, but using SSE2
void cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);

View file

@ -341,6 +341,27 @@ address TemplateInterpreterGenerator::generate_math_entry(AbstractInterpreter::M
// [ lo(arg) ]
// [ hi(arg) ]
//
if (kind == Interpreter::java_lang_math_fmaD) {
__ movdbl(xmm2, Address(rsp, 5 * wordSize));
__ movdbl(xmm1, Address(rsp, 3 * wordSize));
__ movdbl(xmm0, Address(rsp, 1 * wordSize));
__ fmad(xmm0, xmm1, xmm2, xmm0);
__ pop(rdi); // get return address
__ mov(rsp, rsi); // set sp to sender sp
__ jmp(rdi);
return entry_point;
} else if (kind == Interpreter::java_lang_math_fmaF) {
__ movflt(xmm2, Address(rsp, 3 * wordSize));
__ movflt(xmm1, Address(rsp, 2 * wordSize));
__ movflt(xmm0, Address(rsp, 1 * wordSize));
__ fmaf(xmm0, xmm1, xmm2, xmm0);
__ pop(rdi); // get return address
__ mov(rsp, rsi); // set sp to sender sp
__ jmp(rdi);
return entry_point;
}
__ fld_d(Address(rsp, 1*wordSize));
switch (kind) {

View file

@ -369,8 +369,17 @@ address TemplateInterpreterGenerator::generate_math_entry(AbstractInterpreter::M
// [ hi(arg) ]
//
if (kind == Interpreter::java_lang_math_sqrt) {
if (kind == Interpreter::java_lang_math_fmaD) {
__ movdbl(xmm0, Address(rsp, wordSize));
__ movdbl(xmm1, Address(rsp, 3 * wordSize));
__ movdbl(xmm2, Address(rsp, 5 * wordSize));
__ fmad(xmm0, xmm1, xmm2, xmm0);
} else if (kind == Interpreter::java_lang_math_fmaF) {
__ movflt(xmm0, Address(rsp, wordSize));
__ movflt(xmm1, Address(rsp, 2 * wordSize));
__ movflt(xmm2, Address(rsp, 3 * wordSize));
__ fmaf(xmm0, xmm1, xmm2, xmm0);
} else if (kind == Interpreter::java_lang_math_sqrt) {
__ sqrtsd(xmm0, Address(rsp, wordSize));
} else if (kind == Interpreter::java_lang_math_exp) {
__ movdbl(xmm0, Address(rsp, wordSize));

View file

@ -73,6 +73,7 @@
#define VM_LONG_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant) \
declare_preprocessor_constant("VM_Version::CPU_AVX512BW", CPU_AVX512BW) \
declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) \
declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA)
declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA) \
declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA)
#endif // CPU_X86_VM_VMSTRUCTS_X86_HPP

View file

@ -578,7 +578,7 @@ void VM_Version::get_processor_features() {
}
char buf[256];
jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
cores_per_cpu(), threads_per_core(),
cpu_family(), _model, _stepping,
(supports_cmov() ? ", cmov" : ""),
@ -610,7 +610,8 @@ void VM_Version::get_processor_features() {
(supports_bmi2() ? ", bmi2" : ""),
(supports_adx() ? ", adx" : ""),
(supports_evex() ? ", evex" : ""),
(supports_sha() ? ", sha" : ""));
(supports_sha() ? ", sha" : ""),
(supports_fma() ? ", fma" : ""));
_features_string = os::strdup(buf);
// UseSSE is set to the smaller of what hardware supports and what
@ -732,6 +733,15 @@ void VM_Version::get_processor_features() {
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
}
if (supports_fma() && UseSSE >= 2) {
if (FLAG_IS_DEFAULT(UseFMA)) {
UseFMA = true;
}
} else if (UseFMA) {
warning("FMA instructions are not available on this CPU");
FLAG_SET_DEFAULT(UseFMA, false);
}
if (supports_sha() LP64_ONLY(|| supports_avx2() && supports_bmi2())) {
if (FLAG_IS_DEFAULT(UseSHA)) {
UseSHA = true;
@ -773,7 +783,6 @@ void VM_Version::get_processor_features() {
FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
}
// Adjust RTM (Restricted Transactional Memory) flags
if (!supports_rtm() && UseRTMLocking) {
// Can't continue because UseRTMLocking affects UseBiasedLocking flag
// setting during arguments processing. See use_biased_locking().

View file

@ -74,7 +74,8 @@ class VM_Version : public Abstract_VM_Version {
: 1,
ssse3 : 1,
cid : 1,
: 2,
: 1,
fma : 1,
cmpxchg16: 1,
: 4,
dca : 1,
@ -289,6 +290,7 @@ protected:
#define CPU_AVX512BW ((uint64_t)UCONST64(0x100000000)) // enums are limited to 31 bit
#define CPU_AVX512VL ((uint64_t)UCONST64(0x200000000)) // EVEX instructions with smaller vector length
#define CPU_SHA ((uint64_t)UCONST64(0x400000000)) // SHA instructions
#define CPU_FMA ((uint64_t)UCONST64(0x800000000)) // FMA instructions
enum Extended_Family {
// AMD
@ -522,6 +524,8 @@ protected:
result |= CPU_SHA;
if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
result |= CPU_LZCNT;
if (_cpuid_info.std_cpuid1_ecx.bits.fma != 0)
result |= CPU_FMA;
// for Intel, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw
if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
result |= CPU_3DNOW_PREFETCH;
@ -726,6 +730,7 @@ public:
static bool supports_avx256only() { return (supports_avx2() && !supports_evex()); }
static bool supports_avxonly() { return ((supports_avx2() || supports_avx()) && !supports_evex()); }
static bool supports_sha() { return (_features & CPU_SHA) != 0; }
static bool supports_fma() { return (_features & CPU_FMA) != 0; }
// Intel features
static bool is_intel_family_core() { return is_intel() &&
extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }

View file

@ -3113,6 +3113,30 @@ instruct onspinwait() %{
ins_pipe(pipe_slow);
%}
// a * b + c
instruct fmaD_reg(regD a, regD b, regD c) %{
predicate(UseFMA);
match(Set c (FmaD c (Binary a b)));
format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
ins_cost(150);
ins_encode %{
__ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
// a * b + c
instruct fmaF_reg(regF a, regF b, regF c) %{
predicate(UseFMA);
match(Set c (FmaF c (Binary a b)));
format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
ins_cost(150);
ins_encode %{
__ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
// ====================VECTOR INSTRUCTIONS=====================================
// Load vectors (4 bytes long)

View file

@ -205,7 +205,8 @@ public class AMD64 extends Architecture {
AVX512CD,
AVX512BW,
AVX512VL,
SHA
SHA,
FMA
}
private final EnumSet<CPUFeature> features;

View file

@ -124,6 +124,9 @@ public class AMD64HotSpotJVMCIBackendFactory implements HotSpotJVMCIBackendFacto
if ((config.vmVersionFeatures & config.amd64SHA) != 0) {
features.add(AMD64.CPUFeature.SHA);
}
if ((config.vmVersionFeatures & config.amd64FMA) != 0) {
features.add(AMD64.CPUFeature.FMA);
}
return features;
}

View file

@ -78,4 +78,5 @@ class AMD64HotSpotVMConfig extends HotSpotVMConfigAccess {
final long amd64AVX512BW = getConstant("VM_Version::CPU_AVX512BW", Long.class);
final long amd64AVX512VL = getConstant("VM_Version::CPU_AVX512VL", Long.class);
final long amd64SHA = getConstant("VM_Version::CPU_SHA", Long.class);
final long amd64FMA = getConstant("VM_Version::CPU_FMA", Long.class);
}

View file

@ -4038,6 +4038,8 @@ int MatchRule::is_expensive() const {
strcmp(opType,"EncodeP")==0 ||
strcmp(opType,"EncodePKlass")==0 ||
strcmp(opType,"DecodeNKlass")==0 ||
strcmp(opType,"FmaD") == 0 ||
strcmp(opType,"FmaF") == 0 ||
strcmp(opType,"RoundDouble")==0 ||
strcmp(opType,"RoundFloat")==0 ||
strcmp(opType,"ReverseBytesI")==0 ||

View file

@ -162,6 +162,8 @@ bool Compiler::is_intrinsic_supported(const methodHandle& method) {
case vmIntrinsics::_dlog10:
case vmIntrinsics::_dexp:
case vmIntrinsics::_dpow:
case vmIntrinsics::_fmaD:
case vmIntrinsics::_fmaF:
case vmIntrinsics::_getObject:
case vmIntrinsics::_getBoolean:
case vmIntrinsics::_getByte:

View file

@ -666,7 +666,9 @@ void LIR_OpVisitState::visit(LIR_Op* op) {
// LIR_Op3
case lir_idiv:
case lir_irem: {
case lir_irem:
case lir_fmad:
case lir_fmaf: {
assert(op->as_Op3() != NULL, "must be");
LIR_Op3* op3= (LIR_Op3*)op;
@ -1663,6 +1665,8 @@ const char * LIR_Op::name() const {
// LIR_Op3
case lir_idiv: s = "idiv"; break;
case lir_irem: s = "irem"; break;
case lir_fmad: s = "fmad"; break;
case lir_fmaf: s = "fmaf"; break;
// LIR_OpJavaCall
case lir_static_call: s = "static"; break;
case lir_optvirtual_call: s = "optvirtual"; break;

View file

@ -956,6 +956,8 @@ enum LIR_Code {
, begin_op3
, lir_idiv
, lir_irem
, lir_fmad
, lir_fmaf
, end_op3
, begin_opJavaCall
, lir_static_call
@ -2149,6 +2151,8 @@ class LIR_List: public CompilationResourceObj {
void abs (LIR_Opr from, LIR_Opr to, LIR_Opr tmp) { append(new LIR_Op2(lir_abs , from, tmp, to)); }
void sqrt(LIR_Opr from, LIR_Opr to, LIR_Opr tmp) { append(new LIR_Op2(lir_sqrt, from, tmp, to)); }
void fmad(LIR_Opr from, LIR_Opr from1, LIR_Opr from2, LIR_Opr to) { append(new LIR_Op3(lir_fmad, from, from1, from2, to)); }
void fmaf(LIR_Opr from, LIR_Opr from1, LIR_Opr from2, LIR_Opr to) { append(new LIR_Op3(lir_fmaf, from, from1, from2, to)); }
void log10 (LIR_Opr from, LIR_Opr to, LIR_Opr tmp) { append(new LIR_Op2(lir_log10, from, LIR_OprFact::illegalOpr, to, tmp)); }
void tan (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_tan , from, tmp1, to, tmp2)); }

View file

@ -3147,6 +3147,9 @@ void LIRGenerator::do_Intrinsic(Intrinsic* x) {
case vmIntrinsics::_dpow : do_MathIntrinsic(x); break;
case vmIntrinsics::_arraycopy: do_ArrayCopy(x); break;
case vmIntrinsics::_fmaD: do_FmaIntrinsic(x); break;
case vmIntrinsics::_fmaF: do_FmaIntrinsic(x); break;
// java.nio.Buffer.checkIndex
case vmIntrinsics::_checkIndex: do_NIOCheckIndex(x); break;

View file

@ -245,6 +245,7 @@ class LIRGenerator: public InstructionVisitor, public BlockClosure {
void do_isPrimitive(Intrinsic* x);
void do_getClass(Intrinsic* x);
void do_currentThread(Intrinsic* x);
void do_FmaIntrinsic(Intrinsic* x);
void do_MathIntrinsic(Intrinsic* x);
void do_LibmIntrinsic(Intrinsic* x);
void do_ArrayCopy(Intrinsic* x);

View file

@ -355,6 +355,8 @@ bool vmIntrinsics::preserves_state(vmIntrinsics::ID id) {
case vmIntrinsics::_updateBytesCRC32:
case vmIntrinsics::_updateByteBufferCRC32:
case vmIntrinsics::_vectorizedMismatch:
case vmIntrinsics::_fmaD:
case vmIntrinsics::_fmaF:
return true;
default:
return false;
@ -387,6 +389,8 @@ bool vmIntrinsics::can_trap(vmIntrinsics::ID id) {
case vmIntrinsics::_updateBytesCRC32:
case vmIntrinsics::_updateByteBufferCRC32:
case vmIntrinsics::_vectorizedMismatch:
case vmIntrinsics::_fmaD:
case vmIntrinsics::_fmaF:
return false;
default:
return true;
@ -535,6 +539,10 @@ bool vmIntrinsics::is_disabled_by_flags(vmIntrinsics::ID id) {
case vmIntrinsics::_doubleToLongBits:
if (!InlineMathNatives) return true;
break;
case vmIntrinsics::_fmaD:
case vmIntrinsics::_fmaF:
if (!InlineMathNatives || !UseFMA) return true;
break;
case vmIntrinsics::_arraycopy:
if (!InlineArrayCopy) return true;
break;

View file

@ -755,8 +755,10 @@
do_class(java_lang_Math, "java/lang/Math") \
do_class(java_lang_StrictMath, "java/lang/StrictMath") \
do_signature(double2_double_signature, "(DD)D") \
do_signature(double3_double_signature, "(DDD)D") \
do_signature(float3_float_signature, "(FFF)F") \
do_signature(int2_int_signature, "(II)I") \
do_signature(long2_long_signature, "(JJ)J") \
do_signature(long2_long_signature, "(JJ)J") \
\
/* here are the math names, all together: */ \
do_name(abs_name,"abs") do_name(sin_name,"sin") do_name(cos_name,"cos") \
@ -770,6 +772,7 @@
do_name(multiplyExact_name,"multiplyExact") \
do_name(negateExact_name,"negateExact") \
do_name(subtractExact_name,"subtractExact") \
do_name(fma_name, "fma") \
\
do_intrinsic(_dabs, java_lang_Math, abs_name, double_double_signature, F_S) \
do_intrinsic(_dsin, java_lang_Math, sin_name, double_double_signature, F_S) \
@ -795,6 +798,8 @@
do_intrinsic(_negateExactL, java_lang_Math, negateExact_name, long_long_signature, F_S) \
do_intrinsic(_subtractExactI, java_lang_Math, subtractExact_name, int2_int_signature, F_S) \
do_intrinsic(_subtractExactL, java_lang_Math, subtractExact_name, long2_long_signature, F_S) \
do_intrinsic(_fmaD, java_lang_Math, fma_name, double3_double_signature, F_S) \
do_intrinsic(_fmaF, java_lang_Math, fma_name, float3_float_signature, F_S) \
\
do_intrinsic(_floatToRawIntBits, java_lang_Float, floatToRawIntBits_name, float_int_signature, F_S) \
do_name( floatToRawIntBits_name, "floatToRawIntBits") \

View file

@ -194,6 +194,13 @@ AbstractInterpreter::MethodKind AbstractInterpreter::method_kind(methodHandle m)
return java_lang_ref_reference_get;
}
if (UseFMA) {
switch (m->intrinsic_id()) {
case vmIntrinsics::_fmaD: return java_lang_math_fmaD;
case vmIntrinsics::_fmaF: return java_lang_math_fmaF;
}
}
// Accessor method?
if (m->is_getter()) {
// TODO: We should have used ::is_accessor above, but fast accessors in Zero expect only getters.
@ -281,6 +288,8 @@ void AbstractInterpreter::print_method_kind(MethodKind kind) {
case java_lang_math_sqrt : tty->print("java_lang_math_sqrt" ); break;
case java_lang_math_log : tty->print("java_lang_math_log" ); break;
case java_lang_math_log10 : tty->print("java_lang_math_log10" ); break;
case java_lang_math_fmaD : tty->print("java_lang_math_fmaD" ); break;
case java_lang_math_fmaF : tty->print("java_lang_math_fmaF" ); break;
case java_util_zip_CRC32_update : tty->print("java_util_zip_CRC32_update"); break;
case java_util_zip_CRC32_updateBytes : tty->print("java_util_zip_CRC32_updateBytes"); break;
case java_util_zip_CRC32_updateByteBuffer : tty->print("java_util_zip_CRC32_updateByteBuffer"); break;

View file

@ -76,6 +76,8 @@ class AbstractInterpreter: AllStatic {
java_lang_math_log10, // implementation of java.lang.Math.log10 (x)
java_lang_math_pow, // implementation of java.lang.Math.pow (x,y)
java_lang_math_exp, // implementation of java.lang.Math.exp (x)
java_lang_math_fmaF, // implementation of java.lang.Math.fma (x, y, z)
java_lang_math_fmaD, // implementation of java.lang.Math.fma (x, y, z)
java_lang_ref_reference_get, // implementation of java.lang.ref.Reference.get()
java_util_zip_CRC32_update, // implementation of java.util.zip.CRC32.update()
java_util_zip_CRC32_updateBytes, // implementation of java.util.zip.CRC32.updateBytes()

View file

@ -239,6 +239,10 @@ void TemplateInterpreterGenerator::generate_all() {
method_entry(java_lang_math_log10)
method_entry(java_lang_math_exp )
method_entry(java_lang_math_pow )
if (UseFMA) {
method_entry(java_lang_math_fmaF)
method_entry(java_lang_math_fmaD)
}
method_entry(java_lang_ref_reference_get)
AbstractInterpreter::initialize_method_handle_entries();
@ -445,7 +449,9 @@ address TemplateInterpreterGenerator::generate_method_entry(
case Interpreter::java_lang_math_log10 : // fall thru
case Interpreter::java_lang_math_sqrt : // fall thru
case Interpreter::java_lang_math_pow : // fall thru
case Interpreter::java_lang_math_exp : entry_point = generate_math_entry(kind); break;
case Interpreter::java_lang_math_exp : // fall thru
case Interpreter::java_lang_math_fmaD : // fall thru
case Interpreter::java_lang_math_fmaF : entry_point = generate_math_entry(kind); break;
case Interpreter::java_lang_ref_reference_get
: entry_point = generate_Reference_get_entry(); break;
case Interpreter::java_util_zip_CRC32_update

View file

@ -660,7 +660,8 @@
#define VM_LONG_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant) \
declare_preprocessor_constant("VM_Version::CPU_AVX512BW", CPU_AVX512BW) \
declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) \
declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA)
declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA) \
declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA)
#endif

View file

@ -416,6 +416,12 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method, bool is_virt
case vmIntrinsics::_onSpinWait:
if (!Matcher::match_rule_supported(Op_OnSpinWait)) return false;
break;
case vmIntrinsics::_fmaD:
if (!UseFMA || !Matcher::match_rule_supported(Op_FmaD)) return false;
break;
case vmIntrinsics::_fmaF:
if (!UseFMA || !Matcher::match_rule_supported(Op_FmaF)) return false;
break;
case vmIntrinsics::_hashCode:
case vmIntrinsics::_identityHashCode:
case vmIntrinsics::_getClass:

View file

@ -151,6 +151,8 @@ macro(EncodeP)
macro(EncodePKlass)
macro(FastLock)
macro(FastUnlock)
macro(FmaD)
macro(FmaF)
macro(Goto)
macro(Halt)
macro(HasNegatives)

View file

@ -317,6 +317,7 @@ class LibraryCallKit : public GraphKit {
bool inline_montgomeryMultiply();
bool inline_montgomerySquare();
bool inline_vectorizedMismatch();
bool inline_fma(vmIntrinsics::ID id);
bool inline_profileBoolean();
bool inline_isCompileConstant();
@ -825,6 +826,10 @@ bool LibraryCallKit::try_to_inline(int predicate) {
case vmIntrinsics::_hasNegatives:
return inline_hasNegatives();
case vmIntrinsics::_fmaD:
case vmIntrinsics::_fmaF:
return inline_fma(intrinsic_id());
default:
// If you get here, it may be that someone has added a new intrinsic
// to the list in vmSymbols.hpp without implementing it here.
@ -6657,6 +6662,35 @@ Node* LibraryCallKit::inline_digestBase_implCompressMB_predicate(int predicate)
return instof_false; // even if it is NULL
}
//-------------inline_fma-----------------------------------
bool LibraryCallKit::inline_fma(vmIntrinsics::ID id) {
Node *a = NULL;
Node *b = NULL;
Node *c = NULL;
Node* result = NULL;
switch (id) {
case vmIntrinsics::_fmaD:
assert(callee()->signature()->size() == 6, "fma has 3 parameters of size 2 each.");
// no receiver since it is static method
a = round_double_node(argument(0));
b = round_double_node(argument(2));
c = round_double_node(argument(4));
result = _gvn.transform(new FmaDNode(control(), a, b, c));
break;
case vmIntrinsics::_fmaF:
assert(callee()->signature()->size() == 3, "fma has 3 parameters of size 1 each.");
a = argument(0);
b = argument(1);
c = argument(2);
result = _gvn.transform(new FmaFNode(control(), a, b, c));
break;
default:
fatal_unexpected_iid(id); break;
}
set_result(result);
return true;
}
bool LibraryCallKit::inline_profileBoolean() {
Node* counts = argument(1);
const TypeAryPtr* ary = NULL;

View file

@ -2117,6 +2117,8 @@ void Matcher::find_shared( Node *n ) {
case Op_StrInflatedCopy:
case Op_StrCompressedCopy:
case Op_EncodeISOArray:
case Op_FmaD:
case Op_FmaF:
set_shared(n); // Force result into register (it will be anyways)
break;
case Op_ConP: { // Convert pointers above the centerline to NUL
@ -2305,6 +2307,15 @@ void Matcher::find_shared( Node *n ) {
n->del_req(4);
break;
}
case Op_FmaD:
case Op_FmaF: {
// Restructure into a binary tree for Matching.
Node* pair = new BinaryNode(n->in(1), n->in(2));
n->set_req(2, pair);
n->set_req(1, n->in(3));
n->del_req(3);
break;
}
default:
break;
}

View file

@ -1343,3 +1343,47 @@ const Type* URShiftLNode::Value(PhaseGVN* phase) const {
return TypeLong::LONG; // Give up
}
//=============================================================================
//------------------------------Value------------------------------------------
const Type* FmaDNode::Value(PhaseGVN* phase) const {
const Type *t1 = phase->type(in(1));
if (t1 == Type::TOP) return Type::TOP;
if (t1->base() != Type::DoubleCon) return Type::DOUBLE;
const Type *t2 = phase->type(in(2));
if (t2 == Type::TOP) return Type::TOP;
if (t2->base() != Type::DoubleCon) return Type::DOUBLE;
const Type *t3 = phase->type(in(3));
if (t3 == Type::TOP) return Type::TOP;
if (t3->base() != Type::DoubleCon) return Type::DOUBLE;
#ifndef __STDC_IEC_559__
return Type::DOUBLE;
#else
double d1 = t1->getd();
double d2 = t2->getd();
double d3 = t3->getd();
return TypeD::make(fma(d1, d2, d3));
#endif
}
//=============================================================================
//------------------------------Value------------------------------------------
const Type* FmaFNode::Value(PhaseGVN* phase) const {
const Type *t1 = phase->type(in(1));
if (t1 == Type::TOP) return Type::TOP;
if (t1->base() != Type::FloatCon) return Type::FLOAT;
const Type *t2 = phase->type(in(2));
if (t2 == Type::TOP) return Type::TOP;
if (t2->base() != Type::FloatCon) return Type::FLOAT;
const Type *t3 = phase->type(in(3));
if (t3 == Type::TOP) return Type::TOP;
if (t3->base() != Type::FloatCon) return Type::FLOAT;
#ifndef __STDC_IEC_559__
return Type::FLOAT;
#else
float f1 = t1->getf();
float f2 = t2->getf();
float f3 = t3->getf();
return TypeF::make(fma(f1, f2, f3));
#endif
}

View file

@ -263,4 +263,26 @@ public:
virtual uint ideal_reg() const { return Op_RegL; }
};
//------------------------------FmaDNode--------------------------------------
// fused-multiply-add double
class FmaDNode : public Node {
public:
FmaDNode(Node *c, Node *in1, Node *in2, Node *in3) : Node(c, in1, in2, in3) {}
virtual int Opcode() const;
const Type *bottom_type() const { return Type::DOUBLE; }
virtual uint ideal_reg() const { return Op_RegD; }
virtual const Type* Value(PhaseGVN* phase) const;
};
//------------------------------FmaFNode--------------------------------------
// fused-multiply-add float
class FmaFNode : public Node {
public:
FmaFNode(Node *c, Node *in1, Node *in2, Node *in3) : Node(c, in1, in2, in3) {}
virtual int Opcode() const;
const Type *bottom_type() const { return Type::FLOAT; }
virtual uint ideal_reg() const { return Op_RegF; }
virtual const Type* Value(PhaseGVN* phase) const;
};
#endif // SHARE_VM_OPTO_MULNODE_HPP

View file

@ -659,6 +659,9 @@ public:
product(bool, UseAES, false, \
"Control whether AES instructions can be used on x86/x64") \
\
product(bool, UseFMA, false, \
"Control whether FMA instructions can be used") \
\
product(bool, UseSHA, false, \
"Control whether SHA instructions can be used " \
"on SPARC, on ARM and on x86") \

View file

@ -2105,6 +2105,8 @@ typedef CompactHashtable<Symbol*, char> SymbolCompactHashTable;
declare_c2_type(OverflowAddLNode, OverflowLNode) \
declare_c2_type(OverflowSubLNode, OverflowLNode) \
declare_c2_type(OverflowMulLNode, OverflowLNode) \
declare_c2_type(FmaDNode, Node) \
declare_c2_type(FmaFNode, Node) \
\
/*********************/ \
/* Adapter Blob Entries */ \