mirror of
https://github.com/openjdk/jdk.git
synced 2025-08-27 14:54:52 +02:00
8154122: Intrinsify fused mac operations
Added FMA intrinsics on x86 Reviewed-by: kvn, aph, darcy
This commit is contained in:
parent
474c035379
commit
d58e3e0324
42 changed files with 365 additions and 13 deletions
|
@ -1032,6 +1032,10 @@ void LIRGenerator::do_update_CRC32C(Intrinsic* x) {
|
|||
Unimplemented();
|
||||
}
|
||||
|
||||
void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
|
||||
fatal("FMA intrinsic is not implemented on this platform");
|
||||
}
|
||||
|
||||
void LIRGenerator::do_vectorizedMismatch(Intrinsic* x) {
|
||||
fatal("vectorizedMismatch intrinsic is not implemented on this platform");
|
||||
}
|
||||
|
|
|
@ -262,6 +262,11 @@ void VM_Version::get_processor_features() {
|
|||
FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
|
||||
}
|
||||
|
||||
if (UseFMA) {
|
||||
warning("FMA instructions are not available on this CPU");
|
||||
FLAG_SET_DEFAULT(UseFMA, false);
|
||||
}
|
||||
|
||||
if (auxv & (HWCAP_SHA1 | HWCAP_SHA2)) {
|
||||
if (FLAG_IS_DEFAULT(UseSHA)) {
|
||||
FLAG_SET_DEFAULT(UseSHA, true);
|
||||
|
|
|
@ -1433,6 +1433,10 @@ void LIRGenerator::do_update_CRC32(Intrinsic* x) {
|
|||
}
|
||||
}
|
||||
|
||||
void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
|
||||
fatal("FMA intrinsic is not implemented on this platform");
|
||||
}
|
||||
|
||||
void LIRGenerator::do_vectorizedMismatch(Intrinsic* x) {
|
||||
fatal("vectorizedMismatch intrinsic is not implemented on this platform");
|
||||
}
|
||||
|
|
|
@ -230,6 +230,11 @@ void VM_Version::initialize() {
|
|||
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
|
||||
}
|
||||
|
||||
if (UseFMA) {
|
||||
warning("FMA instructions are not available on this CPU");
|
||||
FLAG_SET_DEFAULT(UseFMA, false);
|
||||
}
|
||||
|
||||
if (UseSHA) {
|
||||
warning("SHA instructions are not available on this CPU");
|
||||
FLAG_SET_DEFAULT(UseSHA, false);
|
||||
|
|
|
@ -953,6 +953,10 @@ void LIRGenerator::do_update_CRC32C(Intrinsic* x) {
|
|||
}
|
||||
}
|
||||
|
||||
void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
|
||||
fatal("FMA intrinsic is not implemented on this platform");
|
||||
}
|
||||
|
||||
void LIRGenerator::do_vectorizedMismatch(Intrinsic* x) {
|
||||
fatal("vectorizedMismatch intrinsic is not implemented on this platform");
|
||||
}
|
||||
|
|
|
@ -266,6 +266,11 @@ void VM_Version::initialize() {
|
|||
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
|
||||
}
|
||||
|
||||
if (UseFMA) {
|
||||
warning("FMA instructions are not available on this CPU");
|
||||
FLAG_SET_DEFAULT(UseFMA, false);
|
||||
}
|
||||
|
||||
// SHA1, SHA256, and SHA512 instructions were added to SPARC T-series at different times
|
||||
if (has_sha1() || has_sha256() || has_sha512()) {
|
||||
if (UseVIS > 0) { // SHA intrinsics use VIS1 instructions
|
||||
|
|
|
@ -172,7 +172,9 @@ bool AbstractInterpreter::can_be_compiled(methodHandle m) {
|
|||
case Interpreter::java_lang_math_log10 : // fall thru
|
||||
case Interpreter::java_lang_math_sqrt : // fall thru
|
||||
case Interpreter::java_lang_math_pow : // fall thru
|
||||
case Interpreter::java_lang_math_exp :
|
||||
case Interpreter::java_lang_math_exp : // fall thru
|
||||
case Interpreter::java_lang_math_fmaD : // fall thru
|
||||
case Interpreter::java_lang_math_fmaF :
|
||||
return false;
|
||||
default:
|
||||
return true;
|
||||
|
|
|
@ -4769,6 +4769,22 @@ void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
|
|||
emit_int8((unsigned char)(0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::vfmadd231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
|
||||
assert(VM_Version::supports_fma(), "");
|
||||
InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
|
||||
int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
|
||||
emit_int8((unsigned char)0xB9);
|
||||
emit_int8((unsigned char)(0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::vfmadd231ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
|
||||
assert(VM_Version::supports_fma(), "");
|
||||
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
|
||||
int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
|
||||
emit_int8((unsigned char)0xB9);
|
||||
emit_int8((unsigned char)(0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
|
||||
assert(VM_Version::supports_avx(), "");
|
||||
InstructionMark im(this);
|
||||
|
|
|
@ -1860,6 +1860,8 @@ private:
|
|||
void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
|
||||
void vdivss(XMMRegister dst, XMMRegister nds, Address src);
|
||||
void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src);
|
||||
void vfmadd231sd(XMMRegister dst, XMMRegister nds, XMMRegister src);
|
||||
void vfmadd231ss(XMMRegister dst, XMMRegister nds, XMMRegister src);
|
||||
void vmulsd(XMMRegister dst, XMMRegister nds, Address src);
|
||||
void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
|
||||
void vmulss(XMMRegister dst, XMMRegister nds, Address src);
|
||||
|
|
|
@ -1345,6 +1345,18 @@ void LIR_Assembler::emit_op3(LIR_Op3* op) {
|
|||
op->result_opr(),
|
||||
op->info());
|
||||
break;
|
||||
case lir_fmad:
|
||||
__ fmad(op->result_opr()->as_xmm_double_reg(),
|
||||
op->in_opr1()->as_xmm_double_reg(),
|
||||
op->in_opr2()->as_xmm_double_reg(),
|
||||
op->in_opr3()->as_xmm_double_reg());
|
||||
break;
|
||||
case lir_fmaf:
|
||||
__ fmaf(op->result_opr()->as_xmm_float_reg(),
|
||||
op->in_opr1()->as_xmm_float_reg(),
|
||||
op->in_opr2()->as_xmm_float_reg(),
|
||||
op->in_opr3()->as_xmm_float_reg());
|
||||
break;
|
||||
default: ShouldNotReachHere(); break;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -806,6 +806,32 @@ void LIRGenerator::do_CompareAndSwap(Intrinsic* x, ValueType* type) {
|
|||
}
|
||||
}
|
||||
|
||||
void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
|
||||
assert(x->number_of_arguments() == 3, "wrong type");
|
||||
assert(UseFMA, "Needs FMA instructions support.");
|
||||
LIRItem value(x->argument_at(0), this);
|
||||
LIRItem value1(x->argument_at(1), this);
|
||||
LIRItem value2(x->argument_at(2), this);
|
||||
|
||||
value2.set_destroys_register();
|
||||
|
||||
value.load_item();
|
||||
value1.load_item();
|
||||
value2.load_item();
|
||||
|
||||
LIR_Opr calc_input = value.result();
|
||||
LIR_Opr calc_input1 = value1.result();
|
||||
LIR_Opr calc_input2 = value2.result();
|
||||
LIR_Opr calc_result = rlock_result(x);
|
||||
|
||||
switch (x->id()) {
|
||||
case vmIntrinsics::_fmaD: __ fmad(calc_input, calc_input1, calc_input2, calc_result); break;
|
||||
case vmIntrinsics::_fmaF: __ fmaf(calc_input, calc_input1, calc_input2, calc_result); break;
|
||||
default: ShouldNotReachHere();
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
|
||||
assert(x->number_of_arguments() == 1 || (x->number_of_arguments() == 2 && x->id() == vmIntrinsics::_dpow), "wrong type");
|
||||
|
|
|
@ -3147,6 +3147,24 @@ void MacroAssembler::fremr(Register tmp) {
|
|||
fpop();
|
||||
}
|
||||
|
||||
// dst = c = a * b + c
|
||||
void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
|
||||
Assembler::vfmadd231sd(c, a, b);
|
||||
if (dst != c) {
|
||||
movdbl(dst, c);
|
||||
}
|
||||
}
|
||||
|
||||
// dst = c = a * b + c
|
||||
void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
|
||||
Assembler::vfmadd231ss(c, a, b);
|
||||
if (dst != c) {
|
||||
movflt(dst, c);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
void MacroAssembler::incrementl(AddressLiteral dst) {
|
||||
if (reachable(dst)) {
|
||||
|
|
|
@ -449,6 +449,10 @@ class MacroAssembler: public Assembler {
|
|||
// tmp is a temporary register, if none is available use noreg
|
||||
void fremr(Register tmp);
|
||||
|
||||
// dst = c = a * b + c
|
||||
void fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
|
||||
void fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
|
||||
|
||||
|
||||
// same as fcmp2int, but using SSE2
|
||||
void cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
|
||||
|
|
|
@ -341,6 +341,27 @@ address TemplateInterpreterGenerator::generate_math_entry(AbstractInterpreter::M
|
|||
// [ lo(arg) ]
|
||||
// [ hi(arg) ]
|
||||
//
|
||||
if (kind == Interpreter::java_lang_math_fmaD) {
|
||||
__ movdbl(xmm2, Address(rsp, 5 * wordSize));
|
||||
__ movdbl(xmm1, Address(rsp, 3 * wordSize));
|
||||
__ movdbl(xmm0, Address(rsp, 1 * wordSize));
|
||||
__ fmad(xmm0, xmm1, xmm2, xmm0);
|
||||
__ pop(rdi); // get return address
|
||||
__ mov(rsp, rsi); // set sp to sender sp
|
||||
__ jmp(rdi);
|
||||
|
||||
return entry_point;
|
||||
} else if (kind == Interpreter::java_lang_math_fmaF) {
|
||||
__ movflt(xmm2, Address(rsp, 3 * wordSize));
|
||||
__ movflt(xmm1, Address(rsp, 2 * wordSize));
|
||||
__ movflt(xmm0, Address(rsp, 1 * wordSize));
|
||||
__ fmaf(xmm0, xmm1, xmm2, xmm0);
|
||||
__ pop(rdi); // get return address
|
||||
__ mov(rsp, rsi); // set sp to sender sp
|
||||
__ jmp(rdi);
|
||||
|
||||
return entry_point;
|
||||
}
|
||||
|
||||
__ fld_d(Address(rsp, 1*wordSize));
|
||||
switch (kind) {
|
||||
|
|
|
@ -369,8 +369,17 @@ address TemplateInterpreterGenerator::generate_math_entry(AbstractInterpreter::M
|
|||
// [ hi(arg) ]
|
||||
//
|
||||
|
||||
|
||||
if (kind == Interpreter::java_lang_math_sqrt) {
|
||||
if (kind == Interpreter::java_lang_math_fmaD) {
|
||||
__ movdbl(xmm0, Address(rsp, wordSize));
|
||||
__ movdbl(xmm1, Address(rsp, 3 * wordSize));
|
||||
__ movdbl(xmm2, Address(rsp, 5 * wordSize));
|
||||
__ fmad(xmm0, xmm1, xmm2, xmm0);
|
||||
} else if (kind == Interpreter::java_lang_math_fmaF) {
|
||||
__ movflt(xmm0, Address(rsp, wordSize));
|
||||
__ movflt(xmm1, Address(rsp, 2 * wordSize));
|
||||
__ movflt(xmm2, Address(rsp, 3 * wordSize));
|
||||
__ fmaf(xmm0, xmm1, xmm2, xmm0);
|
||||
} else if (kind == Interpreter::java_lang_math_sqrt) {
|
||||
__ sqrtsd(xmm0, Address(rsp, wordSize));
|
||||
} else if (kind == Interpreter::java_lang_math_exp) {
|
||||
__ movdbl(xmm0, Address(rsp, wordSize));
|
||||
|
|
|
@ -73,6 +73,7 @@
|
|||
#define VM_LONG_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant) \
|
||||
declare_preprocessor_constant("VM_Version::CPU_AVX512BW", CPU_AVX512BW) \
|
||||
declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) \
|
||||
declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA)
|
||||
declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA) \
|
||||
declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA)
|
||||
|
||||
#endif // CPU_X86_VM_VMSTRUCTS_X86_HPP
|
||||
|
|
|
@ -578,7 +578,7 @@ void VM_Version::get_processor_features() {
|
|||
}
|
||||
|
||||
char buf[256];
|
||||
jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
cores_per_cpu(), threads_per_core(),
|
||||
cpu_family(), _model, _stepping,
|
||||
(supports_cmov() ? ", cmov" : ""),
|
||||
|
@ -610,7 +610,8 @@ void VM_Version::get_processor_features() {
|
|||
(supports_bmi2() ? ", bmi2" : ""),
|
||||
(supports_adx() ? ", adx" : ""),
|
||||
(supports_evex() ? ", evex" : ""),
|
||||
(supports_sha() ? ", sha" : ""));
|
||||
(supports_sha() ? ", sha" : ""),
|
||||
(supports_fma() ? ", fma" : ""));
|
||||
_features_string = os::strdup(buf);
|
||||
|
||||
// UseSSE is set to the smaller of what hardware supports and what
|
||||
|
@ -732,6 +733,15 @@ void VM_Version::get_processor_features() {
|
|||
FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
|
||||
}
|
||||
|
||||
if (supports_fma() && UseSSE >= 2) {
|
||||
if (FLAG_IS_DEFAULT(UseFMA)) {
|
||||
UseFMA = true;
|
||||
}
|
||||
} else if (UseFMA) {
|
||||
warning("FMA instructions are not available on this CPU");
|
||||
FLAG_SET_DEFAULT(UseFMA, false);
|
||||
}
|
||||
|
||||
if (supports_sha() LP64_ONLY(|| supports_avx2() && supports_bmi2())) {
|
||||
if (FLAG_IS_DEFAULT(UseSHA)) {
|
||||
UseSHA = true;
|
||||
|
@ -773,7 +783,6 @@ void VM_Version::get_processor_features() {
|
|||
FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
|
||||
}
|
||||
|
||||
// Adjust RTM (Restricted Transactional Memory) flags
|
||||
if (!supports_rtm() && UseRTMLocking) {
|
||||
// Can't continue because UseRTMLocking affects UseBiasedLocking flag
|
||||
// setting during arguments processing. See use_biased_locking().
|
||||
|
|
|
@ -74,7 +74,8 @@ class VM_Version : public Abstract_VM_Version {
|
|||
: 1,
|
||||
ssse3 : 1,
|
||||
cid : 1,
|
||||
: 2,
|
||||
: 1,
|
||||
fma : 1,
|
||||
cmpxchg16: 1,
|
||||
: 4,
|
||||
dca : 1,
|
||||
|
@ -289,6 +290,7 @@ protected:
|
|||
#define CPU_AVX512BW ((uint64_t)UCONST64(0x100000000)) // enums are limited to 31 bit
|
||||
#define CPU_AVX512VL ((uint64_t)UCONST64(0x200000000)) // EVEX instructions with smaller vector length
|
||||
#define CPU_SHA ((uint64_t)UCONST64(0x400000000)) // SHA instructions
|
||||
#define CPU_FMA ((uint64_t)UCONST64(0x800000000)) // FMA instructions
|
||||
|
||||
enum Extended_Family {
|
||||
// AMD
|
||||
|
@ -522,6 +524,8 @@ protected:
|
|||
result |= CPU_SHA;
|
||||
if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
|
||||
result |= CPU_LZCNT;
|
||||
if (_cpuid_info.std_cpuid1_ecx.bits.fma != 0)
|
||||
result |= CPU_FMA;
|
||||
// for Intel, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw
|
||||
if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
|
||||
result |= CPU_3DNOW_PREFETCH;
|
||||
|
@ -726,6 +730,7 @@ public:
|
|||
static bool supports_avx256only() { return (supports_avx2() && !supports_evex()); }
|
||||
static bool supports_avxonly() { return ((supports_avx2() || supports_avx()) && !supports_evex()); }
|
||||
static bool supports_sha() { return (_features & CPU_SHA) != 0; }
|
||||
static bool supports_fma() { return (_features & CPU_FMA) != 0; }
|
||||
// Intel features
|
||||
static bool is_intel_family_core() { return is_intel() &&
|
||||
extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
|
||||
|
|
|
@ -3113,6 +3113,30 @@ instruct onspinwait() %{
|
|||
ins_pipe(pipe_slow);
|
||||
%}
|
||||
|
||||
// a * b + c
|
||||
instruct fmaD_reg(regD a, regD b, regD c) %{
|
||||
predicate(UseFMA);
|
||||
match(Set c (FmaD c (Binary a b)));
|
||||
format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
|
||||
ins_cost(150);
|
||||
ins_encode %{
|
||||
__ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
// a * b + c
|
||||
instruct fmaF_reg(regF a, regF b, regF c) %{
|
||||
predicate(UseFMA);
|
||||
match(Set c (FmaF c (Binary a b)));
|
||||
format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
|
||||
ins_cost(150);
|
||||
ins_encode %{
|
||||
__ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
// ====================VECTOR INSTRUCTIONS=====================================
|
||||
|
||||
// Load vectors (4 bytes long)
|
||||
|
|
|
@ -205,7 +205,8 @@ public class AMD64 extends Architecture {
|
|||
AVX512CD,
|
||||
AVX512BW,
|
||||
AVX512VL,
|
||||
SHA
|
||||
SHA,
|
||||
FMA
|
||||
}
|
||||
|
||||
private final EnumSet<CPUFeature> features;
|
||||
|
|
|
@ -124,6 +124,9 @@ public class AMD64HotSpotJVMCIBackendFactory implements HotSpotJVMCIBackendFacto
|
|||
if ((config.vmVersionFeatures & config.amd64SHA) != 0) {
|
||||
features.add(AMD64.CPUFeature.SHA);
|
||||
}
|
||||
if ((config.vmVersionFeatures & config.amd64FMA) != 0) {
|
||||
features.add(AMD64.CPUFeature.FMA);
|
||||
}
|
||||
return features;
|
||||
}
|
||||
|
||||
|
|
|
@ -78,4 +78,5 @@ class AMD64HotSpotVMConfig extends HotSpotVMConfigAccess {
|
|||
final long amd64AVX512BW = getConstant("VM_Version::CPU_AVX512BW", Long.class);
|
||||
final long amd64AVX512VL = getConstant("VM_Version::CPU_AVX512VL", Long.class);
|
||||
final long amd64SHA = getConstant("VM_Version::CPU_SHA", Long.class);
|
||||
final long amd64FMA = getConstant("VM_Version::CPU_FMA", Long.class);
|
||||
}
|
||||
|
|
|
@ -4038,6 +4038,8 @@ int MatchRule::is_expensive() const {
|
|||
strcmp(opType,"EncodeP")==0 ||
|
||||
strcmp(opType,"EncodePKlass")==0 ||
|
||||
strcmp(opType,"DecodeNKlass")==0 ||
|
||||
strcmp(opType,"FmaD") == 0 ||
|
||||
strcmp(opType,"FmaF") == 0 ||
|
||||
strcmp(opType,"RoundDouble")==0 ||
|
||||
strcmp(opType,"RoundFloat")==0 ||
|
||||
strcmp(opType,"ReverseBytesI")==0 ||
|
||||
|
|
|
@ -162,6 +162,8 @@ bool Compiler::is_intrinsic_supported(const methodHandle& method) {
|
|||
case vmIntrinsics::_dlog10:
|
||||
case vmIntrinsics::_dexp:
|
||||
case vmIntrinsics::_dpow:
|
||||
case vmIntrinsics::_fmaD:
|
||||
case vmIntrinsics::_fmaF:
|
||||
case vmIntrinsics::_getObject:
|
||||
case vmIntrinsics::_getBoolean:
|
||||
case vmIntrinsics::_getByte:
|
||||
|
|
|
@ -666,7 +666,9 @@ void LIR_OpVisitState::visit(LIR_Op* op) {
|
|||
|
||||
// LIR_Op3
|
||||
case lir_idiv:
|
||||
case lir_irem: {
|
||||
case lir_irem:
|
||||
case lir_fmad:
|
||||
case lir_fmaf: {
|
||||
assert(op->as_Op3() != NULL, "must be");
|
||||
LIR_Op3* op3= (LIR_Op3*)op;
|
||||
|
||||
|
@ -1663,6 +1665,8 @@ const char * LIR_Op::name() const {
|
|||
// LIR_Op3
|
||||
case lir_idiv: s = "idiv"; break;
|
||||
case lir_irem: s = "irem"; break;
|
||||
case lir_fmad: s = "fmad"; break;
|
||||
case lir_fmaf: s = "fmaf"; break;
|
||||
// LIR_OpJavaCall
|
||||
case lir_static_call: s = "static"; break;
|
||||
case lir_optvirtual_call: s = "optvirtual"; break;
|
||||
|
|
|
@ -956,6 +956,8 @@ enum LIR_Code {
|
|||
, begin_op3
|
||||
, lir_idiv
|
||||
, lir_irem
|
||||
, lir_fmad
|
||||
, lir_fmaf
|
||||
, end_op3
|
||||
, begin_opJavaCall
|
||||
, lir_static_call
|
||||
|
@ -2149,6 +2151,8 @@ class LIR_List: public CompilationResourceObj {
|
|||
|
||||
void abs (LIR_Opr from, LIR_Opr to, LIR_Opr tmp) { append(new LIR_Op2(lir_abs , from, tmp, to)); }
|
||||
void sqrt(LIR_Opr from, LIR_Opr to, LIR_Opr tmp) { append(new LIR_Op2(lir_sqrt, from, tmp, to)); }
|
||||
void fmad(LIR_Opr from, LIR_Opr from1, LIR_Opr from2, LIR_Opr to) { append(new LIR_Op3(lir_fmad, from, from1, from2, to)); }
|
||||
void fmaf(LIR_Opr from, LIR_Opr from1, LIR_Opr from2, LIR_Opr to) { append(new LIR_Op3(lir_fmaf, from, from1, from2, to)); }
|
||||
void log10 (LIR_Opr from, LIR_Opr to, LIR_Opr tmp) { append(new LIR_Op2(lir_log10, from, LIR_OprFact::illegalOpr, to, tmp)); }
|
||||
void tan (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_tan , from, tmp1, to, tmp2)); }
|
||||
|
||||
|
|
|
@ -3147,6 +3147,9 @@ void LIRGenerator::do_Intrinsic(Intrinsic* x) {
|
|||
case vmIntrinsics::_dpow : do_MathIntrinsic(x); break;
|
||||
case vmIntrinsics::_arraycopy: do_ArrayCopy(x); break;
|
||||
|
||||
case vmIntrinsics::_fmaD: do_FmaIntrinsic(x); break;
|
||||
case vmIntrinsics::_fmaF: do_FmaIntrinsic(x); break;
|
||||
|
||||
// java.nio.Buffer.checkIndex
|
||||
case vmIntrinsics::_checkIndex: do_NIOCheckIndex(x); break;
|
||||
|
||||
|
|
|
@ -245,6 +245,7 @@ class LIRGenerator: public InstructionVisitor, public BlockClosure {
|
|||
void do_isPrimitive(Intrinsic* x);
|
||||
void do_getClass(Intrinsic* x);
|
||||
void do_currentThread(Intrinsic* x);
|
||||
void do_FmaIntrinsic(Intrinsic* x);
|
||||
void do_MathIntrinsic(Intrinsic* x);
|
||||
void do_LibmIntrinsic(Intrinsic* x);
|
||||
void do_ArrayCopy(Intrinsic* x);
|
||||
|
|
|
@ -355,6 +355,8 @@ bool vmIntrinsics::preserves_state(vmIntrinsics::ID id) {
|
|||
case vmIntrinsics::_updateBytesCRC32:
|
||||
case vmIntrinsics::_updateByteBufferCRC32:
|
||||
case vmIntrinsics::_vectorizedMismatch:
|
||||
case vmIntrinsics::_fmaD:
|
||||
case vmIntrinsics::_fmaF:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
|
@ -387,6 +389,8 @@ bool vmIntrinsics::can_trap(vmIntrinsics::ID id) {
|
|||
case vmIntrinsics::_updateBytesCRC32:
|
||||
case vmIntrinsics::_updateByteBufferCRC32:
|
||||
case vmIntrinsics::_vectorizedMismatch:
|
||||
case vmIntrinsics::_fmaD:
|
||||
case vmIntrinsics::_fmaF:
|
||||
return false;
|
||||
default:
|
||||
return true;
|
||||
|
@ -535,6 +539,10 @@ bool vmIntrinsics::is_disabled_by_flags(vmIntrinsics::ID id) {
|
|||
case vmIntrinsics::_doubleToLongBits:
|
||||
if (!InlineMathNatives) return true;
|
||||
break;
|
||||
case vmIntrinsics::_fmaD:
|
||||
case vmIntrinsics::_fmaF:
|
||||
if (!InlineMathNatives || !UseFMA) return true;
|
||||
break;
|
||||
case vmIntrinsics::_arraycopy:
|
||||
if (!InlineArrayCopy) return true;
|
||||
break;
|
||||
|
|
|
@ -755,8 +755,10 @@
|
|||
do_class(java_lang_Math, "java/lang/Math") \
|
||||
do_class(java_lang_StrictMath, "java/lang/StrictMath") \
|
||||
do_signature(double2_double_signature, "(DD)D") \
|
||||
do_signature(double3_double_signature, "(DDD)D") \
|
||||
do_signature(float3_float_signature, "(FFF)F") \
|
||||
do_signature(int2_int_signature, "(II)I") \
|
||||
do_signature(long2_long_signature, "(JJ)J") \
|
||||
do_signature(long2_long_signature, "(JJ)J") \
|
||||
\
|
||||
/* here are the math names, all together: */ \
|
||||
do_name(abs_name,"abs") do_name(sin_name,"sin") do_name(cos_name,"cos") \
|
||||
|
@ -770,6 +772,7 @@
|
|||
do_name(multiplyExact_name,"multiplyExact") \
|
||||
do_name(negateExact_name,"negateExact") \
|
||||
do_name(subtractExact_name,"subtractExact") \
|
||||
do_name(fma_name, "fma") \
|
||||
\
|
||||
do_intrinsic(_dabs, java_lang_Math, abs_name, double_double_signature, F_S) \
|
||||
do_intrinsic(_dsin, java_lang_Math, sin_name, double_double_signature, F_S) \
|
||||
|
@ -795,6 +798,8 @@
|
|||
do_intrinsic(_negateExactL, java_lang_Math, negateExact_name, long_long_signature, F_S) \
|
||||
do_intrinsic(_subtractExactI, java_lang_Math, subtractExact_name, int2_int_signature, F_S) \
|
||||
do_intrinsic(_subtractExactL, java_lang_Math, subtractExact_name, long2_long_signature, F_S) \
|
||||
do_intrinsic(_fmaD, java_lang_Math, fma_name, double3_double_signature, F_S) \
|
||||
do_intrinsic(_fmaF, java_lang_Math, fma_name, float3_float_signature, F_S) \
|
||||
\
|
||||
do_intrinsic(_floatToRawIntBits, java_lang_Float, floatToRawIntBits_name, float_int_signature, F_S) \
|
||||
do_name( floatToRawIntBits_name, "floatToRawIntBits") \
|
||||
|
|
|
@ -194,6 +194,13 @@ AbstractInterpreter::MethodKind AbstractInterpreter::method_kind(methodHandle m)
|
|||
return java_lang_ref_reference_get;
|
||||
}
|
||||
|
||||
if (UseFMA) {
|
||||
switch (m->intrinsic_id()) {
|
||||
case vmIntrinsics::_fmaD: return java_lang_math_fmaD;
|
||||
case vmIntrinsics::_fmaF: return java_lang_math_fmaF;
|
||||
}
|
||||
}
|
||||
|
||||
// Accessor method?
|
||||
if (m->is_getter()) {
|
||||
// TODO: We should have used ::is_accessor above, but fast accessors in Zero expect only getters.
|
||||
|
@ -281,6 +288,8 @@ void AbstractInterpreter::print_method_kind(MethodKind kind) {
|
|||
case java_lang_math_sqrt : tty->print("java_lang_math_sqrt" ); break;
|
||||
case java_lang_math_log : tty->print("java_lang_math_log" ); break;
|
||||
case java_lang_math_log10 : tty->print("java_lang_math_log10" ); break;
|
||||
case java_lang_math_fmaD : tty->print("java_lang_math_fmaD" ); break;
|
||||
case java_lang_math_fmaF : tty->print("java_lang_math_fmaF" ); break;
|
||||
case java_util_zip_CRC32_update : tty->print("java_util_zip_CRC32_update"); break;
|
||||
case java_util_zip_CRC32_updateBytes : tty->print("java_util_zip_CRC32_updateBytes"); break;
|
||||
case java_util_zip_CRC32_updateByteBuffer : tty->print("java_util_zip_CRC32_updateByteBuffer"); break;
|
||||
|
|
|
@ -76,6 +76,8 @@ class AbstractInterpreter: AllStatic {
|
|||
java_lang_math_log10, // implementation of java.lang.Math.log10 (x)
|
||||
java_lang_math_pow, // implementation of java.lang.Math.pow (x,y)
|
||||
java_lang_math_exp, // implementation of java.lang.Math.exp (x)
|
||||
java_lang_math_fmaF, // implementation of java.lang.Math.fma (x, y, z)
|
||||
java_lang_math_fmaD, // implementation of java.lang.Math.fma (x, y, z)
|
||||
java_lang_ref_reference_get, // implementation of java.lang.ref.Reference.get()
|
||||
java_util_zip_CRC32_update, // implementation of java.util.zip.CRC32.update()
|
||||
java_util_zip_CRC32_updateBytes, // implementation of java.util.zip.CRC32.updateBytes()
|
||||
|
|
|
@ -239,6 +239,10 @@ void TemplateInterpreterGenerator::generate_all() {
|
|||
method_entry(java_lang_math_log10)
|
||||
method_entry(java_lang_math_exp )
|
||||
method_entry(java_lang_math_pow )
|
||||
if (UseFMA) {
|
||||
method_entry(java_lang_math_fmaF)
|
||||
method_entry(java_lang_math_fmaD)
|
||||
}
|
||||
method_entry(java_lang_ref_reference_get)
|
||||
|
||||
AbstractInterpreter::initialize_method_handle_entries();
|
||||
|
@ -445,7 +449,9 @@ address TemplateInterpreterGenerator::generate_method_entry(
|
|||
case Interpreter::java_lang_math_log10 : // fall thru
|
||||
case Interpreter::java_lang_math_sqrt : // fall thru
|
||||
case Interpreter::java_lang_math_pow : // fall thru
|
||||
case Interpreter::java_lang_math_exp : entry_point = generate_math_entry(kind); break;
|
||||
case Interpreter::java_lang_math_exp : // fall thru
|
||||
case Interpreter::java_lang_math_fmaD : // fall thru
|
||||
case Interpreter::java_lang_math_fmaF : entry_point = generate_math_entry(kind); break;
|
||||
case Interpreter::java_lang_ref_reference_get
|
||||
: entry_point = generate_Reference_get_entry(); break;
|
||||
case Interpreter::java_util_zip_CRC32_update
|
||||
|
|
|
@ -660,7 +660,8 @@
|
|||
#define VM_LONG_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant) \
|
||||
declare_preprocessor_constant("VM_Version::CPU_AVX512BW", CPU_AVX512BW) \
|
||||
declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) \
|
||||
declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA)
|
||||
declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA) \
|
||||
declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA)
|
||||
|
||||
#endif
|
||||
|
||||
|
|
|
@ -416,6 +416,12 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method, bool is_virt
|
|||
case vmIntrinsics::_onSpinWait:
|
||||
if (!Matcher::match_rule_supported(Op_OnSpinWait)) return false;
|
||||
break;
|
||||
case vmIntrinsics::_fmaD:
|
||||
if (!UseFMA || !Matcher::match_rule_supported(Op_FmaD)) return false;
|
||||
break;
|
||||
case vmIntrinsics::_fmaF:
|
||||
if (!UseFMA || !Matcher::match_rule_supported(Op_FmaF)) return false;
|
||||
break;
|
||||
case vmIntrinsics::_hashCode:
|
||||
case vmIntrinsics::_identityHashCode:
|
||||
case vmIntrinsics::_getClass:
|
||||
|
|
|
@ -151,6 +151,8 @@ macro(EncodeP)
|
|||
macro(EncodePKlass)
|
||||
macro(FastLock)
|
||||
macro(FastUnlock)
|
||||
macro(FmaD)
|
||||
macro(FmaF)
|
||||
macro(Goto)
|
||||
macro(Halt)
|
||||
macro(HasNegatives)
|
||||
|
|
|
@ -317,6 +317,7 @@ class LibraryCallKit : public GraphKit {
|
|||
bool inline_montgomeryMultiply();
|
||||
bool inline_montgomerySquare();
|
||||
bool inline_vectorizedMismatch();
|
||||
bool inline_fma(vmIntrinsics::ID id);
|
||||
|
||||
bool inline_profileBoolean();
|
||||
bool inline_isCompileConstant();
|
||||
|
@ -825,6 +826,10 @@ bool LibraryCallKit::try_to_inline(int predicate) {
|
|||
case vmIntrinsics::_hasNegatives:
|
||||
return inline_hasNegatives();
|
||||
|
||||
case vmIntrinsics::_fmaD:
|
||||
case vmIntrinsics::_fmaF:
|
||||
return inline_fma(intrinsic_id());
|
||||
|
||||
default:
|
||||
// If you get here, it may be that someone has added a new intrinsic
|
||||
// to the list in vmSymbols.hpp without implementing it here.
|
||||
|
@ -6657,6 +6662,35 @@ Node* LibraryCallKit::inline_digestBase_implCompressMB_predicate(int predicate)
|
|||
return instof_false; // even if it is NULL
|
||||
}
|
||||
|
||||
//-------------inline_fma-----------------------------------
|
||||
bool LibraryCallKit::inline_fma(vmIntrinsics::ID id) {
|
||||
Node *a = NULL;
|
||||
Node *b = NULL;
|
||||
Node *c = NULL;
|
||||
Node* result = NULL;
|
||||
switch (id) {
|
||||
case vmIntrinsics::_fmaD:
|
||||
assert(callee()->signature()->size() == 6, "fma has 3 parameters of size 2 each.");
|
||||
// no receiver since it is static method
|
||||
a = round_double_node(argument(0));
|
||||
b = round_double_node(argument(2));
|
||||
c = round_double_node(argument(4));
|
||||
result = _gvn.transform(new FmaDNode(control(), a, b, c));
|
||||
break;
|
||||
case vmIntrinsics::_fmaF:
|
||||
assert(callee()->signature()->size() == 3, "fma has 3 parameters of size 1 each.");
|
||||
a = argument(0);
|
||||
b = argument(1);
|
||||
c = argument(2);
|
||||
result = _gvn.transform(new FmaFNode(control(), a, b, c));
|
||||
break;
|
||||
default:
|
||||
fatal_unexpected_iid(id); break;
|
||||
}
|
||||
set_result(result);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool LibraryCallKit::inline_profileBoolean() {
|
||||
Node* counts = argument(1);
|
||||
const TypeAryPtr* ary = NULL;
|
||||
|
|
|
@ -2117,6 +2117,8 @@ void Matcher::find_shared( Node *n ) {
|
|||
case Op_StrInflatedCopy:
|
||||
case Op_StrCompressedCopy:
|
||||
case Op_EncodeISOArray:
|
||||
case Op_FmaD:
|
||||
case Op_FmaF:
|
||||
set_shared(n); // Force result into register (it will be anyways)
|
||||
break;
|
||||
case Op_ConP: { // Convert pointers above the centerline to NUL
|
||||
|
@ -2305,6 +2307,15 @@ void Matcher::find_shared( Node *n ) {
|
|||
n->del_req(4);
|
||||
break;
|
||||
}
|
||||
case Op_FmaD:
|
||||
case Op_FmaF: {
|
||||
// Restructure into a binary tree for Matching.
|
||||
Node* pair = new BinaryNode(n->in(1), n->in(2));
|
||||
n->set_req(2, pair);
|
||||
n->set_req(1, n->in(3));
|
||||
n->del_req(3);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -1343,3 +1343,47 @@ const Type* URShiftLNode::Value(PhaseGVN* phase) const {
|
|||
|
||||
return TypeLong::LONG; // Give up
|
||||
}
|
||||
|
||||
//=============================================================================
|
||||
//------------------------------Value------------------------------------------
|
||||
const Type* FmaDNode::Value(PhaseGVN* phase) const {
|
||||
const Type *t1 = phase->type(in(1));
|
||||
if (t1 == Type::TOP) return Type::TOP;
|
||||
if (t1->base() != Type::DoubleCon) return Type::DOUBLE;
|
||||
const Type *t2 = phase->type(in(2));
|
||||
if (t2 == Type::TOP) return Type::TOP;
|
||||
if (t2->base() != Type::DoubleCon) return Type::DOUBLE;
|
||||
const Type *t3 = phase->type(in(3));
|
||||
if (t3 == Type::TOP) return Type::TOP;
|
||||
if (t3->base() != Type::DoubleCon) return Type::DOUBLE;
|
||||
#ifndef __STDC_IEC_559__
|
||||
return Type::DOUBLE;
|
||||
#else
|
||||
double d1 = t1->getd();
|
||||
double d2 = t2->getd();
|
||||
double d3 = t3->getd();
|
||||
return TypeD::make(fma(d1, d2, d3));
|
||||
#endif
|
||||
}
|
||||
|
||||
//=============================================================================
|
||||
//------------------------------Value------------------------------------------
|
||||
const Type* FmaFNode::Value(PhaseGVN* phase) const {
|
||||
const Type *t1 = phase->type(in(1));
|
||||
if (t1 == Type::TOP) return Type::TOP;
|
||||
if (t1->base() != Type::FloatCon) return Type::FLOAT;
|
||||
const Type *t2 = phase->type(in(2));
|
||||
if (t2 == Type::TOP) return Type::TOP;
|
||||
if (t2->base() != Type::FloatCon) return Type::FLOAT;
|
||||
const Type *t3 = phase->type(in(3));
|
||||
if (t3 == Type::TOP) return Type::TOP;
|
||||
if (t3->base() != Type::FloatCon) return Type::FLOAT;
|
||||
#ifndef __STDC_IEC_559__
|
||||
return Type::FLOAT;
|
||||
#else
|
||||
float f1 = t1->getf();
|
||||
float f2 = t2->getf();
|
||||
float f3 = t3->getf();
|
||||
return TypeF::make(fma(f1, f2, f3));
|
||||
#endif
|
||||
}
|
||||
|
|
|
@ -263,4 +263,26 @@ public:
|
|||
virtual uint ideal_reg() const { return Op_RegL; }
|
||||
};
|
||||
|
||||
//------------------------------FmaDNode--------------------------------------
|
||||
// fused-multiply-add double
|
||||
class FmaDNode : public Node {
|
||||
public:
|
||||
FmaDNode(Node *c, Node *in1, Node *in2, Node *in3) : Node(c, in1, in2, in3) {}
|
||||
virtual int Opcode() const;
|
||||
const Type *bottom_type() const { return Type::DOUBLE; }
|
||||
virtual uint ideal_reg() const { return Op_RegD; }
|
||||
virtual const Type* Value(PhaseGVN* phase) const;
|
||||
};
|
||||
|
||||
//------------------------------FmaFNode--------------------------------------
|
||||
// fused-multiply-add float
|
||||
class FmaFNode : public Node {
|
||||
public:
|
||||
FmaFNode(Node *c, Node *in1, Node *in2, Node *in3) : Node(c, in1, in2, in3) {}
|
||||
virtual int Opcode() const;
|
||||
const Type *bottom_type() const { return Type::FLOAT; }
|
||||
virtual uint ideal_reg() const { return Op_RegF; }
|
||||
virtual const Type* Value(PhaseGVN* phase) const;
|
||||
};
|
||||
|
||||
#endif // SHARE_VM_OPTO_MULNODE_HPP
|
||||
|
|
|
@ -659,6 +659,9 @@ public:
|
|||
product(bool, UseAES, false, \
|
||||
"Control whether AES instructions can be used on x86/x64") \
|
||||
\
|
||||
product(bool, UseFMA, false, \
|
||||
"Control whether FMA instructions can be used") \
|
||||
\
|
||||
product(bool, UseSHA, false, \
|
||||
"Control whether SHA instructions can be used " \
|
||||
"on SPARC, on ARM and on x86") \
|
||||
|
|
|
@ -2105,6 +2105,8 @@ typedef CompactHashtable<Symbol*, char> SymbolCompactHashTable;
|
|||
declare_c2_type(OverflowAddLNode, OverflowLNode) \
|
||||
declare_c2_type(OverflowSubLNode, OverflowLNode) \
|
||||
declare_c2_type(OverflowMulLNode, OverflowLNode) \
|
||||
declare_c2_type(FmaDNode, Node) \
|
||||
declare_c2_type(FmaFNode, Node) \
|
||||
\
|
||||
/*********************/ \
|
||||
/* Adapter Blob Entries */ \
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue