8154122: Intrinsify fused mac operations

Added FMA intrinsics on x86 Reviewed-by: kvn, aph, darcy
2025-08-27 14:54:52 +02:00 · 2016-08-26 12:17:50 -07:00 · 2016-08-26 12:17:50 -07:00 · d58e3e0324
commit d58e3e0324
parent 474c035379
42 changed files with 365 additions and 13 deletions
--- a/hotspot/src/cpu/aarch64/vm/c1_LIRGenerator_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/c1_LIRGenerator_aarch64.cpp
@ -1032,6 +1032,10 @@ void LIRGenerator::do_update_CRC32C(Intrinsic* x) {
  Unimplemented();
 }

+void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
+  fatal("FMA intrinsic is not implemented on this platform");
+}
+
 void LIRGenerator::do_vectorizedMismatch(Intrinsic* x) {
  fatal("vectorizedMismatch intrinsic is not implemented on this platform");
 }
--- a/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp
+++ b/hotspot/src/cpu/aarch64/vm/vm_version_aarch64.cpp
@ -262,6 +262,11 @@ void VM_Version::get_processor_features() {
    FLAG_SET_DEFAULT(UseCRC32CIntrinsics, false);
  }

+  if (UseFMA) {
+    warning("FMA instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseFMA, false);
+  }
+
  if (auxv & (HWCAP_SHA1 | HWCAP_SHA2)) {
    if (FLAG_IS_DEFAULT(UseSHA)) {
      FLAG_SET_DEFAULT(UseSHA, true);
--- a/hotspot/src/cpu/ppc/vm/c1_LIRGenerator_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/c1_LIRGenerator_ppc.cpp
@ -1433,6 +1433,10 @@ void LIRGenerator::do_update_CRC32(Intrinsic* x) {
  }
 }

+void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
+  fatal("FMA intrinsic is not implemented on this platform");
+}
+
 void LIRGenerator::do_vectorizedMismatch(Intrinsic* x) {
  fatal("vectorizedMismatch intrinsic is not implemented on this platform");
 }
--- a/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
+++ b/hotspot/src/cpu/ppc/vm/vm_version_ppc.cpp
@ -230,6 +230,11 @@ void VM_Version::initialize() {
    FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
  }

+  if (UseFMA) {
+    warning("FMA instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseFMA, false);
+  }
+
  if (UseSHA) {
    warning("SHA instructions are not available on this CPU");
    FLAG_SET_DEFAULT(UseSHA, false);
--- a/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp
@ -953,6 +953,10 @@ void LIRGenerator::do_update_CRC32C(Intrinsic* x) {
  }
 }

+void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
+  fatal("FMA intrinsic is not implemented on this platform");
+}
+
 void LIRGenerator::do_vectorizedMismatch(Intrinsic* x) {
  fatal("vectorizedMismatch intrinsic is not implemented on this platform");
 }
--- a/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
+++ b/hotspot/src/cpu/sparc/vm/vm_version_sparc.cpp
@ -266,6 +266,11 @@ void VM_Version::initialize() {
    FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
  }

+  if (UseFMA) {
+    warning("FMA instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseFMA, false);
+  }
+
  // SHA1, SHA256, and SHA512 instructions were added to SPARC T-series at different times
  if (has_sha1() || has_sha256() || has_sha512()) {
    if (UseVIS > 0) { // SHA intrinsics use VIS1 instructions
--- a/hotspot/src/cpu/x86/vm/abstractInterpreter_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/abstractInterpreter_x86.cpp
@ -172,7 +172,9 @@ bool AbstractInterpreter::can_be_compiled(methodHandle m) {
    case Interpreter::java_lang_math_log10   : // fall thru
    case Interpreter::java_lang_math_sqrt    : // fall thru
    case Interpreter::java_lang_math_pow     : // fall thru
-    case Interpreter::java_lang_math_exp     :
+    case Interpreter::java_lang_math_exp     : // fall thru
+    case Interpreter::java_lang_math_fmaD    : // fall thru
+    case Interpreter::java_lang_math_fmaF    :
      return false;
    default:
      return true;
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
@ -4769,6 +4769,22 @@ void Assembler::vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src) {
  emit_int8((unsigned char)(0xC0 | encode));
 }

+void Assembler::vfmadd231sd(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
+  assert(VM_Version::supports_fma(), "");
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8((unsigned char)0xB9);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
+void Assembler::vfmadd231ss(XMMRegister dst, XMMRegister src1, XMMRegister src2) {
+  assert(VM_Version::supports_fma(), "");
+  InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ false);
+  int encode = vex_prefix_and_encode(dst->encoding(), src1->encoding(), src2->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
+  emit_int8((unsigned char)0xB9);
+  emit_int8((unsigned char)(0xC0 | encode));
+}
+
 void Assembler::vmulsd(XMMRegister dst, XMMRegister nds, Address src) {
  assert(VM_Version::supports_avx(), "");
  InstructionMark im(this);
--- a/hotspot/src/cpu/x86/vm/assembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.hpp
@ -1860,6 +1860,8 @@ private:
  void vdivsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
  void vdivss(XMMRegister dst, XMMRegister nds, Address src);
  void vdivss(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vfmadd231sd(XMMRegister dst, XMMRegister nds, XMMRegister src);
+  void vfmadd231ss(XMMRegister dst, XMMRegister nds, XMMRegister src);
  void vmulsd(XMMRegister dst, XMMRegister nds, Address src);
  void vmulsd(XMMRegister dst, XMMRegister nds, XMMRegister src);
  void vmulss(XMMRegister dst, XMMRegister nds, Address src);
--- a/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
@ -1345,6 +1345,18 @@ void LIR_Assembler::emit_op3(LIR_Op3* op) {
                      op->result_opr(),
                      op->info());
      break;
+    case lir_fmad:
+      __ fmad(op->result_opr()->as_xmm_double_reg(),
+              op->in_opr1()->as_xmm_double_reg(),
+              op->in_opr2()->as_xmm_double_reg(),
+              op->in_opr3()->as_xmm_double_reg());
+      break;
+    case lir_fmaf:
+      __ fmaf(op->result_opr()->as_xmm_float_reg(),
+              op->in_opr1()->as_xmm_float_reg(),
+              op->in_opr2()->as_xmm_float_reg(),
+              op->in_opr3()->as_xmm_float_reg());
+      break;
    default:      ShouldNotReachHere(); break;
  }
 }
--- a/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
@ -806,6 +806,32 @@ void LIRGenerator::do_CompareAndSwap(Intrinsic* x, ValueType* type) {
  }
 }

+void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
+  assert(x->number_of_arguments() == 3, "wrong type");
+  assert(UseFMA, "Needs FMA instructions support.");
+  LIRItem value(x->argument_at(0), this);
+  LIRItem value1(x->argument_at(1), this);
+  LIRItem value2(x->argument_at(2), this);
+
+  value2.set_destroys_register();
+
+  value.load_item();
+  value1.load_item();
+  value2.load_item();
+
+  LIR_Opr calc_input = value.result();
+  LIR_Opr calc_input1 = value1.result();
+  LIR_Opr calc_input2 = value2.result();
+  LIR_Opr calc_result = rlock_result(x);
+
+  switch (x->id()) {
+  case vmIntrinsics::_fmaD:   __ fmad(calc_input, calc_input1, calc_input2, calc_result); break;
+  case vmIntrinsics::_fmaF:   __ fmaf(calc_input, calc_input1, calc_input2, calc_result); break;
+  default:                    ShouldNotReachHere();
+  }
+
+}
+

 void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
  assert(x->number_of_arguments() == 1 || (x->number_of_arguments() == 2 && x->id() == vmIntrinsics::_dpow), "wrong type");
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.cpp
@ -3147,6 +3147,24 @@ void MacroAssembler::fremr(Register tmp) {
  fpop();
 }

+// dst = c = a * b + c
+void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
+  Assembler::vfmadd231sd(c, a, b);
+  if (dst != c) {
+    movdbl(dst, c);
+  }
+}
+
+// dst = c = a * b + c
+void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) {
+  Assembler::vfmadd231ss(c, a, b);
+  if (dst != c) {
+    movflt(dst, c);
+  }
+}
+
+
+

 void MacroAssembler::incrementl(AddressLiteral dst) {
  if (reachable(dst)) {
--- a/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/macroAssembler_x86.hpp
@ -449,6 +449,10 @@ class MacroAssembler: public Assembler {
  // tmp is a temporary register, if none is available use noreg
  void fremr(Register tmp);

+  // dst = c = a * b + c
+  void fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
+  void fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c);
+

  // same as fcmp2int, but using SSE2
  void cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less);
--- a/hotspot/src/cpu/x86/vm/templateInterpreterGenerator_x86_32.cpp
+++ b/hotspot/src/cpu/x86/vm/templateInterpreterGenerator_x86_32.cpp
@ -341,6 +341,27 @@ address TemplateInterpreterGenerator::generate_math_entry(AbstractInterpreter::M
  //        [ lo(arg) ]
  //        [ hi(arg) ]
  //
+  if (kind == Interpreter::java_lang_math_fmaD) {
+    __ movdbl(xmm2, Address(rsp, 5 * wordSize));
+    __ movdbl(xmm1, Address(rsp, 3 * wordSize));
+    __ movdbl(xmm0, Address(rsp, 1 * wordSize));
+    __ fmad(xmm0, xmm1, xmm2, xmm0);
+    __ pop(rdi);                               // get return address
+    __ mov(rsp, rsi);                          // set sp to sender sp
+    __ jmp(rdi);
+
+    return entry_point;
+  } else if (kind == Interpreter::java_lang_math_fmaF) {
+    __ movflt(xmm2, Address(rsp, 3 * wordSize));
+    __ movflt(xmm1, Address(rsp, 2 * wordSize));
+    __ movflt(xmm0, Address(rsp, 1 * wordSize));
+    __ fmaf(xmm0, xmm1, xmm2, xmm0);
+    __ pop(rdi);                               // get return address
+    __ mov(rsp, rsi);                          // set sp to sender sp
+    __ jmp(rdi);
+
+    return entry_point;
+ }

  __ fld_d(Address(rsp, 1*wordSize));
  switch (kind) {
--- a/hotspot/src/cpu/x86/vm/templateInterpreterGenerator_x86_64.cpp
+++ b/hotspot/src/cpu/x86/vm/templateInterpreterGenerator_x86_64.cpp
@ -369,8 +369,17 @@ address TemplateInterpreterGenerator::generate_math_entry(AbstractInterpreter::M
  //        [ hi(arg) ]
  //

-
-  if (kind == Interpreter::java_lang_math_sqrt) {
+  if (kind == Interpreter::java_lang_math_fmaD) {
+    __ movdbl(xmm0, Address(rsp, wordSize));
+    __ movdbl(xmm1, Address(rsp, 3 * wordSize));
+    __ movdbl(xmm2, Address(rsp, 5 * wordSize));
+    __ fmad(xmm0, xmm1, xmm2, xmm0);
+  } else if (kind == Interpreter::java_lang_math_fmaF) {
+    __ movflt(xmm0, Address(rsp, wordSize));
+    __ movflt(xmm1, Address(rsp, 2 * wordSize));
+    __ movflt(xmm2, Address(rsp, 3 * wordSize));
+    __ fmaf(xmm0, xmm1, xmm2, xmm0);
+  } else if (kind == Interpreter::java_lang_math_sqrt) {
    __ sqrtsd(xmm0, Address(rsp, wordSize));
  } else if (kind == Interpreter::java_lang_math_exp) {
    __ movdbl(xmm0, Address(rsp, wordSize));
--- a/hotspot/src/cpu/x86/vm/vmStructs_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/vmStructs_x86.hpp
@ -73,6 +73,7 @@
 #define VM_LONG_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant) \
  declare_preprocessor_constant("VM_Version::CPU_AVX512BW", CPU_AVX512BW) \
  declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) \
-  declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA)
+  declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA)           \
+  declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA)

 #endif // CPU_X86_VM_VMSTRUCTS_X86_HPP
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.cpp
@ -578,7 +578,7 @@ void VM_Version::get_processor_features() {
  }

  char buf[256];
-  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
+  jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
               cores_per_cpu(), threads_per_core(),
               cpu_family(), _model, _stepping,
               (supports_cmov() ? ", cmov" : ""),
@ -610,7 +610,8 @@ void VM_Version::get_processor_features() {
               (supports_bmi2() ? ", bmi2" : ""),
               (supports_adx() ? ", adx" : ""),
               (supports_evex() ? ", evex" : ""),
-               (supports_sha() ? ", sha" : ""));
+               (supports_sha() ? ", sha" : ""),
+               (supports_fma() ? ", fma" : ""));
  _features_string = os::strdup(buf);

  // UseSSE is set to the smaller of what hardware supports and what
@ -732,6 +733,15 @@ void VM_Version::get_processor_features() {
    FLAG_SET_DEFAULT(UseGHASHIntrinsics, false);
  }

+  if (supports_fma() && UseSSE >= 2) {
+    if (FLAG_IS_DEFAULT(UseFMA)) {
+      UseFMA = true;
+    }
+  } else if (UseFMA) {
+    warning("FMA instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseFMA, false);
+  }
+
  if (supports_sha() LP64_ONLY(|| supports_avx2() && supports_bmi2())) {
    if (FLAG_IS_DEFAULT(UseSHA)) {
      UseSHA = true;
@ -773,7 +783,6 @@ void VM_Version::get_processor_features() {
    FLAG_SET_DEFAULT(UseAdler32Intrinsics, false);
  }

-  // Adjust RTM (Restricted Transactional Memory) flags
  if (!supports_rtm() && UseRTMLocking) {
    // Can't continue because UseRTMLocking affects UseBiasedLocking flag
    // setting during arguments processing. See use_biased_locking().
--- a/hotspot/src/cpu/x86/vm/vm_version_x86.hpp
+++ b/hotspot/src/cpu/x86/vm/vm_version_x86.hpp
@ -74,7 +74,8 @@ class VM_Version : public Abstract_VM_Version {
                        : 1,
               ssse3    : 1,
               cid      : 1,
-                        : 2,
+                        : 1,
+               fma      : 1,
               cmpxchg16: 1,
                        : 4,
               dca      : 1,
@ -289,6 +290,7 @@ protected:
 #define CPU_AVX512BW ((uint64_t)UCONST64(0x100000000)) // enums are limited to 31 bit
 #define CPU_AVX512VL ((uint64_t)UCONST64(0x200000000)) // EVEX instructions with smaller vector length
 #define CPU_SHA ((uint64_t)UCONST64(0x400000000))      // SHA instructions
+#define CPU_FMA ((uint64_t)UCONST64(0x800000000))      // FMA instructions

  enum Extended_Family {
    // AMD
@ -522,6 +524,8 @@ protected:
        result |= CPU_SHA;
      if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
        result |= CPU_LZCNT;
+      if (_cpuid_info.std_cpuid1_ecx.bits.fma != 0)
+        result |= CPU_FMA;
      // for Intel, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw
      if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
        result |= CPU_3DNOW_PREFETCH;
@ -726,6 +730,7 @@ public:
  static bool supports_avx256only() { return (supports_avx2() && !supports_evex()); }
  static bool supports_avxonly()    { return ((supports_avx2() || supports_avx()) && !supports_evex()); }
  static bool supports_sha()        { return (_features & CPU_SHA) != 0; }
+  static bool supports_fma()        { return (_features & CPU_FMA) != 0; }
  // Intel features
  static bool is_intel_family_core() { return is_intel() &&
                                       extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
--- a/hotspot/src/cpu/x86/vm/x86.ad
+++ b/hotspot/src/cpu/x86/vm/x86.ad
@ -3113,6 +3113,30 @@ instruct onspinwait() %{
  ins_pipe(pipe_slow);
 %}

+// a * b + c
+instruct fmaD_reg(regD a, regD b, regD c) %{
+  predicate(UseFMA);
+  match(Set c (FmaD  c (Binary a b)));
+  format %{ "fmasd $a,$b,$c\t# $c = $a * $b + $c" %}
+  ins_cost(150);
+  ins_encode %{
+    __ fmad($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
+// a * b + c
+instruct fmaF_reg(regF a, regF b, regF c) %{
+  predicate(UseFMA);
+  match(Set c (FmaF  c (Binary a b)));
+  format %{ "fmass $a,$b,$c\t# $c = $a * $b + $c" %}
+  ins_cost(150);
+  ins_encode %{
+    __ fmaf($c$$XMMRegister, $a$$XMMRegister, $b$$XMMRegister, $c$$XMMRegister);
+  %}
+  ins_pipe( pipe_slow );
+%}
+
 // ====================VECTOR INSTRUCTIONS=====================================

 // Load vectors (4 bytes long)
--- a/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java
+++ b/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.amd64/src/jdk/vm/ci/amd64/AMD64.java
@ -205,7 +205,8 @@ public class AMD64 extends Architecture {
        AVX512CD,
        AVX512BW,
        AVX512VL,
-        SHA
+        SHA,
+        FMA
    }

    private final EnumSet<CPUFeature> features;
--- a/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot.amd64/src/jdk/vm/ci/hotspot/amd64/AMD64HotSpotJVMCIBackendFactory.java
+++ b/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot.amd64/src/jdk/vm/ci/hotspot/amd64/AMD64HotSpotJVMCIBackendFactory.java
@ -124,6 +124,9 @@ public class AMD64HotSpotJVMCIBackendFactory implements HotSpotJVMCIBackendFacto
        if ((config.vmVersionFeatures & config.amd64SHA) != 0) {
            features.add(AMD64.CPUFeature.SHA);
        }
+        if ((config.vmVersionFeatures & config.amd64FMA) != 0) {
+            features.add(AMD64.CPUFeature.FMA);
+        }
        return features;
    }

--- a/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot.amd64/src/jdk/vm/ci/hotspot/amd64/AMD64HotSpotVMConfig.java
+++ b/hotspot/src/jdk.vm.ci/share/classes/jdk.vm.ci.hotspot.amd64/src/jdk/vm/ci/hotspot/amd64/AMD64HotSpotVMConfig.java
@ -78,4 +78,5 @@ class AMD64HotSpotVMConfig extends HotSpotVMConfigAccess {
    final long amd64AVX512BW = getConstant("VM_Version::CPU_AVX512BW", Long.class);
    final long amd64AVX512VL = getConstant("VM_Version::CPU_AVX512VL", Long.class);
    final long amd64SHA = getConstant("VM_Version::CPU_SHA", Long.class);
+    final long amd64FMA = getConstant("VM_Version::CPU_FMA", Long.class);
 }
--- a/hotspot/src/share/vm/adlc/formssel.cpp
+++ b/hotspot/src/share/vm/adlc/formssel.cpp
@ -4038,6 +4038,8 @@ int MatchRule::is_expensive() const {
        strcmp(opType,"EncodeP")==0 ||
        strcmp(opType,"EncodePKlass")==0 ||
        strcmp(opType,"DecodeNKlass")==0 ||
+        strcmp(opType,"FmaD") == 0 ||
+        strcmp(opType,"FmaF") == 0 ||
        strcmp(opType,"RoundDouble")==0 ||
        strcmp(opType,"RoundFloat")==0 ||
        strcmp(opType,"ReverseBytesI")==0 ||
--- a/hotspot/src/share/vm/c1/c1_Compiler.cpp
+++ b/hotspot/src/share/vm/c1/c1_Compiler.cpp
@ -162,6 +162,8 @@ bool Compiler::is_intrinsic_supported(const methodHandle& method) {
  case vmIntrinsics::_dlog10:
  case vmIntrinsics::_dexp:
  case vmIntrinsics::_dpow:
+  case vmIntrinsics::_fmaD:
+  case vmIntrinsics::_fmaF:
  case vmIntrinsics::_getObject:
  case vmIntrinsics::_getBoolean:
  case vmIntrinsics::_getByte:
--- a/hotspot/src/share/vm/c1/c1_LIR.cpp
+++ b/hotspot/src/share/vm/c1/c1_LIR.cpp
@ -666,7 +666,9 @@ void LIR_OpVisitState::visit(LIR_Op* op) {

 // LIR_Op3
    case lir_idiv:
-    case lir_irem: {
+    case lir_irem:
+    case lir_fmad:
+    case lir_fmaf: {
      assert(op->as_Op3() != NULL, "must be");
      LIR_Op3* op3= (LIR_Op3*)op;

@ -1663,6 +1665,8 @@ const char * LIR_Op::name() const {
     // LIR_Op3
     case lir_idiv:                  s = "idiv";          break;
     case lir_irem:                  s = "irem";          break;
+     case lir_fmad:                  s = "fmad";          break;
+     case lir_fmaf:                  s = "fmaf";          break;
     // LIR_OpJavaCall
     case lir_static_call:           s = "static";        break;
     case lir_optvirtual_call:       s = "optvirtual";    break;
--- a/hotspot/src/share/vm/c1/c1_LIR.hpp
+++ b/hotspot/src/share/vm/c1/c1_LIR.hpp
@ -956,6 +956,8 @@ enum LIR_Code {
  , begin_op3
      , lir_idiv
      , lir_irem
+      , lir_fmad
+      , lir_fmaf
  , end_op3
  , begin_opJavaCall
      , lir_static_call
@ -2149,6 +2151,8 @@ class LIR_List: public CompilationResourceObj {

  void abs (LIR_Opr from, LIR_Opr to, LIR_Opr tmp)                { append(new LIR_Op2(lir_abs , from, tmp, to)); }
  void sqrt(LIR_Opr from, LIR_Opr to, LIR_Opr tmp)                { append(new LIR_Op2(lir_sqrt, from, tmp, to)); }
+  void fmad(LIR_Opr from, LIR_Opr from1, LIR_Opr from2, LIR_Opr to) { append(new LIR_Op3(lir_fmad, from, from1, from2, to)); }
+  void fmaf(LIR_Opr from, LIR_Opr from1, LIR_Opr from2, LIR_Opr to) { append(new LIR_Op3(lir_fmaf, from, from1, from2, to)); }
  void log10 (LIR_Opr from, LIR_Opr to, LIR_Opr tmp)              { append(new LIR_Op2(lir_log10, from, LIR_OprFact::illegalOpr, to, tmp)); }
  void tan (LIR_Opr from, LIR_Opr to, LIR_Opr tmp1, LIR_Opr tmp2) { append(new LIR_Op2(lir_tan , from, tmp1, to, tmp2)); }

--- a/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp
+++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.cpp
@ -3147,6 +3147,9 @@ void LIRGenerator::do_Intrinsic(Intrinsic* x) {
  case vmIntrinsics::_dpow :          do_MathIntrinsic(x); break;
  case vmIntrinsics::_arraycopy:      do_ArrayCopy(x);     break;

+  case vmIntrinsics::_fmaD:           do_FmaIntrinsic(x); break;
+  case vmIntrinsics::_fmaF:           do_FmaIntrinsic(x); break;
+
  // java.nio.Buffer.checkIndex
  case vmIntrinsics::_checkIndex:     do_NIOCheckIndex(x); break;

--- a/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp
+++ b/hotspot/src/share/vm/c1/c1_LIRGenerator.hpp
@ -245,6 +245,7 @@ class LIRGenerator: public InstructionVisitor, public BlockClosure {
  void do_isPrimitive(Intrinsic* x);
  void do_getClass(Intrinsic* x);
  void do_currentThread(Intrinsic* x);
+  void do_FmaIntrinsic(Intrinsic* x);
  void do_MathIntrinsic(Intrinsic* x);
  void do_LibmIntrinsic(Intrinsic* x);
  void do_ArrayCopy(Intrinsic* x);
--- a/hotspot/src/share/vm/classfile/vmSymbols.cpp
+++ b/hotspot/src/share/vm/classfile/vmSymbols.cpp
@ -355,6 +355,8 @@ bool vmIntrinsics::preserves_state(vmIntrinsics::ID id) {
  case vmIntrinsics::_updateBytesCRC32:
  case vmIntrinsics::_updateByteBufferCRC32:
  case vmIntrinsics::_vectorizedMismatch:
+  case vmIntrinsics::_fmaD:
+  case vmIntrinsics::_fmaF:
    return true;
  default:
    return false;
@ -387,6 +389,8 @@ bool vmIntrinsics::can_trap(vmIntrinsics::ID id) {
  case vmIntrinsics::_updateBytesCRC32:
  case vmIntrinsics::_updateByteBufferCRC32:
  case vmIntrinsics::_vectorizedMismatch:
+  case vmIntrinsics::_fmaD:
+  case vmIntrinsics::_fmaF:
    return false;
  default:
    return true;
@ -535,6 +539,10 @@ bool vmIntrinsics::is_disabled_by_flags(vmIntrinsics::ID id) {
  case vmIntrinsics::_doubleToLongBits:
    if (!InlineMathNatives) return true;
    break;
+  case vmIntrinsics::_fmaD:
+  case vmIntrinsics::_fmaF:
+    if (!InlineMathNatives || !UseFMA) return true;
+    break;
  case vmIntrinsics::_arraycopy:
    if (!InlineArrayCopy) return true;
    break;
--- a/hotspot/src/share/vm/classfile/vmSymbols.hpp
+++ b/hotspot/src/share/vm/classfile/vmSymbols.hpp
@ -755,8 +755,10 @@
  do_class(java_lang_Math,                "java/lang/Math")                                                             \
  do_class(java_lang_StrictMath,          "java/lang/StrictMath")                                                       \
  do_signature(double2_double_signature,  "(DD)D")                                                                      \
+  do_signature(double3_double_signature,  "(DDD)D")                                                                     \
+  do_signature(float3_float_signature,    "(FFF)F")                                                                     \
  do_signature(int2_int_signature,        "(II)I")                                                                      \
-  do_signature(long2_long_signature,      "(JJ)J")                                                                         \
+  do_signature(long2_long_signature,      "(JJ)J")                                                                      \
                                                                                                                        \
  /* here are the math names, all together: */                                                                          \
  do_name(abs_name,"abs")       do_name(sin_name,"sin")         do_name(cos_name,"cos")                                 \
@ -770,6 +772,7 @@
  do_name(multiplyExact_name,"multiplyExact")                                                                           \
  do_name(negateExact_name,"negateExact")                                                                               \
  do_name(subtractExact_name,"subtractExact")                                                                           \
+  do_name(fma_name, "fma")                                                                                              \
                                                                                                                        \
  do_intrinsic(_dabs,                     java_lang_Math,         abs_name,   double_double_signature,           F_S)   \
  do_intrinsic(_dsin,                     java_lang_Math,         sin_name,   double_double_signature,           F_S)   \
@ -795,6 +798,8 @@
  do_intrinsic(_negateExactL,             java_lang_Math,         negateExact_name, long_long_signature,         F_S)   \
  do_intrinsic(_subtractExactI,           java_lang_Math,         subtractExact_name, int2_int_signature,        F_S)   \
  do_intrinsic(_subtractExactL,           java_lang_Math,         subtractExact_name, long2_long_signature,      F_S)   \
+  do_intrinsic(_fmaD,                     java_lang_Math,         fma_name,           double3_double_signature,  F_S)   \
+  do_intrinsic(_fmaF,                     java_lang_Math,         fma_name,           float3_float_signature,    F_S)   \
                                                                                                                        \
  do_intrinsic(_floatToRawIntBits,        java_lang_Float,        floatToRawIntBits_name,   float_int_signature, F_S)   \
   do_name(     floatToRawIntBits_name,                          "floatToRawIntBits")                                   \
--- a/hotspot/src/share/vm/interpreter/abstractInterpreter.cpp
+++ b/hotspot/src/share/vm/interpreter/abstractInterpreter.cpp
@ -194,6 +194,13 @@ AbstractInterpreter::MethodKind AbstractInterpreter::method_kind(methodHandle m)
                                return java_lang_ref_reference_get;
  }

+  if (UseFMA) {
+    switch (m->intrinsic_id()) {
+      case vmIntrinsics::_fmaD: return java_lang_math_fmaD;
+      case vmIntrinsics::_fmaF: return java_lang_math_fmaF;
+    }
+  }
+
  // Accessor method?
  if (m->is_getter()) {
    // TODO: We should have used ::is_accessor above, but fast accessors in Zero expect only getters.
@ -281,6 +288,8 @@ void AbstractInterpreter::print_method_kind(MethodKind kind) {
    case java_lang_math_sqrt    : tty->print("java_lang_math_sqrt"    ); break;
    case java_lang_math_log     : tty->print("java_lang_math_log"     ); break;
    case java_lang_math_log10   : tty->print("java_lang_math_log10"   ); break;
+    case java_lang_math_fmaD    : tty->print("java_lang_math_fmaD"    ); break;
+    case java_lang_math_fmaF    : tty->print("java_lang_math_fmaF"    ); break;
    case java_util_zip_CRC32_update           : tty->print("java_util_zip_CRC32_update"); break;
    case java_util_zip_CRC32_updateBytes      : tty->print("java_util_zip_CRC32_updateBytes"); break;
    case java_util_zip_CRC32_updateByteBuffer : tty->print("java_util_zip_CRC32_updateByteBuffer"); break;
--- a/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp
+++ b/hotspot/src/share/vm/interpreter/abstractInterpreter.hpp
@ -76,6 +76,8 @@ class AbstractInterpreter: AllStatic {
    java_lang_math_log10,                                       // implementation of java.lang.Math.log10 (x)
    java_lang_math_pow,                                         // implementation of java.lang.Math.pow   (x,y)
    java_lang_math_exp,                                         // implementation of java.lang.Math.exp   (x)
+    java_lang_math_fmaF,                                        // implementation of java.lang.Math.fma   (x, y, z)
+    java_lang_math_fmaD,                                        // implementation of java.lang.Math.fma   (x, y, z)
    java_lang_ref_reference_get,                                // implementation of java.lang.ref.Reference.get()
    java_util_zip_CRC32_update,                                 // implementation of java.util.zip.CRC32.update()
    java_util_zip_CRC32_updateBytes,                            // implementation of java.util.zip.CRC32.updateBytes()
--- a/hotspot/src/share/vm/interpreter/templateInterpreterGenerator.cpp
+++ b/hotspot/src/share/vm/interpreter/templateInterpreterGenerator.cpp
@ -239,6 +239,10 @@ void TemplateInterpreterGenerator::generate_all() {
      method_entry(java_lang_math_log10)
      method_entry(java_lang_math_exp  )
      method_entry(java_lang_math_pow  )
+      if (UseFMA) {
+        method_entry(java_lang_math_fmaF)
+        method_entry(java_lang_math_fmaD)
+      }
      method_entry(java_lang_ref_reference_get)

      AbstractInterpreter::initialize_method_handle_entries();
@ -445,7 +449,9 @@ address TemplateInterpreterGenerator::generate_method_entry(
  case Interpreter::java_lang_math_log10   : // fall thru
  case Interpreter::java_lang_math_sqrt    : // fall thru
  case Interpreter::java_lang_math_pow     : // fall thru
-  case Interpreter::java_lang_math_exp     : entry_point = generate_math_entry(kind);      break;
+  case Interpreter::java_lang_math_exp     : // fall thru
+  case Interpreter::java_lang_math_fmaD    : // fall thru
+  case Interpreter::java_lang_math_fmaF     : entry_point = generate_math_entry(kind);      break;
  case Interpreter::java_lang_ref_reference_get
                                           : entry_point = generate_Reference_get_entry(); break;
  case Interpreter::java_util_zip_CRC32_update
--- a/hotspot/src/share/vm/jvmci/vmStructs_jvmci.cpp
+++ b/hotspot/src/share/vm/jvmci/vmStructs_jvmci.cpp
@ -660,7 +660,8 @@
 #define VM_LONG_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant) \
  declare_preprocessor_constant("VM_Version::CPU_AVX512BW", CPU_AVX512BW) \
  declare_preprocessor_constant("VM_Version::CPU_AVX512VL", CPU_AVX512VL) \
-  declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA)
+  declare_preprocessor_constant("VM_Version::CPU_SHA", CPU_SHA)           \
+  declare_preprocessor_constant("VM_Version::CPU_FMA", CPU_FMA)

 #endif

--- a/hotspot/src/share/vm/opto/c2compiler.cpp
+++ b/hotspot/src/share/vm/opto/c2compiler.cpp
@ -416,6 +416,12 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method, bool is_virt
  case vmIntrinsics::_onSpinWait:
    if (!Matcher::match_rule_supported(Op_OnSpinWait)) return false;
    break;
+  case vmIntrinsics::_fmaD:
+    if (!UseFMA || !Matcher::match_rule_supported(Op_FmaD)) return false;
+    break;
+  case vmIntrinsics::_fmaF:
+    if (!UseFMA || !Matcher::match_rule_supported(Op_FmaF)) return false;
+    break;
  case vmIntrinsics::_hashCode:
  case vmIntrinsics::_identityHashCode:
  case vmIntrinsics::_getClass:
--- a/hotspot/src/share/vm/opto/classes.hpp
+++ b/hotspot/src/share/vm/opto/classes.hpp
@ -151,6 +151,8 @@ macro(EncodeP)
 macro(EncodePKlass)
 macro(FastLock)
 macro(FastUnlock)
+macro(FmaD)
+macro(FmaF)
 macro(Goto)
 macro(Halt)
 macro(HasNegatives)
--- a/hotspot/src/share/vm/opto/library_call.cpp
+++ b/hotspot/src/share/vm/opto/library_call.cpp
@ -317,6 +317,7 @@ class LibraryCallKit : public GraphKit {
  bool inline_montgomeryMultiply();
  bool inline_montgomerySquare();
  bool inline_vectorizedMismatch();
+  bool inline_fma(vmIntrinsics::ID id);

  bool inline_profileBoolean();
  bool inline_isCompileConstant();
@ -825,6 +826,10 @@ bool LibraryCallKit::try_to_inline(int predicate) {
  case vmIntrinsics::_hasNegatives:
    return inline_hasNegatives();

+  case vmIntrinsics::_fmaD:
+  case vmIntrinsics::_fmaF:
+    return inline_fma(intrinsic_id());
+
  default:
    // If you get here, it may be that someone has added a new intrinsic
    // to the list in vmSymbols.hpp without implementing it here.
@ -6657,6 +6662,35 @@ Node* LibraryCallKit::inline_digestBase_implCompressMB_predicate(int predicate)
  return instof_false;  // even if it is NULL
 }

+//-------------inline_fma-----------------------------------
+bool LibraryCallKit::inline_fma(vmIntrinsics::ID id) {
+  Node *a = NULL;
+  Node *b = NULL;
+  Node *c = NULL;
+  Node* result = NULL;
+  switch (id) {
+  case vmIntrinsics::_fmaD:
+    assert(callee()->signature()->size() == 6, "fma has 3 parameters of size 2 each.");
+    // no receiver since it is static method
+    a = round_double_node(argument(0));
+    b = round_double_node(argument(2));
+    c = round_double_node(argument(4));
+    result = _gvn.transform(new FmaDNode(control(), a, b, c));
+    break;
+  case vmIntrinsics::_fmaF:
+    assert(callee()->signature()->size() == 3, "fma has 3 parameters of size 1 each.");
+    a = argument(0);
+    b = argument(1);
+    c = argument(2);
+    result = _gvn.transform(new FmaFNode(control(), a, b, c));
+    break;
+  default:
+    fatal_unexpected_iid(id);  break;
+  }
+  set_result(result);
+  return true;
+}
+
 bool LibraryCallKit::inline_profileBoolean() {
  Node* counts = argument(1);
  const TypeAryPtr* ary = NULL;
--- a/hotspot/src/share/vm/opto/matcher.cpp
+++ b/hotspot/src/share/vm/opto/matcher.cpp
@ -2117,6 +2117,8 @@ void Matcher::find_shared( Node *n ) {
      case Op_StrInflatedCopy:
      case Op_StrCompressedCopy:
      case Op_EncodeISOArray:
+      case Op_FmaD:
+      case Op_FmaF:
        set_shared(n); // Force result into register (it will be anyways)
        break;
      case Op_ConP: {  // Convert pointers above the centerline to NUL
@ -2305,6 +2307,15 @@ void Matcher::find_shared( Node *n ) {
        n->del_req(4);
        break;
      }
+      case Op_FmaD:
+      case Op_FmaF: {
+        // Restructure into a binary tree for Matching.
+        Node* pair = new BinaryNode(n->in(1), n->in(2));
+        n->set_req(2, pair);
+        n->set_req(1, n->in(3));
+        n->del_req(3);
+        break;
+      }
      default:
        break;
      }
--- a/hotspot/src/share/vm/opto/mulnode.cpp
+++ b/hotspot/src/share/vm/opto/mulnode.cpp
@ -1343,3 +1343,47 @@ const Type* URShiftLNode::Value(PhaseGVN* phase) const {

  return TypeLong::LONG;                // Give up
 }
+
+//=============================================================================
+//------------------------------Value------------------------------------------
+const Type* FmaDNode::Value(PhaseGVN* phase) const {
+  const Type *t1 = phase->type(in(1));
+  if (t1 == Type::TOP) return Type::TOP;
+  if (t1->base() != Type::DoubleCon) return Type::DOUBLE;
+  const Type *t2 = phase->type(in(2));
+  if (t2 == Type::TOP) return Type::TOP;
+  if (t2->base() != Type::DoubleCon) return Type::DOUBLE;
+  const Type *t3 = phase->type(in(3));
+  if (t3 == Type::TOP) return Type::TOP;
+  if (t3->base() != Type::DoubleCon) return Type::DOUBLE;
+#ifndef __STDC_IEC_559__
+  return Type::DOUBLE;
+#else
+  double d1 = t1->getd();
+  double d2 = t2->getd();
+  double d3 = t3->getd();
+  return TypeD::make(fma(d1, d2, d3));
+#endif
+}
+
+//=============================================================================
+//------------------------------Value------------------------------------------
+const Type* FmaFNode::Value(PhaseGVN* phase) const {
+  const Type *t1 = phase->type(in(1));
+  if (t1 == Type::TOP) return Type::TOP;
+  if (t1->base() != Type::FloatCon) return Type::FLOAT;
+  const Type *t2 = phase->type(in(2));
+  if (t2 == Type::TOP) return Type::TOP;
+  if (t2->base() != Type::FloatCon) return Type::FLOAT;
+  const Type *t3 = phase->type(in(3));
+  if (t3 == Type::TOP) return Type::TOP;
+  if (t3->base() != Type::FloatCon) return Type::FLOAT;
+#ifndef __STDC_IEC_559__
+  return Type::FLOAT;
+#else
+  float f1 = t1->getf();
+  float f2 = t2->getf();
+  float f3 = t3->getf();
+  return TypeF::make(fma(f1, f2, f3));
+#endif
+}
--- a/hotspot/src/share/vm/opto/mulnode.hpp
+++ b/hotspot/src/share/vm/opto/mulnode.hpp
@ -263,4 +263,26 @@ public:
  virtual uint ideal_reg() const { return Op_RegL; }
 };

+//------------------------------FmaDNode--------------------------------------
+// fused-multiply-add double
+class FmaDNode : public Node {
+public:
+  FmaDNode(Node *c, Node *in1, Node *in2, Node *in3) : Node(c, in1, in2, in3) {}
+  virtual int Opcode() const;
+  const Type *bottom_type() const { return Type::DOUBLE; }
+  virtual uint ideal_reg() const { return Op_RegD; }
+  virtual const Type* Value(PhaseGVN* phase) const;
+};
+
+//------------------------------FmaFNode--------------------------------------
+// fused-multiply-add float
+class FmaFNode : public Node {
+public:
+  FmaFNode(Node *c, Node *in1, Node *in2, Node *in3) : Node(c, in1, in2, in3) {}
+  virtual int Opcode() const;
+  const Type *bottom_type() const { return Type::FLOAT; }
+  virtual uint ideal_reg() const { return Op_RegF; }
+  virtual const Type* Value(PhaseGVN* phase) const;
+};
+
 #endif // SHARE_VM_OPTO_MULNODE_HPP
--- a/hotspot/src/share/vm/runtime/globals.hpp
+++ b/hotspot/src/share/vm/runtime/globals.hpp
@ -659,6 +659,9 @@ public:
  product(bool, UseAES, false,                                              \
          "Control whether AES instructions can be used on x86/x64")        \
                                                                            \
+  product(bool, UseFMA, false,                                              \
+          "Control whether FMA instructions can be used")                   \
+                                                                            \
  product(bool, UseSHA, false,                                              \
          "Control whether SHA instructions can be used "                   \
          "on SPARC, on ARM and on x86")                                    \
--- a/hotspot/src/share/vm/runtime/vmStructs.cpp
+++ b/hotspot/src/share/vm/runtime/vmStructs.cpp
@ -2105,6 +2105,8 @@ typedef CompactHashtable<Symbol*, char>       SymbolCompactHashTable;
  declare_c2_type(OverflowAddLNode, OverflowLNode)                        \
  declare_c2_type(OverflowSubLNode, OverflowLNode)                        \
  declare_c2_type(OverflowMulLNode, OverflowLNode)                        \
+  declare_c2_type(FmaDNode, Node)                                         \
+  declare_c2_type(FmaFNode, Node)                                         \
                                                                          \
  /*********************/                                                 \
  /* Adapter Blob Entries */                                              \