7133857: exp() and pow() should use the x87 ISA on x86

Use x87 instructions to implement exp() and pow() in interpreter/c1/c2. Reviewed-by: kvn, never, twisti
2025-08-26 14:24:46 +02:00 · 2012-05-15 10:10:23 +02:00 · 2012-05-15 10:10:23 +02:00 · b305cf722e
commit b305cf722e
parent eb4a860bc3
26 changed files with 783 additions and 279 deletions
--- a/hotspot/src/cpu/x86/vm/assembler_x86.cpp
+++ b/hotspot/src/cpu/x86/vm/assembler_x86.cpp
@ -3578,6 +3578,21 @@ void Assembler::fyl2x() {
  emit_byte(0xF1);
 }

+void Assembler::frndint() {
+  emit_byte(0xD9);
+  emit_byte(0xFC);
+}
+
+void Assembler::f2xm1() {
+  emit_byte(0xD9);
+  emit_byte(0xF0);
+}
+
+void Assembler::fldl2e() {
+  emit_byte(0xD9);
+  emit_byte(0xEA);
+}
+
 // SSE SIMD prefix byte values corresponding to VexSimdPrefix encoding.
 static int simd_pre[4] = { 0, 0x66, 0xF3, 0xF2 };
 // SSE opcode second byte values (first is 0x0F) corresponding to VexOpcode encoding.
@ -6868,6 +6883,242 @@ void MacroAssembler::fldcw(AddressLiteral src) {
  Assembler::fldcw(as_Address(src));
 }

+void MacroAssembler::pow_exp_core_encoding() {
+  // kills rax, rcx, rdx
+  subptr(rsp,sizeof(jdouble));
+  // computes 2^X. Stack: X ...
+  // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and
+  // keep it on the thread's stack to compute 2^int(X) later
+  // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1)
+  // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X))
+  fld_s(0);                 // Stack: X X ...
+  frndint();                // Stack: int(X) X ...
+  fsuba(1);                 // Stack: int(X) X-int(X) ...
+  fistp_s(Address(rsp,0));  // move int(X) as integer to thread's stack. Stack: X-int(X) ...
+  f2xm1();                  // Stack: 2^(X-int(X))-1 ...
+  fld1();                   // Stack: 1 2^(X-int(X))-1 ...
+  faddp(1);                 // Stack: 2^(X-int(X))
+  // computes 2^(int(X)): add exponent bias (1023) to int(X), then
+  // shift int(X)+1023 to exponent position.
+  // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11
+  // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent
+  // values so detect them and set result to NaN.
+  movl(rax,Address(rsp,0));
+  movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding
+  addl(rax, 1023);
+  movl(rdx,rax);
+  shll(rax,20);
+  // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN.
+  addl(rdx,1);
+  // Check that 1 < int(X)+1023+1 < 2048
+  // in 3 steps:
+  // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048
+  // 2- (int(X)+1023+1)&-2048 != 0
+  // 3- (int(X)+1023+1)&-2048 != 1
+  // Do 2- first because addl just updated the flags.
+  cmov32(Assembler::equal,rax,rcx);
+  cmpl(rdx,1);
+  cmov32(Assembler::equal,rax,rcx);
+  testl(rdx,rcx);
+  cmov32(Assembler::notEqual,rax,rcx);
+  movl(Address(rsp,4),rax);
+  movl(Address(rsp,0),0);
+  fmul_d(Address(rsp,0));   // Stack: 2^X ...
+  addptr(rsp,sizeof(jdouble));
+}
+
+void MacroAssembler::fast_pow() {
+  // computes X^Y = 2^(Y * log2(X))
+  // if fast computation is not possible, result is NaN. Requires
+  // fallback from user of this macro.
+  fyl2x();                 // Stack: (Y*log2(X)) ...
+  pow_exp_core_encoding(); // Stack: exp(X) ...
+}
+
+void MacroAssembler::fast_exp() {
+  // computes exp(X) = 2^(X * log2(e))
+  // if fast computation is not possible, result is NaN. Requires
+  // fallback from user of this macro.
+  fldl2e();                // Stack: log2(e) X ...
+  fmulp(1);                // Stack: (X*log2(e)) ...
+  pow_exp_core_encoding(); // Stack: exp(X) ...
+}
+
+void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) {
+  // kills rax, rcx, rdx
+  // pow and exp needs 2 extra registers on the fpu stack.
+  Label slow_case, done;
+  Register tmp = noreg;
+  if (!VM_Version::supports_cmov()) {
+    // fcmp needs a temporary so preserve rdx,
+    tmp = rdx;
+  }
+  Register tmp2 = rax;
+  NOT_LP64(Register tmp3 = rcx;)
+
+  if (is_exp) {
+    // Stack: X
+    fld_s(0);                   // duplicate argument for runtime call. Stack: X X
+    fast_exp();                 // Stack: exp(X) X
+    fcmp(tmp, 0, false, false); // Stack: exp(X) X
+    // exp(X) not equal to itself: exp(X) is NaN go to slow case.
+    jcc(Assembler::parity, slow_case);
+    // get rid of duplicate argument. Stack: exp(X)
+    if (num_fpu_regs_in_use > 0) {
+      fxch();
+      fpop();
+    } else {
+      ffree(1);
+    }
+    jmp(done);
+  } else {
+    // Stack: X Y
+    Label x_negative, y_odd;
+
+    fldz();                     // Stack: 0 X Y
+    fcmp(tmp, 1, true, false);  // Stack: X Y
+    jcc(Assembler::above, x_negative);
+
+    // X >= 0
+
+    fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
+    fld_s(1);                   // Stack: X Y X Y
+    fast_pow();                 // Stack: X^Y X Y
+    fcmp(tmp, 0, false, false); // Stack: X^Y X Y
+    // X^Y not equal to itself: X^Y is NaN go to slow case.
+    jcc(Assembler::parity, slow_case);
+    // get rid of duplicate arguments. Stack: X^Y
+    if (num_fpu_regs_in_use > 0) {
+      fxch(); fpop();
+      fxch(); fpop();
+    } else {
+      ffree(2);
+      ffree(1);
+    }
+    jmp(done);
+
+    // X <= 0
+    bind(x_negative);
+
+    fld_s(1);                   // Stack: Y X Y
+    frndint();                  // Stack: int(Y) X Y
+    fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
+    jcc(Assembler::notEqual, slow_case);
+
+    subptr(rsp, 8);
+
+    // For X^Y, when X < 0, Y has to be an integer and the final
+    // result depends on whether it's odd or even. We just checked
+    // that int(Y) == Y.  We move int(Y) to gp registers as a 64 bit
+    // integer to test its parity. If int(Y) is huge and doesn't fit
+    // in the 64 bit integer range, the integer indefinite value will
+    // end up in the gp registers. Huge numbers are all even, the
+    // integer indefinite number is even so it's fine.
+
+#ifdef ASSERT
+    // Let's check we don't end up with an integer indefinite number
+    // when not expected. First test for huge numbers: check whether
+    // int(Y)+1 == int(Y) which is true for very large numbers and
+    // those are all even. A 64 bit integer is guaranteed to not
+    // overflow for numbers where y+1 != y (when precision is set to
+    // double precision).
+    Label y_not_huge;
+
+    fld1();                     // Stack: 1 int(Y) X Y
+    fadd(1);                    // Stack: 1+int(Y) int(Y) X Y
+
+#ifdef _LP64
+    // trip to memory to force the precision down from double extended
+    // precision
+    fstp_d(Address(rsp, 0));
+    fld_d(Address(rsp, 0));
+#endif
+
+    fcmp(tmp, 1, true, false);  // Stack: int(Y) X Y
+#endif
+
+    // move int(Y) as 64 bit integer to thread's stack
+    fistp_d(Address(rsp,0));    // Stack: X Y
+
+#ifdef ASSERT
+    jcc(Assembler::notEqual, y_not_huge);
+
+    // Y is huge so we know it's even. It may not fit in a 64 bit
+    // integer and we don't want the debug code below to see the
+    // integer indefinite value so overwrite int(Y) on the thread's
+    // stack with 0.
+    movl(Address(rsp, 0), 0);
+    movl(Address(rsp, 4), 0);
+
+    bind(y_not_huge);
+#endif
+
+    fld_s(1);                   // duplicate arguments for runtime call. Stack: Y X Y
+    fld_s(1);                   // Stack: X Y X Y
+    fabs();                     // Stack: abs(X) Y X Y
+    fast_pow();                 // Stack: abs(X)^Y X Y
+    fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
+    // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
+
+    pop(tmp2);
+    NOT_LP64(pop(tmp3));
+    jcc(Assembler::parity, slow_case);
+
+#ifdef ASSERT
+    // Check that int(Y) is not integer indefinite value (int
+    // overflow). Shouldn't happen because for values that would
+    // overflow, 1+int(Y)==Y which was tested earlier.
+#ifndef _LP64
+    {
+      Label integer;
+      testl(tmp2, tmp2);
+      jcc(Assembler::notZero, integer);
+      cmpl(tmp3, 0x80000000);
+      jcc(Assembler::notZero, integer);
+      stop("integer indefinite value shouldn't be seen here");
+      bind(integer);
+    }
+#else
+    {
+      Label integer;
+      shlq(tmp2, 1);
+      jcc(Assembler::carryClear, integer);
+      jcc(Assembler::notZero, integer);
+      stop("integer indefinite value shouldn't be seen here");
+      bind(integer);
+    }
+#endif
+#endif
+
+    // get rid of duplicate arguments. Stack: X^Y
+    if (num_fpu_regs_in_use > 0) {
+      fxch(); fpop();
+      fxch(); fpop();
+    } else {
+      ffree(2);
+      ffree(1);
+    }
+
+    testl(tmp2, 1);
+    jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
+    // X <= 0, Y even: X^Y = -abs(X)^Y
+
+    fchs();                     // Stack: -abs(X)^Y Y
+    jmp(done);
+  }
+
+  // slow case: runtime call
+  bind(slow_case);
+
+  fpop();                       // pop incorrect result or int(Y)
+
+  fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow),
+                      is_exp ? 1 : 2, num_fpu_regs_in_use);
+
+  // Come here with result in F-TOS
+  bind(done);
+}
+
 void MacroAssembler::fpop() {
  ffree();
  fincstp();
@ -8045,6 +8296,144 @@ void MacroAssembler::incr_allocated_bytes(Register thread,
 #endif
 }

+void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
+  pusha();
+
+  // if we are coming from c1, xmm registers may be live
+  if (UseSSE >= 1) {
+    subptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
+  }
+  int off = 0;
+  if (UseSSE == 1)  {
+    movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
+    movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
+    movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
+    movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
+    movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
+    movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
+    movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
+    movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
+  } else if (UseSSE >= 2)  {
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm0);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm1);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm2);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm3);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm4);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm5);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm6);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm7);
+#ifdef _LP64
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm8);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm9);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm10);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm11);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm12);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm13);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm14);
+    movdbl(Address(rsp,off++*sizeof(jdouble)),xmm15);
+#endif
+  }
+
+  // Preserve registers across runtime call
+  int incoming_argument_and_return_value_offset = -1;
+  if (num_fpu_regs_in_use > 1) {
+    // Must preserve all other FPU regs (could alternatively convert
+    // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash
+    // FPU state, but can not trust C compiler)
+    NEEDS_CLEANUP;
+    // NOTE that in this case we also push the incoming argument(s) to
+    // the stack and restore it later; we also use this stack slot to
+    // hold the return value from dsin, dcos etc.
+    for (int i = 0; i < num_fpu_regs_in_use; i++) {
+      subptr(rsp, sizeof(jdouble));
+      fstp_d(Address(rsp, 0));
+    }
+    incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
+    for (int i = nb_args-1; i >= 0; i--) {
+      fld_d(Address(rsp, incoming_argument_and_return_value_offset-i*sizeof(jdouble)));
+    }
+  }
+
+  subptr(rsp, nb_args*sizeof(jdouble));
+  for (int i = 0; i < nb_args; i++) {
+    fstp_d(Address(rsp, i*sizeof(jdouble)));
+  }
+
+#ifdef _LP64
+  if (nb_args > 0) {
+    movdbl(xmm0, Address(rsp, 0));
+  }
+  if (nb_args > 1) {
+    movdbl(xmm1, Address(rsp, sizeof(jdouble)));
+  }
+  assert(nb_args <= 2, "unsupported number of args");
+#endif // _LP64
+
+  // NOTE: we must not use call_VM_leaf here because that requires a
+  // complete interpreter frame in debug mode -- same bug as 4387334
+  // MacroAssembler::call_VM_leaf_base is perfectly safe and will
+  // do proper 64bit abi
+
+  NEEDS_CLEANUP;
+  // Need to add stack banging before this runtime call if it needs to
+  // be taken; however, there is no generic stack banging routine at
+  // the MacroAssembler level
+
+  MacroAssembler::call_VM_leaf_base(runtime_entry, 0);
+
+#ifdef _LP64
+  movsd(Address(rsp, 0), xmm0);
+  fld_d(Address(rsp, 0));
+#endif // _LP64
+  addptr(rsp, sizeof(jdouble) * nb_args);
+  if (num_fpu_regs_in_use > 1) {
+    // Must save return value to stack and then restore entire FPU
+    // stack except incoming arguments
+    fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
+    for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) {
+      fld_d(Address(rsp, 0));
+      addptr(rsp, sizeof(jdouble));
+    }
+    fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
+    addptr(rsp, sizeof(jdouble) * nb_args);
+  }
+
+  off = 0;
+  if (UseSSE == 1)  {
+    movflt(xmm0, Address(rsp,off++*sizeof(jdouble)));
+    movflt(xmm1, Address(rsp,off++*sizeof(jdouble)));
+    movflt(xmm2, Address(rsp,off++*sizeof(jdouble)));
+    movflt(xmm3, Address(rsp,off++*sizeof(jdouble)));
+    movflt(xmm4, Address(rsp,off++*sizeof(jdouble)));
+    movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
+    movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
+    movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
+  } else if (UseSSE >= 2)  {
+    movdbl(xmm0, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm1, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm2, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm3, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm4, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm5, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm6, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm7, Address(rsp,off++*sizeof(jdouble)));
+#ifdef _LP64
+    movdbl(xmm8, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm9, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm10, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm11, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm12, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm13, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm14, Address(rsp,off++*sizeof(jdouble)));
+    movdbl(xmm15, Address(rsp,off++*sizeof(jdouble)));
+#endif
+  }
+  if (UseSSE >= 1) {
+    addptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
+  }
+  popa();
+}
+
 static const double     pi_4 =  0.7853981633974483;

 void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
@ -8092,73 +8481,27 @@ void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {

  // slow case: runtime call
  bind(slow_case);
-  // Preserve registers across runtime call
-  pusha();
-  int incoming_argument_and_return_value_offset = -1;
-  if (num_fpu_regs_in_use > 1) {
-    // Must preserve all other FPU regs (could alternatively convert
-    // SharedRuntime::dsin and dcos into assembly routines known not to trash
-    // FPU state, but can not trust C compiler)
-    NEEDS_CLEANUP;
-    // NOTE that in this case we also push the incoming argument to
-    // the stack and restore it later; we also use this stack slot to
-    // hold the return value from dsin or dcos.
-    for (int i = 0; i < num_fpu_regs_in_use; i++) {
-      subptr(rsp, sizeof(jdouble));
-      fstp_d(Address(rsp, 0));
-    }
-    incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
-    fld_d(Address(rsp, incoming_argument_and_return_value_offset));
-  }
-  subptr(rsp, sizeof(jdouble));
-  fstp_d(Address(rsp, 0));
-#ifdef _LP64
-  movdbl(xmm0, Address(rsp, 0));
-#endif // _LP64

-  // NOTE: we must not use call_VM_leaf here because that requires a
-  // complete interpreter frame in debug mode -- same bug as 4387334
-  // MacroAssembler::call_VM_leaf_base is perfectly safe and will
-  // do proper 64bit abi
-
-  NEEDS_CLEANUP;
-  // Need to add stack banging before this runtime call if it needs to
-  // be taken; however, there is no generic stack banging routine at
-  // the MacroAssembler level
  switch(trig) {
  case 's':
    {
-      MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 0);
+      fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 1, num_fpu_regs_in_use);
    }
    break;
  case 'c':
    {
-      MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 0);
+      fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 1, num_fpu_regs_in_use);
    }
    break;
  case 't':
    {
-      MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 0);
+      fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 1, num_fpu_regs_in_use);
    }
    break;
  default:
    assert(false, "bad intrinsic");
    break;
  }
-#ifdef _LP64
-    movsd(Address(rsp, 0), xmm0);
-    fld_d(Address(rsp, 0));
-#endif // _LP64
-  addptr(rsp, sizeof(jdouble));
-  if (num_fpu_regs_in_use > 1) {
-    // Must save return value to stack and then restore entire FPU stack
-    fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
-    for (int i = 0; i < num_fpu_regs_in_use; i++) {
-      fld_d(Address(rsp, 0));
-      addptr(rsp, sizeof(jdouble));
-    }
-  }
-  popa();

  // Come here with result in F-TOS
  bind(done);