mirror of
https://github.com/openjdk/jdk.git
synced 2025-08-25 13:54:38 +02:00
8055494: Add C2 x86 intrinsic for BigInteger::multiplyToLen() method
Add new C2 intrinsic for BigInteger::multiplyToLen() on x86 in 64-bit VM. Reviewed-by: roland
This commit is contained in:
parent
8b93fb04cd
commit
b9e949183d
19 changed files with 1062 additions and 22 deletions
|
@ -4937,6 +4937,26 @@ void Assembler::addq(Register dst, Register src) {
|
|||
emit_arith(0x03, 0xC0, dst, src);
|
||||
}
|
||||
|
||||
void Assembler::adcxq(Register dst, Register src) {
|
||||
//assert(VM_Version::supports_adx(), "adx instructions not supported");
|
||||
emit_int8((unsigned char)0x66);
|
||||
int encode = prefixq_and_encode(dst->encoding(), src->encoding());
|
||||
emit_int8(0x0F);
|
||||
emit_int8(0x38);
|
||||
emit_int8((unsigned char)0xF6);
|
||||
emit_int8((unsigned char)(0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::adoxq(Register dst, Register src) {
|
||||
//assert(VM_Version::supports_adx(), "adx instructions not supported");
|
||||
emit_int8((unsigned char)0xF3);
|
||||
int encode = prefixq_and_encode(dst->encoding(), src->encoding());
|
||||
emit_int8(0x0F);
|
||||
emit_int8(0x38);
|
||||
emit_int8((unsigned char)0xF6);
|
||||
emit_int8((unsigned char)(0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::andq(Address dst, int32_t imm32) {
|
||||
InstructionMark im(this);
|
||||
prefixq(dst);
|
||||
|
@ -5444,6 +5464,26 @@ void Assembler::movzwq(Register dst, Register src) {
|
|||
emit_int8((unsigned char)(0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::mulq(Address src) {
|
||||
InstructionMark im(this);
|
||||
prefixq(src);
|
||||
emit_int8((unsigned char)0xF7);
|
||||
emit_operand(rsp, src);
|
||||
}
|
||||
|
||||
void Assembler::mulq(Register src) {
|
||||
int encode = prefixq_and_encode(src->encoding());
|
||||
emit_int8((unsigned char)0xF7);
|
||||
emit_int8((unsigned char)(0xE0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::mulxq(Register dst1, Register dst2, Register src) {
|
||||
assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported");
|
||||
int encode = vex_prefix_and_encode(dst1->encoding(), dst2->encoding(), src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, true, false);
|
||||
emit_int8((unsigned char)0xF6);
|
||||
emit_int8((unsigned char)(0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::negq(Register dst) {
|
||||
int encode = prefixq_and_encode(dst->encoding());
|
||||
emit_int8((unsigned char)0xF7);
|
||||
|
@ -5572,6 +5612,28 @@ void Assembler::rclq(Register dst, int imm8) {
|
|||
emit_int8(imm8);
|
||||
}
|
||||
}
|
||||
|
||||
void Assembler::rorq(Register dst, int imm8) {
|
||||
assert(isShiftCount(imm8 >> 1), "illegal shift count");
|
||||
int encode = prefixq_and_encode(dst->encoding());
|
||||
if (imm8 == 1) {
|
||||
emit_int8((unsigned char)0xD1);
|
||||
emit_int8((unsigned char)(0xC8 | encode));
|
||||
} else {
|
||||
emit_int8((unsigned char)0xC1);
|
||||
emit_int8((unsigned char)(0xc8 | encode));
|
||||
emit_int8(imm8);
|
||||
}
|
||||
}
|
||||
|
||||
void Assembler::rorxq(Register dst, Register src, int imm8) {
|
||||
assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported");
|
||||
int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_3A, true, false);
|
||||
emit_int8((unsigned char)0xF0);
|
||||
emit_int8((unsigned char)(0xC0 | encode));
|
||||
emit_int8(imm8);
|
||||
}
|
||||
|
||||
void Assembler::sarq(Register dst, int imm8) {
|
||||
assert(isShiftCount(imm8 >> 1), "illegal shift count");
|
||||
int encode = prefixq_and_encode(dst->encoding());
|
||||
|
|
|
@ -888,6 +888,14 @@ private:
|
|||
void addq(Register dst, Address src);
|
||||
void addq(Register dst, Register src);
|
||||
|
||||
#ifdef _LP64
|
||||
//Add Unsigned Integers with Carry Flag
|
||||
void adcxq(Register dst, Register src);
|
||||
|
||||
//Add Unsigned Integers with Overflow Flag
|
||||
void adoxq(Register dst, Register src);
|
||||
#endif
|
||||
|
||||
void addr_nop_4();
|
||||
void addr_nop_5();
|
||||
void addr_nop_7();
|
||||
|
@ -1204,19 +1212,20 @@ private:
|
|||
void idivl(Register src);
|
||||
void divl(Register src); // Unsigned division
|
||||
|
||||
#ifdef _LP64
|
||||
void idivq(Register src);
|
||||
#endif
|
||||
|
||||
void imull(Register dst, Register src);
|
||||
void imull(Register dst, Register src, int value);
|
||||
void imull(Register dst, Address src);
|
||||
|
||||
#ifdef _LP64
|
||||
void imulq(Register dst, Register src);
|
||||
void imulq(Register dst, Register src, int value);
|
||||
#ifdef _LP64
|
||||
void imulq(Register dst, Address src);
|
||||
#endif
|
||||
|
||||
|
||||
// jcc is the generic conditional branch generator to run-
|
||||
// time routines, jcc is used for branches to labels. jcc
|
||||
// takes a branch opcode (cc) and a label (L) and generates
|
||||
|
@ -1408,9 +1417,16 @@ private:
|
|||
void movzwq(Register dst, Register src);
|
||||
#endif
|
||||
|
||||
// Unsigned multiply with RAX destination register
|
||||
void mull(Address src);
|
||||
void mull(Register src);
|
||||
|
||||
#ifdef _LP64
|
||||
void mulq(Address src);
|
||||
void mulq(Register src);
|
||||
void mulxq(Register dst1, Register dst2, Register src);
|
||||
#endif
|
||||
|
||||
// Multiply Scalar Double-Precision Floating-Point Values
|
||||
void mulsd(XMMRegister dst, Address src);
|
||||
void mulsd(XMMRegister dst, XMMRegister src);
|
||||
|
@ -1541,6 +1557,11 @@ private:
|
|||
|
||||
void ret(int imm16);
|
||||
|
||||
#ifdef _LP64
|
||||
void rorq(Register dst, int imm8);
|
||||
void rorxq(Register dst, Register src, int imm8);
|
||||
#endif
|
||||
|
||||
void sahf();
|
||||
|
||||
void sarl(Register dst, int imm8);
|
||||
|
|
|
@ -176,6 +176,8 @@ define_pd_global(uintx, TypeProfileLevel, 111);
|
|||
"Use count trailing zeros instruction") \
|
||||
\
|
||||
product(bool, UseBMI1Instructions, false, \
|
||||
"Use BMI instructions")
|
||||
|
||||
"Use BMI1 instructions") \
|
||||
\
|
||||
product(bool, UseBMI2Instructions, false, \
|
||||
"Use BMI2 instructions")
|
||||
#endif // CPU_X86_VM_GLOBALS_X86_HPP
|
||||
|
|
|
@ -7293,6 +7293,467 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
|
|||
bind(L_done);
|
||||
}
|
||||
|
||||
#ifdef _LP64
|
||||
/**
|
||||
* Helper for multiply_to_len().
|
||||
*/
|
||||
void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) {
|
||||
addq(dest_lo, src1);
|
||||
adcq(dest_hi, 0);
|
||||
addq(dest_lo, src2);
|
||||
adcq(dest_hi, 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Multiply 64 bit by 64 bit first loop.
|
||||
*/
|
||||
void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
|
||||
Register y, Register y_idx, Register z,
|
||||
Register carry, Register product,
|
||||
Register idx, Register kdx) {
|
||||
//
|
||||
// jlong carry, x[], y[], z[];
|
||||
// for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
|
||||
// huge_128 product = y[idx] * x[xstart] + carry;
|
||||
// z[kdx] = (jlong)product;
|
||||
// carry = (jlong)(product >>> 64);
|
||||
// }
|
||||
// z[xstart] = carry;
|
||||
//
|
||||
|
||||
Label L_first_loop, L_first_loop_exit;
|
||||
Label L_one_x, L_one_y, L_multiply;
|
||||
|
||||
decrementl(xstart);
|
||||
jcc(Assembler::negative, L_one_x);
|
||||
|
||||
movq(x_xstart, Address(x, xstart, Address::times_4, 0));
|
||||
rorq(x_xstart, 32); // convert big-endian to little-endian
|
||||
|
||||
bind(L_first_loop);
|
||||
decrementl(idx);
|
||||
jcc(Assembler::negative, L_first_loop_exit);
|
||||
decrementl(idx);
|
||||
jcc(Assembler::negative, L_one_y);
|
||||
movq(y_idx, Address(y, idx, Address::times_4, 0));
|
||||
rorq(y_idx, 32); // convert big-endian to little-endian
|
||||
bind(L_multiply);
|
||||
movq(product, x_xstart);
|
||||
mulq(y_idx); // product(rax) * y_idx -> rdx:rax
|
||||
addq(product, carry);
|
||||
adcq(rdx, 0);
|
||||
subl(kdx, 2);
|
||||
movl(Address(z, kdx, Address::times_4, 4), product);
|
||||
shrq(product, 32);
|
||||
movl(Address(z, kdx, Address::times_4, 0), product);
|
||||
movq(carry, rdx);
|
||||
jmp(L_first_loop);
|
||||
|
||||
bind(L_one_y);
|
||||
movl(y_idx, Address(y, 0));
|
||||
jmp(L_multiply);
|
||||
|
||||
bind(L_one_x);
|
||||
movl(x_xstart, Address(x, 0));
|
||||
jmp(L_first_loop);
|
||||
|
||||
bind(L_first_loop_exit);
|
||||
}
|
||||
|
||||
/**
|
||||
* Multiply 64 bit by 64 bit and add 128 bit.
|
||||
*/
|
||||
void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z,
|
||||
Register yz_idx, Register idx,
|
||||
Register carry, Register product, int offset) {
|
||||
// huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry;
|
||||
// z[kdx] = (jlong)product;
|
||||
|
||||
movq(yz_idx, Address(y, idx, Address::times_4, offset));
|
||||
rorq(yz_idx, 32); // convert big-endian to little-endian
|
||||
movq(product, x_xstart);
|
||||
mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
|
||||
movq(yz_idx, Address(z, idx, Address::times_4, offset));
|
||||
rorq(yz_idx, 32); // convert big-endian to little-endian
|
||||
|
||||
add2_with_carry(rdx, product, carry, yz_idx);
|
||||
|
||||
movl(Address(z, idx, Address::times_4, offset+4), product);
|
||||
shrq(product, 32);
|
||||
movl(Address(z, idx, Address::times_4, offset), product);
|
||||
|
||||
}
|
||||
|
||||
/**
|
||||
* Multiply 128 bit by 128 bit. Unrolled inner loop.
|
||||
*/
|
||||
void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
|
||||
Register yz_idx, Register idx, Register jdx,
|
||||
Register carry, Register product,
|
||||
Register carry2) {
|
||||
// jlong carry, x[], y[], z[];
|
||||
// int kdx = ystart+1;
|
||||
// for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
|
||||
// huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry;
|
||||
// z[kdx+idx+1] = (jlong)product;
|
||||
// jlong carry2 = (jlong)(product >>> 64);
|
||||
// product = (y[idx] * x_xstart) + z[kdx+idx] + carry2;
|
||||
// z[kdx+idx] = (jlong)product;
|
||||
// carry = (jlong)(product >>> 64);
|
||||
// }
|
||||
// idx += 2;
|
||||
// if (idx > 0) {
|
||||
// product = (y[idx] * x_xstart) + z[kdx+idx] + carry;
|
||||
// z[kdx+idx] = (jlong)product;
|
||||
// carry = (jlong)(product >>> 64);
|
||||
// }
|
||||
//
|
||||
|
||||
Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
|
||||
|
||||
movl(jdx, idx);
|
||||
andl(jdx, 0xFFFFFFFC);
|
||||
shrl(jdx, 2);
|
||||
|
||||
bind(L_third_loop);
|
||||
subl(jdx, 1);
|
||||
jcc(Assembler::negative, L_third_loop_exit);
|
||||
subl(idx, 4);
|
||||
|
||||
multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8);
|
||||
movq(carry2, rdx);
|
||||
|
||||
multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0);
|
||||
movq(carry, rdx);
|
||||
jmp(L_third_loop);
|
||||
|
||||
bind (L_third_loop_exit);
|
||||
|
||||
andl (idx, 0x3);
|
||||
jcc(Assembler::zero, L_post_third_loop_done);
|
||||
|
||||
Label L_check_1;
|
||||
subl(idx, 2);
|
||||
jcc(Assembler::negative, L_check_1);
|
||||
|
||||
multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0);
|
||||
movq(carry, rdx);
|
||||
|
||||
bind (L_check_1);
|
||||
addl (idx, 0x2);
|
||||
andl (idx, 0x1);
|
||||
subl(idx, 1);
|
||||
jcc(Assembler::negative, L_post_third_loop_done);
|
||||
|
||||
movl(yz_idx, Address(y, idx, Address::times_4, 0));
|
||||
movq(product, x_xstart);
|
||||
mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax)
|
||||
movl(yz_idx, Address(z, idx, Address::times_4, 0));
|
||||
|
||||
add2_with_carry(rdx, product, yz_idx, carry);
|
||||
|
||||
movl(Address(z, idx, Address::times_4, 0), product);
|
||||
shrq(product, 32);
|
||||
|
||||
shlq(rdx, 32);
|
||||
orq(product, rdx);
|
||||
movq(carry, product);
|
||||
|
||||
bind(L_post_third_loop_done);
|
||||
}
|
||||
|
||||
/**
|
||||
* Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop.
|
||||
*
|
||||
*/
|
||||
void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z,
|
||||
Register carry, Register carry2,
|
||||
Register idx, Register jdx,
|
||||
Register yz_idx1, Register yz_idx2,
|
||||
Register tmp, Register tmp3, Register tmp4) {
|
||||
assert(UseBMI2Instructions, "should be used only when BMI2 is available");
|
||||
|
||||
// jlong carry, x[], y[], z[];
|
||||
// int kdx = ystart+1;
|
||||
// for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
|
||||
// huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry;
|
||||
// jlong carry2 = (jlong)(tmp3 >>> 64);
|
||||
// huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2;
|
||||
// carry = (jlong)(tmp4 >>> 64);
|
||||
// z[kdx+idx+1] = (jlong)tmp3;
|
||||
// z[kdx+idx] = (jlong)tmp4;
|
||||
// }
|
||||
// idx += 2;
|
||||
// if (idx > 0) {
|
||||
// yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry;
|
||||
// z[kdx+idx] = (jlong)yz_idx1;
|
||||
// carry = (jlong)(yz_idx1 >>> 64);
|
||||
// }
|
||||
//
|
||||
|
||||
Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
|
||||
|
||||
movl(jdx, idx);
|
||||
andl(jdx, 0xFFFFFFFC);
|
||||
shrl(jdx, 2);
|
||||
|
||||
bind(L_third_loop);
|
||||
subl(jdx, 1);
|
||||
jcc(Assembler::negative, L_third_loop_exit);
|
||||
subl(idx, 4);
|
||||
|
||||
movq(yz_idx1, Address(y, idx, Address::times_4, 8));
|
||||
rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
|
||||
movq(yz_idx2, Address(y, idx, Address::times_4, 0));
|
||||
rorxq(yz_idx2, yz_idx2, 32);
|
||||
|
||||
mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
|
||||
mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp
|
||||
|
||||
movq(yz_idx1, Address(z, idx, Address::times_4, 8));
|
||||
rorxq(yz_idx1, yz_idx1, 32);
|
||||
movq(yz_idx2, Address(z, idx, Address::times_4, 0));
|
||||
rorxq(yz_idx2, yz_idx2, 32);
|
||||
|
||||
if (VM_Version::supports_adx()) {
|
||||
adcxq(tmp3, carry);
|
||||
adoxq(tmp3, yz_idx1);
|
||||
|
||||
adcxq(tmp4, tmp);
|
||||
adoxq(tmp4, yz_idx2);
|
||||
|
||||
movl(carry, 0); // does not affect flags
|
||||
adcxq(carry2, carry);
|
||||
adoxq(carry2, carry);
|
||||
} else {
|
||||
add2_with_carry(tmp4, tmp3, carry, yz_idx1);
|
||||
add2_with_carry(carry2, tmp4, tmp, yz_idx2);
|
||||
}
|
||||
movq(carry, carry2);
|
||||
|
||||
movl(Address(z, idx, Address::times_4, 12), tmp3);
|
||||
shrq(tmp3, 32);
|
||||
movl(Address(z, idx, Address::times_4, 8), tmp3);
|
||||
|
||||
movl(Address(z, idx, Address::times_4, 4), tmp4);
|
||||
shrq(tmp4, 32);
|
||||
movl(Address(z, idx, Address::times_4, 0), tmp4);
|
||||
|
||||
jmp(L_third_loop);
|
||||
|
||||
bind (L_third_loop_exit);
|
||||
|
||||
andl (idx, 0x3);
|
||||
jcc(Assembler::zero, L_post_third_loop_done);
|
||||
|
||||
Label L_check_1;
|
||||
subl(idx, 2);
|
||||
jcc(Assembler::negative, L_check_1);
|
||||
|
||||
movq(yz_idx1, Address(y, idx, Address::times_4, 0));
|
||||
rorxq(yz_idx1, yz_idx1, 32);
|
||||
mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3
|
||||
movq(yz_idx2, Address(z, idx, Address::times_4, 0));
|
||||
rorxq(yz_idx2, yz_idx2, 32);
|
||||
|
||||
add2_with_carry(tmp4, tmp3, carry, yz_idx2);
|
||||
|
||||
movl(Address(z, idx, Address::times_4, 4), tmp3);
|
||||
shrq(tmp3, 32);
|
||||
movl(Address(z, idx, Address::times_4, 0), tmp3);
|
||||
movq(carry, tmp4);
|
||||
|
||||
bind (L_check_1);
|
||||
addl (idx, 0x2);
|
||||
andl (idx, 0x1);
|
||||
subl(idx, 1);
|
||||
jcc(Assembler::negative, L_post_third_loop_done);
|
||||
movl(tmp4, Address(y, idx, Address::times_4, 0));
|
||||
mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3
|
||||
movl(tmp4, Address(z, idx, Address::times_4, 0));
|
||||
|
||||
add2_with_carry(carry2, tmp3, tmp4, carry);
|
||||
|
||||
movl(Address(z, idx, Address::times_4, 0), tmp3);
|
||||
shrq(tmp3, 32);
|
||||
|
||||
shlq(carry2, 32);
|
||||
orq(tmp3, carry2);
|
||||
movq(carry, tmp3);
|
||||
|
||||
bind(L_post_third_loop_done);
|
||||
}
|
||||
|
||||
/**
|
||||
* Code for BigInteger::multiplyToLen() instrinsic.
|
||||
*
|
||||
* rdi: x
|
||||
* rax: xlen
|
||||
* rsi: y
|
||||
* rcx: ylen
|
||||
* r8: z
|
||||
* r11: zlen
|
||||
* r12: tmp1
|
||||
* r13: tmp2
|
||||
* r14: tmp3
|
||||
* r15: tmp4
|
||||
* rbx: tmp5
|
||||
*
|
||||
*/
|
||||
void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
|
||||
Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
|
||||
ShortBranchVerifier sbv(this);
|
||||
assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx);
|
||||
|
||||
push(tmp1);
|
||||
push(tmp2);
|
||||
push(tmp3);
|
||||
push(tmp4);
|
||||
push(tmp5);
|
||||
|
||||
push(xlen);
|
||||
push(zlen);
|
||||
|
||||
const Register idx = tmp1;
|
||||
const Register kdx = tmp2;
|
||||
const Register xstart = tmp3;
|
||||
|
||||
const Register y_idx = tmp4;
|
||||
const Register carry = tmp5;
|
||||
const Register product = xlen;
|
||||
const Register x_xstart = zlen; // reuse register
|
||||
|
||||
// First Loop.
|
||||
//
|
||||
// final static long LONG_MASK = 0xffffffffL;
|
||||
// int xstart = xlen - 1;
|
||||
// int ystart = ylen - 1;
|
||||
// long carry = 0;
|
||||
// for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
|
||||
// long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
|
||||
// z[kdx] = (int)product;
|
||||
// carry = product >>> 32;
|
||||
// }
|
||||
// z[xstart] = (int)carry;
|
||||
//
|
||||
|
||||
movl(idx, ylen); // idx = ylen;
|
||||
movl(kdx, zlen); // kdx = xlen+ylen;
|
||||
xorq(carry, carry); // carry = 0;
|
||||
|
||||
Label L_done;
|
||||
|
||||
movl(xstart, xlen);
|
||||
decrementl(xstart);
|
||||
jcc(Assembler::negative, L_done);
|
||||
|
||||
multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
|
||||
|
||||
Label L_second_loop;
|
||||
testl(kdx, kdx);
|
||||
jcc(Assembler::zero, L_second_loop);
|
||||
|
||||
Label L_carry;
|
||||
subl(kdx, 1);
|
||||
jcc(Assembler::zero, L_carry);
|
||||
|
||||
movl(Address(z, kdx, Address::times_4, 0), carry);
|
||||
shrq(carry, 32);
|
||||
subl(kdx, 1);
|
||||
|
||||
bind(L_carry);
|
||||
movl(Address(z, kdx, Address::times_4, 0), carry);
|
||||
|
||||
// Second and third (nested) loops.
|
||||
//
|
||||
// for (int i = xstart-1; i >= 0; i--) { // Second loop
|
||||
// carry = 0;
|
||||
// for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
|
||||
// long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
|
||||
// (z[k] & LONG_MASK) + carry;
|
||||
// z[k] = (int)product;
|
||||
// carry = product >>> 32;
|
||||
// }
|
||||
// z[i] = (int)carry;
|
||||
// }
|
||||
//
|
||||
// i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx
|
||||
|
||||
const Register jdx = tmp1;
|
||||
|
||||
bind(L_second_loop);
|
||||
xorl(carry, carry); // carry = 0;
|
||||
movl(jdx, ylen); // j = ystart+1
|
||||
|
||||
subl(xstart, 1); // i = xstart-1;
|
||||
jcc(Assembler::negative, L_done);
|
||||
|
||||
push (z);
|
||||
|
||||
Label L_last_x;
|
||||
lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j
|
||||
subl(xstart, 1); // i = xstart-1;
|
||||
jcc(Assembler::negative, L_last_x);
|
||||
|
||||
if (UseBMI2Instructions) {
|
||||
movq(rdx, Address(x, xstart, Address::times_4, 0));
|
||||
rorxq(rdx, rdx, 32); // convert big-endian to little-endian
|
||||
} else {
|
||||
movq(x_xstart, Address(x, xstart, Address::times_4, 0));
|
||||
rorq(x_xstart, 32); // convert big-endian to little-endian
|
||||
}
|
||||
|
||||
Label L_third_loop_prologue;
|
||||
bind(L_third_loop_prologue);
|
||||
|
||||
push (x);
|
||||
push (xstart);
|
||||
push (ylen);
|
||||
|
||||
|
||||
if (UseBMI2Instructions) {
|
||||
multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4);
|
||||
} else { // !UseBMI2Instructions
|
||||
multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x);
|
||||
}
|
||||
|
||||
pop(ylen);
|
||||
pop(xlen);
|
||||
pop(x);
|
||||
pop(z);
|
||||
|
||||
movl(tmp3, xlen);
|
||||
addl(tmp3, 1);
|
||||
movl(Address(z, tmp3, Address::times_4, 0), carry);
|
||||
subl(tmp3, 1);
|
||||
jccb(Assembler::negative, L_done);
|
||||
|
||||
shrq(carry, 32);
|
||||
movl(Address(z, tmp3, Address::times_4, 0), carry);
|
||||
jmp(L_second_loop);
|
||||
|
||||
// Next infrequent code is moved outside loops.
|
||||
bind(L_last_x);
|
||||
if (UseBMI2Instructions) {
|
||||
movl(rdx, Address(x, 0));
|
||||
} else {
|
||||
movl(x_xstart, Address(x, 0));
|
||||
}
|
||||
jmp(L_third_loop_prologue);
|
||||
|
||||
bind(L_done);
|
||||
|
||||
pop(zlen);
|
||||
pop(xlen);
|
||||
|
||||
pop(tmp5);
|
||||
pop(tmp4);
|
||||
pop(tmp3);
|
||||
pop(tmp2);
|
||||
pop(tmp1);
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Emits code to update CRC-32 with a byte value according to constants in table
|
||||
*
|
||||
|
|
|
@ -1221,6 +1221,28 @@ public:
|
|||
XMMRegister tmp1, XMMRegister tmp2, XMMRegister tmp3,
|
||||
XMMRegister tmp4, Register tmp5, Register result);
|
||||
|
||||
#ifdef _LP64
|
||||
void add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2);
|
||||
void multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
|
||||
Register y, Register y_idx, Register z,
|
||||
Register carry, Register product,
|
||||
Register idx, Register kdx);
|
||||
void multiply_add_128_x_128(Register x_xstart, Register y, Register z,
|
||||
Register yz_idx, Register idx,
|
||||
Register carry, Register product, int offset);
|
||||
void multiply_128_x_128_bmi2_loop(Register y, Register z,
|
||||
Register carry, Register carry2,
|
||||
Register idx, Register jdx,
|
||||
Register yz_idx1, Register yz_idx2,
|
||||
Register tmp, Register tmp3, Register tmp4);
|
||||
void multiply_128_x_128_loop(Register x_xstart, Register y, Register z,
|
||||
Register yz_idx, Register idx, Register jdx,
|
||||
Register carry, Register product,
|
||||
Register carry2);
|
||||
void multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen,
|
||||
Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5);
|
||||
#endif
|
||||
|
||||
// CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
|
||||
void update_byte_crc32(Register crc, Register val, Register table);
|
||||
void kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp);
|
||||
|
|
|
@ -3677,6 +3677,70 @@ class StubGenerator: public StubCodeGenerator {
|
|||
return start;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Arguments:
|
||||
*
|
||||
* Input:
|
||||
* c_rarg0 - x address
|
||||
* c_rarg1 - x length
|
||||
* c_rarg2 - y address
|
||||
* c_rarg3 - y lenth
|
||||
* not Win64
|
||||
* c_rarg4 - z address
|
||||
* c_rarg5 - z length
|
||||
* Win64
|
||||
* rsp+40 - z address
|
||||
* rsp+48 - z length
|
||||
*/
|
||||
address generate_multiplyToLen() {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
|
||||
|
||||
address start = __ pc();
|
||||
// Win64: rcx, rdx, r8, r9 (c_rarg0, c_rarg1, ...)
|
||||
// Unix: rdi, rsi, rdx, rcx, r8, r9 (c_rarg0, c_rarg1, ...)
|
||||
const Register x = rdi;
|
||||
const Register xlen = rax;
|
||||
const Register y = rsi;
|
||||
const Register ylen = rcx;
|
||||
const Register z = r8;
|
||||
const Register zlen = r11;
|
||||
|
||||
// Next registers will be saved on stack in multiply_to_len().
|
||||
const Register tmp1 = r12;
|
||||
const Register tmp2 = r13;
|
||||
const Register tmp3 = r14;
|
||||
const Register tmp4 = r15;
|
||||
const Register tmp5 = rbx;
|
||||
|
||||
BLOCK_COMMENT("Entry:");
|
||||
__ enter(); // required for proper stackwalking of RuntimeStub frame
|
||||
|
||||
#ifndef _WIN64
|
||||
__ movptr(zlen, r9); // Save r9 in r11 - zlen
|
||||
#endif
|
||||
setup_arg_regs(4); // x => rdi, xlen => rsi, y => rdx
|
||||
// ylen => rcx, z => r8, zlen => r11
|
||||
// r9 and r10 may be used to save non-volatile registers
|
||||
#ifdef _WIN64
|
||||
// last 2 arguments (#4, #5) are on stack on Win64
|
||||
__ movptr(z, Address(rsp, 6 * wordSize));
|
||||
__ movptr(zlen, Address(rsp, 7 * wordSize));
|
||||
#endif
|
||||
|
||||
__ movptr(xlen, rsi);
|
||||
__ movptr(y, rdx);
|
||||
__ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5);
|
||||
|
||||
restore_arg_regs();
|
||||
|
||||
__ leave(); // required for proper stackwalking of RuntimeStub frame
|
||||
__ ret(0);
|
||||
|
||||
return start;
|
||||
}
|
||||
|
||||
#undef __
|
||||
#define __ masm->
|
||||
|
||||
|
@ -3917,6 +3981,11 @@ class StubGenerator: public StubCodeGenerator {
|
|||
generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
|
||||
&StubRoutines::_safefetchN_fault_pc,
|
||||
&StubRoutines::_safefetchN_continuation_pc);
|
||||
#ifdef COMPILER2
|
||||
if (UseMultiplyToLenIntrinsic) {
|
||||
StubRoutines::_multiplyToLen = generate_multiplyToLen();
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
public:
|
||||
|
|
|
@ -485,7 +485,7 @@ void VM_Version::get_processor_features() {
|
|||
}
|
||||
|
||||
char buf[256];
|
||||
jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
cores_per_cpu(), threads_per_core(),
|
||||
cpu_family(), _model, _stepping,
|
||||
(supports_cmov() ? ", cmov" : ""),
|
||||
|
@ -514,7 +514,8 @@ void VM_Version::get_processor_features() {
|
|||
(supports_tscinv_bit() ? ", tscinvbit": ""),
|
||||
(supports_tscinv() ? ", tscinv": ""),
|
||||
(supports_bmi1() ? ", bmi1" : ""),
|
||||
(supports_bmi2() ? ", bmi2" : ""));
|
||||
(supports_bmi2() ? ", bmi2" : ""),
|
||||
(supports_adx() ? ", adx" : ""));
|
||||
_features_str = os::strdup(buf);
|
||||
|
||||
// UseSSE is set to the smaller of what hardware supports and what
|
||||
|
@ -566,7 +567,7 @@ void VM_Version::get_processor_features() {
|
|||
}
|
||||
} else if (UseCRC32Intrinsics) {
|
||||
if (!FLAG_IS_DEFAULT(UseCRC32Intrinsics))
|
||||
warning("CRC32 Intrinsics requires AVX and CLMUL instructions (not available on this CPU)");
|
||||
warning("CRC32 Intrinsics requires CLMUL instructions (not available on this CPU)");
|
||||
FLAG_SET_DEFAULT(UseCRC32Intrinsics, false);
|
||||
}
|
||||
|
||||
|
@ -689,7 +690,20 @@ void VM_Version::get_processor_features() {
|
|||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef _LP64
|
||||
if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
|
||||
UseMultiplyToLenIntrinsic = true;
|
||||
}
|
||||
#else
|
||||
if (UseMultiplyToLenIntrinsic) {
|
||||
if (!FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
|
||||
warning("multiplyToLen intrinsic is not available in 32-bit VM");
|
||||
}
|
||||
FLAG_SET_DEFAULT(UseMultiplyToLenIntrinsic, false);
|
||||
}
|
||||
#endif
|
||||
#endif // COMPILER2
|
||||
|
||||
// On new cpus instructions which update whole XMM register should be used
|
||||
// to prevent partial register stall due to dependencies on high half.
|
||||
|
@ -832,6 +846,9 @@ void VM_Version::get_processor_features() {
|
|||
}
|
||||
}
|
||||
}
|
||||
if(FLAG_IS_DEFAULT(AllocatePrefetchInstr) && supports_3dnow_prefetch()) {
|
||||
AllocatePrefetchInstr = 3;
|
||||
}
|
||||
}
|
||||
|
||||
// Use count leading zeros count instruction if available.
|
||||
|
@ -844,25 +861,37 @@ void VM_Version::get_processor_features() {
|
|||
FLAG_SET_DEFAULT(UseCountLeadingZerosInstruction, false);
|
||||
}
|
||||
|
||||
if (supports_bmi1()) {
|
||||
if (FLAG_IS_DEFAULT(UseBMI1Instructions)) {
|
||||
UseBMI1Instructions = true;
|
||||
}
|
||||
} else if (UseBMI1Instructions) {
|
||||
warning("BMI1 instructions are not available on this CPU");
|
||||
FLAG_SET_DEFAULT(UseBMI1Instructions, false);
|
||||
}
|
||||
|
||||
// Use count trailing zeros instruction if available
|
||||
if (supports_bmi1()) {
|
||||
// tzcnt does not require VEX prefix
|
||||
if (FLAG_IS_DEFAULT(UseCountTrailingZerosInstruction)) {
|
||||
UseCountTrailingZerosInstruction = UseBMI1Instructions;
|
||||
UseCountTrailingZerosInstruction = true;
|
||||
}
|
||||
} else if (UseCountTrailingZerosInstruction) {
|
||||
warning("tzcnt instruction is not available on this CPU");
|
||||
FLAG_SET_DEFAULT(UseCountTrailingZerosInstruction, false);
|
||||
}
|
||||
|
||||
// BMI instructions use an encoding with VEX prefix.
|
||||
// VEX prefix is generated only when AVX > 0.
|
||||
if (supports_bmi1() && supports_avx()) {
|
||||
if (FLAG_IS_DEFAULT(UseBMI1Instructions)) {
|
||||
UseBMI1Instructions = true;
|
||||
}
|
||||
} else if (UseBMI1Instructions) {
|
||||
warning("BMI1 instructions are not available on this CPU (AVX is also required)");
|
||||
FLAG_SET_DEFAULT(UseBMI1Instructions, false);
|
||||
}
|
||||
|
||||
if (supports_bmi2() && supports_avx()) {
|
||||
if (FLAG_IS_DEFAULT(UseBMI2Instructions)) {
|
||||
UseBMI2Instructions = true;
|
||||
}
|
||||
} else if (UseBMI2Instructions) {
|
||||
warning("BMI2 instructions are not available on this CPU (AVX is also required)");
|
||||
FLAG_SET_DEFAULT(UseBMI2Instructions, false);
|
||||
}
|
||||
|
||||
// Use population count instruction if available.
|
||||
if (supports_popcnt()) {
|
||||
if (FLAG_IS_DEFAULT(UsePopCountInstruction)) {
|
||||
|
|
|
@ -209,7 +209,9 @@ public:
|
|||
erms : 1,
|
||||
: 1,
|
||||
rtm : 1,
|
||||
: 20;
|
||||
: 7,
|
||||
adx : 1,
|
||||
: 12;
|
||||
} bits;
|
||||
};
|
||||
|
||||
|
@ -260,7 +262,8 @@ protected:
|
|||
CPU_CLMUL = (1 << 21), // carryless multiply for CRC
|
||||
CPU_BMI1 = (1 << 22),
|
||||
CPU_BMI2 = (1 << 23),
|
||||
CPU_RTM = (1 << 24) // Restricted Transactional Memory instructions
|
||||
CPU_RTM = (1 << 24), // Restricted Transactional Memory instructions
|
||||
CPU_ADX = (1 << 25)
|
||||
} cpuFeatureFlags;
|
||||
|
||||
enum {
|
||||
|
@ -465,10 +468,16 @@ protected:
|
|||
}
|
||||
// Intel features.
|
||||
if(is_intel()) {
|
||||
if(_cpuid_info.sef_cpuid7_ebx.bits.adx != 0)
|
||||
result |= CPU_ADX;
|
||||
if(_cpuid_info.sef_cpuid7_ebx.bits.bmi2 != 0)
|
||||
result |= CPU_BMI2;
|
||||
if(_cpuid_info.ext_cpuid1_ecx.bits.lzcnt_intel != 0)
|
||||
result |= CPU_LZCNT;
|
||||
// for Intel, ecx.bits.misalignsse bit (bit 8) indicates support for prefetchw
|
||||
if (_cpuid_info.ext_cpuid1_ecx.bits.misalignsse != 0) {
|
||||
result |= CPU_3DNOW_PREFETCH;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
|
@ -625,6 +634,7 @@ public:
|
|||
static bool supports_rtm() { return (_cpuFeatures & CPU_RTM) != 0; }
|
||||
static bool supports_bmi1() { return (_cpuFeatures & CPU_BMI1) != 0; }
|
||||
static bool supports_bmi2() { return (_cpuFeatures & CPU_BMI2) != 0; }
|
||||
static bool supports_adx() { return (_cpuFeatures & CPU_ADX) != 0; }
|
||||
// Intel features
|
||||
static bool is_intel_family_core() { return is_intel() &&
|
||||
extended_cpu_family() == CPU_FAMILY_INTEL_CORE; }
|
||||
|
|
|
@ -275,4 +275,101 @@ inline void assert_different_registers(
|
|||
);
|
||||
}
|
||||
|
||||
inline void assert_different_registers(
|
||||
AbstractRegister a,
|
||||
AbstractRegister b,
|
||||
AbstractRegister c,
|
||||
AbstractRegister d,
|
||||
AbstractRegister e,
|
||||
AbstractRegister f,
|
||||
AbstractRegister g,
|
||||
AbstractRegister h,
|
||||
AbstractRegister i,
|
||||
AbstractRegister j
|
||||
) {
|
||||
assert(
|
||||
a != b && a != c && a != d && a != e && a != f && a != g && a != h && a != i && a != j
|
||||
&& b != c && b != d && b != e && b != f && b != g && b != h && b != i && b != j
|
||||
&& c != d && c != e && c != f && c != g && c != h && c != i && c != j
|
||||
&& d != e && d != f && d != g && d != h && d != i && d != j
|
||||
&& e != f && e != g && e != h && e != i && e != j
|
||||
&& f != g && f != h && f != i && f != j
|
||||
&& g != h && g != i && g != j
|
||||
&& h != i && h != j
|
||||
&& i != j,
|
||||
err_msg_res("registers must be different: a=" INTPTR_FORMAT ", b=" INTPTR_FORMAT
|
||||
", c=" INTPTR_FORMAT ", d=" INTPTR_FORMAT ", e=" INTPTR_FORMAT
|
||||
", f=" INTPTR_FORMAT ", g=" INTPTR_FORMAT ", h=" INTPTR_FORMAT
|
||||
", i=" INTPTR_FORMAT ", j=" INTPTR_FORMAT "",
|
||||
p2i(a), p2i(b), p2i(c), p2i(d), p2i(e), p2i(f), p2i(g), p2i(h), p2i(i), p2i(j))
|
||||
);
|
||||
}
|
||||
|
||||
inline void assert_different_registers(
|
||||
AbstractRegister a,
|
||||
AbstractRegister b,
|
||||
AbstractRegister c,
|
||||
AbstractRegister d,
|
||||
AbstractRegister e,
|
||||
AbstractRegister f,
|
||||
AbstractRegister g,
|
||||
AbstractRegister h,
|
||||
AbstractRegister i,
|
||||
AbstractRegister j,
|
||||
AbstractRegister k
|
||||
) {
|
||||
assert(
|
||||
a != b && a != c && a != d && a != e && a != f && a != g && a != h && a != i && a != j && a !=k
|
||||
&& b != c && b != d && b != e && b != f && b != g && b != h && b != i && b != j && b !=k
|
||||
&& c != d && c != e && c != f && c != g && c != h && c != i && c != j && c !=k
|
||||
&& d != e && d != f && d != g && d != h && d != i && d != j && d !=k
|
||||
&& e != f && e != g && e != h && e != i && e != j && e !=k
|
||||
&& f != g && f != h && f != i && f != j && f !=k
|
||||
&& g != h && g != i && g != j && g !=k
|
||||
&& h != i && h != j && h !=k
|
||||
&& i != j && i !=k
|
||||
&& j !=k,
|
||||
err_msg_res("registers must be different: a=" INTPTR_FORMAT ", b=" INTPTR_FORMAT
|
||||
", c=" INTPTR_FORMAT ", d=" INTPTR_FORMAT ", e=" INTPTR_FORMAT
|
||||
", f=" INTPTR_FORMAT ", g=" INTPTR_FORMAT ", h=" INTPTR_FORMAT
|
||||
", i=" INTPTR_FORMAT ", j=" INTPTR_FORMAT ", k=" INTPTR_FORMAT "",
|
||||
p2i(a), p2i(b), p2i(c), p2i(d), p2i(e), p2i(f), p2i(g), p2i(h), p2i(i), p2i(j), p2i(k))
|
||||
);
|
||||
}
|
||||
|
||||
inline void assert_different_registers(
|
||||
AbstractRegister a,
|
||||
AbstractRegister b,
|
||||
AbstractRegister c,
|
||||
AbstractRegister d,
|
||||
AbstractRegister e,
|
||||
AbstractRegister f,
|
||||
AbstractRegister g,
|
||||
AbstractRegister h,
|
||||
AbstractRegister i,
|
||||
AbstractRegister j,
|
||||
AbstractRegister k,
|
||||
AbstractRegister l
|
||||
) {
|
||||
assert(
|
||||
a != b && a != c && a != d && a != e && a != f && a != g && a != h && a != i && a != j && a !=k && a !=l
|
||||
&& b != c && b != d && b != e && b != f && b != g && b != h && b != i && b != j && b !=k && b !=l
|
||||
&& c != d && c != e && c != f && c != g && c != h && c != i && c != j && c !=k && c !=l
|
||||
&& d != e && d != f && d != g && d != h && d != i && d != j && d !=k && d !=l
|
||||
&& e != f && e != g && e != h && e != i && e != j && e !=k && e !=l
|
||||
&& f != g && f != h && f != i && f != j && f !=k && f !=l
|
||||
&& g != h && g != i && g != j && g !=k && g !=l
|
||||
&& h != i && h != j && h !=k && h !=l
|
||||
&& i != j && i !=k && i !=l
|
||||
&& j !=k && j !=l
|
||||
&& k !=l,
|
||||
err_msg_res("registers must be different: a=" INTPTR_FORMAT ", b=" INTPTR_FORMAT
|
||||
", c=" INTPTR_FORMAT ", d=" INTPTR_FORMAT ", e=" INTPTR_FORMAT
|
||||
", f=" INTPTR_FORMAT ", g=" INTPTR_FORMAT ", h=" INTPTR_FORMAT
|
||||
", i=" INTPTR_FORMAT ", j=" INTPTR_FORMAT ", k=" INTPTR_FORMAT
|
||||
", l=" INTPTR_FORMAT "",
|
||||
p2i(a), p2i(b), p2i(c), p2i(d), p2i(e), p2i(f), p2i(g), p2i(h), p2i(i), p2i(j), p2i(k), p2i(l))
|
||||
);
|
||||
}
|
||||
|
||||
#endif // SHARE_VM_ASM_REGISTER_HPP
|
||||
|
|
|
@ -788,6 +788,11 @@
|
|||
do_name( encodeISOArray_name, "encodeISOArray") \
|
||||
do_signature(encodeISOArray_signature, "([CI[BII)I") \
|
||||
\
|
||||
do_class(java_math_BigInteger, "java/math/BigInteger") \
|
||||
do_intrinsic(_multiplyToLen, java_math_BigInteger, multiplyToLen_name, multiplyToLen_signature, F_R) \
|
||||
do_name( multiplyToLen_name, "multiplyToLen") \
|
||||
do_signature(multiplyToLen_signature, "([II[II[I)[I") \
|
||||
\
|
||||
/* java/lang/ref/Reference */ \
|
||||
do_intrinsic(_Reference_get, java_lang_ref_Reference, get_name, void_object_signature, F_R) \
|
||||
\
|
||||
|
|
|
@ -650,6 +650,9 @@
|
|||
product(bool, UseMathExactIntrinsics, true, \
|
||||
"Enables intrinsification of various java.lang.Math functions") \
|
||||
\
|
||||
product(bool, UseMultiplyToLenIntrinsic, false, \
|
||||
"Enables intrinsification of BigInteger.multiplyToLen()") \
|
||||
\
|
||||
product(bool, UseTypeSpeculation, true, \
|
||||
"Speculatively propagate types from profiles") \
|
||||
\
|
||||
|
|
|
@ -945,7 +945,8 @@ void ConnectionGraph::process_call_arguments(CallNode *call) {
|
|||
strcmp(call->as_CallLeaf()->_name, "sha256_implCompress") == 0 ||
|
||||
strcmp(call->as_CallLeaf()->_name, "sha256_implCompressMB") == 0 ||
|
||||
strcmp(call->as_CallLeaf()->_name, "sha512_implCompress") == 0 ||
|
||||
strcmp(call->as_CallLeaf()->_name, "sha512_implCompressMB") == 0)
|
||||
strcmp(call->as_CallLeaf()->_name, "sha512_implCompressMB") == 0 ||
|
||||
strcmp(call->as_CallLeaf()->_name, "multiplyToLen") == 0)
|
||||
))) {
|
||||
call->dump();
|
||||
fatal(err_msg_res("EA unexpected CallLeaf %s", call->as_CallLeaf()->_name));
|
||||
|
|
|
@ -285,6 +285,7 @@ class LibraryCallKit : public GraphKit {
|
|||
bool inline_updateCRC32();
|
||||
bool inline_updateBytesCRC32();
|
||||
bool inline_updateByteBufferCRC32();
|
||||
bool inline_multiplyToLen();
|
||||
};
|
||||
|
||||
|
||||
|
@ -293,8 +294,12 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) {
|
|||
vmIntrinsics::ID id = m->intrinsic_id();
|
||||
assert(id != vmIntrinsics::_none, "must be a VM intrinsic");
|
||||
|
||||
if (DisableIntrinsic[0] != '\0'
|
||||
&& strstr(DisableIntrinsic, vmIntrinsics::name_at(id)) != NULL) {
|
||||
ccstr disable_intr = NULL;
|
||||
|
||||
if ((DisableIntrinsic[0] != '\0'
|
||||
&& strstr(DisableIntrinsic, vmIntrinsics::name_at(id)) != NULL) ||
|
||||
(method_has_option_value("DisableIntrinsic", disable_intr)
|
||||
&& strstr(disable_intr, vmIntrinsics::name_at(id)) != NULL)) {
|
||||
// disabled by a user request on the command line:
|
||||
// example: -XX:DisableIntrinsic=_hashCode,_getClass
|
||||
return NULL;
|
||||
|
@ -477,6 +482,10 @@ CallGenerator* Compile::make_vm_intrinsic(ciMethod* m, bool is_virtual) {
|
|||
if (!UseAESIntrinsics) return NULL;
|
||||
break;
|
||||
|
||||
case vmIntrinsics::_multiplyToLen:
|
||||
if (!UseMultiplyToLenIntrinsic) return NULL;
|
||||
break;
|
||||
|
||||
case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
|
||||
case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
|
||||
if (!UseAESIntrinsics) return NULL;
|
||||
|
@ -876,6 +885,9 @@ bool LibraryCallKit::try_to_inline(int predicate) {
|
|||
case vmIntrinsics::_digestBase_implCompressMB:
|
||||
return inline_digestBase_implCompressMB(predicate);
|
||||
|
||||
case vmIntrinsics::_multiplyToLen:
|
||||
return inline_multiplyToLen();
|
||||
|
||||
case vmIntrinsics::_encodeISOArray:
|
||||
return inline_encodeISOArray();
|
||||
|
||||
|
@ -4924,6 +4936,106 @@ bool LibraryCallKit::inline_encodeISOArray() {
|
|||
return true;
|
||||
}
|
||||
|
||||
//-------------inline_multiplyToLen-----------------------------------
|
||||
bool LibraryCallKit::inline_multiplyToLen() {
|
||||
assert(UseMultiplyToLenIntrinsic, "not implementated on this platform");
|
||||
|
||||
address stubAddr = StubRoutines::multiplyToLen();
|
||||
if (stubAddr == NULL) {
|
||||
return false; // Intrinsic's stub is not implemented on this platform
|
||||
}
|
||||
const char* stubName = "multiplyToLen";
|
||||
|
||||
assert(callee()->signature()->size() == 5, "multiplyToLen has 5 parameters");
|
||||
|
||||
Node* x = argument(1);
|
||||
Node* xlen = argument(2);
|
||||
Node* y = argument(3);
|
||||
Node* ylen = argument(4);
|
||||
Node* z = argument(5);
|
||||
|
||||
const Type* x_type = x->Value(&_gvn);
|
||||
const Type* y_type = y->Value(&_gvn);
|
||||
const TypeAryPtr* top_x = x_type->isa_aryptr();
|
||||
const TypeAryPtr* top_y = y_type->isa_aryptr();
|
||||
if (top_x == NULL || top_x->klass() == NULL ||
|
||||
top_y == NULL || top_y->klass() == NULL) {
|
||||
// failed array check
|
||||
return false;
|
||||
}
|
||||
|
||||
BasicType x_elem = x_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
|
||||
BasicType y_elem = y_type->isa_aryptr()->klass()->as_array_klass()->element_type()->basic_type();
|
||||
if (x_elem != T_INT || y_elem != T_INT) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Set the original stack and the reexecute bit for the interpreter to reexecute
|
||||
// the bytecode that invokes BigInteger.multiplyToLen() if deoptimization happens
|
||||
// on the return from z array allocation in runtime.
|
||||
{ PreserveReexecuteState preexecs(this);
|
||||
jvms()->set_should_reexecute(true);
|
||||
|
||||
Node* x_start = array_element_address(x, intcon(0), x_elem);
|
||||
Node* y_start = array_element_address(y, intcon(0), y_elem);
|
||||
// 'x_start' points to x array + scaled xlen
|
||||
// 'y_start' points to y array + scaled ylen
|
||||
|
||||
// Allocate the result array
|
||||
Node* zlen = _gvn.transform(new AddINode(xlen, ylen));
|
||||
Node* klass_node = makecon(TypeKlassPtr::make(ciTypeArrayKlass::make(T_INT)));
|
||||
|
||||
IdealKit ideal(this);
|
||||
|
||||
#define __ ideal.
|
||||
Node* one = __ ConI(1);
|
||||
Node* zero = __ ConI(0);
|
||||
IdealVariable need_alloc(ideal), z_alloc(ideal); __ declarations_done();
|
||||
__ set(need_alloc, zero);
|
||||
__ set(z_alloc, z);
|
||||
__ if_then(z, BoolTest::eq, null()); {
|
||||
__ increment (need_alloc, one);
|
||||
} __ else_(); {
|
||||
// Update graphKit memory and control from IdealKit.
|
||||
sync_kit(ideal);
|
||||
Node* zlen_arg = load_array_length(z);
|
||||
// Update IdealKit memory and control from graphKit.
|
||||
__ sync_kit(this);
|
||||
__ if_then(zlen_arg, BoolTest::lt, zlen); {
|
||||
__ increment (need_alloc, one);
|
||||
} __ end_if();
|
||||
} __ end_if();
|
||||
|
||||
__ if_then(__ value(need_alloc), BoolTest::ne, zero); {
|
||||
// Update graphKit memory and control from IdealKit.
|
||||
sync_kit(ideal);
|
||||
Node * narr = new_array(klass_node, zlen, 1);
|
||||
// Update IdealKit memory and control from graphKit.
|
||||
__ sync_kit(this);
|
||||
__ set(z_alloc, narr);
|
||||
} __ end_if();
|
||||
|
||||
sync_kit(ideal);
|
||||
z = __ value(z_alloc);
|
||||
_gvn.set_type(z, TypeAryPtr::INTS);
|
||||
// Final sync IdealKit and GraphKit.
|
||||
final_sync(ideal);
|
||||
#undef __
|
||||
|
||||
Node* z_start = array_element_address(z, intcon(0), T_INT);
|
||||
|
||||
Node* call = make_runtime_call(RC_LEAF|RC_NO_FP,
|
||||
OptoRuntime::multiplyToLen_Type(),
|
||||
stubAddr, stubName, TypePtr::BOTTOM,
|
||||
x_start, xlen, y_start, ylen, z_start, zlen);
|
||||
} // original reexecute is set back here
|
||||
|
||||
C->set_has_split_ifs(true); // Has chance for split-if optimization
|
||||
set_result(z);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Calculate CRC32 for byte.
|
||||
* int java.util.zip.CRC32.update(int crc, int b)
|
||||
|
|
|
@ -922,6 +922,30 @@ const TypeFunc* OptoRuntime::digestBase_implCompressMB_Type() {
|
|||
return TypeFunc::make(domain, range);
|
||||
}
|
||||
|
||||
const TypeFunc* OptoRuntime::multiplyToLen_Type() {
|
||||
// create input type (domain)
|
||||
int num_args = 6;
|
||||
int argcnt = num_args;
|
||||
const Type** fields = TypeTuple::fields(argcnt);
|
||||
int argp = TypeFunc::Parms;
|
||||
fields[argp++] = TypePtr::NOTNULL; // x
|
||||
fields[argp++] = TypeInt::INT; // xlen
|
||||
fields[argp++] = TypePtr::NOTNULL; // y
|
||||
fields[argp++] = TypeInt::INT; // ylen
|
||||
fields[argp++] = TypePtr::NOTNULL; // z
|
||||
fields[argp++] = TypeInt::INT; // zlen
|
||||
assert(argp == TypeFunc::Parms+argcnt, "correct decoding");
|
||||
const TypeTuple* domain = TypeTuple::make(TypeFunc::Parms+argcnt, fields);
|
||||
|
||||
// no result type needed
|
||||
fields = TypeTuple::fields(1);
|
||||
fields[TypeFunc::Parms+0] = NULL;
|
||||
const TypeTuple* range = TypeTuple::make(TypeFunc::Parms, fields);
|
||||
return TypeFunc::make(domain, range);
|
||||
}
|
||||
|
||||
|
||||
|
||||
//------------- Interpreter state access for on stack replacement
|
||||
const TypeFunc* OptoRuntime::osr_end_Type() {
|
||||
// create input type (domain)
|
||||
|
|
|
@ -310,6 +310,8 @@ private:
|
|||
static const TypeFunc* sha_implCompress_Type();
|
||||
static const TypeFunc* digestBase_implCompressMB_Type();
|
||||
|
||||
static const TypeFunc* multiplyToLen_Type();
|
||||
|
||||
static const TypeFunc* updateBytesCRC32_Type();
|
||||
|
||||
// leaf on stack replacement interpreter accessor types
|
||||
|
|
|
@ -135,6 +135,8 @@ address StubRoutines::_sha512_implCompressMB = NULL;
|
|||
address StubRoutines::_updateBytesCRC32 = NULL;
|
||||
address StubRoutines::_crc_table_adr = NULL;
|
||||
|
||||
address StubRoutines::_multiplyToLen = NULL;
|
||||
|
||||
double (* StubRoutines::_intrinsic_log )(double) = NULL;
|
||||
double (* StubRoutines::_intrinsic_log10 )(double) = NULL;
|
||||
double (* StubRoutines::_intrinsic_exp )(double) = NULL;
|
||||
|
|
|
@ -202,6 +202,8 @@ class StubRoutines: AllStatic {
|
|||
static address _updateBytesCRC32;
|
||||
static address _crc_table_adr;
|
||||
|
||||
static address _multiplyToLen;
|
||||
|
||||
// These are versions of the java.lang.Math methods which perform
|
||||
// the same operations as the intrinsic version. They are used for
|
||||
// constant folding in the compiler to ensure equivalence. If the
|
||||
|
@ -358,6 +360,8 @@ class StubRoutines: AllStatic {
|
|||
static address updateBytesCRC32() { return _updateBytesCRC32; }
|
||||
static address crc_table_addr() { return _crc_table_adr; }
|
||||
|
||||
static address multiplyToLen() {return _multiplyToLen; }
|
||||
|
||||
static address select_fill_function(BasicType t, bool aligned, const char* &name);
|
||||
|
||||
static address zero_aligned_words() { return _zero_aligned_words; }
|
||||
|
|
|
@ -811,6 +811,7 @@ typedef TwoOopHashtable<Symbol*, mtClass> SymbolTwoOopHashtable;
|
|||
static_field(StubRoutines, _cipherBlockChaining_decryptAESCrypt, address) \
|
||||
static_field(StubRoutines, _updateBytesCRC32, address) \
|
||||
static_field(StubRoutines, _crc_table_adr, address) \
|
||||
static_field(StubRoutines, _multiplyToLen, address) \
|
||||
\
|
||||
/*****************/ \
|
||||
/* SharedRuntime */ \
|
||||
|
|
|
@ -0,0 +1,113 @@
|
|||
/*
|
||||
* Copyright (c) 2014, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*
|
||||
*/
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @bug 8055494
|
||||
* @summary Add C2 x86 intrinsic for BigInteger::multiplyToLen() method
|
||||
*
|
||||
* @run main/othervm/timeout=600 -XX:-TieredCompilation -Xbatch
|
||||
* -XX:CompileCommand=exclude,TestMultiplyToLen::main
|
||||
* -XX:CompileCommand=option,TestMultiplyToLen::base_multiply,ccstr,DisableIntrinsic,_multiplyToLen
|
||||
* -XX:CompileCommand=option,java.math.BigInteger::multiply,ccstr,DisableIntrinsic,_multiplyToLen
|
||||
* -XX:CompileCommand=inline,java.math.BigInteger::multiply TestMultiplyToLen
|
||||
*/
|
||||
|
||||
import java.util.Random;
|
||||
import java.math.*;
|
||||
|
||||
public class TestMultiplyToLen {
|
||||
|
||||
// Avoid intrinsic by preventing inlining multiply() and multiplyToLen().
|
||||
public static BigInteger base_multiply(BigInteger op1, BigInteger op2) {
|
||||
return op1.multiply(op2);
|
||||
}
|
||||
|
||||
// Generate multiplyToLen() intrinsic by inlining multiply().
|
||||
public static BigInteger new_multiply(BigInteger op1, BigInteger op2) {
|
||||
return op1.multiply(op2);
|
||||
}
|
||||
|
||||
public static boolean bytecompare(BigInteger b1, BigInteger b2) {
|
||||
byte[] data1 = b1.toByteArray();
|
||||
byte[] data2 = b2.toByteArray();
|
||||
if (data1.length != data2.length)
|
||||
return false;
|
||||
for (int i = 0; i < data1.length; i++) {
|
||||
if (data1[i] != data2[i])
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public static String stringify(BigInteger b) {
|
||||
String strout= "";
|
||||
byte [] data = b.toByteArray();
|
||||
for (int i = 0; i < data.length; i++) {
|
||||
strout += (String.format("%02x",data[i]) + " ");
|
||||
}
|
||||
return strout;
|
||||
}
|
||||
|
||||
public static void main(String args[]) throws Exception {
|
||||
|
||||
BigInteger oldsum = new BigInteger("0");
|
||||
BigInteger newsum = new BigInteger("0");
|
||||
|
||||
BigInteger b1, b2, oldres, newres;
|
||||
|
||||
Random rand = new Random();
|
||||
long seed = System.nanoTime();
|
||||
Random rand1 = new Random();
|
||||
long seed1 = System.nanoTime();
|
||||
rand.setSeed(seed);
|
||||
rand1.setSeed(seed1);
|
||||
|
||||
for (int j = 0; j < 1000000; j++) {
|
||||
int rand_int = rand1.nextInt(3136)+32;
|
||||
int rand_int1 = rand1.nextInt(3136)+32;
|
||||
b1 = new BigInteger(rand_int, rand);
|
||||
b2 = new BigInteger(rand_int1, rand);
|
||||
|
||||
oldres = base_multiply(b1,b2);
|
||||
newres = new_multiply(b1,b2);
|
||||
|
||||
oldsum = oldsum.add(oldres);
|
||||
newsum = newsum.add(newres);
|
||||
|
||||
if (!bytecompare(oldres,newres)) {
|
||||
System.out.print("mismatch for:b1:" + stringify(b1) + " :b2:" + stringify(b2) + " :oldres:" + stringify(oldres) + " :newres:" + stringify(newres));
|
||||
System.out.println(b1);
|
||||
System.out.println(b2);
|
||||
throw new Exception("Failed");
|
||||
}
|
||||
}
|
||||
if (!bytecompare(oldsum,newsum)) {
|
||||
System.out.println("Failure: oldsum:" + stringify(oldsum) + " newsum:" + stringify(newsum));
|
||||
throw new Exception("Failed");
|
||||
} else {
|
||||
System.out.println("Success");
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue