8005544: Use 256bit YMM registers in arraycopy stubs on x86

Use YMM registers in arraycopy and array_fill stubs.

Reviewed-by: roland, twisti
This commit is contained in:
Vladimir Kozlov 2013-01-03 16:30:47 -08:00
parent cfcd28fd9d
commit 7b9133b99c
5 changed files with 194 additions and 88 deletions

View file

@ -3656,6 +3656,15 @@ void Assembler::vextracti128h(Address dst, XMMRegister src) {
emit_int8(0x01);
}
// duplicate 4-bytes integer data from src into 8 locations in dest
void Assembler::vpbroadcastd(XMMRegister dst, XMMRegister src) {
assert(VM_Version::supports_avx2(), "");
bool vector256 = true;
int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
emit_int8(0x58);
emit_int8((unsigned char)(0xC0 | encode));
}
void Assembler::vzeroupper() {
assert(VM_Version::supports_avx(), "");
(void)vex_prefix_and_encode(xmm0, xmm0, xmm0, VEX_SIMD_NONE);

View file

@ -1754,6 +1754,9 @@ private:
void vextractf128h(Address dst, XMMRegister src);
void vextracti128h(Address dst, XMMRegister src);
// duplicate 4-bytes integer data from src into 8 locations in dest
void vpbroadcastd(XMMRegister dst, XMMRegister src);
// AVX instruction which is used to clear upper 128 bits of YMM registers and
// to avoid transaction penalty between AVX and SSE states. There is no
// penalty if legacy SSE instructions are encoded using VEX prefix because

View file

@ -6011,8 +6011,31 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
{
assert( UseSSE >= 2, "supported cpu only" );
Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes;
// Fill 32-byte chunks
movdl(xtmp, value);
if (UseAVX >= 2 && UseUnalignedLoadStores) {
// Fill 64-byte chunks
Label L_fill_64_bytes_loop, L_check_fill_32_bytes;
vpbroadcastd(xtmp, xtmp);
subl(count, 16 << shift);
jcc(Assembler::less, L_check_fill_32_bytes);
align(16);
BIND(L_fill_64_bytes_loop);
vmovdqu(Address(to, 0), xtmp);
vmovdqu(Address(to, 32), xtmp);
addptr(to, 64);
subl(count, 16 << shift);
jcc(Assembler::greaterEqual, L_fill_64_bytes_loop);
BIND(L_check_fill_32_bytes);
addl(count, 8 << shift);
jccb(Assembler::less, L_check_fill_8_bytes);
vmovdqu(Address(to, 0), xtmp);
addptr(to, 32);
subl(count, 8 << shift);
} else {
// Fill 32-byte chunks
pshufd(xtmp, xtmp, 0);
subl(count, 8 << shift);
@ -6034,6 +6057,7 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
addptr(to, 32);
subl(count, 8 << shift);
jcc(Assembler::greaterEqual, L_fill_32_bytes_loop);
}
BIND(L_check_fill_8_bytes);
addl(count, 8 << shift);
jccb(Assembler::zero, L_exit);

View file

@ -797,6 +797,12 @@ class StubGenerator: public StubCodeGenerator {
__ BIND(L_copy_64_bytes_loop);
if (UseUnalignedLoadStores) {
if (UseAVX >= 2) {
__ vmovdqu(xmm0, Address(from, 0));
__ vmovdqu(Address(from, to_from, Address::times_1, 0), xmm0);
__ vmovdqu(xmm1, Address(from, 32));
__ vmovdqu(Address(from, to_from, Address::times_1, 32), xmm1);
} else {
__ movdqu(xmm0, Address(from, 0));
__ movdqu(Address(from, to_from, Address::times_1, 0), xmm0);
__ movdqu(xmm1, Address(from, 16));
@ -805,7 +811,7 @@ class StubGenerator: public StubCodeGenerator {
__ movdqu(Address(from, to_from, Address::times_1, 32), xmm2);
__ movdqu(xmm3, Address(from, 48));
__ movdqu(Address(from, to_from, Address::times_1, 48), xmm3);
}
} else {
__ movq(xmm0, Address(from, 0));
__ movq(Address(from, to_from, Address::times_1, 0), xmm0);

View file

@ -1286,23 +1286,54 @@ class StubGenerator: public StubCodeGenerator {
// end_to - destination array end address
// qword_count - 64-bits element count, negative
// to - scratch
// L_copy_32_bytes - entry label
// L_copy_bytes - entry label
// L_copy_8_bytes - exit label
//
void copy_32_bytes_forward(Register end_from, Register end_to,
void copy_bytes_forward(Register end_from, Register end_to,
Register qword_count, Register to,
Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
Label& L_copy_bytes, Label& L_copy_8_bytes) {
DEBUG_ONLY(__ stop("enter at entry label, not here"));
Label L_loop;
__ align(OptoLoopAlignment);
__ BIND(L_loop);
if (UseUnalignedLoadStores) {
Label L_end;
// Copy 64-bytes per iteration
__ BIND(L_loop);
if (UseAVX >= 2) {
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
__ vmovdqu(xmm1, Address(end_from, qword_count, Address::times_8, -24));
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm1);
} else {
__ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -56));
__ movdqu(Address(end_to, qword_count, Address::times_8, -56), xmm0);
__ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, -40));
__ movdqu(Address(end_to, qword_count, Address::times_8, -40), xmm1);
__ movdqu(xmm2, Address(end_from, qword_count, Address::times_8, -24));
__ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm2);
__ movdqu(xmm3, Address(end_from, qword_count, Address::times_8, - 8));
__ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm3);
}
__ BIND(L_copy_bytes);
__ addptr(qword_count, 8);
__ jcc(Assembler::lessEqual, L_loop);
__ subptr(qword_count, 4); // sub(8) and add(4)
__ jccb(Assembler::greater, L_end);
// Copy trailing 32 bytes
if (UseAVX >= 2) {
__ vmovdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
__ vmovdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
} else {
__ movdqu(xmm0, Address(end_from, qword_count, Address::times_8, -24));
__ movdqu(Address(end_to, qword_count, Address::times_8, -24), xmm0);
__ movdqu(xmm1, Address(end_from, qword_count, Address::times_8, - 8));
__ movdqu(Address(end_to, qword_count, Address::times_8, - 8), xmm1);
}
__ addptr(qword_count, 4);
__ BIND(L_end);
} else {
// Copy 32-bytes per iteration
__ BIND(L_loop);
__ movq(to, Address(end_from, qword_count, Address::times_8, -24));
__ movq(Address(end_to, qword_count, Address::times_8, -24), to);
__ movq(to, Address(end_from, qword_count, Address::times_8, -16));
@ -1311,15 +1342,15 @@ class StubGenerator: public StubCodeGenerator {
__ movq(Address(end_to, qword_count, Address::times_8, - 8), to);
__ movq(to, Address(end_from, qword_count, Address::times_8, - 0));
__ movq(Address(end_to, qword_count, Address::times_8, - 0), to);
}
__ BIND(L_copy_32_bytes);
__ BIND(L_copy_bytes);
__ addptr(qword_count, 4);
__ jcc(Assembler::lessEqual, L_loop);
}
__ subptr(qword_count, 4);
__ jcc(Assembler::less, L_copy_8_bytes); // Copy trailing qwords
}
// Copy big chunks backward
//
// Inputs:
@ -1327,23 +1358,55 @@ class StubGenerator: public StubCodeGenerator {
// dest - destination array address
// qword_count - 64-bits element count
// to - scratch
// L_copy_32_bytes - entry label
// L_copy_bytes - entry label
// L_copy_8_bytes - exit label
//
void copy_32_bytes_backward(Register from, Register dest,
void copy_bytes_backward(Register from, Register dest,
Register qword_count, Register to,
Label& L_copy_32_bytes, Label& L_copy_8_bytes) {
Label& L_copy_bytes, Label& L_copy_8_bytes) {
DEBUG_ONLY(__ stop("enter at entry label, not here"));
Label L_loop;
__ align(OptoLoopAlignment);
__ BIND(L_loop);
if (UseUnalignedLoadStores) {
Label L_end;
// Copy 64-bytes per iteration
__ BIND(L_loop);
if (UseAVX >= 2) {
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 32));
__ vmovdqu(Address(dest, qword_count, Address::times_8, 32), xmm0);
__ vmovdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
__ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
} else {
__ movdqu(xmm0, Address(from, qword_count, Address::times_8, 48));
__ movdqu(Address(dest, qword_count, Address::times_8, 48), xmm0);
__ movdqu(xmm1, Address(from, qword_count, Address::times_8, 32));
__ movdqu(Address(dest, qword_count, Address::times_8, 32), xmm1);
__ movdqu(xmm2, Address(from, qword_count, Address::times_8, 16));
__ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm2);
__ movdqu(xmm3, Address(from, qword_count, Address::times_8, 0));
__ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm3);
}
__ BIND(L_copy_bytes);
__ subptr(qword_count, 8);
__ jcc(Assembler::greaterEqual, L_loop);
__ addptr(qword_count, 4); // add(8) and sub(4)
__ jccb(Assembler::less, L_end);
// Copy trailing 32 bytes
if (UseAVX >= 2) {
__ vmovdqu(xmm0, Address(from, qword_count, Address::times_8, 0));
__ vmovdqu(Address(dest, qword_count, Address::times_8, 0), xmm0);
} else {
__ movdqu(xmm0, Address(from, qword_count, Address::times_8, 16));
__ movdqu(Address(dest, qword_count, Address::times_8, 16), xmm0);
__ movdqu(xmm1, Address(from, qword_count, Address::times_8, 0));
__ movdqu(Address(dest, qword_count, Address::times_8, 0), xmm1);
}
__ subptr(qword_count, 4);
__ BIND(L_end);
} else {
// Copy 32-bytes per iteration
__ BIND(L_loop);
__ movq(to, Address(from, qword_count, Address::times_8, 24));
__ movq(Address(dest, qword_count, Address::times_8, 24), to);
__ movq(to, Address(from, qword_count, Address::times_8, 16));
@ -1352,10 +1415,11 @@ class StubGenerator: public StubCodeGenerator {
__ movq(Address(dest, qword_count, Address::times_8, 8), to);
__ movq(to, Address(from, qword_count, Address::times_8, 0));
__ movq(Address(dest, qword_count, Address::times_8, 0), to);
}
__ BIND(L_copy_32_bytes);
__ BIND(L_copy_bytes);
__ subptr(qword_count, 4);
__ jcc(Assembler::greaterEqual, L_loop);
}
__ addptr(qword_count, 4);
__ jcc(Assembler::greater, L_copy_8_bytes); // Copy trailing qwords
}
@ -1385,7 +1449,7 @@ class StubGenerator: public StubCodeGenerator {
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
Label L_copy_byte, L_exit;
const Register from = rdi; // source array address
const Register to = rsi; // destination array address
@ -1417,7 +1481,7 @@ class StubGenerator: public StubCodeGenerator {
__ lea(end_from, Address(from, qword_count, Address::times_8, -8));
__ lea(end_to, Address(to, qword_count, Address::times_8, -8));
__ negptr(qword_count); // make the count negative
__ jmp(L_copy_32_bytes);
__ jmp(L_copy_bytes);
// Copy trailing qwords
__ BIND(L_copy_8_bytes);
@ -1460,8 +1524,8 @@ class StubGenerator: public StubCodeGenerator {
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
// Copy in 32-bytes chunks
copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
// Copy in multi-bytes chunks
copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
__ jmp(L_copy_4_bytes);
return start;
@ -1488,7 +1552,7 @@ class StubGenerator: public StubCodeGenerator {
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_copy_2_bytes;
const Register from = rdi; // source array address
const Register to = rsi; // destination array address
const Register count = rdx; // elements count
@ -1531,10 +1595,10 @@ class StubGenerator: public StubCodeGenerator {
// Check for and copy trailing dword
__ BIND(L_copy_4_bytes);
__ testl(byte_count, 4);
__ jcc(Assembler::zero, L_copy_32_bytes);
__ jcc(Assembler::zero, L_copy_bytes);
__ movl(rax, Address(from, qword_count, Address::times_8));
__ movl(Address(to, qword_count, Address::times_8), rax);
__ jmp(L_copy_32_bytes);
__ jmp(L_copy_bytes);
// Copy trailing qwords
__ BIND(L_copy_8_bytes);
@ -1549,8 +1613,8 @@ class StubGenerator: public StubCodeGenerator {
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
// Copy in 32-bytes chunks
copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
// Copy in multi-bytes chunks
copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
restore_arg_regs();
inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr); // Update counter after rscratch1 is free
@ -1585,7 +1649,7 @@ class StubGenerator: public StubCodeGenerator {
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes,L_copy_2_bytes,L_exit;
const Register from = rdi; // source array address
const Register to = rsi; // destination array address
const Register count = rdx; // elements count
@ -1616,7 +1680,7 @@ class StubGenerator: public StubCodeGenerator {
__ lea(end_from, Address(from, qword_count, Address::times_8, -8));
__ lea(end_to, Address(to, qword_count, Address::times_8, -8));
__ negptr(qword_count);
__ jmp(L_copy_32_bytes);
__ jmp(L_copy_bytes);
// Copy trailing qwords
__ BIND(L_copy_8_bytes);
@ -1652,8 +1716,8 @@ class StubGenerator: public StubCodeGenerator {
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
// Copy in 32-bytes chunks
copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
// Copy in multi-bytes chunks
copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
__ jmp(L_copy_4_bytes);
return start;
@ -1700,7 +1764,7 @@ class StubGenerator: public StubCodeGenerator {
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes;
Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes;
const Register from = rdi; // source array address
const Register to = rsi; // destination array address
const Register count = rdx; // elements count
@ -1735,10 +1799,10 @@ class StubGenerator: public StubCodeGenerator {
// Check for and copy trailing dword
__ BIND(L_copy_4_bytes);
__ testl(word_count, 2);
__ jcc(Assembler::zero, L_copy_32_bytes);
__ jcc(Assembler::zero, L_copy_bytes);
__ movl(rax, Address(from, qword_count, Address::times_8));
__ movl(Address(to, qword_count, Address::times_8), rax);
__ jmp(L_copy_32_bytes);
__ jmp(L_copy_bytes);
// Copy trailing qwords
__ BIND(L_copy_8_bytes);
@ -1753,8 +1817,8 @@ class StubGenerator: public StubCodeGenerator {
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
// Copy in 32-bytes chunks
copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
// Copy in multi-bytes chunks
copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
restore_arg_regs();
inc_counter_np(SharedRuntime::_jshort_array_copy_ctr); // Update counter after rscratch1 is free
@ -1790,7 +1854,7 @@ class StubGenerator: public StubCodeGenerator {
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
Label L_copy_32_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
Label L_copy_bytes, L_copy_8_bytes, L_copy_4_bytes, L_exit;
const Register from = rdi; // source array address
const Register to = rsi; // destination array address
const Register count = rdx; // elements count
@ -1826,7 +1890,7 @@ class StubGenerator: public StubCodeGenerator {
__ lea(end_from, Address(from, qword_count, Address::times_8, -8));
__ lea(end_to, Address(to, qword_count, Address::times_8, -8));
__ negptr(qword_count);
__ jmp(L_copy_32_bytes);
__ jmp(L_copy_bytes);
// Copy trailing qwords
__ BIND(L_copy_8_bytes);
@ -1853,8 +1917,8 @@ class StubGenerator: public StubCodeGenerator {
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
// Copy 32-bytes chunks
copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
// Copy in multi-bytes chunks
copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
__ jmp(L_copy_4_bytes);
return start;
@ -1882,7 +1946,7 @@ class StubGenerator: public StubCodeGenerator {
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
Label L_copy_32_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
Label L_copy_bytes, L_copy_8_bytes, L_copy_2_bytes, L_exit;
const Register from = rdi; // source array address
const Register to = rsi; // destination array address
const Register count = rdx; // elements count
@ -1916,10 +1980,10 @@ class StubGenerator: public StubCodeGenerator {
// Check for and copy trailing dword
__ testl(dword_count, 1);
__ jcc(Assembler::zero, L_copy_32_bytes);
__ jcc(Assembler::zero, L_copy_bytes);
__ movl(rax, Address(from, dword_count, Address::times_4, -4));
__ movl(Address(to, dword_count, Address::times_4, -4), rax);
__ jmp(L_copy_32_bytes);
__ jmp(L_copy_bytes);
// Copy trailing qwords
__ BIND(L_copy_8_bytes);
@ -1937,8 +2001,8 @@ class StubGenerator: public StubCodeGenerator {
__ leave(); // required for proper stackwalking of RuntimeStub frame
__ ret(0);
// Copy in 32-bytes chunks
copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
// Copy in multi-bytes chunks
copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
__ bind(L_exit);
if (is_oop) {
@ -1976,7 +2040,7 @@ class StubGenerator: public StubCodeGenerator {
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
Label L_copy_bytes, L_copy_8_bytes, L_exit;
const Register from = rdi; // source array address
const Register to = rsi; // destination array address
const Register qword_count = rdx; // elements count
@ -2008,7 +2072,7 @@ class StubGenerator: public StubCodeGenerator {
__ lea(end_from, Address(from, qword_count, Address::times_8, -8));
__ lea(end_to, Address(to, qword_count, Address::times_8, -8));
__ negptr(qword_count);
__ jmp(L_copy_32_bytes);
__ jmp(L_copy_bytes);
// Copy trailing qwords
__ BIND(L_copy_8_bytes);
@ -2027,8 +2091,8 @@ class StubGenerator: public StubCodeGenerator {
__ ret(0);
}
// Copy 64-byte chunks
copy_32_bytes_forward(end_from, end_to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
// Copy in multi-bytes chunks
copy_bytes_forward(end_from, end_to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
if (is_oop) {
__ BIND(L_exit);
@ -2065,7 +2129,7 @@ class StubGenerator: public StubCodeGenerator {
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
Label L_copy_32_bytes, L_copy_8_bytes, L_exit;
Label L_copy_bytes, L_copy_8_bytes, L_exit;
const Register from = rdi; // source array address
const Register to = rsi; // destination array address
const Register qword_count = rdx; // elements count
@ -2091,7 +2155,7 @@ class StubGenerator: public StubCodeGenerator {
gen_write_ref_array_pre_barrier(to, saved_count, dest_uninitialized);
}
__ jmp(L_copy_32_bytes);
__ jmp(L_copy_bytes);
// Copy trailing qwords
__ BIND(L_copy_8_bytes);
@ -2110,8 +2174,8 @@ class StubGenerator: public StubCodeGenerator {
__ ret(0);
}
// Copy in 32-bytes chunks
copy_32_bytes_backward(from, to, qword_count, rax, L_copy_32_bytes, L_copy_8_bytes);
// Copy in multi-bytes chunks
copy_bytes_backward(from, to, qword_count, rax, L_copy_bytes, L_copy_8_bytes);
if (is_oop) {
__ BIND(L_exit);