This commit is contained in:
Vladimir Ivanov 2016-05-06 18:20:50 +03:00
commit 622a5ae6f0
112 changed files with 2044 additions and 803 deletions

View file

@ -8259,10 +8259,19 @@ void MacroAssembler::string_compare(Register str1, Register str2,
// Search for Non-ASCII character (Negative byte value) in a byte array,
// return true if it has any and false otherwise.
// ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
// @HotSpotIntrinsicCandidate
// private static boolean hasNegatives(byte[] ba, int off, int len) {
// for (int i = off; i < off + len; i++) {
// if (ba[i] < 0) {
// return true;
// }
// }
// return false;
// }
void MacroAssembler::has_negatives(Register ary1, Register len,
Register result, Register tmp1,
XMMRegister vec1, XMMRegister vec2) {
Register result, Register tmp1,
XMMRegister vec1, XMMRegister vec2) {
// rsi: byte array
// rcx: len
// rax: result
@ -8275,79 +8284,161 @@ void MacroAssembler::has_negatives(Register ary1, Register len,
testl(len, len);
jcc(Assembler::zero, FALSE_LABEL);
movl(result, len); // copy
if ((UseAVX > 2) && // AVX512
VM_Version::supports_avx512vlbw() &&
VM_Version::supports_bmi2()) {
if (UseAVX >= 2 && UseSSE >= 2) {
// With AVX2, use 32-byte vector compare
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
set_vector_masking(); // opening of the stub context for programming mask registers
// Compare 32-byte vectors
andl(result, 0x0000001f); // tail count (in bytes)
andl(len, 0xffffffe0); // vector count (in bytes)
jcc(Assembler::zero, COMPARE_TAIL);
Label test_64_loop, test_tail;
Register tmp3_aliased = len;
movl(tmp1, len);
vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
andl(tmp1, 64 - 1); // tail count (in chars) 0x3F
andl(len, ~(64 - 1)); // vector count (in chars)
jccb(Assembler::zero, test_tail);
lea(ary1, Address(ary1, len, Address::times_1));
negptr(len);
movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
movdl(vec2, tmp1);
vpbroadcastd(vec2, vec2);
bind(COMPARE_WIDE_VECTORS);
vmovdqu(vec1, Address(ary1, len, Address::times_1));
vptest(vec1, vec2);
bind(test_64_loop);
// Check whether our 64 elements of size byte contain negatives
evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
kortestql(k2, k2);
jcc(Assembler::notZero, TRUE_LABEL);
addptr(len, 32);
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
testl(result, result);
addptr(len, 64);
jccb(Assembler::notZero, test_64_loop);
bind(test_tail);
// bail out when there is nothing to be done
testl(tmp1, -1);
jcc(Assembler::zero, FALSE_LABEL);
vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
vptest(vec1, vec2);
// Save k1
kmovql(k3, k1);
// ~(~0 << len) applied up to two times (for 32-bit scenario)
#ifdef _LP64
mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
shlxq(tmp3_aliased, tmp3_aliased, tmp1);
notq(tmp3_aliased);
kmovql(k1, tmp3_aliased);
#else
Label k_init;
jmp(k_init);
// We could not read 64-bits from a general purpose register thus we move
// data required to compose 64 1's to the instruction stream
// We emit 64 byte wide series of elements from 0..63 which later on would
// be used as a compare targets with tail count contained in tmp1 register.
// Result would be a k1 register having tmp1 consecutive number or 1
// counting from least significant bit.
address tmp = pc();
emit_int64(0x0706050403020100);
emit_int64(0x0F0E0D0C0B0A0908);
emit_int64(0x1716151413121110);
emit_int64(0x1F1E1D1C1B1A1918);
emit_int64(0x2726252423222120);
emit_int64(0x2F2E2D2C2B2A2928);
emit_int64(0x3736353433323130);
emit_int64(0x3F3E3D3C3B3A3938);
bind(k_init);
lea(len, InternalAddress(tmp));
// create mask to test for negative byte inside a vector
evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
evpcmpgtb(k1, vec1, Address(len, 0), Assembler::AVX_512bit);
#endif
evpcmpgtb(k2, k1, vec2, Address(ary1, 0), Assembler::AVX_512bit);
ktestq(k2, k1);
// Restore k1
kmovql(k1, k3);
jcc(Assembler::notZero, TRUE_LABEL);
jmp(FALSE_LABEL);
bind(COMPARE_TAIL); // len is zero
movl(len, result);
// Fallthru to tail compare
} else if (UseSSE42Intrinsics) {
assert(UseSSE >= 4, "SSE4 must be for SSE4.2 intrinsics to be available");
// With SSE4.2, use double quad vector compare
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
// Compare 16-byte vectors
andl(result, 0x0000000f); // tail count (in bytes)
andl(len, 0xfffffff0); // vector count (in bytes)
jccb(Assembler::zero, COMPARE_TAIL);
lea(ary1, Address(ary1, len, Address::times_1));
negptr(len);
movl(tmp1, 0x80808080);
movdl(vec2, tmp1);
pshufd(vec2, vec2, 0);
bind(COMPARE_WIDE_VECTORS);
movdqu(vec1, Address(ary1, len, Address::times_1));
ptest(vec1, vec2);
jcc(Assembler::notZero, TRUE_LABEL);
addptr(len, 16);
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
testl(result, result);
jcc(Assembler::zero, FALSE_LABEL);
movdqu(vec1, Address(ary1, result, Address::times_1, -16));
ptest(vec1, vec2);
jccb(Assembler::notZero, TRUE_LABEL);
jmpb(FALSE_LABEL);
bind(COMPARE_TAIL); // len is zero
movl(len, result);
// Fallthru to tail compare
clear_vector_masking(); // closing of the stub context for programming mask registers
}
else {
movl(result, len); // copy
if (UseAVX == 2 && UseSSE >= 2) {
// With AVX2, use 32-byte vector compare
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
// Compare 32-byte vectors
andl(result, 0x0000001f); // tail count (in bytes)
andl(len, 0xffffffe0); // vector count (in bytes)
jccb(Assembler::zero, COMPARE_TAIL);
lea(ary1, Address(ary1, len, Address::times_1));
negptr(len);
movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
movdl(vec2, tmp1);
vpbroadcastd(vec2, vec2);
bind(COMPARE_WIDE_VECTORS);
vmovdqu(vec1, Address(ary1, len, Address::times_1));
vptest(vec1, vec2);
jccb(Assembler::notZero, TRUE_LABEL);
addptr(len, 32);
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
testl(result, result);
jccb(Assembler::zero, FALSE_LABEL);
vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
vptest(vec1, vec2);
jccb(Assembler::notZero, TRUE_LABEL);
jmpb(FALSE_LABEL);
bind(COMPARE_TAIL); // len is zero
movl(len, result);
// Fallthru to tail compare
}
else if (UseSSE42Intrinsics) {
assert(UseSSE >= 4, "SSE4 must be for SSE4.2 intrinsics to be available");
// With SSE4.2, use double quad vector compare
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
// Compare 16-byte vectors
andl(result, 0x0000000f); // tail count (in bytes)
andl(len, 0xfffffff0); // vector count (in bytes)
jccb(Assembler::zero, COMPARE_TAIL);
lea(ary1, Address(ary1, len, Address::times_1));
negptr(len);
movl(tmp1, 0x80808080);
movdl(vec2, tmp1);
pshufd(vec2, vec2, 0);
bind(COMPARE_WIDE_VECTORS);
movdqu(vec1, Address(ary1, len, Address::times_1));
ptest(vec1, vec2);
jccb(Assembler::notZero, TRUE_LABEL);
addptr(len, 16);
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
testl(result, result);
jccb(Assembler::zero, FALSE_LABEL);
movdqu(vec1, Address(ary1, result, Address::times_1, -16));
ptest(vec1, vec2);
jccb(Assembler::notZero, TRUE_LABEL);
jmpb(FALSE_LABEL);
bind(COMPARE_TAIL); // len is zero
movl(len, result);
// Fallthru to tail compare
}
}
// Compare 4-byte vectors
andl(len, 0xfffffffc); // vector count (in bytes)
jccb(Assembler::zero, COMPARE_CHAR);
@ -8395,7 +8486,6 @@ void MacroAssembler::has_negatives(Register ary1, Register len,
vpxor(vec2, vec2);
}
}
// Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
void MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
Register limit, Register result, Register chr,
@ -8841,10 +8931,23 @@ void MacroAssembler::generate_fill(BasicType t, bool aligned,
}
// encode char[] to byte[] in ISO_8859_1
//@HotSpotIntrinsicCandidate
//private static int implEncodeISOArray(byte[] sa, int sp,
//byte[] da, int dp, int len) {
// int i = 0;
// for (; i < len; i++) {
// char c = StringUTF16.getChar(sa, sp++);
// if (c > '\u00FF')
// break;
// da[dp++] = (byte)c;
// }
// return i;
//}
void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
XMMRegister tmp1Reg, XMMRegister tmp2Reg,
XMMRegister tmp3Reg, XMMRegister tmp4Reg,
Register tmp5, Register result) {
XMMRegister tmp1Reg, XMMRegister tmp2Reg,
XMMRegister tmp3Reg, XMMRegister tmp4Reg,
Register tmp5, Register result) {
// rsi: src
// rdi: dst
// rdx: len
@ -8859,6 +8962,7 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
// check for zero length
testl(len, len);
jcc(Assembler::zero, L_done);
movl(result, len);
// Setup pointers
@ -8967,6 +9071,7 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, Register len,
bind(L_copy_1_char_exit);
addptr(result, len); // len is negative count of not processed elements
bind(L_done);
}
@ -9478,8 +9583,8 @@ void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register
notq(tmp2);
kmovql(k1, tmp2);
evmovdqub(k1, rymm0, Address(obja, result), Assembler::AVX_512bit);
evpcmpeqb(k1, k7, rymm0, Address(objb, result), Assembler::AVX_512bit);
evmovdqub(rymm0, k1, Address(obja, result), Assembler::AVX_512bit);
evpcmpeqb(k7, k1, rymm0, Address(objb, result), Assembler::AVX_512bit);
ktestql(k7, k1);
// Restore k1
@ -10838,13 +10943,24 @@ void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Registe
#undef BIND
#undef BLOCK_COMMENT
// Compress char[] array to byte[].
// ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
// @HotSpotIntrinsicCandidate
// private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
// for (int i = 0; i < len; i++) {
// int c = src[srcOff++];
// if (c >>> 8 != 0) {
// return 0;
// }
// dst[dstOff++] = (byte)c;
// }
// return len;
// }
void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
XMMRegister tmp1Reg, XMMRegister tmp2Reg,
XMMRegister tmp3Reg, XMMRegister tmp4Reg,
Register tmp5, Register result) {
Label copy_chars_loop, return_length, return_zero, done;
XMMRegister tmp1Reg, XMMRegister tmp2Reg,
XMMRegister tmp3Reg, XMMRegister tmp4Reg,
Register tmp5, Register result) {
Label copy_chars_loop, return_length, return_zero, done, below_threshold;
// rsi: src
// rdi: dst
@ -10861,11 +10977,141 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
// save length for return
push(len);
if ((UseAVX > 2) && // AVX512
VM_Version::supports_avx512vlbw() &&
VM_Version::supports_bmi2()) {
set_vector_masking(); // opening of the stub context for programming mask registers
Label copy_32_loop, copy_loop_tail, copy_just_portion_of_candidates;
// alignement
Label post_alignement;
// if length of the string is less than 16, handle it in an old fashioned
// way
testl(len, -32);
jcc(Assembler::zero, below_threshold);
// First check whether a character is compressable ( <= 0xFF).
// Create mask to test for Unicode chars inside zmm vector
movl(result, 0x00FF);
evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit);
testl(len, -64);
jcc(Assembler::zero, post_alignement);
// Save k1
kmovql(k3, k1);
movl(tmp5, dst);
andl(tmp5, (64 - 1));
negl(tmp5);
andl(tmp5, (64 - 1));
// bail out when there is nothing to be done
testl(tmp5, 0xFFFFFFFF);
jcc(Assembler::zero, post_alignement);
// ~(~0 << len), where len is the # of remaining elements to process
movl(result, 0xFFFFFFFF);
shlxl(result, result, tmp5);
notl(result);
kmovdl(k1, result);
evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
ktestd(k2, k1);
jcc(Assembler::carryClear, copy_just_portion_of_candidates);
evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
addptr(src, tmp5);
addptr(src, tmp5);
addptr(dst, tmp5);
subl(len, tmp5);
bind(post_alignement);
// end of alignement
movl(tmp5, len);
andl(tmp5, (32 - 1)); // tail count (in chars)
andl(len, ~(32 - 1)); // vector count (in chars)
jcc(Assembler::zero, copy_loop_tail);
lea(src, Address(src, len, Address::times_2));
lea(dst, Address(dst, len, Address::times_1));
negptr(len);
bind(copy_32_loop);
evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit);
evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
kortestdl(k2, k2);
jcc(Assembler::carryClear, copy_just_portion_of_candidates);
// All elements in current processed chunk are valid candidates for
// compression. Write a truncated byte elements to the memory.
evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
addptr(len, 32);
jcc(Assembler::notZero, copy_32_loop);
bind(copy_loop_tail);
// bail out when there is nothing to be done
testl(tmp5, 0xFFFFFFFF);
jcc(Assembler::zero, return_length);
// Save k1
kmovql(k3, k1);
movl(len, tmp5);
// ~(~0 << len), where len is the # of remaining elements to process
movl(result, 0xFFFFFFFF);
shlxl(result, result, len);
notl(result);
kmovdl(k1, result);
evmovdquw(tmp1Reg, k1, Address(src, 0), Assembler::AVX_512bit);
evpcmpuw(k2, k1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
ktestd(k2, k1);
jcc(Assembler::carryClear, copy_just_portion_of_candidates);
evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
// Restore k1
kmovql(k1, k3);
jmp(return_length);
bind(copy_just_portion_of_candidates);
kmovdl(tmp5, k2);
tzcntl(tmp5, tmp5);
// ~(~0 << tmp5), where tmp5 is a number of elements in an array from the
// result to the first element larger than 0xFF
movl(result, 0xFFFFFFFF);
shlxl(result, result, tmp5);
notl(result);
kmovdl(k1, result);
evpmovwb(Address(dst, 0), k1, tmp1Reg, Assembler::AVX_512bit);
// Restore k1
kmovql(k1, k3);
jmp(return_zero);
clear_vector_masking(); // closing of the stub context for programming mask registers
}
if (UseSSE42Intrinsics) {
assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
Label copy_32_loop, copy_16, copy_tail;
bind(below_threshold);
movl(result, len);
movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors
// vectored compression
@ -10947,10 +11193,16 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
}
// Inflate byte[] array to char[].
// ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java
// @HotSpotIntrinsicCandidate
// private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) {
// for (int i = 0; i < len; i++) {
// dst[dstOff++] = (char)(src[srcOff++] & 0xff);
// }
// }
void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
XMMRegister tmp1, Register tmp2) {
Label copy_chars_loop, done;
XMMRegister tmp1, Register tmp2) {
Label copy_chars_loop, done, below_threshold;
// rsi: src
// rdi: dst
// rdx: len
@ -10961,20 +11213,109 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
// rdx holds length
assert_different_registers(src, dst, len, tmp2);
if ((UseAVX > 2) && // AVX512
VM_Version::supports_avx512vlbw() &&
VM_Version::supports_bmi2()) {
set_vector_masking(); // opening of the stub context for programming mask registers
Label copy_32_loop, copy_tail;
Register tmp3_aliased = len;
// if length of the string is less than 16, handle it in an old fashioned
// way
testl(len, -16);
jcc(Assembler::zero, below_threshold);
// In order to use only one arithmetic operation for the main loop we use
// this pre-calculation
movl(tmp2, len);
andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop
andl(len, -32); // vector count
jccb(Assembler::zero, copy_tail);
lea(src, Address(src, len, Address::times_1));
lea(dst, Address(dst, len, Address::times_2));
negptr(len);
// inflate 32 chars per iter
bind(copy_32_loop);
vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit);
evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit);
addptr(len, 32);
jcc(Assembler::notZero, copy_32_loop);
bind(copy_tail);
// bail out when there is nothing to be done
testl(tmp2, -1); // we don't destroy the contents of tmp2 here
jcc(Assembler::zero, done);
// Save k1
kmovql(k2, k1);
// ~(~0 << length), where length is the # of remaining elements to process
movl(tmp3_aliased, -1);
shlxl(tmp3_aliased, tmp3_aliased, tmp2);
notl(tmp3_aliased);
kmovdl(k1, tmp3_aliased);
evpmovzxbw(tmp1, k1, Address(src, 0), Assembler::AVX_512bit);
evmovdquw(Address(dst, 0), k1, tmp1, Assembler::AVX_512bit);
// Restore k1
kmovql(k1, k2);
jmp(done);
clear_vector_masking(); // closing of the stub context for programming mask registers
}
if (UseSSE42Intrinsics) {
assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
Label copy_8_loop, copy_bytes, copy_tail;
Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail;
movl(tmp2, len);
andl(tmp2, 0x00000007); // tail count (in chars)
andl(len, 0xfffffff8); // vector count (in chars)
jccb(Assembler::zero, copy_tail);
if (UseAVX > 1) {
andl(tmp2, (16 - 1));
andl(len, -16);
jccb(Assembler::zero, copy_new_tail);
} else {
andl(tmp2, 0x00000007); // tail count (in chars)
andl(len, 0xfffffff8); // vector count (in chars)
jccb(Assembler::zero, copy_tail);
}
// vectored inflation
lea(src, Address(src, len, Address::times_1));
lea(dst, Address(dst, len, Address::times_2));
negptr(len);
if (UseAVX > 1) {
bind(copy_16_loop);
vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit);
vmovdqu(Address(dst, len, Address::times_2), tmp1);
addptr(len, 16);
jcc(Assembler::notZero, copy_16_loop);
bind(below_threshold);
bind(copy_new_tail);
if (UseAVX > 2) {
movl(tmp2, len);
}
else {
movl(len, tmp2);
}
andl(tmp2, 0x00000007);
andl(len, 0xFFFFFFF8);
jccb(Assembler::zero, copy_tail);
pmovzxbw(tmp1, Address(src, 0));
movdqu(Address(dst, 0), tmp1);
addptr(src, 8);
addptr(dst, 2 * 8);
jmp(copy_tail, true);
}
// inflate 8 chars per iter
bind(copy_8_loop);
pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words
@ -11013,7 +11354,6 @@ void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len
bind(done);
}
Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
switch (cond) {
// Note some conditions are synonyms for others