8144771: Use AVX3 instructions for string compare

Co-authored-by: Michael C Berg <michael.c.berg@intel.com>
Reviewed-by: kvn, thartmann
This commit is contained in:
Jan Civlin 2015-12-14 14:48:30 -08:00 committed by Vladimir Kozlov
parent 6f27a97d77
commit a08d3805f0
6 changed files with 234 additions and 103 deletions

View file

@ -7999,9 +7999,15 @@ void MacroAssembler::string_compare(Register str1, Register str2,
XMMRegister vec1, int ae) {
ShortBranchVerifier sbv(this);
Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3
int stride, stride2, adr_stride, adr_stride1, adr_stride2;
int stride2x2 = 0x40;
Address::ScaleFactor scale, scale1, scale2;
if (ae != StrIntrinsicNode::LL) {
stride2x2 = 0x20;
}
if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
shrl(cnt2, 1);
}
@ -8011,15 +8017,15 @@ void MacroAssembler::string_compare(Register str1, Register str2,
movl(result, cnt1);
subl(cnt1, cnt2);
push(cnt1);
cmov32(Assembler::lessEqual, cnt2, result);
cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
// Is the minimum length zero?
testl(cnt2, cnt2);
jcc(Assembler::zero, LENGTH_DIFF_LABEL);
if (ae == StrIntrinsicNode::LL) {
// Load first bytes
load_unsigned_byte(result, Address(str1, 0));
load_unsigned_byte(cnt1, Address(str2, 0));
load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
} else if (ae == StrIntrinsicNode::UU) {
// Load first characters
load_unsigned_short(result, Address(str1, 0));
@ -8060,7 +8066,10 @@ void MacroAssembler::string_compare(Register str1, Register str2,
assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
Label COMPARE_TAIL_LONG;
Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3
int pcmpmask = 0x19;
if (ae == StrIntrinsicNode::LL) {
pcmpmask &= ~0x01;
@ -8123,11 +8132,40 @@ void MacroAssembler::string_compare(Register str1, Register str2,
}
subl(result, stride2);
subl(cnt2, stride2);
jccb(Assembler::zero, COMPARE_WIDE_TAIL);
jcc(Assembler::zero, COMPARE_WIDE_TAIL);
negptr(result);
// In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
bind(COMPARE_WIDE_VECTORS_LOOP);
#ifdef _LP64
if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
cmpl(cnt2, stride2x2);
jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
testl(cnt2, stride2x2-1); // cnt2 holds the vector count
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
} else {
vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
}
kortestql(k7, k7);
jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
addptr(result, stride2x2); // update since we already compared at this addr
subl(cnt2, stride2x2); // and sub the size too
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
vpxor(vec1, vec1);
jmpb(COMPARE_WIDE_TAIL);
}//if (VM_Version::supports_avx512vlbw())
#endif // _LP64
bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
vmovdqu(vec1, Address(str1, result, scale));
vpxor(vec1, Address(str2, result, scale));
@ -8136,7 +8174,7 @@ void MacroAssembler::string_compare(Register str1, Register str2,
vpxor(vec1, Address(str2, result, scale2));
}
vptest(vec1, vec1);
jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
addptr(result, stride2);
subl(cnt2, stride2);
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
@ -8151,7 +8189,7 @@ void MacroAssembler::string_compare(Register str1, Register str2,
movl(result, stride2);
movl(cnt2, result);
negptr(result);
jmpb(COMPARE_WIDE_VECTORS_LOOP);
jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
// Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
bind(VECTOR_NOT_EQUAL);
@ -8295,6 +8333,34 @@ void MacroAssembler::string_compare(Register str1, Register str2,
}
jmpb(DONE_LABEL);
#ifdef _LP64
if (VM_Version::supports_avx512vlbw()) {
bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
kmovql(cnt1, k7);
notq(cnt1);
bsfq(cnt2, cnt1);
if (ae != StrIntrinsicNode::LL) {
// Divide diff by 2 to get number of chars
sarl(cnt2, 1);
}
addq(result, cnt2);
if (ae == StrIntrinsicNode::LL) {
load_unsigned_byte(cnt1, Address(str2, result));
load_unsigned_byte(result, Address(str1, result));
} else if (ae == StrIntrinsicNode::UU) {
load_unsigned_short(cnt1, Address(str2, result, scale));
load_unsigned_short(result, Address(str1, result, scale));
} else {
load_unsigned_short(cnt1, Address(str2, result, scale2));
load_unsigned_byte(result, Address(str1, result, scale1));
}
subl(result, cnt1);
jmpb(POP_LABEL);
}//if (VM_Version::supports_avx512vlbw())
#endif // _LP64
// Discard the stored length difference
bind(POP_LABEL);
pop(cnt1);
@ -8304,6 +8370,7 @@ void MacroAssembler::string_compare(Register str1, Register str2,
if(ae == StrIntrinsicNode::UL) {
negl(result);
}
}
// Search for Non-ASCII character (Negative byte value) in a byte array,