mirror of
https://github.com/openjdk/jdk.git
synced 2025-09-20 02:54:35 +02:00
8144771: Use AVX3 instructions for string compare
Co-authored-by: Michael C Berg <michael.c.berg@intel.com> Reviewed-by: kvn, thartmann
This commit is contained in:
parent
6f27a97d77
commit
a08d3805f0
6 changed files with 234 additions and 103 deletions
|
@ -7999,9 +7999,15 @@ void MacroAssembler::string_compare(Register str1, Register str2,
|
|||
XMMRegister vec1, int ae) {
|
||||
ShortBranchVerifier sbv(this);
|
||||
Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
|
||||
Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3
|
||||
int stride, stride2, adr_stride, adr_stride1, adr_stride2;
|
||||
int stride2x2 = 0x40;
|
||||
Address::ScaleFactor scale, scale1, scale2;
|
||||
|
||||
if (ae != StrIntrinsicNode::LL) {
|
||||
stride2x2 = 0x20;
|
||||
}
|
||||
|
||||
if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
|
||||
shrl(cnt2, 1);
|
||||
}
|
||||
|
@ -8011,15 +8017,15 @@ void MacroAssembler::string_compare(Register str1, Register str2,
|
|||
movl(result, cnt1);
|
||||
subl(cnt1, cnt2);
|
||||
push(cnt1);
|
||||
cmov32(Assembler::lessEqual, cnt2, result);
|
||||
cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
|
||||
|
||||
// Is the minimum length zero?
|
||||
testl(cnt2, cnt2);
|
||||
jcc(Assembler::zero, LENGTH_DIFF_LABEL);
|
||||
if (ae == StrIntrinsicNode::LL) {
|
||||
// Load first bytes
|
||||
load_unsigned_byte(result, Address(str1, 0));
|
||||
load_unsigned_byte(cnt1, Address(str2, 0));
|
||||
load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
|
||||
load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
|
||||
} else if (ae == StrIntrinsicNode::UU) {
|
||||
// Load first characters
|
||||
load_unsigned_short(result, Address(str1, 0));
|
||||
|
@ -8060,7 +8066,10 @@ void MacroAssembler::string_compare(Register str1, Register str2,
|
|||
assert(UseSSE >= 4, "SSE4 must be enabled for SSE4.2 intrinsics to be available");
|
||||
Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
|
||||
Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
|
||||
Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
|
||||
Label COMPARE_TAIL_LONG;
|
||||
Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3
|
||||
|
||||
int pcmpmask = 0x19;
|
||||
if (ae == StrIntrinsicNode::LL) {
|
||||
pcmpmask &= ~0x01;
|
||||
|
@ -8123,11 +8132,40 @@ void MacroAssembler::string_compare(Register str1, Register str2,
|
|||
}
|
||||
subl(result, stride2);
|
||||
subl(cnt2, stride2);
|
||||
jccb(Assembler::zero, COMPARE_WIDE_TAIL);
|
||||
jcc(Assembler::zero, COMPARE_WIDE_TAIL);
|
||||
negptr(result);
|
||||
|
||||
// In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
|
||||
bind(COMPARE_WIDE_VECTORS_LOOP);
|
||||
|
||||
#ifdef _LP64
|
||||
if (VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
|
||||
cmpl(cnt2, stride2x2);
|
||||
jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
|
||||
testl(cnt2, stride2x2-1); // cnt2 holds the vector count
|
||||
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
|
||||
|
||||
bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
|
||||
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
|
||||
evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
|
||||
evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
|
||||
} else {
|
||||
vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
|
||||
evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
|
||||
}
|
||||
kortestql(k7, k7);
|
||||
jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
|
||||
addptr(result, stride2x2); // update since we already compared at this addr
|
||||
subl(cnt2, stride2x2); // and sub the size too
|
||||
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
|
||||
|
||||
vpxor(vec1, vec1);
|
||||
jmpb(COMPARE_WIDE_TAIL);
|
||||
}//if (VM_Version::supports_avx512vlbw())
|
||||
#endif // _LP64
|
||||
|
||||
|
||||
bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
|
||||
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
|
||||
vmovdqu(vec1, Address(str1, result, scale));
|
||||
vpxor(vec1, Address(str2, result, scale));
|
||||
|
@ -8136,7 +8174,7 @@ void MacroAssembler::string_compare(Register str1, Register str2,
|
|||
vpxor(vec1, Address(str2, result, scale2));
|
||||
}
|
||||
vptest(vec1, vec1);
|
||||
jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
|
||||
jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
|
||||
addptr(result, stride2);
|
||||
subl(cnt2, stride2);
|
||||
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
|
||||
|
@ -8151,7 +8189,7 @@ void MacroAssembler::string_compare(Register str1, Register str2,
|
|||
movl(result, stride2);
|
||||
movl(cnt2, result);
|
||||
negptr(result);
|
||||
jmpb(COMPARE_WIDE_VECTORS_LOOP);
|
||||
jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
|
||||
|
||||
// Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
|
||||
bind(VECTOR_NOT_EQUAL);
|
||||
|
@ -8295,6 +8333,34 @@ void MacroAssembler::string_compare(Register str1, Register str2,
|
|||
}
|
||||
jmpb(DONE_LABEL);
|
||||
|
||||
#ifdef _LP64
|
||||
if (VM_Version::supports_avx512vlbw()) {
|
||||
|
||||
bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
|
||||
|
||||
kmovql(cnt1, k7);
|
||||
notq(cnt1);
|
||||
bsfq(cnt2, cnt1);
|
||||
if (ae != StrIntrinsicNode::LL) {
|
||||
// Divide diff by 2 to get number of chars
|
||||
sarl(cnt2, 1);
|
||||
}
|
||||
addq(result, cnt2);
|
||||
if (ae == StrIntrinsicNode::LL) {
|
||||
load_unsigned_byte(cnt1, Address(str2, result));
|
||||
load_unsigned_byte(result, Address(str1, result));
|
||||
} else if (ae == StrIntrinsicNode::UU) {
|
||||
load_unsigned_short(cnt1, Address(str2, result, scale));
|
||||
load_unsigned_short(result, Address(str1, result, scale));
|
||||
} else {
|
||||
load_unsigned_short(cnt1, Address(str2, result, scale2));
|
||||
load_unsigned_byte(result, Address(str1, result, scale1));
|
||||
}
|
||||
subl(result, cnt1);
|
||||
jmpb(POP_LABEL);
|
||||
}//if (VM_Version::supports_avx512vlbw())
|
||||
#endif // _LP64
|
||||
|
||||
// Discard the stored length difference
|
||||
bind(POP_LABEL);
|
||||
pop(cnt1);
|
||||
|
@ -8304,6 +8370,7 @@ void MacroAssembler::string_compare(Register str1, Register str2,
|
|||
if(ae == StrIntrinsicNode::UL) {
|
||||
negl(result);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// Search for Non-ASCII character (Negative byte value) in a byte array,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue