mirror of
https://github.com/openjdk/jdk.git
synced 2025-08-26 22:34:27 +02:00
8005419: Improve intrinsics code performance on x86 by using AVX2
Use 256bit vpxor,vptest instructions in String.compareTo() and equals() intrinsics. Reviewed-by: twisti
This commit is contained in:
parent
b3fe91a803
commit
38c81fb411
5 changed files with 290 additions and 35 deletions
|
@ -2468,6 +2468,26 @@ void Assembler::ptest(XMMRegister dst, XMMRegister src) {
|
|||
emit_int8((unsigned char)(0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::vptest(XMMRegister dst, Address src) {
|
||||
assert(VM_Version::supports_avx(), "");
|
||||
InstructionMark im(this);
|
||||
bool vector256 = true;
|
||||
assert(dst != xnoreg, "sanity");
|
||||
int dst_enc = dst->encoding();
|
||||
// swap src<->dst for encoding
|
||||
vex_prefix(src, dst_enc, dst_enc, VEX_SIMD_66, VEX_OPCODE_0F_38, false, vector256);
|
||||
emit_int8(0x17);
|
||||
emit_operand(dst, src);
|
||||
}
|
||||
|
||||
void Assembler::vptest(XMMRegister dst, XMMRegister src) {
|
||||
assert(VM_Version::supports_avx(), "");
|
||||
bool vector256 = true;
|
||||
int encode = vex_prefix_and_encode(dst, xnoreg, src, VEX_SIMD_66, vector256, VEX_OPCODE_0F_38);
|
||||
emit_int8(0x17);
|
||||
emit_int8((unsigned char)(0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::punpcklbw(XMMRegister dst, Address src) {
|
||||
NOT_LP64(assert(VM_Version::supports_sse2(), ""));
|
||||
assert((UseAVX > 0), "SSE mode requires address alignment 16 bytes");
|
||||
|
|
|
@ -1444,9 +1444,12 @@ private:
|
|||
// Shift Right by bytes Logical DoubleQuadword Immediate
|
||||
void psrldq(XMMRegister dst, int shift);
|
||||
|
||||
// Logical Compare Double Quadword
|
||||
// Logical Compare 128bit
|
||||
void ptest(XMMRegister dst, XMMRegister src);
|
||||
void ptest(XMMRegister dst, Address src);
|
||||
// Logical Compare 256bit
|
||||
void vptest(XMMRegister dst, XMMRegister src);
|
||||
void vptest(XMMRegister dst, Address src);
|
||||
|
||||
// Interleave Low Bytes
|
||||
void punpcklbw(XMMRegister dst, XMMRegister src);
|
||||
|
|
|
@ -5675,42 +5675,114 @@ void MacroAssembler::string_compare(Register str1, Register str2,
|
|||
testl(cnt2, cnt2);
|
||||
jcc(Assembler::zero, LENGTH_DIFF_LABEL);
|
||||
|
||||
// Load first characters
|
||||
// Compare first characters
|
||||
load_unsigned_short(result, Address(str1, 0));
|
||||
load_unsigned_short(cnt1, Address(str2, 0));
|
||||
|
||||
// Compare first characters
|
||||
subl(result, cnt1);
|
||||
jcc(Assembler::notZero, POP_LABEL);
|
||||
decrementl(cnt2);
|
||||
jcc(Assembler::zero, LENGTH_DIFF_LABEL);
|
||||
|
||||
{
|
||||
// Check after comparing first character to see if strings are equivalent
|
||||
Label LSkip2;
|
||||
// Check if the strings start at same location
|
||||
cmpptr(str1, str2);
|
||||
jccb(Assembler::notEqual, LSkip2);
|
||||
|
||||
// Check if the length difference is zero (from stack)
|
||||
cmpl(Address(rsp, 0), 0x0);
|
||||
cmpl(cnt2, 1);
|
||||
jcc(Assembler::equal, LENGTH_DIFF_LABEL);
|
||||
|
||||
// Strings might not be equivalent
|
||||
bind(LSkip2);
|
||||
}
|
||||
// Check if the strings start at the same location.
|
||||
cmpptr(str1, str2);
|
||||
jcc(Assembler::equal, LENGTH_DIFF_LABEL);
|
||||
|
||||
Address::ScaleFactor scale = Address::times_2;
|
||||
int stride = 8;
|
||||
|
||||
// Advance to next element
|
||||
addptr(str1, 16/stride);
|
||||
addptr(str2, 16/stride);
|
||||
if (UseAVX >= 2) {
|
||||
Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
|
||||
Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
|
||||
Label COMPARE_TAIL_LONG;
|
||||
int pcmpmask = 0x19;
|
||||
|
||||
if (UseSSE42Intrinsics) {
|
||||
// Setup to compare 16-chars (32-bytes) vectors,
|
||||
// start from first character again because it has aligned address.
|
||||
int stride2 = 16;
|
||||
int adr_stride = stride << scale;
|
||||
int adr_stride2 = stride2 << scale;
|
||||
|
||||
assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
|
||||
// rax and rdx are used by pcmpestri as elements counters
|
||||
movl(result, cnt2);
|
||||
andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
|
||||
jcc(Assembler::zero, COMPARE_TAIL_LONG);
|
||||
|
||||
// fast path : compare first 2 8-char vectors.
|
||||
bind(COMPARE_16_CHARS);
|
||||
movdqu(vec1, Address(str1, 0));
|
||||
pcmpestri(vec1, Address(str2, 0), pcmpmask);
|
||||
jccb(Assembler::below, COMPARE_INDEX_CHAR);
|
||||
|
||||
movdqu(vec1, Address(str1, adr_stride));
|
||||
pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
|
||||
jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
|
||||
addl(cnt1, stride);
|
||||
|
||||
// Compare the characters at index in cnt1
|
||||
bind(COMPARE_INDEX_CHAR); //cnt1 has the offset of the mismatching character
|
||||
load_unsigned_short(result, Address(str1, cnt1, scale));
|
||||
load_unsigned_short(cnt2, Address(str2, cnt1, scale));
|
||||
subl(result, cnt2);
|
||||
jmp(POP_LABEL);
|
||||
|
||||
// Setup the registers to start vector comparison loop
|
||||
bind(COMPARE_WIDE_VECTORS);
|
||||
lea(str1, Address(str1, result, scale));
|
||||
lea(str2, Address(str2, result, scale));
|
||||
subl(result, stride2);
|
||||
subl(cnt2, stride2);
|
||||
jccb(Assembler::zero, COMPARE_WIDE_TAIL);
|
||||
negptr(result);
|
||||
|
||||
// In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
|
||||
bind(COMPARE_WIDE_VECTORS_LOOP);
|
||||
vmovdqu(vec1, Address(str1, result, scale));
|
||||
vpxor(vec1, Address(str2, result, scale));
|
||||
vptest(vec1, vec1);
|
||||
jccb(Assembler::notZero, VECTOR_NOT_EQUAL);
|
||||
addptr(result, stride2);
|
||||
subl(cnt2, stride2);
|
||||
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
|
||||
|
||||
// compare wide vectors tail
|
||||
bind(COMPARE_WIDE_TAIL);
|
||||
testptr(result, result);
|
||||
jccb(Assembler::zero, LENGTH_DIFF_LABEL);
|
||||
|
||||
movl(result, stride2);
|
||||
movl(cnt2, result);
|
||||
negptr(result);
|
||||
jmpb(COMPARE_WIDE_VECTORS_LOOP);
|
||||
|
||||
// Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
|
||||
bind(VECTOR_NOT_EQUAL);
|
||||
lea(str1, Address(str1, result, scale));
|
||||
lea(str2, Address(str2, result, scale));
|
||||
jmp(COMPARE_16_CHARS);
|
||||
|
||||
// Compare tail chars, length between 1 to 15 chars
|
||||
bind(COMPARE_TAIL_LONG);
|
||||
movl(cnt2, result);
|
||||
cmpl(cnt2, stride);
|
||||
jccb(Assembler::less, COMPARE_SMALL_STR);
|
||||
|
||||
movdqu(vec1, Address(str1, 0));
|
||||
pcmpestri(vec1, Address(str2, 0), pcmpmask);
|
||||
jcc(Assembler::below, COMPARE_INDEX_CHAR);
|
||||
subptr(cnt2, stride);
|
||||
jccb(Assembler::zero, LENGTH_DIFF_LABEL);
|
||||
lea(str1, Address(str1, result, scale));
|
||||
lea(str2, Address(str2, result, scale));
|
||||
negptr(cnt2);
|
||||
jmpb(WHILE_HEAD_LABEL);
|
||||
|
||||
bind(COMPARE_SMALL_STR);
|
||||
} else if (UseSSE42Intrinsics) {
|
||||
Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
|
||||
int pcmpmask = 0x19;
|
||||
// Setup to compare 16-byte vectors
|
||||
// Setup to compare 8-char (16-byte) vectors,
|
||||
// start from first character again because it has aligned address.
|
||||
movl(result, cnt2);
|
||||
andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
|
||||
jccb(Assembler::zero, COMPARE_TAIL);
|
||||
|
@ -5742,7 +5814,7 @@ void MacroAssembler::string_compare(Register str1, Register str2,
|
|||
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
|
||||
|
||||
// compare wide vectors tail
|
||||
testl(result, result);
|
||||
testptr(result, result);
|
||||
jccb(Assembler::zero, LENGTH_DIFF_LABEL);
|
||||
|
||||
movl(cnt2, stride);
|
||||
|
@ -5754,21 +5826,20 @@ void MacroAssembler::string_compare(Register str1, Register str2,
|
|||
|
||||
// Mismatched characters in the vectors
|
||||
bind(VECTOR_NOT_EQUAL);
|
||||
addptr(result, cnt1);
|
||||
movptr(cnt2, result);
|
||||
load_unsigned_short(result, Address(str1, cnt2, scale));
|
||||
load_unsigned_short(cnt1, Address(str2, cnt2, scale));
|
||||
subl(result, cnt1);
|
||||
addptr(cnt1, result);
|
||||
load_unsigned_short(result, Address(str1, cnt1, scale));
|
||||
load_unsigned_short(cnt2, Address(str2, cnt1, scale));
|
||||
subl(result, cnt2);
|
||||
jmpb(POP_LABEL);
|
||||
|
||||
bind(COMPARE_TAIL); // limit is zero
|
||||
movl(cnt2, result);
|
||||
// Fallthru to tail compare
|
||||
}
|
||||
|
||||
// Shift str2 and str1 to the end of the arrays, negate min
|
||||
lea(str1, Address(str1, cnt2, scale, 0));
|
||||
lea(str2, Address(str2, cnt2, scale, 0));
|
||||
lea(str1, Address(str1, cnt2, scale));
|
||||
lea(str2, Address(str2, cnt2, scale));
|
||||
decrementl(cnt2); // first character was compared already
|
||||
negptr(cnt2);
|
||||
|
||||
// Compare the rest of the elements
|
||||
|
@ -5833,7 +5904,44 @@ void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Regist
|
|||
shll(limit, 1); // byte count != 0
|
||||
movl(result, limit); // copy
|
||||
|
||||
if (UseSSE42Intrinsics) {
|
||||
if (UseAVX >= 2) {
|
||||
// With AVX2, use 32-byte vector compare
|
||||
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
|
||||
|
||||
// Compare 32-byte vectors
|
||||
andl(result, 0x0000001e); // tail count (in bytes)
|
||||
andl(limit, 0xffffffe0); // vector count (in bytes)
|
||||
jccb(Assembler::zero, COMPARE_TAIL);
|
||||
|
||||
lea(ary1, Address(ary1, limit, Address::times_1));
|
||||
lea(ary2, Address(ary2, limit, Address::times_1));
|
||||
negptr(limit);
|
||||
|
||||
bind(COMPARE_WIDE_VECTORS);
|
||||
vmovdqu(vec1, Address(ary1, limit, Address::times_1));
|
||||
vmovdqu(vec2, Address(ary2, limit, Address::times_1));
|
||||
vpxor(vec1, vec2);
|
||||
|
||||
vptest(vec1, vec1);
|
||||
jccb(Assembler::notZero, FALSE_LABEL);
|
||||
addptr(limit, 32);
|
||||
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
|
||||
|
||||
testl(result, result);
|
||||
jccb(Assembler::zero, TRUE_LABEL);
|
||||
|
||||
vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
|
||||
vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
|
||||
vpxor(vec1, vec2);
|
||||
|
||||
vptest(vec1, vec1);
|
||||
jccb(Assembler::notZero, FALSE_LABEL);
|
||||
jmpb(TRUE_LABEL);
|
||||
|
||||
bind(COMPARE_TAIL); // limit is zero
|
||||
movl(limit, result);
|
||||
// Fallthru to tail compare
|
||||
} else if (UseSSE42Intrinsics) {
|
||||
// With SSE4.2, use double quad vector compare
|
||||
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
|
||||
|
||||
|
|
|
@ -1011,6 +1011,10 @@ public:
|
|||
Assembler::vxorpd(dst, nds, src, vector256);
|
||||
}
|
||||
|
||||
// Simple version for AVX2 256bit vectors
|
||||
void vpxor(XMMRegister dst, XMMRegister src) { Assembler::vpxor(dst, dst, src, true); }
|
||||
void vpxor(XMMRegister dst, Address src) { Assembler::vpxor(dst, dst, src, true); }
|
||||
|
||||
// Move packed integer values from low 128 bit to hign 128 bit in 256 bit vector.
|
||||
void vinserti128h(XMMRegister dst, XMMRegister nds, XMMRegister src) {
|
||||
if (UseAVX > 1) // vinserti128h is available only in AVX2
|
||||
|
|
120
hotspot/test/compiler/8005419/Test8005419.java
Normal file
120
hotspot/test/compiler/8005419/Test8005419.java
Normal file
|
@ -0,0 +1,120 @@
|
|||
/*
|
||||
* Copyright (c) 2012, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/*
|
||||
* @test
|
||||
* @bug 8005419
|
||||
* @summary Improve intrinsics code performance on x86 by using AVX2
|
||||
* @run main/othervm -Xbatch -Xmx64m Test8005419
|
||||
*
|
||||
*/
|
||||
|
||||
public class Test8005419 {
|
||||
public static int SIZE = 64;
|
||||
|
||||
public static void main(String[] args) {
|
||||
char[] a = new char[SIZE];
|
||||
char[] b = new char[SIZE];
|
||||
|
||||
for (int i = 16; i < SIZE; i++) {
|
||||
a[i] = (char)i;
|
||||
b[i] = (char)i;
|
||||
}
|
||||
String s1 = new String(a);
|
||||
String s2 = new String(b);
|
||||
|
||||
// Warm up
|
||||
boolean failed = false;
|
||||
int result = 0;
|
||||
for (int i = 0; i < 10000; i++) {
|
||||
result += test(s1, s2);
|
||||
}
|
||||
for (int i = 0; i < 10000; i++) {
|
||||
result += test(s1, s2);
|
||||
}
|
||||
for (int i = 0; i < 10000; i++) {
|
||||
result += test(s1, s2);
|
||||
}
|
||||
if (result != 0) failed = true;
|
||||
|
||||
System.out.println("Start testing");
|
||||
// Compare same string
|
||||
result = test(s1, s1);
|
||||
if (result != 0) {
|
||||
failed = true;
|
||||
System.out.println("Failed same: result = " + result + ", expected 0");
|
||||
}
|
||||
// Compare equal strings
|
||||
for (int i = 1; i <= SIZE; i++) {
|
||||
s1 = new String(a, 0, i);
|
||||
s2 = new String(b, 0, i);
|
||||
result = test(s1, s2);
|
||||
if (result != 0) {
|
||||
failed = true;
|
||||
System.out.println("Failed equals s1[" + i + "], s2[" + i + "]: result = " + result + ", expected 0");
|
||||
}
|
||||
}
|
||||
// Compare equal strings but different sizes
|
||||
for (int i = 1; i <= SIZE; i++) {
|
||||
s1 = new String(a, 0, i);
|
||||
for (int j = 1; j <= SIZE; j++) {
|
||||
s2 = new String(b, 0, j);
|
||||
result = test(s1, s2);
|
||||
if (result != (i-j)) {
|
||||
failed = true;
|
||||
System.out.println("Failed diff size s1[" + i + "], s2[" + j + "]: result = " + result + ", expected " + (i-j));
|
||||
}
|
||||
}
|
||||
}
|
||||
// Compare strings with one char different and different sizes
|
||||
for (int i = 1; i <= SIZE; i++) {
|
||||
s1 = new String(a, 0, i);
|
||||
for (int j = 0; j < i; j++) {
|
||||
b[j] -= 3; // change char
|
||||
s2 = new String(b, 0, i);
|
||||
result = test(s1, s2);
|
||||
int chdiff = a[j] - b[j];
|
||||
if (result != chdiff) {
|
||||
failed = true;
|
||||
System.out.println("Failed diff char s1[" + i + "], s2[" + i + "]: result = " + result + ", expected " + chdiff);
|
||||
}
|
||||
result = test(s2, s1);
|
||||
chdiff = b[j] - a[j];
|
||||
if (result != chdiff) {
|
||||
failed = true;
|
||||
System.out.println("Failed diff char s2[" + i + "], s1[" + i + "]: result = " + result + ", expected " + chdiff);
|
||||
}
|
||||
b[j] += 3; // restore
|
||||
}
|
||||
}
|
||||
if (failed) {
|
||||
System.out.println("FAILED");
|
||||
System.exit(97);
|
||||
}
|
||||
System.out.println("PASSED");
|
||||
}
|
||||
|
||||
private static int test(String str1, String str2) {
|
||||
return str1.compareTo(str2);
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue