mirror of
https://github.com/openjdk/jdk.git
synced 2025-08-26 22:34:27 +02:00
8005522: use fast-string instructions on x86 for zeroing
Use 'rep stosb' instead of 'rep stosq' when fast-string operations are available. Reviewed-by: twisti, roland
This commit is contained in:
parent
73d6d417be
commit
cfcd28fd9d
10 changed files with 95 additions and 22 deletions
|
@ -2544,12 +2544,18 @@ void Assembler::rep_mov() {
|
||||||
emit_int8((unsigned char)0xA5);
|
emit_int8((unsigned char)0xA5);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// sets rcx bytes with rax, value at [edi]
|
||||||
|
void Assembler::rep_stosb() {
|
||||||
|
emit_int8((unsigned char)0xF3); // REP
|
||||||
|
LP64_ONLY(prefix(REX_W));
|
||||||
|
emit_int8((unsigned char)0xAA); // STOSB
|
||||||
|
}
|
||||||
|
|
||||||
// sets rcx pointer sized words with rax, value at [edi]
|
// sets rcx pointer sized words with rax, value at [edi]
|
||||||
// generic
|
// generic
|
||||||
void Assembler::rep_set() { // rep_set
|
void Assembler::rep_stos() {
|
||||||
emit_int8((unsigned char)0xF3);
|
emit_int8((unsigned char)0xF3); // REP
|
||||||
// STOSQ
|
LP64_ONLY(prefix(REX_W)); // LP64:STOSQ, LP32:STOSD
|
||||||
LP64_ONLY(prefix(REX_W));
|
|
||||||
emit_int8((unsigned char)0xAB);
|
emit_int8((unsigned char)0xAB);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -832,7 +832,8 @@ private:
|
||||||
|
|
||||||
// These do register sized moves/scans
|
// These do register sized moves/scans
|
||||||
void rep_mov();
|
void rep_mov();
|
||||||
void rep_set();
|
void rep_stos();
|
||||||
|
void rep_stosb();
|
||||||
void repne_scan();
|
void repne_scan();
|
||||||
#ifdef _LP64
|
#ifdef _LP64
|
||||||
void repne_scanl();
|
void repne_scanl();
|
||||||
|
|
|
@ -120,6 +120,9 @@ define_pd_global(intx, CMSYoungGenPerWorker, 64*M); // default max size of CMS
|
||||||
product(bool, UseUnalignedLoadStores, false, \
|
product(bool, UseUnalignedLoadStores, false, \
|
||||||
"Use SSE2 MOVDQU instruction for Arraycopy") \
|
"Use SSE2 MOVDQU instruction for Arraycopy") \
|
||||||
\
|
\
|
||||||
|
product(bool, UseFastStosb, false, \
|
||||||
|
"Use fast-string operation for zeroing: rep stosb") \
|
||||||
|
\
|
||||||
/* assembler */ \
|
/* assembler */ \
|
||||||
product(bool, Use486InstrsOnly, false, \
|
product(bool, Use486InstrsOnly, false, \
|
||||||
"Use 80486 Compliant instruction subset") \
|
"Use 80486 Compliant instruction subset") \
|
||||||
|
|
|
@ -5224,6 +5224,22 @@ void MacroAssembler::verified_entry(int framesize, bool stack_bang, bool fp_mode
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp) {
|
||||||
|
// cnt - number of qwords (8-byte words).
|
||||||
|
// base - start address, qword aligned.
|
||||||
|
assert(base==rdi, "base register must be edi for rep stos");
|
||||||
|
assert(tmp==rax, "tmp register must be eax for rep stos");
|
||||||
|
assert(cnt==rcx, "cnt register must be ecx for rep stos");
|
||||||
|
|
||||||
|
xorptr(tmp, tmp);
|
||||||
|
if (UseFastStosb) {
|
||||||
|
shlptr(cnt,3); // convert to number of bytes
|
||||||
|
rep_stosb();
|
||||||
|
} else {
|
||||||
|
NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM
|
||||||
|
rep_stos();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// IndexOf for constant substrings with size >= 8 chars
|
// IndexOf for constant substrings with size >= 8 chars
|
||||||
// which don't need to be loaded through stack.
|
// which don't need to be loaded through stack.
|
||||||
|
|
|
@ -1096,6 +1096,9 @@ public:
|
||||||
// C2 compiled method's prolog code.
|
// C2 compiled method's prolog code.
|
||||||
void verified_entry(int framesize, bool stack_bang, bool fp_mode_24b);
|
void verified_entry(int framesize, bool stack_bang, bool fp_mode_24b);
|
||||||
|
|
||||||
|
// clear memory of size 'cnt' qwords, starting at 'base'.
|
||||||
|
void clear_mem(Register base, Register cnt, Register rtmp);
|
||||||
|
|
||||||
// IndexOf strings.
|
// IndexOf strings.
|
||||||
// Small strings are loaded through stack if they cross page boundary.
|
// Small strings are loaded through stack if they cross page boundary.
|
||||||
void string_indexof(Register str1, Register str2,
|
void string_indexof(Register str1, Register str2,
|
||||||
|
|
|
@ -429,7 +429,7 @@ void VM_Version::get_processor_features() {
|
||||||
}
|
}
|
||||||
|
|
||||||
char buf[256];
|
char buf[256];
|
||||||
jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
jio_snprintf(buf, sizeof(buf), "(%u cores per cpu, %u threads per core) family %d model %d stepping %d%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||||
cores_per_cpu(), threads_per_core(),
|
cores_per_cpu(), threads_per_core(),
|
||||||
cpu_family(), _model, _stepping,
|
cpu_family(), _model, _stepping,
|
||||||
(supports_cmov() ? ", cmov" : ""),
|
(supports_cmov() ? ", cmov" : ""),
|
||||||
|
@ -446,6 +446,7 @@ void VM_Version::get_processor_features() {
|
||||||
(supports_avx() ? ", avx" : ""),
|
(supports_avx() ? ", avx" : ""),
|
||||||
(supports_avx2() ? ", avx2" : ""),
|
(supports_avx2() ? ", avx2" : ""),
|
||||||
(supports_aes() ? ", aes" : ""),
|
(supports_aes() ? ", aes" : ""),
|
||||||
|
(supports_erms() ? ", erms" : ""),
|
||||||
(supports_mmx_ext() ? ", mmxext" : ""),
|
(supports_mmx_ext() ? ", mmxext" : ""),
|
||||||
(supports_3dnow_prefetch() ? ", 3dnowpref" : ""),
|
(supports_3dnow_prefetch() ? ", 3dnowpref" : ""),
|
||||||
(supports_lzcnt() ? ", lzcnt": ""),
|
(supports_lzcnt() ? ", lzcnt": ""),
|
||||||
|
@ -671,6 +672,16 @@ void VM_Version::get_processor_features() {
|
||||||
FLAG_SET_DEFAULT(UsePopCountInstruction, false);
|
FLAG_SET_DEFAULT(UsePopCountInstruction, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Use fast-string operations if available.
|
||||||
|
if (supports_erms()) {
|
||||||
|
if (FLAG_IS_DEFAULT(UseFastStosb)) {
|
||||||
|
UseFastStosb = true;
|
||||||
|
}
|
||||||
|
} else if (UseFastStosb) {
|
||||||
|
warning("fast-string operations are not available on this CPU");
|
||||||
|
FLAG_SET_DEFAULT(UseFastStosb, false);
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef COMPILER2
|
#ifdef COMPILER2
|
||||||
if (FLAG_IS_DEFAULT(AlignVector)) {
|
if (FLAG_IS_DEFAULT(AlignVector)) {
|
||||||
// Modern processors allow misaligned memory operations for vectors.
|
// Modern processors allow misaligned memory operations for vectors.
|
||||||
|
|
|
@ -204,7 +204,8 @@ public:
|
||||||
avx2 : 1,
|
avx2 : 1,
|
||||||
: 2,
|
: 2,
|
||||||
bmi2 : 1,
|
bmi2 : 1,
|
||||||
: 23;
|
erms : 1,
|
||||||
|
: 22;
|
||||||
} bits;
|
} bits;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -247,7 +248,8 @@ protected:
|
||||||
CPU_TSCINV = (1 << 16),
|
CPU_TSCINV = (1 << 16),
|
||||||
CPU_AVX = (1 << 17),
|
CPU_AVX = (1 << 17),
|
||||||
CPU_AVX2 = (1 << 18),
|
CPU_AVX2 = (1 << 18),
|
||||||
CPU_AES = (1 << 19)
|
CPU_AES = (1 << 19),
|
||||||
|
CPU_ERMS = (1 << 20) // enhanced 'rep movsb/stosb' instructions
|
||||||
} cpuFeatureFlags;
|
} cpuFeatureFlags;
|
||||||
|
|
||||||
enum {
|
enum {
|
||||||
|
@ -425,6 +427,8 @@ protected:
|
||||||
result |= CPU_TSCINV;
|
result |= CPU_TSCINV;
|
||||||
if (_cpuid_info.std_cpuid1_ecx.bits.aes != 0)
|
if (_cpuid_info.std_cpuid1_ecx.bits.aes != 0)
|
||||||
result |= CPU_AES;
|
result |= CPU_AES;
|
||||||
|
if (_cpuid_info.sef_cpuid7_ebx.bits.erms != 0)
|
||||||
|
result |= CPU_ERMS;
|
||||||
|
|
||||||
// AMD features.
|
// AMD features.
|
||||||
if (is_amd()) {
|
if (is_amd()) {
|
||||||
|
@ -489,7 +493,7 @@ public:
|
||||||
return (_cpuid_info.std_max_function >= 0xB) &&
|
return (_cpuid_info.std_max_function >= 0xB) &&
|
||||||
// eax[4:0] | ebx[0:15] == 0 indicates invalid topology level.
|
// eax[4:0] | ebx[0:15] == 0 indicates invalid topology level.
|
||||||
// Some cpus have max cpuid >= 0xB but do not support processor topology.
|
// Some cpus have max cpuid >= 0xB but do not support processor topology.
|
||||||
((_cpuid_info.tpl_cpuidB0_eax & 0x1f | _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus) != 0);
|
(((_cpuid_info.tpl_cpuidB0_eax & 0x1f) | _cpuid_info.tpl_cpuidB0_ebx.bits.logical_cpus) != 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint cores_per_cpu() {
|
static uint cores_per_cpu() {
|
||||||
|
@ -550,6 +554,7 @@ public:
|
||||||
static bool supports_avx2() { return (_cpuFeatures & CPU_AVX2) != 0; }
|
static bool supports_avx2() { return (_cpuFeatures & CPU_AVX2) != 0; }
|
||||||
static bool supports_tsc() { return (_cpuFeatures & CPU_TSC) != 0; }
|
static bool supports_tsc() { return (_cpuFeatures & CPU_TSC) != 0; }
|
||||||
static bool supports_aes() { return (_cpuFeatures & CPU_AES) != 0; }
|
static bool supports_aes() { return (_cpuFeatures & CPU_AES) != 0; }
|
||||||
|
static bool supports_erms() { return (_cpuFeatures & CPU_ERMS) != 0; }
|
||||||
|
|
||||||
// Intel features
|
// Intel features
|
||||||
static bool is_intel_family_core() { return is_intel() &&
|
static bool is_intel_family_core() { return is_intel() &&
|
||||||
|
|
|
@ -11572,15 +11572,28 @@ instruct MoveL2D_reg_reg_sse(regD dst, eRegL src, regD tmp) %{
|
||||||
// =======================================================================
|
// =======================================================================
|
||||||
// fast clearing of an array
|
// fast clearing of an array
|
||||||
instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
|
instruct rep_stos(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
|
||||||
|
predicate(!UseFastStosb);
|
||||||
match(Set dummy (ClearArray cnt base));
|
match(Set dummy (ClearArray cnt base));
|
||||||
effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
|
effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
|
||||||
format %{ "SHL ECX,1\t# Convert doublewords to words\n\t"
|
format %{ "XOR EAX,EAX\t# ClearArray:\n\t"
|
||||||
"XOR EAX,EAX\n\t"
|
"SHL ECX,1\t# Convert doublewords to words\n\t"
|
||||||
"REP STOS\t# store EAX into [EDI++] while ECX--" %}
|
"REP STOS\t# store EAX into [EDI++] while ECX--" %}
|
||||||
opcode(0,0x4);
|
ins_encode %{
|
||||||
ins_encode( Opcode(0xD1), RegOpc(ECX),
|
__ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
|
||||||
OpcRegReg(0x33,EAX,EAX),
|
%}
|
||||||
Opcode(0xF3), Opcode(0xAB) );
|
ins_pipe( pipe_slow );
|
||||||
|
%}
|
||||||
|
|
||||||
|
instruct rep_fast_stosb(eCXRegI cnt, eDIRegP base, eAXRegI zero, Universe dummy, eFlagsReg cr) %{
|
||||||
|
predicate(UseFastStosb);
|
||||||
|
match(Set dummy (ClearArray cnt base));
|
||||||
|
effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
|
||||||
|
format %{ "XOR EAX,EAX\t# ClearArray:\n\t"
|
||||||
|
"SHL ECX,3\t# Convert doublewords to bytes\n\t"
|
||||||
|
"REP STOSB\t# store EAX into [EDI++] while ECX--" %}
|
||||||
|
ins_encode %{
|
||||||
|
__ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
|
||||||
|
%}
|
||||||
ins_pipe( pipe_slow );
|
ins_pipe( pipe_slow );
|
||||||
%}
|
%}
|
||||||
|
|
||||||
|
|
|
@ -10374,13 +10374,30 @@ instruct MoveL2D_reg_reg(regD dst, rRegL src) %{
|
||||||
instruct rep_stos(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
|
instruct rep_stos(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
|
||||||
rFlagsReg cr)
|
rFlagsReg cr)
|
||||||
%{
|
%{
|
||||||
|
predicate(!UseFastStosb);
|
||||||
match(Set dummy (ClearArray cnt base));
|
match(Set dummy (ClearArray cnt base));
|
||||||
effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
|
effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
|
||||||
|
|
||||||
format %{ "xorl rax, rax\t# ClearArray:\n\t"
|
format %{ "xorq rax, rax\t# ClearArray:\n\t"
|
||||||
"rep stosq\t# Store rax to *rdi++ while rcx--" %}
|
"rep stosq\t# Store rax to *rdi++ while rcx--" %}
|
||||||
ins_encode(opc_reg_reg(0x33, RAX, RAX), // xorl %eax, %eax
|
ins_encode %{
|
||||||
Opcode(0xF3), Opcode(0x48), Opcode(0xAB)); // rep REX_W stos
|
__ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
|
||||||
|
%}
|
||||||
|
ins_pipe(pipe_slow);
|
||||||
|
%}
|
||||||
|
|
||||||
|
instruct rep_fast_stosb(rcx_RegL cnt, rdi_RegP base, rax_RegI zero, Universe dummy,
|
||||||
|
rFlagsReg cr)
|
||||||
|
%{
|
||||||
|
predicate(UseFastStosb);
|
||||||
|
match(Set dummy (ClearArray cnt base));
|
||||||
|
effect(USE_KILL cnt, USE_KILL base, KILL zero, KILL cr);
|
||||||
|
format %{ "xorq rax, rax\t# ClearArray:\n\t"
|
||||||
|
"shlq rcx,3\t# Convert doublewords to bytes\n\t"
|
||||||
|
"rep stosb\t# Store rax to *rdi++ while rcx--" %}
|
||||||
|
ins_encode %{
|
||||||
|
__ clear_mem($base$$Register, $cnt$$Register, $zero$$Register);
|
||||||
|
%}
|
||||||
ins_pipe( pipe_slow );
|
ins_pipe( pipe_slow );
|
||||||
%}
|
%}
|
||||||
|
|
||||||
|
|
|
@ -2725,10 +2725,8 @@ Node* ClearArrayNode::clear_memory(Node* ctl, Node* mem, Node* dest,
|
||||||
zend = phase->transform( new(C) URShiftXNode(zend, shift) );
|
zend = phase->transform( new(C) URShiftXNode(zend, shift) );
|
||||||
}
|
}
|
||||||
|
|
||||||
Node* zsize = phase->transform( new(C) SubXNode(zend, zbase) );
|
|
||||||
Node* zinit = phase->zerocon((unit == BytesPerLong) ? T_LONG : T_INT);
|
|
||||||
|
|
||||||
// Bulk clear double-words
|
// Bulk clear double-words
|
||||||
|
Node* zsize = phase->transform( new(C) SubXNode(zend, zbase) );
|
||||||
Node* adr = phase->transform( new(C) AddPNode(dest, dest, start_offset) );
|
Node* adr = phase->transform( new(C) AddPNode(dest, dest, start_offset) );
|
||||||
mem = new (C) ClearArrayNode(ctl, mem, zsize, adr);
|
mem = new (C) ClearArrayNode(ctl, mem, zsize, adr);
|
||||||
return phase->transform(mem);
|
return phase->transform(mem);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue