mirror of
https://github.com/openjdk/jdk.git
synced 2025-09-18 10:04:42 +02:00
6987135: Performance regression on Intel platform with 32-bits edition between 6u13 and 6u14
Use hardware DIV instruction for long division by constant when it is faster than code with multiply. Reviewed-by: never
This commit is contained in:
parent
ce0125e7f2
commit
249b1f6c4f
10 changed files with 180 additions and 23 deletions
|
@ -1843,6 +1843,12 @@ bool Matcher::is_spillable_arg( int reg ) {
|
|||
return can_be_java_arg(reg);
|
||||
}
|
||||
|
||||
bool Matcher::use_asm_for_ldiv_by_con( jlong divisor ) {
|
||||
// Use hardware SDIVX instruction when it is
|
||||
// faster than a code which use multiply.
|
||||
return VM_Version::has_fast_idiv();
|
||||
}
|
||||
|
||||
// Register for DIVI projection of divmodI
|
||||
RegMask Matcher::divI_proj_mask() {
|
||||
ShouldNotReachHere();
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -80,7 +80,8 @@ void VM_Version::initialize() {
|
|||
FLAG_SET_DEFAULT(InteriorEntryAlignment, 4);
|
||||
}
|
||||
if (is_niagara1_plus()) {
|
||||
if (AllocatePrefetchStyle > 0 && FLAG_IS_DEFAULT(AllocatePrefetchStyle)) {
|
||||
if (has_blk_init() && AllocatePrefetchStyle > 0 &&
|
||||
FLAG_IS_DEFAULT(AllocatePrefetchStyle)) {
|
||||
// Use BIS instruction for allocation prefetch.
|
||||
FLAG_SET_DEFAULT(AllocatePrefetchStyle, 3);
|
||||
if (FLAG_IS_DEFAULT(AllocatePrefetchDistance)) {
|
||||
|
@ -118,16 +119,18 @@ void VM_Version::initialize() {
|
|||
#endif
|
||||
|
||||
char buf[512];
|
||||
jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
jio_snprintf(buf, sizeof(buf), "%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
|
||||
(has_v8() ? ", has_v8" : ""),
|
||||
(has_v9() ? ", has_v9" : ""),
|
||||
(has_hardware_popc() ? ", popc" : ""),
|
||||
(has_vis1() ? ", has_vis1" : ""),
|
||||
(has_vis2() ? ", has_vis2" : ""),
|
||||
(has_blk_init() ? ", has_blk_init" : ""),
|
||||
(is_ultra3() ? ", is_ultra3" : ""),
|
||||
(is_sun4v() ? ", is_sun4v" : ""),
|
||||
(is_niagara1() ? ", is_niagara1" : ""),
|
||||
(is_niagara1_plus() ? ", is_niagara1_plus" : ""),
|
||||
(is_sparc64() ? ", is_sparc64" : ""),
|
||||
(!has_hardware_mul32() ? ", no-mul32" : ""),
|
||||
(!has_hardware_div32() ? ", no-div32" : ""),
|
||||
(!has_hardware_fsmuld() ? ", no-fsmuld" : ""));
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -33,7 +33,9 @@ protected:
|
|||
v9_instructions = 5,
|
||||
vis1_instructions = 6,
|
||||
vis2_instructions = 7,
|
||||
sun4v_instructions = 8
|
||||
sun4v_instructions = 8,
|
||||
blk_init_instructions = 9,
|
||||
fmaf_instructions = 10
|
||||
};
|
||||
|
||||
enum Feature_Flag_Set {
|
||||
|
@ -49,6 +51,8 @@ protected:
|
|||
vis1_instructions_m = 1 << vis1_instructions,
|
||||
vis2_instructions_m = 1 << vis2_instructions,
|
||||
sun4v_m = 1 << sun4v_instructions,
|
||||
blk_init_instructions_m = 1 << blk_init_instructions,
|
||||
fmaf_instructions_m = 1 << fmaf_instructions,
|
||||
|
||||
generic_v8_m = v8_instructions_m | hardware_mul32_m | hardware_div32_m | hardware_fsmuld_m,
|
||||
generic_v9_m = generic_v8_m | v9_instructions_m,
|
||||
|
@ -67,6 +71,7 @@ protected:
|
|||
static int platform_features(int features);
|
||||
|
||||
static bool is_niagara1(int features) { return (features & sun4v_m) != 0; }
|
||||
static bool is_sparc64(int features) { return (features & fmaf_instructions_m) != 0; }
|
||||
|
||||
static int maximum_niagara1_processor_count() { return 32; }
|
||||
// Returns true if the platform is in the niagara line and
|
||||
|
@ -86,6 +91,7 @@ public:
|
|||
static bool has_hardware_popc() { return (_features & hardware_popc_m) != 0; }
|
||||
static bool has_vis1() { return (_features & vis1_instructions_m) != 0; }
|
||||
static bool has_vis2() { return (_features & vis2_instructions_m) != 0; }
|
||||
static bool has_blk_init() { return (_features & blk_init_instructions_m) != 0; }
|
||||
|
||||
static bool supports_compare_and_exchange()
|
||||
{ return has_v9(); }
|
||||
|
@ -93,8 +99,10 @@ public:
|
|||
static bool is_ultra3() { return (_features & ultra3_m) == ultra3_m; }
|
||||
static bool is_sun4v() { return (_features & sun4v_m) != 0; }
|
||||
static bool is_niagara1() { return is_niagara1(_features); }
|
||||
static bool is_sparc64() { return is_sparc64(_features); }
|
||||
|
||||
static bool has_fast_fxtof() { return has_v9() && !is_ultra3(); }
|
||||
static bool has_fast_idiv() { return is_niagara1_plus() || is_sparc64(); }
|
||||
|
||||
static const char* cpu_features() { return _features_str; }
|
||||
|
||||
|
|
|
@ -1288,7 +1288,7 @@ void Assembler::imull(Register dst, Register src, int value) {
|
|||
if (is8bit(value)) {
|
||||
emit_byte(0x6B);
|
||||
emit_byte(0xC0 | encode);
|
||||
emit_byte(value);
|
||||
emit_byte(value & 0xFF);
|
||||
} else {
|
||||
emit_byte(0x69);
|
||||
emit_byte(0xC0 | encode);
|
||||
|
@ -3903,7 +3903,7 @@ void Assembler::imulq(Register dst, Register src, int value) {
|
|||
if (is8bit(value)) {
|
||||
emit_byte(0x6B);
|
||||
emit_byte(0xC0 | encode);
|
||||
emit_byte(value);
|
||||
emit_byte(value & 0xFF);
|
||||
} else {
|
||||
emit_byte(0x69);
|
||||
emit_byte(0xC0 | encode);
|
||||
|
|
|
@ -446,6 +446,10 @@ public:
|
|||
static bool supports_lzcnt() { return (_cpuFeatures & CPU_LZCNT) != 0; }
|
||||
static bool supports_sse4a() { return (_cpuFeatures & CPU_SSE4A) != 0; }
|
||||
|
||||
// Intel Core and newer cpus have fast IDIV instruction (excluding Atom).
|
||||
static bool has_fast_idiv() { return is_intel() && cpu_family() == 6 &&
|
||||
supports_sse3() && _model != 0x1C; }
|
||||
|
||||
static bool supports_compare_and_exchange() { return true; }
|
||||
|
||||
static const char* cpu_features() { return _features_str; }
|
||||
|
|
|
@ -1508,6 +1508,16 @@ bool Matcher::is_spillable_arg( int reg ) {
|
|||
return can_be_java_arg(reg);
|
||||
}
|
||||
|
||||
bool Matcher::use_asm_for_ldiv_by_con( jlong divisor ) {
|
||||
// Use hardware integer DIV instruction when
|
||||
// it is faster than a code which use multiply.
|
||||
// Only when constant divisor fits into 32 bit
|
||||
// (min_jint is excluded to get only correct
|
||||
// positive 32 bit values from negative).
|
||||
return VM_Version::has_fast_idiv() &&
|
||||
(divisor == (int)divisor && divisor != min_jint);
|
||||
}
|
||||
|
||||
// Register for DIVI projection of divmodI
|
||||
RegMask Matcher::divI_proj_mask() {
|
||||
return EAX_REG_mask;
|
||||
|
@ -1546,6 +1556,9 @@ bool is_operand_hi32_zero(Node* n) {
|
|||
return true;
|
||||
}
|
||||
}
|
||||
if (opc == Op_ConL && (n->get_long() & 0xFFFFFFFF00000000LL) == 0LL) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -2309,9 +2322,11 @@ encode %{
|
|||
enc_class move_long_big_shift_sign( eRegL dst, immI_32_63 cnt ) %{
|
||||
emit_opcode( cbuf, 0x8B ); // Move
|
||||
emit_rm(cbuf, 0x3, $dst$$reg, HIGH_FROM_LOW($dst$$reg));
|
||||
if( $cnt$$constant > 32 ) { // Shift, if not by zero
|
||||
emit_d8(cbuf,$primary);
|
||||
emit_rm(cbuf, 0x3, $secondary, $dst$$reg);
|
||||
emit_d8(cbuf,$cnt$$constant-32);
|
||||
}
|
||||
emit_d8(cbuf,$primary);
|
||||
emit_rm(cbuf, 0x3, $secondary, HIGH_FROM_LOW($dst$$reg));
|
||||
emit_d8(cbuf,31);
|
||||
|
@ -8842,6 +8857,103 @@ instruct modL_eReg( eADXRegL dst, eRegL src1, eRegL src2, eFlagsReg cr, eCXRegI
|
|||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
// Divide Register Long (no special case since divisor != -1)
|
||||
instruct divL_eReg_imm32( eADXRegL dst, immL32 imm, eRegI tmp, eRegI tmp2, eFlagsReg cr ) %{
|
||||
match(Set dst (DivL dst imm));
|
||||
effect( TEMP tmp, TEMP tmp2, KILL cr );
|
||||
ins_cost(1000);
|
||||
format %{ "MOV $tmp,abs($imm) # ldiv EDX:EAX,$imm\n\t"
|
||||
"CMP $tmp,EDX\n\t"
|
||||
"JA,s fast\n\t"
|
||||
"MOV $tmp2,EAX\n\t"
|
||||
"MOV EAX,EDX\n\t"
|
||||
"SAR EDX,31\n\t"
|
||||
"IDIV $tmp\n\t"
|
||||
"XCHG EAX,$tmp2 \n\t"
|
||||
"IDIV $tmp\n\t"
|
||||
"CDQ\n\t"
|
||||
"ADD EDX,$tmp2\n\t"
|
||||
"JMP,s done\n"
|
||||
"fast:\n\t"
|
||||
"IDIV $tmp\n\t"
|
||||
"XOR EDX,EDX\n"
|
||||
"done:\n\t"
|
||||
"NEG EDX:EAX # if $imm < 0" %}
|
||||
ins_encode %{
|
||||
int con = (int)$imm$$constant;
|
||||
assert(con != 0 && con != -1 && con != min_jint, "wrong divisor");
|
||||
int pcon = (con > 0) ? con : -con;
|
||||
Label Lfast, Ldone;
|
||||
|
||||
__ movl($tmp$$Register, pcon);
|
||||
__ cmpl($tmp$$Register, HIGH_FROM_LOW($dst$$Register));
|
||||
__ jccb(Assembler::above, Lfast);
|
||||
|
||||
__ movl($tmp2$$Register, $dst$$Register); // save
|
||||
__ movl($dst$$Register, HIGH_FROM_LOW($dst$$Register));
|
||||
__ sarl(HIGH_FROM_LOW($dst$$Register), 31); // src sign
|
||||
__ idivl($tmp$$Register);
|
||||
__ xchgl($dst$$Register, $tmp2$$Register);
|
||||
__ idivl($tmp$$Register);
|
||||
__ cdql();
|
||||
__ addl(HIGH_FROM_LOW($dst$$Register),$tmp2$$Register);
|
||||
__ jmpb(Ldone);
|
||||
|
||||
__ bind(Lfast);
|
||||
// fast path: src is positive and result fits into 32 bit
|
||||
__ idivl($tmp$$Register);
|
||||
__ xorl(HIGH_FROM_LOW($dst$$Register),HIGH_FROM_LOW($dst$$Register));
|
||||
|
||||
__ bind(Ldone);
|
||||
if (con < 0) {
|
||||
__ lneg(HIGH_FROM_LOW($dst$$Register), $dst$$Register);
|
||||
}
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
// Remainder Register Long (remainder fit into 32 bits)
|
||||
instruct modL_eReg_imm32( eADXRegL dst, immL32 imm, eRegI tmp, eRegI tmp2, eFlagsReg cr ) %{
|
||||
match(Set dst (ModL dst imm));
|
||||
effect( TEMP tmp, TEMP tmp2, KILL cr );
|
||||
ins_cost(1000);
|
||||
format %{ "MOV $tmp,abs($imm) # lrem EDX:EAX,$imm\n\t"
|
||||
"CMP $tmp,EDX\n\t"
|
||||
"JA,s fast\n\t"
|
||||
"MOV $tmp2,EAX\n\t"
|
||||
"MOV EAX,EDX\n\t"
|
||||
"SAR EDX,31\n\t"
|
||||
"IDIV $tmp\n\t"
|
||||
"MOV EAX,$tmp2\n"
|
||||
"fast:\n\t"
|
||||
"IDIV $tmp\n\t"
|
||||
"MOV EAX,EDX\n\t"
|
||||
"SAR EDX,31\n\t" %}
|
||||
ins_encode %{
|
||||
int con = (int)$imm$$constant;
|
||||
assert(con != 0 && con != -1 && con != min_jint, "wrong divisor");
|
||||
int pcon = (con > 0) ? con : -con;
|
||||
Label Lfast;
|
||||
|
||||
__ movl($tmp$$Register, pcon);
|
||||
__ cmpl($tmp$$Register, HIGH_FROM_LOW($dst$$Register));
|
||||
__ jccb(Assembler::above, Lfast); // src is positive and result fits into 32 bit
|
||||
|
||||
__ movl($tmp2$$Register, $dst$$Register); // save
|
||||
__ movl($dst$$Register, HIGH_FROM_LOW($dst$$Register));
|
||||
__ sarl(HIGH_FROM_LOW($dst$$Register), 31); // src sign
|
||||
__ idivl($tmp$$Register);
|
||||
__ movl($dst$$Register, $tmp2$$Register);
|
||||
|
||||
__ bind(Lfast);
|
||||
__ idivl($tmp$$Register);
|
||||
__ movl($dst$$Register, HIGH_FROM_LOW($dst$$Register));
|
||||
__ sarl(HIGH_FROM_LOW($dst$$Register), 31); // result sign
|
||||
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
// Integer Shift Instructions
|
||||
// Shift Left by one
|
||||
instruct shlI_eReg_1(eRegI dst, immI1 shift, eFlagsReg cr) %{
|
||||
|
|
|
@ -2065,6 +2065,13 @@ bool Matcher::is_spillable_arg(int reg)
|
|||
return can_be_java_arg(reg);
|
||||
}
|
||||
|
||||
bool Matcher::use_asm_for_ldiv_by_con( jlong divisor ) {
|
||||
// In 64 bit mode a code which use multiply when
|
||||
// devisor is constant is faster than hardware
|
||||
// DIV instruction (it uses MulHiL).
|
||||
return false;
|
||||
}
|
||||
|
||||
// Register for DIVI projection of divmodI
|
||||
RegMask Matcher::divI_proj_mask() {
|
||||
return INT_RAX_REG_mask;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2006, 2009, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -65,10 +65,6 @@ int VM_Version::platform_features(int features) {
|
|||
// getisax(2), SI_ARCHITECTURE_32, and SI_ARCHITECTURE_64 are
|
||||
// supported on Solaris 10 and later.
|
||||
if (os::Solaris::supports_getisax()) {
|
||||
#ifndef PRODUCT
|
||||
if (PrintMiscellaneous && Verbose)
|
||||
tty->print_cr("getisax(2) supported.");
|
||||
#endif
|
||||
|
||||
// Check 32-bit architecture.
|
||||
do_sysinfo(SI_ARCHITECTURE_32, "sparc", &features, v8_instructions_m);
|
||||
|
@ -81,6 +77,11 @@ int VM_Version::platform_features(int features) {
|
|||
uint_t avn = os::Solaris::getisax(&av, 1);
|
||||
assert(avn == 1, "should only return one av");
|
||||
|
||||
#ifndef PRODUCT
|
||||
if (PrintMiscellaneous && Verbose)
|
||||
tty->print_cr("getisax(2) returned: " PTR32_FORMAT, av);
|
||||
#endif
|
||||
|
||||
if (av & AV_SPARC_MUL32) features |= hardware_mul32_m;
|
||||
if (av & AV_SPARC_DIV32) features |= hardware_div32_m;
|
||||
if (av & AV_SPARC_FSMULD) features |= hardware_fsmuld_m;
|
||||
|
@ -88,11 +89,22 @@ int VM_Version::platform_features(int features) {
|
|||
if (av & AV_SPARC_POPC) features |= hardware_popc_m;
|
||||
if (av & AV_SPARC_VIS) features |= vis1_instructions_m;
|
||||
if (av & AV_SPARC_VIS2) features |= vis2_instructions_m;
|
||||
|
||||
// Next values are not defined before Solaris 10
|
||||
// but Solaris 8 is used for jdk6 update builds.
|
||||
#ifndef AV_SPARC_ASI_BLK_INIT
|
||||
#define AV_SPARC_ASI_BLK_INIT 0x0080 /* ASI_BLK_INIT_xxx ASI */
|
||||
#endif
|
||||
#ifndef AV_SPARC_FMAF
|
||||
#define AV_SPARC_FMAF 0x0100 /* Sparc64 Fused Multiply-Add */
|
||||
#endif
|
||||
if (av & AV_SPARC_ASI_BLK_INIT) features |= blk_init_instructions_m;
|
||||
if (av & AV_SPARC_FMAF) features |= fmaf_instructions_m;
|
||||
} else {
|
||||
// getisax(2) failed, use the old legacy code.
|
||||
#ifndef PRODUCT
|
||||
if (PrintMiscellaneous && Verbose)
|
||||
tty->print_cr("getisax(2) not supported.");
|
||||
tty->print_cr("getisax(2) is not supported.");
|
||||
#endif
|
||||
|
||||
char tmp;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -388,7 +388,8 @@ static Node *transform_long_divide( PhaseGVN *phase, Node *dividend, jlong divis
|
|||
if (!d_pos) {
|
||||
q = new (phase->C, 3) SubLNode(phase->longcon(0), phase->transform(q));
|
||||
}
|
||||
} else {
|
||||
} else if ( !Matcher::use_asm_for_ldiv_by_con(d) ) { // Use hardware DIV instruction when
|
||||
// it is faster than code generated below.
|
||||
// Attempt the jlong constant divide -> multiply transform found in
|
||||
// "Division by Invariant Integers using Multiplication"
|
||||
// by Granlund and Montgomery
|
||||
|
@ -558,7 +559,7 @@ Node *DivLNode::Ideal( PhaseGVN *phase, bool can_reshape) {
|
|||
|
||||
set_req(0,NULL); // Dividing by a not-zero constant; no faulting
|
||||
|
||||
// Dividing by MININT does not optimize as a power-of-2 shift.
|
||||
// Dividing by MINLONG does not optimize as a power-of-2 shift.
|
||||
if( l == min_jlong ) return NULL;
|
||||
|
||||
return transform_long_divide( phase, in(1), l );
|
||||
|
@ -1062,7 +1063,7 @@ Node *ModLNode::Ideal(PhaseGVN *phase, bool can_reshape) {
|
|||
// Fell thru, the unroll case is not appropriate. Transform the modulo
|
||||
// into a long multiply/int multiply/subtract case
|
||||
|
||||
// Cannot handle mod 0, and min_jint isn't handled by the transform
|
||||
// Cannot handle mod 0, and min_jlong isn't handled by the transform
|
||||
if( con == 0 || con == min_jlong ) return NULL;
|
||||
|
||||
// Get the absolute value of the constant; at this point, we can use this
|
||||
|
@ -1075,7 +1076,7 @@ Node *ModLNode::Ideal(PhaseGVN *phase, bool can_reshape) {
|
|||
|
||||
// If this is a power of two, then maybe we can mask it
|
||||
if( is_power_of_2_long(pos_con) ) {
|
||||
log2_con = log2_long(pos_con);
|
||||
log2_con = exact_log2_long(pos_con);
|
||||
|
||||
const Type *dt = phase->type(in(1));
|
||||
const TypeLong *dtl = dt->isa_long();
|
||||
|
@ -1088,7 +1089,7 @@ Node *ModLNode::Ideal(PhaseGVN *phase, bool can_reshape) {
|
|||
// Save in(1) so that it cannot be changed or deleted
|
||||
hook->init_req(0, in(1));
|
||||
|
||||
// Divide using the transform from DivI to MulL
|
||||
// Divide using the transform from DivL to MulL
|
||||
Node *result = transform_long_divide( phase, in(1), pos_con );
|
||||
if (result != NULL) {
|
||||
Node *divide = phase->transform(result);
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 1997, 2009, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -298,6 +298,10 @@ public:
|
|||
// Register for MODL projection of divmodL
|
||||
static RegMask modL_proj_mask();
|
||||
|
||||
// Use hardware DIV instruction when it is faster than
|
||||
// a code which use multiply for division by constant.
|
||||
static bool use_asm_for_ldiv_by_con( jlong divisor );
|
||||
|
||||
static const RegMask method_handle_invoke_SP_save_mask();
|
||||
|
||||
// Java-Interpreter calling convention
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue