mirror of
https://github.com/openjdk/jdk.git
synced 2025-08-27 23:04:50 +02:00

Co-authored-by: Vivek Deshpande <vdeshpande@openjdk.org> Co-authored-by: Qi Feng <qfeng@openjdk.org> Co-authored-by: Ian Graves <igraves@openjdk.org> Co-authored-by: Jean-Philippe Halimi <jphalimi@openjdk.org> Co-authored-by: Vladimir Ivanov <vlivanov@openjdk.org> Co-authored-by: Ningsheng Jian <njian@openjdk.org> Co-authored-by: Razvan Lupusoru <rlupusoru@openjdk.org> Co-authored-by: Smita Kamath <svkamath@openjdk.org> Co-authored-by: Rahul Kandu <rkandu@openjdk.org> Co-authored-by: Kishor Kharbas <kkharbas@openjdk.org> Co-authored-by: Eric Liu <Eric.Liu2@arm.com> Co-authored-by: Aaloan Miftah <someusername3@gmail.com> Co-authored-by: John R Rose <jrose@openjdk.org> Co-authored-by: Shravya Rukmannagari <srukmannagar@openjdk.org> Co-authored-by: Paul Sandoz <psandoz@openjdk.org> Co-authored-by: Sandhya Viswanathan <sviswanathan@openjdk.org> Co-authored-by: Lauren Walkowski <lauren.walkowski@arm.com> Co-authored-by: Yang Zang <Yang.Zhang@arm.com> Co-authored-by: Joshua Zhu <jzhu@openjdk.org> Co-authored-by: Wang Zhuo <wzhuo@openjdk.org> Co-authored-by: Jatin Bhateja <jbhateja@openjdk.org> Reviewed-by: erikj, chegar, kvn, darcy, forax, briangoetz, aph, epavlova, coleenp
3644 lines
130 KiB
C++
3644 lines
130 KiB
C++
/*
|
|
* Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved.
|
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
|
*
|
|
* This code is free software; you can redistribute it and/or modify it
|
|
* under the terms of the GNU General Public License version 2 only, as
|
|
* published by the Free Software Foundation.
|
|
*
|
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
|
* version 2 for more details (a copy is included in the LICENSE file that
|
|
* accompanied this code).
|
|
*
|
|
* You should have received a copy of the GNU General Public License version
|
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
|
*
|
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
|
* or visit www.oracle.com if you need additional information or have any
|
|
* questions.
|
|
*
|
|
*/
|
|
|
|
#include "precompiled.hpp"
|
|
#include "asm/assembler.hpp"
|
|
#include "asm/assembler.inline.hpp"
|
|
#include "oops/methodData.hpp"
|
|
#include "opto/c2_MacroAssembler.hpp"
|
|
#include "opto/intrinsicnode.hpp"
|
|
#include "opto/opcodes.hpp"
|
|
#include "runtime/biasedLocking.hpp"
|
|
#include "runtime/objectMonitor.hpp"
|
|
#include "runtime/stubRoutines.hpp"
|
|
|
|
inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) {
|
|
switch (vlen_in_bytes) {
|
|
case 4: // fall-through
|
|
case 8: // fall-through
|
|
case 16: return Assembler::AVX_128bit;
|
|
case 32: return Assembler::AVX_256bit;
|
|
case 64: return Assembler::AVX_512bit;
|
|
|
|
default: {
|
|
ShouldNotReachHere();
|
|
return Assembler::AVX_NoVec;
|
|
}
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::setvectmask(Register dst, Register src) {
|
|
guarantee(PostLoopMultiversioning, "must be");
|
|
Assembler::movl(dst, 1);
|
|
Assembler::shlxl(dst, dst, src);
|
|
Assembler::decl(dst);
|
|
Assembler::kmovdl(k1, dst);
|
|
Assembler::movl(dst, src);
|
|
}
|
|
|
|
void C2_MacroAssembler::restorevectmask() {
|
|
guarantee(PostLoopMultiversioning, "must be");
|
|
Assembler::knotwl(k1, k0);
|
|
}
|
|
|
|
#if INCLUDE_RTM_OPT
|
|
|
|
// Update rtm_counters based on abort status
|
|
// input: abort_status
|
|
// rtm_counters (RTMLockingCounters*)
|
|
// flags are killed
|
|
void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) {
|
|
|
|
atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset()));
|
|
if (PrintPreciseRTMLockingStatistics) {
|
|
for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) {
|
|
Label check_abort;
|
|
testl(abort_status, (1<<i));
|
|
jccb(Assembler::equal, check_abort);
|
|
atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx))));
|
|
bind(check_abort);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Branch if (random & (count-1) != 0), count is 2^n
|
|
// tmp, scr and flags are killed
|
|
void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) {
|
|
assert(tmp == rax, "");
|
|
assert(scr == rdx, "");
|
|
rdtsc(); // modifies EDX:EAX
|
|
andptr(tmp, count-1);
|
|
jccb(Assembler::notZero, brLabel);
|
|
}
|
|
|
|
// Perform abort ratio calculation, set no_rtm bit if high ratio
|
|
// input: rtm_counters_Reg (RTMLockingCounters* address)
|
|
// tmpReg, rtm_counters_Reg and flags are killed
|
|
void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg,
|
|
Register rtm_counters_Reg,
|
|
RTMLockingCounters* rtm_counters,
|
|
Metadata* method_data) {
|
|
Label L_done, L_check_always_rtm1, L_check_always_rtm2;
|
|
|
|
if (RTMLockingCalculationDelay > 0) {
|
|
// Delay calculation
|
|
movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg);
|
|
testptr(tmpReg, tmpReg);
|
|
jccb(Assembler::equal, L_done);
|
|
}
|
|
// Abort ratio calculation only if abort_count > RTMAbortThreshold
|
|
// Aborted transactions = abort_count * 100
|
|
// All transactions = total_count * RTMTotalCountIncrRate
|
|
// Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio)
|
|
|
|
movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset()));
|
|
cmpptr(tmpReg, RTMAbortThreshold);
|
|
jccb(Assembler::below, L_check_always_rtm2);
|
|
imulptr(tmpReg, tmpReg, 100);
|
|
|
|
Register scrReg = rtm_counters_Reg;
|
|
movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
|
|
imulptr(scrReg, scrReg, RTMTotalCountIncrRate);
|
|
imulptr(scrReg, scrReg, RTMAbortRatio);
|
|
cmpptr(tmpReg, scrReg);
|
|
jccb(Assembler::below, L_check_always_rtm1);
|
|
if (method_data != NULL) {
|
|
// set rtm_state to "no rtm" in MDO
|
|
mov_metadata(tmpReg, method_data);
|
|
lock();
|
|
orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM);
|
|
}
|
|
jmpb(L_done);
|
|
bind(L_check_always_rtm1);
|
|
// Reload RTMLockingCounters* address
|
|
lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
|
|
bind(L_check_always_rtm2);
|
|
movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset()));
|
|
cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate);
|
|
jccb(Assembler::below, L_done);
|
|
if (method_data != NULL) {
|
|
// set rtm_state to "always rtm" in MDO
|
|
mov_metadata(tmpReg, method_data);
|
|
lock();
|
|
orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM);
|
|
}
|
|
bind(L_done);
|
|
}
|
|
|
|
// Update counters and perform abort ratio calculation
|
|
// input: abort_status_Reg
|
|
// rtm_counters_Reg, flags are killed
|
|
void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg,
|
|
Register rtm_counters_Reg,
|
|
RTMLockingCounters* rtm_counters,
|
|
Metadata* method_data,
|
|
bool profile_rtm) {
|
|
|
|
assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
|
|
// update rtm counters based on rax value at abort
|
|
// reads abort_status_Reg, updates flags
|
|
lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters));
|
|
rtm_counters_update(abort_status_Reg, rtm_counters_Reg);
|
|
if (profile_rtm) {
|
|
// Save abort status because abort_status_Reg is used by following code.
|
|
if (RTMRetryCount > 0) {
|
|
push(abort_status_Reg);
|
|
}
|
|
assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
|
|
rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data);
|
|
// restore abort status
|
|
if (RTMRetryCount > 0) {
|
|
pop(abort_status_Reg);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4)
|
|
// inputs: retry_count_Reg
|
|
// : abort_status_Reg
|
|
// output: retry_count_Reg decremented by 1
|
|
// flags are killed
|
|
void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) {
|
|
Label doneRetry;
|
|
assert(abort_status_Reg == rax, "");
|
|
// The abort reason bits are in eax (see all states in rtmLocking.hpp)
|
|
// 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4)
|
|
// if reason is in 0x6 and retry count != 0 then retry
|
|
andptr(abort_status_Reg, 0x6);
|
|
jccb(Assembler::zero, doneRetry);
|
|
testl(retry_count_Reg, retry_count_Reg);
|
|
jccb(Assembler::zero, doneRetry);
|
|
pause();
|
|
decrementl(retry_count_Reg);
|
|
jmp(retryLabel);
|
|
bind(doneRetry);
|
|
}
|
|
|
|
// Spin and retry if lock is busy,
|
|
// inputs: box_Reg (monitor address)
|
|
// : retry_count_Reg
|
|
// output: retry_count_Reg decremented by 1
|
|
// : clear z flag if retry count exceeded
|
|
// tmp_Reg, scr_Reg, flags are killed
|
|
void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg,
|
|
Register tmp_Reg, Register scr_Reg, Label& retryLabel) {
|
|
Label SpinLoop, SpinExit, doneRetry;
|
|
int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
|
|
|
|
testl(retry_count_Reg, retry_count_Reg);
|
|
jccb(Assembler::zero, doneRetry);
|
|
decrementl(retry_count_Reg);
|
|
movptr(scr_Reg, RTMSpinLoopCount);
|
|
|
|
bind(SpinLoop);
|
|
pause();
|
|
decrementl(scr_Reg);
|
|
jccb(Assembler::lessEqual, SpinExit);
|
|
movptr(tmp_Reg, Address(box_Reg, owner_offset));
|
|
testptr(tmp_Reg, tmp_Reg);
|
|
jccb(Assembler::notZero, SpinLoop);
|
|
|
|
bind(SpinExit);
|
|
jmp(retryLabel);
|
|
bind(doneRetry);
|
|
incrementl(retry_count_Reg); // clear z flag
|
|
}
|
|
|
|
// Use RTM for normal stack locks
|
|
// Input: objReg (object to lock)
|
|
void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg,
|
|
Register retry_on_abort_count_Reg,
|
|
RTMLockingCounters* stack_rtm_counters,
|
|
Metadata* method_data, bool profile_rtm,
|
|
Label& DONE_LABEL, Label& IsInflated) {
|
|
assert(UseRTMForStackLocks, "why call this otherwise?");
|
|
assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
|
|
assert(tmpReg == rax, "");
|
|
assert(scrReg == rdx, "");
|
|
Label L_rtm_retry, L_decrement_retry, L_on_abort;
|
|
|
|
if (RTMRetryCount > 0) {
|
|
movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
|
|
bind(L_rtm_retry);
|
|
}
|
|
movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
|
|
testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
|
|
jcc(Assembler::notZero, IsInflated);
|
|
|
|
if (PrintPreciseRTMLockingStatistics || profile_rtm) {
|
|
Label L_noincrement;
|
|
if (RTMTotalCountIncrRate > 1) {
|
|
// tmpReg, scrReg and flags are killed
|
|
branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
|
|
}
|
|
assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM");
|
|
atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg);
|
|
bind(L_noincrement);
|
|
}
|
|
xbegin(L_on_abort);
|
|
movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
|
|
andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
|
|
cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked
|
|
jcc(Assembler::equal, DONE_LABEL); // all done if unlocked
|
|
|
|
Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
|
|
if (UseRTMXendForLockBusy) {
|
|
xend();
|
|
movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry)
|
|
jmp(L_decrement_retry);
|
|
}
|
|
else {
|
|
xabort(0);
|
|
}
|
|
bind(L_on_abort);
|
|
if (PrintPreciseRTMLockingStatistics || profile_rtm) {
|
|
rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm);
|
|
}
|
|
bind(L_decrement_retry);
|
|
if (RTMRetryCount > 0) {
|
|
// retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
|
|
rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
|
|
}
|
|
}
|
|
|
|
// Use RTM for inflating locks
|
|
// inputs: objReg (object to lock)
|
|
// boxReg (on-stack box address (displaced header location) - KILLED)
|
|
// tmpReg (ObjectMonitor address + markWord::monitor_value)
|
|
void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg,
|
|
Register scrReg, Register retry_on_busy_count_Reg,
|
|
Register retry_on_abort_count_Reg,
|
|
RTMLockingCounters* rtm_counters,
|
|
Metadata* method_data, bool profile_rtm,
|
|
Label& DONE_LABEL) {
|
|
assert(UseRTMLocking, "why call this otherwise?");
|
|
assert(tmpReg == rax, "");
|
|
assert(scrReg == rdx, "");
|
|
Label L_rtm_retry, L_decrement_retry, L_on_abort;
|
|
int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
|
|
|
|
// Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
|
|
movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
|
|
movptr(boxReg, tmpReg); // Save ObjectMonitor address
|
|
|
|
if (RTMRetryCount > 0) {
|
|
movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy
|
|
movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort
|
|
bind(L_rtm_retry);
|
|
}
|
|
if (PrintPreciseRTMLockingStatistics || profile_rtm) {
|
|
Label L_noincrement;
|
|
if (RTMTotalCountIncrRate > 1) {
|
|
// tmpReg, scrReg and flags are killed
|
|
branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement);
|
|
}
|
|
assert(rtm_counters != NULL, "should not be NULL when profiling RTM");
|
|
atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg);
|
|
bind(L_noincrement);
|
|
}
|
|
xbegin(L_on_abort);
|
|
movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes()));
|
|
movptr(tmpReg, Address(tmpReg, owner_offset));
|
|
testptr(tmpReg, tmpReg);
|
|
jcc(Assembler::zero, DONE_LABEL);
|
|
if (UseRTMXendForLockBusy) {
|
|
xend();
|
|
jmp(L_decrement_retry);
|
|
}
|
|
else {
|
|
xabort(0);
|
|
}
|
|
bind(L_on_abort);
|
|
Register abort_status_Reg = tmpReg; // status of abort is stored in RAX
|
|
if (PrintPreciseRTMLockingStatistics || profile_rtm) {
|
|
rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm);
|
|
}
|
|
if (RTMRetryCount > 0) {
|
|
// retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4)
|
|
rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry);
|
|
}
|
|
|
|
movptr(tmpReg, Address(boxReg, owner_offset)) ;
|
|
testptr(tmpReg, tmpReg) ;
|
|
jccb(Assembler::notZero, L_decrement_retry) ;
|
|
|
|
// Appears unlocked - try to swing _owner from null to non-null.
|
|
// Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
|
|
#ifdef _LP64
|
|
Register threadReg = r15_thread;
|
|
#else
|
|
get_thread(scrReg);
|
|
Register threadReg = scrReg;
|
|
#endif
|
|
lock();
|
|
cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg
|
|
|
|
if (RTMRetryCount > 0) {
|
|
// success done else retry
|
|
jccb(Assembler::equal, DONE_LABEL) ;
|
|
bind(L_decrement_retry);
|
|
// Spin and retry if lock is busy.
|
|
rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry);
|
|
}
|
|
else {
|
|
bind(L_decrement_retry);
|
|
}
|
|
}
|
|
|
|
#endif // INCLUDE_RTM_OPT
|
|
|
|
// fast_lock and fast_unlock used by C2
|
|
|
|
// Because the transitions from emitted code to the runtime
|
|
// monitorenter/exit helper stubs are so slow it's critical that
|
|
// we inline both the stack-locking fast path and the inflated fast path.
|
|
//
|
|
// See also: cmpFastLock and cmpFastUnlock.
|
|
//
|
|
// What follows is a specialized inline transliteration of the code
|
|
// in enter() and exit(). If we're concerned about I$ bloat another
|
|
// option would be to emit TrySlowEnter and TrySlowExit methods
|
|
// at startup-time. These methods would accept arguments as
|
|
// (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure
|
|
// indications in the icc.ZFlag. fast_lock and fast_unlock would simply
|
|
// marshal the arguments and emit calls to TrySlowEnter and TrySlowExit.
|
|
// In practice, however, the # of lock sites is bounded and is usually small.
|
|
// Besides the call overhead, TrySlowEnter and TrySlowExit might suffer
|
|
// if the processor uses simple bimodal branch predictors keyed by EIP
|
|
// Since the helper routines would be called from multiple synchronization
|
|
// sites.
|
|
//
|
|
// An even better approach would be write "MonitorEnter()" and "MonitorExit()"
|
|
// in java - using j.u.c and unsafe - and just bind the lock and unlock sites
|
|
// to those specialized methods. That'd give us a mostly platform-independent
|
|
// implementation that the JITs could optimize and inline at their pleasure.
|
|
// Done correctly, the only time we'd need to cross to native could would be
|
|
// to park() or unpark() threads. We'd also need a few more unsafe operators
|
|
// to (a) prevent compiler-JIT reordering of non-volatile accesses, and
|
|
// (b) explicit barriers or fence operations.
|
|
//
|
|
// TODO:
|
|
//
|
|
// * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr).
|
|
// This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals.
|
|
// Given TLAB allocation, Self is usually manifested in a register, so passing it into
|
|
// the lock operators would typically be faster than reifying Self.
|
|
//
|
|
// * Ideally I'd define the primitives as:
|
|
// fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED.
|
|
// fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED
|
|
// Unfortunately ADLC bugs prevent us from expressing the ideal form.
|
|
// Instead, we're stuck with a rather awkward and brittle register assignments below.
|
|
// Furthermore the register assignments are overconstrained, possibly resulting in
|
|
// sub-optimal code near the synchronization site.
|
|
//
|
|
// * Eliminate the sp-proximity tests and just use "== Self" tests instead.
|
|
// Alternately, use a better sp-proximity test.
|
|
//
|
|
// * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value.
|
|
// Either one is sufficient to uniquely identify a thread.
|
|
// TODO: eliminate use of sp in _owner and use get_thread(tr) instead.
|
|
//
|
|
// * Intrinsify notify() and notifyAll() for the common cases where the
|
|
// object is locked by the calling thread but the waitlist is empty.
|
|
// avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll().
|
|
//
|
|
// * use jccb and jmpb instead of jcc and jmp to improve code density.
|
|
// But beware of excessive branch density on AMD Opterons.
|
|
//
|
|
// * Both fast_lock and fast_unlock set the ICC.ZF to indicate success
|
|
// or failure of the fast path. If the fast path fails then we pass
|
|
// control to the slow path, typically in C. In fast_lock and
|
|
// fast_unlock we often branch to DONE_LABEL, just to find that C2
|
|
// will emit a conditional branch immediately after the node.
|
|
// So we have branches to branches and lots of ICC.ZF games.
|
|
// Instead, it might be better to have C2 pass a "FailureLabel"
|
|
// into fast_lock and fast_unlock. In the case of success, control
|
|
// will drop through the node. ICC.ZF is undefined at exit.
|
|
// In the case of failure, the node will branch directly to the
|
|
// FailureLabel
|
|
|
|
|
|
// obj: object to lock
|
|
// box: on-stack box address (displaced header location) - KILLED
|
|
// rax,: tmp -- KILLED
|
|
// scr: tmp -- KILLED
|
|
void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg,
|
|
Register scrReg, Register cx1Reg, Register cx2Reg,
|
|
BiasedLockingCounters* counters,
|
|
RTMLockingCounters* rtm_counters,
|
|
RTMLockingCounters* stack_rtm_counters,
|
|
Metadata* method_data,
|
|
bool use_rtm, bool profile_rtm) {
|
|
// Ensure the register assignments are disjoint
|
|
assert(tmpReg == rax, "");
|
|
|
|
if (use_rtm) {
|
|
assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg);
|
|
} else {
|
|
assert(cx2Reg == noreg, "");
|
|
assert_different_registers(objReg, boxReg, tmpReg, scrReg);
|
|
}
|
|
|
|
if (counters != NULL) {
|
|
atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg);
|
|
}
|
|
|
|
// Possible cases that we'll encounter in fast_lock
|
|
// ------------------------------------------------
|
|
// * Inflated
|
|
// -- unlocked
|
|
// -- Locked
|
|
// = by self
|
|
// = by other
|
|
// * biased
|
|
// -- by Self
|
|
// -- by other
|
|
// * neutral
|
|
// * stack-locked
|
|
// -- by self
|
|
// = sp-proximity test hits
|
|
// = sp-proximity test generates false-negative
|
|
// -- by other
|
|
//
|
|
|
|
Label IsInflated, DONE_LABEL;
|
|
|
|
if (DiagnoseSyncOnPrimitiveWrappers != 0) {
|
|
load_klass(tmpReg, objReg, cx1Reg);
|
|
movl(tmpReg, Address(tmpReg, Klass::access_flags_offset()));
|
|
testl(tmpReg, JVM_ACC_IS_BOX_CLASS);
|
|
jcc(Assembler::notZero, DONE_LABEL);
|
|
}
|
|
|
|
// it's stack-locked, biased or neutral
|
|
// TODO: optimize away redundant LDs of obj->mark and improve the markword triage
|
|
// order to reduce the number of conditional branches in the most common cases.
|
|
// Beware -- there's a subtle invariant that fetch of the markword
|
|
// at [FETCH], below, will never observe a biased encoding (*101b).
|
|
// If this invariant is not held we risk exclusion (safety) failure.
|
|
if (UseBiasedLocking && !UseOptoBiasInlining) {
|
|
biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters);
|
|
}
|
|
|
|
#if INCLUDE_RTM_OPT
|
|
if (UseRTMForStackLocks && use_rtm) {
|
|
rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg,
|
|
stack_rtm_counters, method_data, profile_rtm,
|
|
DONE_LABEL, IsInflated);
|
|
}
|
|
#endif // INCLUDE_RTM_OPT
|
|
|
|
movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH]
|
|
testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased
|
|
jccb(Assembler::notZero, IsInflated);
|
|
|
|
// Attempt stack-locking ...
|
|
orptr (tmpReg, markWord::unlocked_value);
|
|
movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS
|
|
lock();
|
|
cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg
|
|
if (counters != NULL) {
|
|
cond_inc32(Assembler::equal,
|
|
ExternalAddress((address)counters->fast_path_entry_count_addr()));
|
|
}
|
|
jcc(Assembler::equal, DONE_LABEL); // Success
|
|
|
|
// Recursive locking.
|
|
// The object is stack-locked: markword contains stack pointer to BasicLock.
|
|
// Locked by current thread if difference with current SP is less than one page.
|
|
subptr(tmpReg, rsp);
|
|
// Next instruction set ZFlag == 1 (Success) if difference is less then one page.
|
|
andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) );
|
|
movptr(Address(boxReg, 0), tmpReg);
|
|
if (counters != NULL) {
|
|
cond_inc32(Assembler::equal,
|
|
ExternalAddress((address)counters->fast_path_entry_count_addr()));
|
|
}
|
|
jmp(DONE_LABEL);
|
|
|
|
bind(IsInflated);
|
|
// The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value
|
|
|
|
#if INCLUDE_RTM_OPT
|
|
// Use the same RTM locking code in 32- and 64-bit VM.
|
|
if (use_rtm) {
|
|
rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg,
|
|
rtm_counters, method_data, profile_rtm, DONE_LABEL);
|
|
} else {
|
|
#endif // INCLUDE_RTM_OPT
|
|
|
|
#ifndef _LP64
|
|
// The object is inflated.
|
|
|
|
// boxReg refers to the on-stack BasicLock in the current frame.
|
|
// We'd like to write:
|
|
// set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices.
|
|
// This is convenient but results a ST-before-CAS penalty. The following CAS suffers
|
|
// additional latency as we have another ST in the store buffer that must drain.
|
|
|
|
// avoid ST-before-CAS
|
|
// register juggle because we need tmpReg for cmpxchgptr below
|
|
movptr(scrReg, boxReg);
|
|
movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2]
|
|
|
|
// Optimistic form: consider XORL tmpReg,tmpReg
|
|
movptr(tmpReg, NULL_WORD);
|
|
|
|
// Appears unlocked - try to swing _owner from null to non-null.
|
|
// Ideally, I'd manifest "Self" with get_thread and then attempt
|
|
// to CAS the register containing Self into m->Owner.
|
|
// But we don't have enough registers, so instead we can either try to CAS
|
|
// rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds
|
|
// we later store "Self" into m->Owner. Transiently storing a stack address
|
|
// (rsp or the address of the box) into m->owner is harmless.
|
|
// Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand.
|
|
lock();
|
|
cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
|
|
movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3
|
|
// If we weren't able to swing _owner from NULL to the BasicLock
|
|
// then take the slow path.
|
|
jccb (Assembler::notZero, DONE_LABEL);
|
|
// update _owner from BasicLock to thread
|
|
get_thread (scrReg); // beware: clobbers ICCs
|
|
movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg);
|
|
xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success
|
|
|
|
// If the CAS fails we can either retry or pass control to the slow path.
|
|
// We use the latter tactic.
|
|
// Pass the CAS result in the icc.ZFlag into DONE_LABEL
|
|
// If the CAS was successful ...
|
|
// Self has acquired the lock
|
|
// Invariant: m->_recursions should already be 0, so we don't need to explicitly set it.
|
|
// Intentional fall-through into DONE_LABEL ...
|
|
#else // _LP64
|
|
// It's inflated and we use scrReg for ObjectMonitor* in this section.
|
|
movq(scrReg, tmpReg);
|
|
xorq(tmpReg, tmpReg);
|
|
lock();
|
|
cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
|
|
// Unconditionally set box->_displaced_header = markWord::unused_mark().
|
|
// Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
|
|
movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value()));
|
|
// Intentional fall-through into DONE_LABEL ...
|
|
// Propagate ICC.ZF from CAS above into DONE_LABEL.
|
|
#endif // _LP64
|
|
#if INCLUDE_RTM_OPT
|
|
} // use_rtm()
|
|
#endif
|
|
// DONE_LABEL is a hot target - we'd really like to place it at the
|
|
// start of cache line by padding with NOPs.
|
|
// See the AMD and Intel software optimization manuals for the
|
|
// most efficient "long" NOP encodings.
|
|
// Unfortunately none of our alignment mechanisms suffice.
|
|
bind(DONE_LABEL);
|
|
|
|
// At DONE_LABEL the icc ZFlag is set as follows ...
|
|
// fast_unlock uses the same protocol.
|
|
// ZFlag == 1 -> Success
|
|
// ZFlag == 0 -> Failure - force control through the slow path
|
|
}
|
|
|
|
// obj: object to unlock
|
|
// box: box address (displaced header location), killed. Must be EAX.
|
|
// tmp: killed, cannot be obj nor box.
|
|
//
|
|
// Some commentary on balanced locking:
|
|
//
|
|
// fast_lock and fast_unlock are emitted only for provably balanced lock sites.
|
|
// Methods that don't have provably balanced locking are forced to run in the
|
|
// interpreter - such methods won't be compiled to use fast_lock and fast_unlock.
|
|
// The interpreter provides two properties:
|
|
// I1: At return-time the interpreter automatically and quietly unlocks any
|
|
// objects acquired the current activation (frame). Recall that the
|
|
// interpreter maintains an on-stack list of locks currently held by
|
|
// a frame.
|
|
// I2: If a method attempts to unlock an object that is not held by the
|
|
// the frame the interpreter throws IMSX.
|
|
//
|
|
// Lets say A(), which has provably balanced locking, acquires O and then calls B().
|
|
// B() doesn't have provably balanced locking so it runs in the interpreter.
|
|
// Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O
|
|
// is still locked by A().
|
|
//
|
|
// The only other source of unbalanced locking would be JNI. The "Java Native Interface:
|
|
// Programmer's Guide and Specification" claims that an object locked by jni_monitorenter
|
|
// should not be unlocked by "normal" java-level locking and vice-versa. The specification
|
|
// doesn't specify what will occur if a program engages in such mixed-mode locking, however.
|
|
// Arguably given that the spec legislates the JNI case as undefined our implementation
|
|
// could reasonably *avoid* checking owner in fast_unlock().
|
|
// In the interest of performance we elide m->Owner==Self check in unlock.
|
|
// A perfectly viable alternative is to elide the owner check except when
|
|
// Xcheck:jni is enabled.
|
|
|
|
void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) {
|
|
assert(boxReg == rax, "");
|
|
assert_different_registers(objReg, boxReg, tmpReg);
|
|
|
|
Label DONE_LABEL, Stacked, CheckSucc;
|
|
|
|
// Critically, the biased locking test must have precedence over
|
|
// and appear before the (box->dhw == 0) recursive stack-lock test.
|
|
if (UseBiasedLocking && !UseOptoBiasInlining) {
|
|
biased_locking_exit(objReg, tmpReg, DONE_LABEL);
|
|
}
|
|
|
|
#if INCLUDE_RTM_OPT
|
|
if (UseRTMForStackLocks && use_rtm) {
|
|
assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking");
|
|
Label L_regular_unlock;
|
|
movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword
|
|
andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits
|
|
cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked
|
|
jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock
|
|
xend(); // otherwise end...
|
|
jmp(DONE_LABEL); // ... and we're done
|
|
bind(L_regular_unlock);
|
|
}
|
|
#endif
|
|
|
|
cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header
|
|
jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock
|
|
movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword
|
|
testptr(tmpReg, markWord::monitor_value); // Inflated?
|
|
jccb (Assembler::zero, Stacked);
|
|
|
|
// It's inflated.
|
|
#if INCLUDE_RTM_OPT
|
|
if (use_rtm) {
|
|
Label L_regular_inflated_unlock;
|
|
int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner);
|
|
movptr(boxReg, Address(tmpReg, owner_offset));
|
|
testptr(boxReg, boxReg);
|
|
jccb(Assembler::notZero, L_regular_inflated_unlock);
|
|
xend();
|
|
jmpb(DONE_LABEL);
|
|
bind(L_regular_inflated_unlock);
|
|
}
|
|
#endif
|
|
|
|
// Despite our balanced locking property we still check that m->_owner == Self
|
|
// as java routines or native JNI code called by this thread might
|
|
// have released the lock.
|
|
// Refer to the comments in synchronizer.cpp for how we might encode extra
|
|
// state in _succ so we can avoid fetching EntryList|cxq.
|
|
//
|
|
// I'd like to add more cases in fast_lock() and fast_unlock() --
|
|
// such as recursive enter and exit -- but we have to be wary of
|
|
// I$ bloat, T$ effects and BP$ effects.
|
|
//
|
|
// If there's no contention try a 1-0 exit. That is, exit without
|
|
// a costly MEMBAR or CAS. See synchronizer.cpp for details on how
|
|
// we detect and recover from the race that the 1-0 exit admits.
|
|
//
|
|
// Conceptually fast_unlock() must execute a STST|LDST "release" barrier
|
|
// before it STs null into _owner, releasing the lock. Updates
|
|
// to data protected by the critical section must be visible before
|
|
// we drop the lock (and thus before any other thread could acquire
|
|
// the lock and observe the fields protected by the lock).
|
|
// IA32's memory-model is SPO, so STs are ordered with respect to
|
|
// each other and there's no need for an explicit barrier (fence).
|
|
// See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html.
|
|
#ifndef _LP64
|
|
get_thread (boxReg);
|
|
|
|
// Note that we could employ various encoding schemes to reduce
|
|
// the number of loads below (currently 4) to just 2 or 3.
|
|
// Refer to the comments in synchronizer.cpp.
|
|
// In practice the chain of fetches doesn't seem to impact performance, however.
|
|
xorptr(boxReg, boxReg);
|
|
orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
|
|
jccb (Assembler::notZero, DONE_LABEL);
|
|
movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
|
|
orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
|
|
jccb (Assembler::notZero, CheckSucc);
|
|
movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD);
|
|
jmpb (DONE_LABEL);
|
|
|
|
bind (Stacked);
|
|
// It's not inflated and it's not recursively stack-locked and it's not biased.
|
|
// It must be stack-locked.
|
|
// Try to reset the header to displaced header.
|
|
// The "box" value on the stack is stable, so we can reload
|
|
// and be assured we observe the same value as above.
|
|
movptr(tmpReg, Address(boxReg, 0));
|
|
lock();
|
|
cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
|
|
// Intention fall-thru into DONE_LABEL
|
|
|
|
// DONE_LABEL is a hot target - we'd really like to place it at the
|
|
// start of cache line by padding with NOPs.
|
|
// See the AMD and Intel software optimization manuals for the
|
|
// most efficient "long" NOP encodings.
|
|
// Unfortunately none of our alignment mechanisms suffice.
|
|
bind (CheckSucc);
|
|
#else // _LP64
|
|
// It's inflated
|
|
xorptr(boxReg, boxReg);
|
|
orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions)));
|
|
jccb (Assembler::notZero, DONE_LABEL);
|
|
movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq)));
|
|
orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList)));
|
|
jccb (Assembler::notZero, CheckSucc);
|
|
// Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
|
|
movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
|
|
jmpb (DONE_LABEL);
|
|
|
|
// Try to avoid passing control into the slow_path ...
|
|
Label LSuccess, LGoSlowPath ;
|
|
bind (CheckSucc);
|
|
|
|
// The following optional optimization can be elided if necessary
|
|
// Effectively: if (succ == null) goto slow path
|
|
// The code reduces the window for a race, however,
|
|
// and thus benefits performance.
|
|
cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
|
|
jccb (Assembler::zero, LGoSlowPath);
|
|
|
|
xorptr(boxReg, boxReg);
|
|
// Without cast to int32_t this style of movptr will destroy r10 which is typically obj.
|
|
movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD);
|
|
|
|
// Memory barrier/fence
|
|
// Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ
|
|
// Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack.
|
|
// This is faster on Nehalem and AMD Shanghai/Barcelona.
|
|
// See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences
|
|
// We might also restructure (ST Owner=0;barrier;LD _Succ) to
|
|
// (mov box,0; xchgq box, &m->Owner; LD _succ) .
|
|
lock(); addl(Address(rsp, 0), 0);
|
|
|
|
cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD);
|
|
jccb (Assembler::notZero, LSuccess);
|
|
|
|
// Rare inopportune interleaving - race.
|
|
// The successor vanished in the small window above.
|
|
// The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor.
|
|
// We need to ensure progress and succession.
|
|
// Try to reacquire the lock.
|
|
// If that fails then the new owner is responsible for succession and this
|
|
// thread needs to take no further action and can exit via the fast path (success).
|
|
// If the re-acquire succeeds then pass control into the slow path.
|
|
// As implemented, this latter mode is horrible because we generated more
|
|
// coherence traffic on the lock *and* artifically extended the critical section
|
|
// length while by virtue of passing control into the slow path.
|
|
|
|
// box is really RAX -- the following CMPXCHG depends on that binding
|
|
// cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R)
|
|
lock();
|
|
cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)));
|
|
// There's no successor so we tried to regrab the lock.
|
|
// If that didn't work, then another thread grabbed the
|
|
// lock so we're done (and exit was a success).
|
|
jccb (Assembler::notEqual, LSuccess);
|
|
// Intentional fall-through into slow path
|
|
|
|
bind (LGoSlowPath);
|
|
orl (boxReg, 1); // set ICC.ZF=0 to indicate failure
|
|
jmpb (DONE_LABEL);
|
|
|
|
bind (LSuccess);
|
|
testl (boxReg, 0); // set ICC.ZF=1 to indicate success
|
|
jmpb (DONE_LABEL);
|
|
|
|
bind (Stacked);
|
|
movptr(tmpReg, Address (boxReg, 0)); // re-fetch
|
|
lock();
|
|
cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box
|
|
|
|
#endif
|
|
bind(DONE_LABEL);
|
|
}
|
|
|
|
//-------------------------------------------------------------------------------------------
|
|
// Generic instructions support for use in .ad files C2 code generation
|
|
|
|
void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
|
|
if (dst != src) {
|
|
movdqu(dst, src);
|
|
}
|
|
if (opcode == Op_AbsVD) {
|
|
andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr);
|
|
} else {
|
|
assert((opcode == Op_NegVD),"opcode should be Op_NegD");
|
|
xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
|
|
if (opcode == Op_AbsVD) {
|
|
vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr);
|
|
} else {
|
|
assert((opcode == Op_NegVD),"opcode should be Op_NegD");
|
|
vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) {
|
|
if (dst != src) {
|
|
movdqu(dst, src);
|
|
}
|
|
if (opcode == Op_AbsVF) {
|
|
andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr);
|
|
} else {
|
|
assert((opcode == Op_NegVF),"opcode should be Op_NegF");
|
|
xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) {
|
|
if (opcode == Op_AbsVF) {
|
|
vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr);
|
|
} else {
|
|
assert((opcode == Op_NegVF),"opcode should be Op_NegF");
|
|
vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) {
|
|
assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
|
|
|
|
if (opcode == Op_MinV) {
|
|
if (elem_bt == T_BYTE) {
|
|
pminsb(dst, src);
|
|
} else if (elem_bt == T_SHORT) {
|
|
pminsw(dst, src);
|
|
} else if (elem_bt == T_INT) {
|
|
pminsd(dst, src);
|
|
} else {
|
|
assert(elem_bt == T_LONG, "required");
|
|
assert(tmp == xmm0, "required");
|
|
movdqu(xmm0, dst);
|
|
pcmpgtq(xmm0, src);
|
|
blendvpd(dst, src); // xmm0 as mask
|
|
}
|
|
} else { // opcode == Op_MaxV
|
|
if (elem_bt == T_BYTE) {
|
|
pmaxsb(dst, src);
|
|
} else if (elem_bt == T_SHORT) {
|
|
pmaxsw(dst, src);
|
|
} else if (elem_bt == T_INT) {
|
|
pmaxsd(dst, src);
|
|
} else {
|
|
assert(elem_bt == T_LONG, "required");
|
|
assert(tmp == xmm0, "required");
|
|
movdqu(xmm0, src);
|
|
pcmpgtq(xmm0, dst);
|
|
blendvpd(dst, src); // xmm0 as mask
|
|
}
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt,
|
|
XMMRegister dst, XMMRegister src1, XMMRegister src2,
|
|
int vlen_enc) {
|
|
assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity");
|
|
|
|
if (opcode == Op_MinV) {
|
|
if (elem_bt == T_BYTE) {
|
|
vpminsb(dst, src1, src2, vlen_enc);
|
|
} else if (elem_bt == T_SHORT) {
|
|
vpminsw(dst, src1, src2, vlen_enc);
|
|
} else if (elem_bt == T_INT) {
|
|
vpminsd(dst, src1, src2, vlen_enc);
|
|
} else {
|
|
assert(elem_bt == T_LONG, "required");
|
|
if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
|
|
vpminsq(dst, src1, src2, vlen_enc);
|
|
} else {
|
|
vpcmpgtq(dst, src1, src2, vlen_enc);
|
|
vblendvpd(dst, src1, src2, dst, vlen_enc);
|
|
}
|
|
}
|
|
} else { // opcode == Op_MaxV
|
|
if (elem_bt == T_BYTE) {
|
|
vpmaxsb(dst, src1, src2, vlen_enc);
|
|
} else if (elem_bt == T_SHORT) {
|
|
vpmaxsw(dst, src1, src2, vlen_enc);
|
|
} else if (elem_bt == T_INT) {
|
|
vpmaxsd(dst, src1, src2, vlen_enc);
|
|
} else {
|
|
assert(elem_bt == T_LONG, "required");
|
|
if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) {
|
|
vpmaxsq(dst, src1, src2, vlen_enc);
|
|
} else {
|
|
vpcmpgtq(dst, src1, src2, vlen_enc);
|
|
vblendvpd(dst, src2, src1, dst, vlen_enc);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Float/Double min max
|
|
|
|
void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt,
|
|
XMMRegister dst, XMMRegister a, XMMRegister b,
|
|
XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
|
|
int vlen_enc) {
|
|
assert(UseAVX > 0, "required");
|
|
assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
|
|
opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
|
|
assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
|
|
|
|
bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
|
|
bool is_double_word = is_double_word_type(elem_bt);
|
|
|
|
if (!is_double_word && is_min) {
|
|
vblendvps(atmp, a, b, a, vlen_enc);
|
|
vblendvps(btmp, b, a, a, vlen_enc);
|
|
vminps(tmp, atmp, btmp, vlen_enc);
|
|
vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
|
|
vblendvps(dst, tmp, atmp, btmp, vlen_enc);
|
|
} else if (!is_double_word && !is_min) {
|
|
vblendvps(btmp, b, a, b, vlen_enc);
|
|
vblendvps(atmp, a, b, b, vlen_enc);
|
|
vmaxps(tmp, atmp, btmp, vlen_enc);
|
|
vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
|
|
vblendvps(dst, tmp, atmp, btmp, vlen_enc);
|
|
} else if (is_double_word && is_min) {
|
|
vblendvpd(atmp, a, b, a, vlen_enc);
|
|
vblendvpd(btmp, b, a, a, vlen_enc);
|
|
vminpd(tmp, atmp, btmp, vlen_enc);
|
|
vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
|
|
vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
|
|
} else {
|
|
assert(is_double_word && !is_min, "sanity");
|
|
vblendvpd(btmp, b, a, b, vlen_enc);
|
|
vblendvpd(atmp, a, b, b, vlen_enc);
|
|
vmaxpd(tmp, atmp, btmp, vlen_enc);
|
|
vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
|
|
vblendvpd(dst, tmp, atmp, btmp, vlen_enc);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt,
|
|
XMMRegister dst, XMMRegister a, XMMRegister b,
|
|
KRegister ktmp, XMMRegister atmp, XMMRegister btmp,
|
|
int vlen_enc) {
|
|
assert(UseAVX > 2, "required");
|
|
assert(opcode == Op_MinV || opcode == Op_MinReductionV ||
|
|
opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity");
|
|
assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity");
|
|
|
|
bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV);
|
|
bool is_double_word = is_double_word_type(elem_bt);
|
|
bool merge = true;
|
|
|
|
if (!is_double_word && is_min) {
|
|
evpmovd2m(ktmp, a, vlen_enc);
|
|
evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
|
|
evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
|
|
vminps(dst, atmp, btmp, vlen_enc);
|
|
evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
|
|
evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
|
|
} else if (!is_double_word && !is_min) {
|
|
evpmovd2m(ktmp, b, vlen_enc);
|
|
evblendmps(atmp, ktmp, a, b, merge, vlen_enc);
|
|
evblendmps(btmp, ktmp, b, a, merge, vlen_enc);
|
|
vmaxps(dst, atmp, btmp, vlen_enc);
|
|
evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
|
|
evmovdqul(dst, ktmp, atmp, merge, vlen_enc);
|
|
} else if (is_double_word && is_min) {
|
|
evpmovq2m(ktmp, a, vlen_enc);
|
|
evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
|
|
evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
|
|
vminpd(dst, atmp, btmp, vlen_enc);
|
|
evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
|
|
evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
|
|
} else {
|
|
assert(is_double_word && !is_min, "sanity");
|
|
evpmovq2m(ktmp, b, vlen_enc);
|
|
evblendmpd(atmp, ktmp, a, b, merge, vlen_enc);
|
|
evblendmpd(btmp, ktmp, b, a, merge, vlen_enc);
|
|
vmaxpd(dst, atmp, btmp, vlen_enc);
|
|
evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc);
|
|
evmovdquq(dst, ktmp, atmp, merge, vlen_enc);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) {
|
|
if (sign) {
|
|
pmovsxbw(dst, src);
|
|
} else {
|
|
pmovzxbw(dst, src);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
|
|
if (sign) {
|
|
vpmovsxbw(dst, src, vector_len);
|
|
} else {
|
|
vpmovzxbw(dst, src, vector_len);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
|
|
if (sign) {
|
|
vpmovsxbd(dst, src, vector_len);
|
|
} else {
|
|
vpmovzxbd(dst, src, vector_len);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) {
|
|
if (sign) {
|
|
vpmovsxwd(dst, src, vector_len);
|
|
} else {
|
|
vpmovzxwd(dst, src, vector_len);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vprotate_imm(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
|
|
int shift, int vector_len) {
|
|
if (opcode == Op_RotateLeftV) {
|
|
if (etype == T_INT) {
|
|
evprold(dst, src, shift, vector_len);
|
|
} else {
|
|
assert(etype == T_LONG, "expected type T_LONG");
|
|
evprolq(dst, src, shift, vector_len);
|
|
}
|
|
} else {
|
|
assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
|
|
if (etype == T_INT) {
|
|
evprord(dst, src, shift, vector_len);
|
|
} else {
|
|
assert(etype == T_LONG, "expected type T_LONG");
|
|
evprorq(dst, src, shift, vector_len);
|
|
}
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vprotate_var(int opcode, BasicType etype, XMMRegister dst, XMMRegister src,
|
|
XMMRegister shift, int vector_len) {
|
|
if (opcode == Op_RotateLeftV) {
|
|
if (etype == T_INT) {
|
|
evprolvd(dst, src, shift, vector_len);
|
|
} else {
|
|
assert(etype == T_LONG, "expected type T_LONG");
|
|
evprolvq(dst, src, shift, vector_len);
|
|
}
|
|
} else {
|
|
assert(opcode == Op_RotateRightV, "opcode should be Op_RotateRightV");
|
|
if (etype == T_INT) {
|
|
evprorvd(dst, src, shift, vector_len);
|
|
} else {
|
|
assert(etype == T_LONG, "expected type T_LONG");
|
|
evprorvq(dst, src, shift, vector_len);
|
|
}
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, int shift) {
|
|
if (opcode == Op_RShiftVI) {
|
|
psrad(dst, shift);
|
|
} else if (opcode == Op_LShiftVI) {
|
|
pslld(dst, shift);
|
|
} else {
|
|
assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
|
|
psrld(dst, shift);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) {
|
|
switch (opcode) {
|
|
case Op_RShiftVI: psrad(dst, shift); break;
|
|
case Op_LShiftVI: pslld(dst, shift); break;
|
|
case Op_URShiftVI: psrld(dst, shift); break;
|
|
|
|
default: assert(false, "%s", NodeClassNames[opcode]);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
|
|
if (opcode == Op_RShiftVI) {
|
|
vpsrad(dst, nds, shift, vector_len);
|
|
} else if (opcode == Op_LShiftVI) {
|
|
vpslld(dst, nds, shift, vector_len);
|
|
} else {
|
|
assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI");
|
|
vpsrld(dst, nds, shift, vector_len);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
|
|
switch (opcode) {
|
|
case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break;
|
|
case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break;
|
|
case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break;
|
|
|
|
default: assert(false, "%s", NodeClassNames[opcode]);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) {
|
|
switch (opcode) {
|
|
case Op_RShiftVB: // fall-through
|
|
case Op_RShiftVS: psraw(dst, shift); break;
|
|
|
|
case Op_LShiftVB: // fall-through
|
|
case Op_LShiftVS: psllw(dst, shift); break;
|
|
|
|
case Op_URShiftVS: // fall-through
|
|
case Op_URShiftVB: psrlw(dst, shift); break;
|
|
|
|
default: assert(false, "%s", NodeClassNames[opcode]);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
|
|
switch (opcode) {
|
|
case Op_RShiftVB: // fall-through
|
|
case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break;
|
|
|
|
case Op_LShiftVB: // fall-through
|
|
case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break;
|
|
|
|
case Op_URShiftVS: // fall-through
|
|
case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break;
|
|
|
|
default: assert(false, "%s", NodeClassNames[opcode]);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
|
|
switch (opcode) {
|
|
case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
|
|
case Op_LShiftVL: psllq(dst, shift); break;
|
|
case Op_URShiftVL: psrlq(dst, shift); break;
|
|
|
|
default: assert(false, "%s", NodeClassNames[opcode]);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, int shift) {
|
|
if (opcode == Op_RShiftVL) {
|
|
psrlq(dst, shift); // using srl to implement sra on pre-avs512 systems
|
|
} else if (opcode == Op_LShiftVL) {
|
|
psllq(dst, shift);
|
|
} else {
|
|
assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
|
|
psrlq(dst, shift);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
|
|
switch (opcode) {
|
|
case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break;
|
|
case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break;
|
|
case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break;
|
|
|
|
default: assert(false, "%s", NodeClassNames[opcode]);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vshiftq_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
|
|
if (opcode == Op_RShiftVL) {
|
|
evpsraq(dst, nds, shift, vector_len);
|
|
} else if (opcode == Op_LShiftVL) {
|
|
vpsllq(dst, nds, shift, vector_len);
|
|
} else {
|
|
assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL");
|
|
vpsrlq(dst, nds, shift, vector_len);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
|
|
switch (opcode) {
|
|
case Op_RShiftVB: // fall-through
|
|
case Op_RShiftVS: // fall-through
|
|
case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break;
|
|
|
|
case Op_LShiftVB: // fall-through
|
|
case Op_LShiftVS: // fall-through
|
|
case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break;
|
|
|
|
case Op_URShiftVB: // fall-through
|
|
case Op_URShiftVS: // fall-through
|
|
case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break;
|
|
|
|
default: assert(false, "%s", NodeClassNames[opcode]);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
|
|
switch (opcode) {
|
|
case Op_RShiftVB: // fall-through
|
|
case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break;
|
|
|
|
case Op_LShiftVB: // fall-through
|
|
case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break;
|
|
|
|
case Op_URShiftVB: // fall-through
|
|
case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break;
|
|
|
|
default: assert(false, "%s", NodeClassNames[opcode]);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) {
|
|
assert(UseAVX >= 2, "required");
|
|
switch (opcode) {
|
|
case Op_RShiftVL: {
|
|
if (UseAVX > 2) {
|
|
assert(tmp == xnoreg, "not used");
|
|
if (!VM_Version::supports_avx512vl()) {
|
|
vlen_enc = Assembler::AVX_512bit;
|
|
}
|
|
evpsravq(dst, src, shift, vlen_enc);
|
|
} else {
|
|
vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask()));
|
|
vpsrlvq(dst, src, shift, vlen_enc);
|
|
vpsrlvq(tmp, tmp, shift, vlen_enc);
|
|
vpxor(dst, dst, tmp, vlen_enc);
|
|
vpsubq(dst, dst, tmp, vlen_enc);
|
|
}
|
|
break;
|
|
}
|
|
case Op_LShiftVL: {
|
|
assert(tmp == xnoreg, "not used");
|
|
vpsllvq(dst, src, shift, vlen_enc);
|
|
break;
|
|
}
|
|
case Op_URShiftVL: {
|
|
assert(tmp == xnoreg, "not used");
|
|
vpsrlvq(dst, src, shift, vlen_enc);
|
|
break;
|
|
}
|
|
default: assert(false, "%s", NodeClassNames[opcode]);
|
|
}
|
|
}
|
|
|
|
// Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst
|
|
void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
|
|
assert(opcode == Op_LShiftVB ||
|
|
opcode == Op_RShiftVB ||
|
|
opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
|
|
bool sign = (opcode != Op_URShiftVB);
|
|
assert(vector_len == 0, "required");
|
|
vextendbd(sign, dst, src, 1);
|
|
vpmovzxbd(vtmp, shift, 1);
|
|
varshiftd(opcode, dst, dst, vtmp, 1);
|
|
vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch);
|
|
vextracti128_high(vtmp, dst);
|
|
vpackusdw(dst, dst, vtmp, 0);
|
|
}
|
|
|
|
// Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst
|
|
void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) {
|
|
assert(opcode == Op_LShiftVB ||
|
|
opcode == Op_RShiftVB ||
|
|
opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]);
|
|
bool sign = (opcode != Op_URShiftVB);
|
|
int ext_vector_len = vector_len + 1;
|
|
vextendbw(sign, dst, src, ext_vector_len);
|
|
vpmovzxbw(vtmp, shift, ext_vector_len);
|
|
varshiftw(opcode, dst, dst, vtmp, ext_vector_len);
|
|
vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch);
|
|
if (vector_len == 0) {
|
|
vextracti128_high(vtmp, dst);
|
|
vpackuswb(dst, dst, vtmp, vector_len);
|
|
} else {
|
|
vextracti64x4_high(vtmp, dst);
|
|
vpackuswb(dst, dst, vtmp, vector_len);
|
|
vpermq(dst, dst, 0xD8, vector_len);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) {
|
|
switch(typ) {
|
|
case T_BYTE:
|
|
pinsrb(dst, val, idx);
|
|
break;
|
|
case T_SHORT:
|
|
pinsrw(dst, val, idx);
|
|
break;
|
|
case T_INT:
|
|
pinsrd(dst, val, idx);
|
|
break;
|
|
case T_LONG:
|
|
pinsrq(dst, val, idx);
|
|
break;
|
|
default:
|
|
assert(false,"Should not reach here.");
|
|
break;
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) {
|
|
switch(typ) {
|
|
case T_BYTE:
|
|
vpinsrb(dst, src, val, idx);
|
|
break;
|
|
case T_SHORT:
|
|
vpinsrw(dst, src, val, idx);
|
|
break;
|
|
case T_INT:
|
|
vpinsrd(dst, src, val, idx);
|
|
break;
|
|
case T_LONG:
|
|
vpinsrq(dst, src, val, idx);
|
|
break;
|
|
default:
|
|
assert(false,"Should not reach here.");
|
|
break;
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) {
|
|
switch(typ) {
|
|
case T_INT:
|
|
vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len);
|
|
break;
|
|
case T_FLOAT:
|
|
vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len);
|
|
break;
|
|
case T_LONG:
|
|
vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len);
|
|
break;
|
|
case T_DOUBLE:
|
|
vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len);
|
|
break;
|
|
default:
|
|
assert(false,"Should not reach here.");
|
|
break;
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) {
|
|
switch(typ) {
|
|
case T_INT:
|
|
evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len);
|
|
break;
|
|
case T_FLOAT:
|
|
evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len);
|
|
break;
|
|
case T_LONG:
|
|
evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len);
|
|
break;
|
|
case T_DOUBLE:
|
|
evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len);
|
|
break;
|
|
default:
|
|
assert(false,"Should not reach here.");
|
|
break;
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) {
|
|
switch(typ) {
|
|
case T_INT:
|
|
evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len);
|
|
break;
|
|
case T_FLOAT:
|
|
evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len);
|
|
break;
|
|
case T_LONG:
|
|
evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len);
|
|
break;
|
|
case T_DOUBLE:
|
|
evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len);
|
|
break;
|
|
default:
|
|
assert(false,"Should not reach here.");
|
|
break;
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt) {
|
|
if (vlen_in_bytes <= 16) {
|
|
pxor (dst, dst);
|
|
psubb(dst, src);
|
|
switch (elem_bt) {
|
|
case T_BYTE: /* nothing to do */ break;
|
|
case T_SHORT: pmovsxbw(dst, dst); break;
|
|
case T_INT: pmovsxbd(dst, dst); break;
|
|
case T_FLOAT: pmovsxbd(dst, dst); break;
|
|
case T_LONG: pmovsxbq(dst, dst); break;
|
|
case T_DOUBLE: pmovsxbq(dst, dst); break;
|
|
|
|
default: assert(false, "%s", type2name(elem_bt));
|
|
}
|
|
} else {
|
|
int vlen_enc = vector_length_encoding(vlen_in_bytes);
|
|
|
|
vpxor (dst, dst, dst, vlen_enc);
|
|
vpsubb(dst, dst, src, vlen_enc);
|
|
switch (elem_bt) {
|
|
case T_BYTE: /* nothing to do */ break;
|
|
case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break;
|
|
case T_INT: vpmovsxbd(dst, dst, vlen_enc); break;
|
|
case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break;
|
|
case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break;
|
|
case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break;
|
|
|
|
default: assert(false, "%s", type2name(elem_bt));
|
|
}
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) {
|
|
ExternalAddress addr(StubRoutines::x86::vector_iota_indices());
|
|
if (vlen_in_bytes <= 16) {
|
|
movdqu(dst, addr, scratch);
|
|
} else if (vlen_in_bytes == 32) {
|
|
vmovdqu(dst, addr, scratch);
|
|
} else {
|
|
assert(vlen_in_bytes == 64, "%d", vlen_in_bytes);
|
|
evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch);
|
|
}
|
|
}
|
|
// Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles.
|
|
|
|
void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) {
|
|
int vector_len = Assembler::AVX_128bit;
|
|
|
|
switch (opcode) {
|
|
case Op_AndReductionV: pand(dst, src); break;
|
|
case Op_OrReductionV: por (dst, src); break;
|
|
case Op_XorReductionV: pxor(dst, src); break;
|
|
case Op_MinReductionV:
|
|
switch (typ) {
|
|
case T_BYTE: pminsb(dst, src); break;
|
|
case T_SHORT: pminsw(dst, src); break;
|
|
case T_INT: pminsd(dst, src); break;
|
|
case T_LONG: assert(UseAVX > 2, "required");
|
|
vpminsq(dst, dst, src, Assembler::AVX_128bit); break;
|
|
default: assert(false, "wrong type");
|
|
}
|
|
break;
|
|
case Op_MaxReductionV:
|
|
switch (typ) {
|
|
case T_BYTE: pmaxsb(dst, src); break;
|
|
case T_SHORT: pmaxsw(dst, src); break;
|
|
case T_INT: pmaxsd(dst, src); break;
|
|
case T_LONG: assert(UseAVX > 2, "required");
|
|
vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break;
|
|
default: assert(false, "wrong type");
|
|
}
|
|
break;
|
|
case Op_AddReductionVF: addss(dst, src); break;
|
|
case Op_AddReductionVD: addsd(dst, src); break;
|
|
case Op_AddReductionVI:
|
|
switch (typ) {
|
|
case T_BYTE: paddb(dst, src); break;
|
|
case T_SHORT: paddw(dst, src); break;
|
|
case T_INT: paddd(dst, src); break;
|
|
default: assert(false, "wrong type");
|
|
}
|
|
break;
|
|
case Op_AddReductionVL: paddq(dst, src); break;
|
|
case Op_MulReductionVF: mulss(dst, src); break;
|
|
case Op_MulReductionVD: mulsd(dst, src); break;
|
|
case Op_MulReductionVI:
|
|
switch (typ) {
|
|
case T_SHORT: pmullw(dst, src); break;
|
|
case T_INT: pmulld(dst, src); break;
|
|
default: assert(false, "wrong type");
|
|
}
|
|
break;
|
|
case Op_MulReductionVL: assert(UseAVX > 2, "required");
|
|
vpmullq(dst, dst, src, vector_len); break;
|
|
default: assert(false, "wrong opcode");
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) {
|
|
int vector_len = Assembler::AVX_256bit;
|
|
|
|
switch (opcode) {
|
|
case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break;
|
|
case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break;
|
|
case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break;
|
|
case Op_MinReductionV:
|
|
switch (typ) {
|
|
case T_BYTE: vpminsb(dst, src1, src2, vector_len); break;
|
|
case T_SHORT: vpminsw(dst, src1, src2, vector_len); break;
|
|
case T_INT: vpminsd(dst, src1, src2, vector_len); break;
|
|
case T_LONG: assert(UseAVX > 2, "required");
|
|
vpminsq(dst, src1, src2, vector_len); break;
|
|
default: assert(false, "wrong type");
|
|
}
|
|
break;
|
|
case Op_MaxReductionV:
|
|
switch (typ) {
|
|
case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break;
|
|
case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break;
|
|
case T_INT: vpmaxsd(dst, src1, src2, vector_len); break;
|
|
case T_LONG: assert(UseAVX > 2, "required");
|
|
vpmaxsq(dst, src1, src2, vector_len); break;
|
|
default: assert(false, "wrong type");
|
|
}
|
|
break;
|
|
case Op_AddReductionVI:
|
|
switch (typ) {
|
|
case T_BYTE: vpaddb(dst, src1, src2, vector_len); break;
|
|
case T_SHORT: vpaddw(dst, src1, src2, vector_len); break;
|
|
case T_INT: vpaddd(dst, src1, src2, vector_len); break;
|
|
default: assert(false, "wrong type");
|
|
}
|
|
break;
|
|
case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break;
|
|
case Op_MulReductionVI:
|
|
switch (typ) {
|
|
case T_SHORT: vpmullw(dst, src1, src2, vector_len); break;
|
|
case T_INT: vpmulld(dst, src1, src2, vector_len); break;
|
|
default: assert(false, "wrong type");
|
|
}
|
|
break;
|
|
case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break;
|
|
default: assert(false, "wrong opcode");
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce_fp(int opcode, int vlen,
|
|
XMMRegister dst, XMMRegister src,
|
|
XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
switch (opcode) {
|
|
case Op_AddReductionVF:
|
|
case Op_MulReductionVF:
|
|
reduceF(opcode, vlen, dst, src, vtmp1, vtmp2);
|
|
break;
|
|
|
|
case Op_AddReductionVD:
|
|
case Op_MulReductionVD:
|
|
reduceD(opcode, vlen, dst, src, vtmp1, vtmp2);
|
|
break;
|
|
|
|
default: assert(false, "wrong opcode");
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::reduceB(int opcode, int vlen,
|
|
Register dst, Register src1, XMMRegister src2,
|
|
XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
switch (vlen) {
|
|
case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
|
case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
|
case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
|
case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
|
|
|
default: assert(false, "wrong vector length");
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::mulreduceB(int opcode, int vlen,
|
|
Register dst, Register src1, XMMRegister src2,
|
|
XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
switch (vlen) {
|
|
case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
|
case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
|
case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
|
case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
|
|
|
default: assert(false, "wrong vector length");
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::reduceS(int opcode, int vlen,
|
|
Register dst, Register src1, XMMRegister src2,
|
|
XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
switch (vlen) {
|
|
case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
|
case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
|
case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
|
case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
|
|
|
default: assert(false, "wrong vector length");
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::reduceI(int opcode, int vlen,
|
|
Register dst, Register src1, XMMRegister src2,
|
|
XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
switch (vlen) {
|
|
case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
|
case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
|
case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
|
case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
|
|
|
default: assert(false, "wrong vector length");
|
|
}
|
|
}
|
|
|
|
#ifdef _LP64
|
|
void C2_MacroAssembler::reduceL(int opcode, int vlen,
|
|
Register dst, Register src1, XMMRegister src2,
|
|
XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
switch (vlen) {
|
|
case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
|
case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
|
case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break;
|
|
|
|
default: assert(false, "wrong vector length");
|
|
}
|
|
}
|
|
#endif // _LP64
|
|
|
|
void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
switch (vlen) {
|
|
case 2:
|
|
assert(vtmp2 == xnoreg, "");
|
|
reduce2F(opcode, dst, src, vtmp1);
|
|
break;
|
|
case 4:
|
|
assert(vtmp2 == xnoreg, "");
|
|
reduce4F(opcode, dst, src, vtmp1);
|
|
break;
|
|
case 8:
|
|
reduce8F(opcode, dst, src, vtmp1, vtmp2);
|
|
break;
|
|
case 16:
|
|
reduce16F(opcode, dst, src, vtmp1, vtmp2);
|
|
break;
|
|
default: assert(false, "wrong vector length");
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
switch (vlen) {
|
|
case 2:
|
|
assert(vtmp2 == xnoreg, "");
|
|
reduce2D(opcode, dst, src, vtmp1);
|
|
break;
|
|
case 4:
|
|
reduce4D(opcode, dst, src, vtmp1, vtmp2);
|
|
break;
|
|
case 8:
|
|
reduce8D(opcode, dst, src, vtmp1, vtmp2);
|
|
break;
|
|
default: assert(false, "wrong vector length");
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
if (opcode == Op_AddReductionVI) {
|
|
if (vtmp1 != src2) {
|
|
movdqu(vtmp1, src2);
|
|
}
|
|
phaddd(vtmp1, vtmp1);
|
|
} else {
|
|
pshufd(vtmp1, src2, 0x1);
|
|
reduce_operation_128(T_INT, opcode, vtmp1, src2);
|
|
}
|
|
movdl(vtmp2, src1);
|
|
reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
|
|
movdl(dst, vtmp1);
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
if (opcode == Op_AddReductionVI) {
|
|
if (vtmp1 != src2) {
|
|
movdqu(vtmp1, src2);
|
|
}
|
|
phaddd(vtmp1, src2);
|
|
reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
|
|
} else {
|
|
pshufd(vtmp2, src2, 0xE);
|
|
reduce_operation_128(T_INT, opcode, vtmp2, src2);
|
|
reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
if (opcode == Op_AddReductionVI) {
|
|
vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit);
|
|
vextracti128_high(vtmp2, vtmp1);
|
|
vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit);
|
|
reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
|
|
} else {
|
|
vextracti128_high(vtmp1, src2);
|
|
reduce_operation_128(T_INT, opcode, vtmp1, src2);
|
|
reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
vextracti64x4_high(vtmp2, src2);
|
|
reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2);
|
|
reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
pshufd(vtmp2, src2, 0x1);
|
|
reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
|
|
movdqu(vtmp1, vtmp2);
|
|
psrldq(vtmp1, 2);
|
|
reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
|
|
movdqu(vtmp2, vtmp1);
|
|
psrldq(vtmp2, 1);
|
|
reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2);
|
|
movdl(vtmp2, src1);
|
|
pmovsxbd(vtmp1, vtmp1);
|
|
reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
|
|
pextrb(dst, vtmp1, 0x0);
|
|
movsbl(dst, dst);
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
pshufd(vtmp1, src2, 0xE);
|
|
reduce_operation_128(T_BYTE, opcode, vtmp1, src2);
|
|
reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
vextracti128_high(vtmp2, src2);
|
|
reduce_operation_128(T_BYTE, opcode, vtmp2, src2);
|
|
reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
vextracti64x4_high(vtmp1, src2);
|
|
reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2);
|
|
reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
|
|
}
|
|
|
|
void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
pmovsxbw(vtmp2, src2);
|
|
reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
|
|
}
|
|
|
|
void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
if (UseAVX > 1) {
|
|
int vector_len = Assembler::AVX_256bit;
|
|
vpmovsxbw(vtmp1, src2, vector_len);
|
|
reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
|
|
} else {
|
|
pmovsxbw(vtmp2, src2);
|
|
reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
|
|
pshufd(vtmp2, src2, 0x1);
|
|
pmovsxbw(vtmp2, src2);
|
|
reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
if (UseAVX > 2 && VM_Version::supports_avx512bw()) {
|
|
int vector_len = Assembler::AVX_512bit;
|
|
vpmovsxbw(vtmp1, src2, vector_len);
|
|
reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
|
|
} else {
|
|
assert(UseAVX >= 2,"Should not reach here.");
|
|
mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2);
|
|
vextracti128_high(vtmp2, src2);
|
|
mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2);
|
|
vextracti64x4_high(vtmp2, src2);
|
|
mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2);
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
if (opcode == Op_AddReductionVI) {
|
|
if (vtmp1 != src2) {
|
|
movdqu(vtmp1, src2);
|
|
}
|
|
phaddw(vtmp1, vtmp1);
|
|
phaddw(vtmp1, vtmp1);
|
|
} else {
|
|
pshufd(vtmp2, src2, 0x1);
|
|
reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
|
|
movdqu(vtmp1, vtmp2);
|
|
psrldq(vtmp1, 2);
|
|
reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2);
|
|
}
|
|
movdl(vtmp2, src1);
|
|
pmovsxwd(vtmp1, vtmp1);
|
|
reduce_operation_128(T_INT, opcode, vtmp1, vtmp2);
|
|
pextrw(dst, vtmp1, 0x0);
|
|
movswl(dst, dst);
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
if (opcode == Op_AddReductionVI) {
|
|
if (vtmp1 != src2) {
|
|
movdqu(vtmp1, src2);
|
|
}
|
|
phaddw(vtmp1, src2);
|
|
} else {
|
|
pshufd(vtmp1, src2, 0xE);
|
|
reduce_operation_128(T_SHORT, opcode, vtmp1, src2);
|
|
}
|
|
reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
if (opcode == Op_AddReductionVI) {
|
|
int vector_len = Assembler::AVX_256bit;
|
|
vphaddw(vtmp2, src2, src2, vector_len);
|
|
vpermq(vtmp2, vtmp2, 0xD8, vector_len);
|
|
} else {
|
|
vextracti128_high(vtmp2, src2);
|
|
reduce_operation_128(T_SHORT, opcode, vtmp2, src2);
|
|
}
|
|
reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
int vector_len = Assembler::AVX_256bit;
|
|
vextracti64x4_high(vtmp1, src2);
|
|
reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2);
|
|
reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
|
|
}
|
|
|
|
#ifdef _LP64
|
|
void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
pshufd(vtmp2, src2, 0xE);
|
|
reduce_operation_128(T_LONG, opcode, vtmp2, src2);
|
|
movdq(vtmp1, src1);
|
|
reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2);
|
|
movdq(dst, vtmp1);
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
vextracti128_high(vtmp1, src2);
|
|
reduce_operation_128(T_LONG, opcode, vtmp1, src2);
|
|
reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2);
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
vextracti64x4_high(vtmp2, src2);
|
|
reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2);
|
|
reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2);
|
|
}
|
|
#endif // _LP64
|
|
|
|
void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
|
|
reduce_operation_128(T_FLOAT, opcode, dst, src);
|
|
pshufd(vtmp, src, 0x1);
|
|
reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
|
|
reduce2F(opcode, dst, src, vtmp);
|
|
pshufd(vtmp, src, 0x2);
|
|
reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
|
|
pshufd(vtmp, src, 0x3);
|
|
reduce_operation_128(T_FLOAT, opcode, dst, vtmp);
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
reduce4F(opcode, dst, src, vtmp2);
|
|
vextractf128_high(vtmp2, src);
|
|
reduce4F(opcode, dst, vtmp2, vtmp1);
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
reduce8F(opcode, dst, src, vtmp1, vtmp2);
|
|
vextracti64x4_high(vtmp1, src);
|
|
reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2);
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) {
|
|
reduce_operation_128(T_DOUBLE, opcode, dst, src);
|
|
pshufd(vtmp, src, 0xE);
|
|
reduce_operation_128(T_DOUBLE, opcode, dst, vtmp);
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
reduce2D(opcode, dst, src, vtmp2);
|
|
vextractf128_high(vtmp2, src);
|
|
reduce2D(opcode, dst, vtmp2, vtmp1);
|
|
}
|
|
|
|
void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) {
|
|
reduce4D(opcode, dst, src, vtmp1, vtmp2);
|
|
vextracti64x4_high(vtmp1, src);
|
|
reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2);
|
|
}
|
|
|
|
void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid,
|
|
XMMRegister dst, XMMRegister src,
|
|
XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
|
|
XMMRegister xmm_0, XMMRegister xmm_1) {
|
|
int permconst[] = {1, 14};
|
|
XMMRegister wsrc = src;
|
|
XMMRegister wdst = xmm_0;
|
|
XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
|
|
|
|
int vlen_enc = Assembler::AVX_128bit;
|
|
if (vlen == 16) {
|
|
vlen_enc = Assembler::AVX_256bit;
|
|
}
|
|
|
|
for (int i = log2(vlen) - 1; i >=0; i--) {
|
|
if (i == 0 && !is_dst_valid) {
|
|
wdst = dst;
|
|
}
|
|
if (i == 3) {
|
|
vextracti64x4_high(wtmp, wsrc);
|
|
} else if (i == 2) {
|
|
vextracti128_high(wtmp, wsrc);
|
|
} else { // i = [0,1]
|
|
vpermilps(wtmp, wsrc, permconst[i], vlen_enc);
|
|
}
|
|
vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
|
|
wsrc = wdst;
|
|
vlen_enc = Assembler::AVX_128bit;
|
|
}
|
|
if (is_dst_valid) {
|
|
vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src,
|
|
XMMRegister tmp, XMMRegister atmp, XMMRegister btmp,
|
|
XMMRegister xmm_0, XMMRegister xmm_1) {
|
|
XMMRegister wsrc = src;
|
|
XMMRegister wdst = xmm_0;
|
|
XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1;
|
|
int vlen_enc = Assembler::AVX_128bit;
|
|
if (vlen == 8) {
|
|
vlen_enc = Assembler::AVX_256bit;
|
|
}
|
|
for (int i = log2(vlen) - 1; i >=0; i--) {
|
|
if (i == 0 && !is_dst_valid) {
|
|
wdst = dst;
|
|
}
|
|
if (i == 1) {
|
|
vextracti128_high(wtmp, wsrc);
|
|
} else if (i == 2) {
|
|
vextracti64x4_high(wtmp, wsrc);
|
|
} else {
|
|
assert(i == 0, "%d", i);
|
|
vpermilpd(wtmp, wsrc, 1, vlen_enc);
|
|
}
|
|
vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc);
|
|
wsrc = wdst;
|
|
vlen_enc = Assembler::AVX_128bit;
|
|
}
|
|
if (is_dst_valid) {
|
|
vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) {
|
|
switch (bt) {
|
|
case T_BYTE: pextrb(dst, src, idx); break;
|
|
case T_SHORT: pextrw(dst, src, idx); break;
|
|
case T_INT: pextrd(dst, src, idx); break;
|
|
case T_LONG: pextrq(dst, src, idx); break;
|
|
|
|
default:
|
|
assert(false,"Should not reach here.");
|
|
break;
|
|
}
|
|
}
|
|
|
|
XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) {
|
|
int esize = type2aelembytes(typ);
|
|
int elem_per_lane = 16/esize;
|
|
int lane = elemindex / elem_per_lane;
|
|
int eindex = elemindex % elem_per_lane;
|
|
|
|
if (lane >= 2) {
|
|
assert(UseAVX > 2, "required");
|
|
vextractf32x4(dst, src, lane & 3);
|
|
return dst;
|
|
} else if (lane > 0) {
|
|
assert(UseAVX > 0, "required");
|
|
vextractf128(dst, src, lane);
|
|
return dst;
|
|
} else {
|
|
return src;
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) {
|
|
int esize = type2aelembytes(typ);
|
|
int elem_per_lane = 16/esize;
|
|
int eindex = elemindex % elem_per_lane;
|
|
assert(is_integral_type(typ),"required");
|
|
|
|
if (eindex == 0) {
|
|
if (typ == T_LONG) {
|
|
movq(dst, src);
|
|
} else {
|
|
movdl(dst, src);
|
|
if (typ == T_BYTE)
|
|
movsbl(dst, dst);
|
|
else if (typ == T_SHORT)
|
|
movswl(dst, dst);
|
|
}
|
|
} else {
|
|
extract(typ, dst, src, eindex);
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) {
|
|
int esize = type2aelembytes(typ);
|
|
int elem_per_lane = 16/esize;
|
|
int eindex = elemindex % elem_per_lane;
|
|
assert((typ == T_FLOAT || typ == T_DOUBLE),"required");
|
|
|
|
if (eindex == 0) {
|
|
movq(dst, src);
|
|
} else {
|
|
if (typ == T_FLOAT) {
|
|
if (UseAVX == 0) {
|
|
movdqu(dst, src);
|
|
pshufps(dst, dst, eindex);
|
|
} else {
|
|
vpshufps(dst, src, src, eindex, Assembler::AVX_128bit);
|
|
}
|
|
} else {
|
|
if (UseAVX == 0) {
|
|
movdqu(dst, src);
|
|
psrldq(dst, eindex*esize);
|
|
} else {
|
|
vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit);
|
|
}
|
|
movq(dst, dst);
|
|
}
|
|
}
|
|
// Zero upper bits
|
|
if (typ == T_FLOAT) {
|
|
if (UseAVX == 0) {
|
|
assert((vtmp != xnoreg) && (tmp != noreg), "required.");
|
|
movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp);
|
|
pand(dst, vtmp);
|
|
} else {
|
|
assert((tmp != noreg), "required.");
|
|
vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp);
|
|
}
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) {
|
|
switch(typ) {
|
|
case T_BYTE:
|
|
evpcmpb(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
|
|
break;
|
|
case T_SHORT:
|
|
evpcmpw(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
|
|
break;
|
|
case T_INT:
|
|
case T_FLOAT:
|
|
evpcmpd(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
|
|
break;
|
|
case T_LONG:
|
|
case T_DOUBLE:
|
|
evpcmpq(kdmask, ksmask, src1, adr, comparison, vector_len, scratch);
|
|
break;
|
|
default:
|
|
assert(false,"Should not reach here.");
|
|
break;
|
|
}
|
|
}
|
|
|
|
void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) {
|
|
switch(typ) {
|
|
case T_BYTE:
|
|
evpblendmb(dst, kmask, src1, src2, merge, vector_len);
|
|
break;
|
|
case T_SHORT:
|
|
evpblendmw(dst, kmask, src1, src2, merge, vector_len);
|
|
break;
|
|
case T_INT:
|
|
case T_FLOAT:
|
|
evpblendmd(dst, kmask, src1, src2, merge, vector_len);
|
|
break;
|
|
case T_LONG:
|
|
case T_DOUBLE:
|
|
evpblendmq(dst, kmask, src1, src2, merge, vector_len);
|
|
break;
|
|
default:
|
|
assert(false,"Should not reach here.");
|
|
break;
|
|
}
|
|
}
|
|
|
|
//-------------------------------------------------------------------------------------------
|
|
|
|
// IndexOf for constant substrings with size >= 8 chars
|
|
// which don't need to be loaded through stack.
|
|
void C2_MacroAssembler::string_indexofC8(Register str1, Register str2,
|
|
Register cnt1, Register cnt2,
|
|
int int_cnt2, Register result,
|
|
XMMRegister vec, Register tmp,
|
|
int ae) {
|
|
ShortBranchVerifier sbv(this);
|
|
assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
|
|
assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
|
|
|
|
// This method uses the pcmpestri instruction with bound registers
|
|
// inputs:
|
|
// xmm - substring
|
|
// rax - substring length (elements count)
|
|
// mem - scanned string
|
|
// rdx - string length (elements count)
|
|
// 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
|
|
// 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
|
|
// outputs:
|
|
// rcx - matched index in string
|
|
assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
|
|
int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
|
|
int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
|
|
Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
|
|
Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
|
|
|
|
Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR,
|
|
RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR,
|
|
MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE;
|
|
|
|
// Note, inline_string_indexOf() generates checks:
|
|
// if (substr.count > string.count) return -1;
|
|
// if (substr.count == 0) return 0;
|
|
assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars");
|
|
|
|
// Load substring.
|
|
if (ae == StrIntrinsicNode::UL) {
|
|
pmovzxbw(vec, Address(str2, 0));
|
|
} else {
|
|
movdqu(vec, Address(str2, 0));
|
|
}
|
|
movl(cnt2, int_cnt2);
|
|
movptr(result, str1); // string addr
|
|
|
|
if (int_cnt2 > stride) {
|
|
jmpb(SCAN_TO_SUBSTR);
|
|
|
|
// Reload substr for rescan, this code
|
|
// is executed only for large substrings (> 8 chars)
|
|
bind(RELOAD_SUBSTR);
|
|
if (ae == StrIntrinsicNode::UL) {
|
|
pmovzxbw(vec, Address(str2, 0));
|
|
} else {
|
|
movdqu(vec, Address(str2, 0));
|
|
}
|
|
negptr(cnt2); // Jumped here with negative cnt2, convert to positive
|
|
|
|
bind(RELOAD_STR);
|
|
// We came here after the beginning of the substring was
|
|
// matched but the rest of it was not so we need to search
|
|
// again. Start from the next element after the previous match.
|
|
|
|
// cnt2 is number of substring reminding elements and
|
|
// cnt1 is number of string reminding elements when cmp failed.
|
|
// Restored cnt1 = cnt1 - cnt2 + int_cnt2
|
|
subl(cnt1, cnt2);
|
|
addl(cnt1, int_cnt2);
|
|
movl(cnt2, int_cnt2); // Now restore cnt2
|
|
|
|
decrementl(cnt1); // Shift to next element
|
|
cmpl(cnt1, cnt2);
|
|
jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
|
|
|
|
addptr(result, (1<<scale1));
|
|
|
|
} // (int_cnt2 > 8)
|
|
|
|
// Scan string for start of substr in 16-byte vectors
|
|
bind(SCAN_TO_SUBSTR);
|
|
pcmpestri(vec, Address(result, 0), mode);
|
|
jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
|
|
subl(cnt1, stride);
|
|
jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
|
|
cmpl(cnt1, cnt2);
|
|
jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
|
|
addptr(result, 16);
|
|
jmpb(SCAN_TO_SUBSTR);
|
|
|
|
// Found a potential substr
|
|
bind(FOUND_CANDIDATE);
|
|
// Matched whole vector if first element matched (tmp(rcx) == 0).
|
|
if (int_cnt2 == stride) {
|
|
jccb(Assembler::overflow, RET_FOUND); // OF == 1
|
|
} else { // int_cnt2 > 8
|
|
jccb(Assembler::overflow, FOUND_SUBSTR);
|
|
}
|
|
// After pcmpestri tmp(rcx) contains matched element index
|
|
// Compute start addr of substr
|
|
lea(result, Address(result, tmp, scale1));
|
|
|
|
// Make sure string is still long enough
|
|
subl(cnt1, tmp);
|
|
cmpl(cnt1, cnt2);
|
|
if (int_cnt2 == stride) {
|
|
jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
|
|
} else { // int_cnt2 > 8
|
|
jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD);
|
|
}
|
|
// Left less then substring.
|
|
|
|
bind(RET_NOT_FOUND);
|
|
movl(result, -1);
|
|
jmp(EXIT);
|
|
|
|
if (int_cnt2 > stride) {
|
|
// This code is optimized for the case when whole substring
|
|
// is matched if its head is matched.
|
|
bind(MATCH_SUBSTR_HEAD);
|
|
pcmpestri(vec, Address(result, 0), mode);
|
|
// Reload only string if does not match
|
|
jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0
|
|
|
|
Label CONT_SCAN_SUBSTR;
|
|
// Compare the rest of substring (> 8 chars).
|
|
bind(FOUND_SUBSTR);
|
|
// First 8 chars are already matched.
|
|
negptr(cnt2);
|
|
addptr(cnt2, stride);
|
|
|
|
bind(SCAN_SUBSTR);
|
|
subl(cnt1, stride);
|
|
cmpl(cnt2, -stride); // Do not read beyond substring
|
|
jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR);
|
|
// Back-up strings to avoid reading beyond substring:
|
|
// cnt1 = cnt1 - cnt2 + 8
|
|
addl(cnt1, cnt2); // cnt2 is negative
|
|
addl(cnt1, stride);
|
|
movl(cnt2, stride); negptr(cnt2);
|
|
bind(CONT_SCAN_SUBSTR);
|
|
if (int_cnt2 < (int)G) {
|
|
int tail_off1 = int_cnt2<<scale1;
|
|
int tail_off2 = int_cnt2<<scale2;
|
|
if (ae == StrIntrinsicNode::UL) {
|
|
pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2));
|
|
} else {
|
|
movdqu(vec, Address(str2, cnt2, scale2, tail_off2));
|
|
}
|
|
pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode);
|
|
} else {
|
|
// calculate index in register to avoid integer overflow (int_cnt2*2)
|
|
movl(tmp, int_cnt2);
|
|
addptr(tmp, cnt2);
|
|
if (ae == StrIntrinsicNode::UL) {
|
|
pmovzxbw(vec, Address(str2, tmp, scale2, 0));
|
|
} else {
|
|
movdqu(vec, Address(str2, tmp, scale2, 0));
|
|
}
|
|
pcmpestri(vec, Address(result, tmp, scale1, 0), mode);
|
|
}
|
|
// Need to reload strings pointers if not matched whole vector
|
|
jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
|
|
addptr(cnt2, stride);
|
|
jcc(Assembler::negative, SCAN_SUBSTR);
|
|
// Fall through if found full substring
|
|
|
|
} // (int_cnt2 > 8)
|
|
|
|
bind(RET_FOUND);
|
|
// Found result if we matched full small substring.
|
|
// Compute substr offset
|
|
subptr(result, str1);
|
|
if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
|
|
shrl(result, 1); // index
|
|
}
|
|
bind(EXIT);
|
|
|
|
} // string_indexofC8
|
|
|
|
// Small strings are loaded through stack if they cross page boundary.
|
|
void C2_MacroAssembler::string_indexof(Register str1, Register str2,
|
|
Register cnt1, Register cnt2,
|
|
int int_cnt2, Register result,
|
|
XMMRegister vec, Register tmp,
|
|
int ae) {
|
|
ShortBranchVerifier sbv(this);
|
|
assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
|
|
assert(ae != StrIntrinsicNode::LU, "Invalid encoding");
|
|
|
|
//
|
|
// int_cnt2 is length of small (< 8 chars) constant substring
|
|
// or (-1) for non constant substring in which case its length
|
|
// is in cnt2 register.
|
|
//
|
|
// Note, inline_string_indexOf() generates checks:
|
|
// if (substr.count > string.count) return -1;
|
|
// if (substr.count == 0) return 0;
|
|
//
|
|
int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8
|
|
assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0");
|
|
// This method uses the pcmpestri instruction with bound registers
|
|
// inputs:
|
|
// xmm - substring
|
|
// rax - substring length (elements count)
|
|
// mem - scanned string
|
|
// rdx - string length (elements count)
|
|
// 0xd - mode: 1100 (substring search) + 01 (unsigned shorts)
|
|
// 0xc - mode: 1100 (substring search) + 00 (unsigned bytes)
|
|
// outputs:
|
|
// rcx - matched index in string
|
|
assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
|
|
int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts
|
|
Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2;
|
|
Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1;
|
|
|
|
Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR,
|
|
RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR,
|
|
FOUND_CANDIDATE;
|
|
|
|
{ //========================================================
|
|
// We don't know where these strings are located
|
|
// and we can't read beyond them. Load them through stack.
|
|
Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR;
|
|
|
|
movptr(tmp, rsp); // save old SP
|
|
|
|
if (int_cnt2 > 0) { // small (< 8 chars) constant substring
|
|
if (int_cnt2 == (1>>scale2)) { // One byte
|
|
assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding");
|
|
load_unsigned_byte(result, Address(str2, 0));
|
|
movdl(vec, result); // move 32 bits
|
|
} else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes
|
|
// Not enough header space in 32-bit VM: 12+3 = 15.
|
|
movl(result, Address(str2, -1));
|
|
shrl(result, 8);
|
|
movdl(vec, result); // move 32 bits
|
|
} else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char
|
|
load_unsigned_short(result, Address(str2, 0));
|
|
movdl(vec, result); // move 32 bits
|
|
} else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars
|
|
movdl(vec, Address(str2, 0)); // move 32 bits
|
|
} else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars
|
|
movq(vec, Address(str2, 0)); // move 64 bits
|
|
} else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7})
|
|
// Array header size is 12 bytes in 32-bit VM
|
|
// + 6 bytes for 3 chars == 18 bytes,
|
|
// enough space to load vec and shift.
|
|
assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity");
|
|
if (ae == StrIntrinsicNode::UL) {
|
|
int tail_off = int_cnt2-8;
|
|
pmovzxbw(vec, Address(str2, tail_off));
|
|
psrldq(vec, -2*tail_off);
|
|
}
|
|
else {
|
|
int tail_off = int_cnt2*(1<<scale2);
|
|
movdqu(vec, Address(str2, tail_off-16));
|
|
psrldq(vec, 16-tail_off);
|
|
}
|
|
}
|
|
} else { // not constant substring
|
|
cmpl(cnt2, stride);
|
|
jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough
|
|
|
|
// We can read beyond string if srt+16 does not cross page boundary
|
|
// since heaps are aligned and mapped by pages.
|
|
assert(os::vm_page_size() < (int)G, "default page should be small");
|
|
movl(result, str2); // We need only low 32 bits
|
|
andl(result, (os::vm_page_size()-1));
|
|
cmpl(result, (os::vm_page_size()-16));
|
|
jccb(Assembler::belowEqual, CHECK_STR);
|
|
|
|
// Move small strings to stack to allow load 16 bytes into vec.
|
|
subptr(rsp, 16);
|
|
int stk_offset = wordSize-(1<<scale2);
|
|
push(cnt2);
|
|
|
|
bind(COPY_SUBSTR);
|
|
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) {
|
|
load_unsigned_byte(result, Address(str2, cnt2, scale2, -1));
|
|
movb(Address(rsp, cnt2, scale2, stk_offset), result);
|
|
} else if (ae == StrIntrinsicNode::UU) {
|
|
load_unsigned_short(result, Address(str2, cnt2, scale2, -2));
|
|
movw(Address(rsp, cnt2, scale2, stk_offset), result);
|
|
}
|
|
decrement(cnt2);
|
|
jccb(Assembler::notZero, COPY_SUBSTR);
|
|
|
|
pop(cnt2);
|
|
movptr(str2, rsp); // New substring address
|
|
} // non constant
|
|
|
|
bind(CHECK_STR);
|
|
cmpl(cnt1, stride);
|
|
jccb(Assembler::aboveEqual, BIG_STRINGS);
|
|
|
|
// Check cross page boundary.
|
|
movl(result, str1); // We need only low 32 bits
|
|
andl(result, (os::vm_page_size()-1));
|
|
cmpl(result, (os::vm_page_size()-16));
|
|
jccb(Assembler::belowEqual, BIG_STRINGS);
|
|
|
|
subptr(rsp, 16);
|
|
int stk_offset = -(1<<scale1);
|
|
if (int_cnt2 < 0) { // not constant
|
|
push(cnt2);
|
|
stk_offset += wordSize;
|
|
}
|
|
movl(cnt2, cnt1);
|
|
|
|
bind(COPY_STR);
|
|
if (ae == StrIntrinsicNode::LL) {
|
|
load_unsigned_byte(result, Address(str1, cnt2, scale1, -1));
|
|
movb(Address(rsp, cnt2, scale1, stk_offset), result);
|
|
} else {
|
|
load_unsigned_short(result, Address(str1, cnt2, scale1, -2));
|
|
movw(Address(rsp, cnt2, scale1, stk_offset), result);
|
|
}
|
|
decrement(cnt2);
|
|
jccb(Assembler::notZero, COPY_STR);
|
|
|
|
if (int_cnt2 < 0) { // not constant
|
|
pop(cnt2);
|
|
}
|
|
movptr(str1, rsp); // New string address
|
|
|
|
bind(BIG_STRINGS);
|
|
// Load substring.
|
|
if (int_cnt2 < 0) { // -1
|
|
if (ae == StrIntrinsicNode::UL) {
|
|
pmovzxbw(vec, Address(str2, 0));
|
|
} else {
|
|
movdqu(vec, Address(str2, 0));
|
|
}
|
|
push(cnt2); // substr count
|
|
push(str2); // substr addr
|
|
push(str1); // string addr
|
|
} else {
|
|
// Small (< 8 chars) constant substrings are loaded already.
|
|
movl(cnt2, int_cnt2);
|
|
}
|
|
push(tmp); // original SP
|
|
|
|
} // Finished loading
|
|
|
|
//========================================================
|
|
// Start search
|
|
//
|
|
|
|
movptr(result, str1); // string addr
|
|
|
|
if (int_cnt2 < 0) { // Only for non constant substring
|
|
jmpb(SCAN_TO_SUBSTR);
|
|
|
|
// SP saved at sp+0
|
|
// String saved at sp+1*wordSize
|
|
// Substr saved at sp+2*wordSize
|
|
// Substr count saved at sp+3*wordSize
|
|
|
|
// Reload substr for rescan, this code
|
|
// is executed only for large substrings (> 8 chars)
|
|
bind(RELOAD_SUBSTR);
|
|
movptr(str2, Address(rsp, 2*wordSize));
|
|
movl(cnt2, Address(rsp, 3*wordSize));
|
|
if (ae == StrIntrinsicNode::UL) {
|
|
pmovzxbw(vec, Address(str2, 0));
|
|
} else {
|
|
movdqu(vec, Address(str2, 0));
|
|
}
|
|
// We came here after the beginning of the substring was
|
|
// matched but the rest of it was not so we need to search
|
|
// again. Start from the next element after the previous match.
|
|
subptr(str1, result); // Restore counter
|
|
if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
|
|
shrl(str1, 1);
|
|
}
|
|
addl(cnt1, str1);
|
|
decrementl(cnt1); // Shift to next element
|
|
cmpl(cnt1, cnt2);
|
|
jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring
|
|
|
|
addptr(result, (1<<scale1));
|
|
} // non constant
|
|
|
|
// Scan string for start of substr in 16-byte vectors
|
|
bind(SCAN_TO_SUBSTR);
|
|
assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri");
|
|
pcmpestri(vec, Address(result, 0), mode);
|
|
jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1
|
|
subl(cnt1, stride);
|
|
jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string
|
|
cmpl(cnt1, cnt2);
|
|
jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring
|
|
addptr(result, 16);
|
|
|
|
bind(ADJUST_STR);
|
|
cmpl(cnt1, stride); // Do not read beyond string
|
|
jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR);
|
|
// Back-up string to avoid reading beyond string.
|
|
lea(result, Address(result, cnt1, scale1, -16));
|
|
movl(cnt1, stride);
|
|
jmpb(SCAN_TO_SUBSTR);
|
|
|
|
// Found a potential substr
|
|
bind(FOUND_CANDIDATE);
|
|
// After pcmpestri tmp(rcx) contains matched element index
|
|
|
|
// Make sure string is still long enough
|
|
subl(cnt1, tmp);
|
|
cmpl(cnt1, cnt2);
|
|
jccb(Assembler::greaterEqual, FOUND_SUBSTR);
|
|
// Left less then substring.
|
|
|
|
bind(RET_NOT_FOUND);
|
|
movl(result, -1);
|
|
jmp(CLEANUP);
|
|
|
|
bind(FOUND_SUBSTR);
|
|
// Compute start addr of substr
|
|
lea(result, Address(result, tmp, scale1));
|
|
if (int_cnt2 > 0) { // Constant substring
|
|
// Repeat search for small substring (< 8 chars)
|
|
// from new point without reloading substring.
|
|
// Have to check that we don't read beyond string.
|
|
cmpl(tmp, stride-int_cnt2);
|
|
jccb(Assembler::greater, ADJUST_STR);
|
|
// Fall through if matched whole substring.
|
|
} else { // non constant
|
|
assert(int_cnt2 == -1, "should be != 0");
|
|
|
|
addl(tmp, cnt2);
|
|
// Found result if we matched whole substring.
|
|
cmpl(tmp, stride);
|
|
jcc(Assembler::lessEqual, RET_FOUND);
|
|
|
|
// Repeat search for small substring (<= 8 chars)
|
|
// from new point 'str1' without reloading substring.
|
|
cmpl(cnt2, stride);
|
|
// Have to check that we don't read beyond string.
|
|
jccb(Assembler::lessEqual, ADJUST_STR);
|
|
|
|
Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG;
|
|
// Compare the rest of substring (> 8 chars).
|
|
movptr(str1, result);
|
|
|
|
cmpl(tmp, cnt2);
|
|
// First 8 chars are already matched.
|
|
jccb(Assembler::equal, CHECK_NEXT);
|
|
|
|
bind(SCAN_SUBSTR);
|
|
pcmpestri(vec, Address(str1, 0), mode);
|
|
// Need to reload strings pointers if not matched whole vector
|
|
jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0
|
|
|
|
bind(CHECK_NEXT);
|
|
subl(cnt2, stride);
|
|
jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring
|
|
addptr(str1, 16);
|
|
if (ae == StrIntrinsicNode::UL) {
|
|
addptr(str2, 8);
|
|
} else {
|
|
addptr(str2, 16);
|
|
}
|
|
subl(cnt1, stride);
|
|
cmpl(cnt2, stride); // Do not read beyond substring
|
|
jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR);
|
|
// Back-up strings to avoid reading beyond substring.
|
|
|
|
if (ae == StrIntrinsicNode::UL) {
|
|
lea(str2, Address(str2, cnt2, scale2, -8));
|
|
lea(str1, Address(str1, cnt2, scale1, -16));
|
|
} else {
|
|
lea(str2, Address(str2, cnt2, scale2, -16));
|
|
lea(str1, Address(str1, cnt2, scale1, -16));
|
|
}
|
|
subl(cnt1, cnt2);
|
|
movl(cnt2, stride);
|
|
addl(cnt1, stride);
|
|
bind(CONT_SCAN_SUBSTR);
|
|
if (ae == StrIntrinsicNode::UL) {
|
|
pmovzxbw(vec, Address(str2, 0));
|
|
} else {
|
|
movdqu(vec, Address(str2, 0));
|
|
}
|
|
jmp(SCAN_SUBSTR);
|
|
|
|
bind(RET_FOUND_LONG);
|
|
movptr(str1, Address(rsp, wordSize));
|
|
} // non constant
|
|
|
|
bind(RET_FOUND);
|
|
// Compute substr offset
|
|
subptr(result, str1);
|
|
if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) {
|
|
shrl(result, 1); // index
|
|
}
|
|
bind(CLEANUP);
|
|
pop(rsp); // restore SP
|
|
|
|
} // string_indexof
|
|
|
|
void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result,
|
|
XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
|
|
ShortBranchVerifier sbv(this);
|
|
assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
|
|
|
|
int stride = 8;
|
|
|
|
Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP,
|
|
SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP,
|
|
RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT,
|
|
FOUND_SEQ_CHAR, DONE_LABEL;
|
|
|
|
movptr(result, str1);
|
|
if (UseAVX >= 2) {
|
|
cmpl(cnt1, stride);
|
|
jcc(Assembler::less, SCAN_TO_CHAR);
|
|
cmpl(cnt1, 2*stride);
|
|
jcc(Assembler::less, SCAN_TO_8_CHAR_INIT);
|
|
movdl(vec1, ch);
|
|
vpbroadcastw(vec1, vec1, Assembler::AVX_256bit);
|
|
vpxor(vec2, vec2);
|
|
movl(tmp, cnt1);
|
|
andl(tmp, 0xFFFFFFF0); //vector count (in chars)
|
|
andl(cnt1,0x0000000F); //tail count (in chars)
|
|
|
|
bind(SCAN_TO_16_CHAR_LOOP);
|
|
vmovdqu(vec3, Address(result, 0));
|
|
vpcmpeqw(vec3, vec3, vec1, 1);
|
|
vptest(vec2, vec3);
|
|
jcc(Assembler::carryClear, FOUND_CHAR);
|
|
addptr(result, 32);
|
|
subl(tmp, 2*stride);
|
|
jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);
|
|
jmp(SCAN_TO_8_CHAR);
|
|
bind(SCAN_TO_8_CHAR_INIT);
|
|
movdl(vec1, ch);
|
|
pshuflw(vec1, vec1, 0x00);
|
|
pshufd(vec1, vec1, 0);
|
|
pxor(vec2, vec2);
|
|
}
|
|
bind(SCAN_TO_8_CHAR);
|
|
cmpl(cnt1, stride);
|
|
jcc(Assembler::less, SCAN_TO_CHAR);
|
|
if (UseAVX < 2) {
|
|
movdl(vec1, ch);
|
|
pshuflw(vec1, vec1, 0x00);
|
|
pshufd(vec1, vec1, 0);
|
|
pxor(vec2, vec2);
|
|
}
|
|
movl(tmp, cnt1);
|
|
andl(tmp, 0xFFFFFFF8); //vector count (in chars)
|
|
andl(cnt1,0x00000007); //tail count (in chars)
|
|
|
|
bind(SCAN_TO_8_CHAR_LOOP);
|
|
movdqu(vec3, Address(result, 0));
|
|
pcmpeqw(vec3, vec1);
|
|
ptest(vec2, vec3);
|
|
jcc(Assembler::carryClear, FOUND_CHAR);
|
|
addptr(result, 16);
|
|
subl(tmp, stride);
|
|
jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP);
|
|
bind(SCAN_TO_CHAR);
|
|
testl(cnt1, cnt1);
|
|
jcc(Assembler::zero, RET_NOT_FOUND);
|
|
bind(SCAN_TO_CHAR_LOOP);
|
|
load_unsigned_short(tmp, Address(result, 0));
|
|
cmpl(ch, tmp);
|
|
jccb(Assembler::equal, FOUND_SEQ_CHAR);
|
|
addptr(result, 2);
|
|
subl(cnt1, 1);
|
|
jccb(Assembler::zero, RET_NOT_FOUND);
|
|
jmp(SCAN_TO_CHAR_LOOP);
|
|
|
|
bind(RET_NOT_FOUND);
|
|
movl(result, -1);
|
|
jmpb(DONE_LABEL);
|
|
|
|
bind(FOUND_CHAR);
|
|
if (UseAVX >= 2) {
|
|
vpmovmskb(tmp, vec3);
|
|
} else {
|
|
pmovmskb(tmp, vec3);
|
|
}
|
|
bsfl(ch, tmp);
|
|
addl(result, ch);
|
|
|
|
bind(FOUND_SEQ_CHAR);
|
|
subptr(result, str1);
|
|
shrl(result, 1);
|
|
|
|
bind(DONE_LABEL);
|
|
} // string_indexof_char
|
|
|
|
void C2_MacroAssembler::stringL_indexof_char(Register str1, Register cnt1, Register ch, Register result,
|
|
XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) {
|
|
ShortBranchVerifier sbv(this);
|
|
assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required");
|
|
|
|
int stride = 16;
|
|
|
|
Label FOUND_CHAR, SCAN_TO_CHAR_INIT, SCAN_TO_CHAR_LOOP,
|
|
SCAN_TO_16_CHAR, SCAN_TO_16_CHAR_LOOP, SCAN_TO_32_CHAR_LOOP,
|
|
RET_NOT_FOUND, SCAN_TO_16_CHAR_INIT,
|
|
FOUND_SEQ_CHAR, DONE_LABEL;
|
|
|
|
movptr(result, str1);
|
|
if (UseAVX >= 2) {
|
|
cmpl(cnt1, stride);
|
|
jcc(Assembler::less, SCAN_TO_CHAR_INIT);
|
|
cmpl(cnt1, stride*2);
|
|
jcc(Assembler::less, SCAN_TO_16_CHAR_INIT);
|
|
movdl(vec1, ch);
|
|
vpbroadcastb(vec1, vec1, Assembler::AVX_256bit);
|
|
vpxor(vec2, vec2);
|
|
movl(tmp, cnt1);
|
|
andl(tmp, 0xFFFFFFE0); //vector count (in chars)
|
|
andl(cnt1,0x0000001F); //tail count (in chars)
|
|
|
|
bind(SCAN_TO_32_CHAR_LOOP);
|
|
vmovdqu(vec3, Address(result, 0));
|
|
vpcmpeqb(vec3, vec3, vec1, Assembler::AVX_256bit);
|
|
vptest(vec2, vec3);
|
|
jcc(Assembler::carryClear, FOUND_CHAR);
|
|
addptr(result, 32);
|
|
subl(tmp, stride*2);
|
|
jcc(Assembler::notZero, SCAN_TO_32_CHAR_LOOP);
|
|
jmp(SCAN_TO_16_CHAR);
|
|
|
|
bind(SCAN_TO_16_CHAR_INIT);
|
|
movdl(vec1, ch);
|
|
pxor(vec2, vec2);
|
|
pshufb(vec1, vec2);
|
|
}
|
|
|
|
bind(SCAN_TO_16_CHAR);
|
|
cmpl(cnt1, stride);
|
|
jcc(Assembler::less, SCAN_TO_CHAR_INIT);//less than 16 entires left
|
|
if (UseAVX < 2) {
|
|
movdl(vec1, ch);
|
|
pxor(vec2, vec2);
|
|
pshufb(vec1, vec2);
|
|
}
|
|
movl(tmp, cnt1);
|
|
andl(tmp, 0xFFFFFFF0); //vector count (in bytes)
|
|
andl(cnt1,0x0000000F); //tail count (in bytes)
|
|
|
|
bind(SCAN_TO_16_CHAR_LOOP);
|
|
movdqu(vec3, Address(result, 0));
|
|
pcmpeqb(vec3, vec1);
|
|
ptest(vec2, vec3);
|
|
jcc(Assembler::carryClear, FOUND_CHAR);
|
|
addptr(result, 16);
|
|
subl(tmp, stride);
|
|
jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP);//last 16 items...
|
|
|
|
bind(SCAN_TO_CHAR_INIT);
|
|
testl(cnt1, cnt1);
|
|
jcc(Assembler::zero, RET_NOT_FOUND);
|
|
bind(SCAN_TO_CHAR_LOOP);
|
|
load_unsigned_byte(tmp, Address(result, 0));
|
|
cmpl(ch, tmp);
|
|
jccb(Assembler::equal, FOUND_SEQ_CHAR);
|
|
addptr(result, 1);
|
|
subl(cnt1, 1);
|
|
jccb(Assembler::zero, RET_NOT_FOUND);
|
|
jmp(SCAN_TO_CHAR_LOOP);
|
|
|
|
bind(RET_NOT_FOUND);
|
|
movl(result, -1);
|
|
jmpb(DONE_LABEL);
|
|
|
|
bind(FOUND_CHAR);
|
|
if (UseAVX >= 2) {
|
|
vpmovmskb(tmp, vec3);
|
|
} else {
|
|
pmovmskb(tmp, vec3);
|
|
}
|
|
bsfl(ch, tmp);
|
|
addl(result, ch);
|
|
|
|
bind(FOUND_SEQ_CHAR);
|
|
subptr(result, str1);
|
|
|
|
bind(DONE_LABEL);
|
|
} // stringL_indexof_char
|
|
|
|
// helper function for string_compare
|
|
void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2,
|
|
Address::ScaleFactor scale, Address::ScaleFactor scale1,
|
|
Address::ScaleFactor scale2, Register index, int ae) {
|
|
if (ae == StrIntrinsicNode::LL) {
|
|
load_unsigned_byte(elem1, Address(str1, index, scale, 0));
|
|
load_unsigned_byte(elem2, Address(str2, index, scale, 0));
|
|
} else if (ae == StrIntrinsicNode::UU) {
|
|
load_unsigned_short(elem1, Address(str1, index, scale, 0));
|
|
load_unsigned_short(elem2, Address(str2, index, scale, 0));
|
|
} else {
|
|
load_unsigned_byte(elem1, Address(str1, index, scale1, 0));
|
|
load_unsigned_short(elem2, Address(str2, index, scale2, 0));
|
|
}
|
|
}
|
|
|
|
// Compare strings, used for char[] and byte[].
|
|
void C2_MacroAssembler::string_compare(Register str1, Register str2,
|
|
Register cnt1, Register cnt2, Register result,
|
|
XMMRegister vec1, int ae) {
|
|
ShortBranchVerifier sbv(this);
|
|
Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL;
|
|
Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3
|
|
int stride, stride2, adr_stride, adr_stride1, adr_stride2;
|
|
int stride2x2 = 0x40;
|
|
Address::ScaleFactor scale = Address::no_scale;
|
|
Address::ScaleFactor scale1 = Address::no_scale;
|
|
Address::ScaleFactor scale2 = Address::no_scale;
|
|
|
|
if (ae != StrIntrinsicNode::LL) {
|
|
stride2x2 = 0x20;
|
|
}
|
|
|
|
if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) {
|
|
shrl(cnt2, 1);
|
|
}
|
|
// Compute the minimum of the string lengths and the
|
|
// difference of the string lengths (stack).
|
|
// Do the conditional move stuff
|
|
movl(result, cnt1);
|
|
subl(cnt1, cnt2);
|
|
push(cnt1);
|
|
cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2)
|
|
|
|
// Is the minimum length zero?
|
|
testl(cnt2, cnt2);
|
|
jcc(Assembler::zero, LENGTH_DIFF_LABEL);
|
|
if (ae == StrIntrinsicNode::LL) {
|
|
// Load first bytes
|
|
load_unsigned_byte(result, Address(str1, 0)); // result = str1[0]
|
|
load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0]
|
|
} else if (ae == StrIntrinsicNode::UU) {
|
|
// Load first characters
|
|
load_unsigned_short(result, Address(str1, 0));
|
|
load_unsigned_short(cnt1, Address(str2, 0));
|
|
} else {
|
|
load_unsigned_byte(result, Address(str1, 0));
|
|
load_unsigned_short(cnt1, Address(str2, 0));
|
|
}
|
|
subl(result, cnt1);
|
|
jcc(Assembler::notZero, POP_LABEL);
|
|
|
|
if (ae == StrIntrinsicNode::UU) {
|
|
// Divide length by 2 to get number of chars
|
|
shrl(cnt2, 1);
|
|
}
|
|
cmpl(cnt2, 1);
|
|
jcc(Assembler::equal, LENGTH_DIFF_LABEL);
|
|
|
|
// Check if the strings start at the same location and setup scale and stride
|
|
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
|
|
cmpptr(str1, str2);
|
|
jcc(Assembler::equal, LENGTH_DIFF_LABEL);
|
|
if (ae == StrIntrinsicNode::LL) {
|
|
scale = Address::times_1;
|
|
stride = 16;
|
|
} else {
|
|
scale = Address::times_2;
|
|
stride = 8;
|
|
}
|
|
} else {
|
|
scale1 = Address::times_1;
|
|
scale2 = Address::times_2;
|
|
// scale not used
|
|
stride = 8;
|
|
}
|
|
|
|
if (UseAVX >= 2 && UseSSE42Intrinsics) {
|
|
Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR;
|
|
Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR;
|
|
Label COMPARE_WIDE_VECTORS_LOOP_AVX2;
|
|
Label COMPARE_TAIL_LONG;
|
|
Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3
|
|
|
|
int pcmpmask = 0x19;
|
|
if (ae == StrIntrinsicNode::LL) {
|
|
pcmpmask &= ~0x01;
|
|
}
|
|
|
|
// Setup to compare 16-chars (32-bytes) vectors,
|
|
// start from first character again because it has aligned address.
|
|
if (ae == StrIntrinsicNode::LL) {
|
|
stride2 = 32;
|
|
} else {
|
|
stride2 = 16;
|
|
}
|
|
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
|
|
adr_stride = stride << scale;
|
|
} else {
|
|
adr_stride1 = 8; //stride << scale1;
|
|
adr_stride2 = 16; //stride << scale2;
|
|
}
|
|
|
|
assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
|
|
// rax and rdx are used by pcmpestri as elements counters
|
|
movl(result, cnt2);
|
|
andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count
|
|
jcc(Assembler::zero, COMPARE_TAIL_LONG);
|
|
|
|
// fast path : compare first 2 8-char vectors.
|
|
bind(COMPARE_16_CHARS);
|
|
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
|
|
movdqu(vec1, Address(str1, 0));
|
|
} else {
|
|
pmovzxbw(vec1, Address(str1, 0));
|
|
}
|
|
pcmpestri(vec1, Address(str2, 0), pcmpmask);
|
|
jccb(Assembler::below, COMPARE_INDEX_CHAR);
|
|
|
|
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
|
|
movdqu(vec1, Address(str1, adr_stride));
|
|
pcmpestri(vec1, Address(str2, adr_stride), pcmpmask);
|
|
} else {
|
|
pmovzxbw(vec1, Address(str1, adr_stride1));
|
|
pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask);
|
|
}
|
|
jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS);
|
|
addl(cnt1, stride);
|
|
|
|
// Compare the characters at index in cnt1
|
|
bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character
|
|
load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
|
|
subl(result, cnt2);
|
|
jmp(POP_LABEL);
|
|
|
|
// Setup the registers to start vector comparison loop
|
|
bind(COMPARE_WIDE_VECTORS);
|
|
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
|
|
lea(str1, Address(str1, result, scale));
|
|
lea(str2, Address(str2, result, scale));
|
|
} else {
|
|
lea(str1, Address(str1, result, scale1));
|
|
lea(str2, Address(str2, result, scale2));
|
|
}
|
|
subl(result, stride2);
|
|
subl(cnt2, stride2);
|
|
jcc(Assembler::zero, COMPARE_WIDE_TAIL);
|
|
negptr(result);
|
|
|
|
// In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest)
|
|
bind(COMPARE_WIDE_VECTORS_LOOP);
|
|
|
|
#ifdef _LP64
|
|
if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
|
|
cmpl(cnt2, stride2x2);
|
|
jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2);
|
|
testl(cnt2, stride2x2-1); // cnt2 holds the vector count
|
|
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40
|
|
|
|
bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
|
|
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
|
|
evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit);
|
|
evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
|
|
} else {
|
|
vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit);
|
|
evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0
|
|
}
|
|
kortestql(k7, k7);
|
|
jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare
|
|
addptr(result, stride2x2); // update since we already compared at this addr
|
|
subl(cnt2, stride2x2); // and sub the size too
|
|
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3);
|
|
|
|
vpxor(vec1, vec1);
|
|
jmpb(COMPARE_WIDE_TAIL);
|
|
}//if (VM_Version::supports_avx512vlbw())
|
|
#endif // _LP64
|
|
|
|
|
|
bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
|
|
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
|
|
vmovdqu(vec1, Address(str1, result, scale));
|
|
vpxor(vec1, Address(str2, result, scale));
|
|
} else {
|
|
vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit);
|
|
vpxor(vec1, Address(str2, result, scale2));
|
|
}
|
|
vptest(vec1, vec1);
|
|
jcc(Assembler::notZero, VECTOR_NOT_EQUAL);
|
|
addptr(result, stride2);
|
|
subl(cnt2, stride2);
|
|
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP);
|
|
// clean upper bits of YMM registers
|
|
vpxor(vec1, vec1);
|
|
|
|
// compare wide vectors tail
|
|
bind(COMPARE_WIDE_TAIL);
|
|
testptr(result, result);
|
|
jcc(Assembler::zero, LENGTH_DIFF_LABEL);
|
|
|
|
movl(result, stride2);
|
|
movl(cnt2, result);
|
|
negptr(result);
|
|
jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2);
|
|
|
|
// Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors.
|
|
bind(VECTOR_NOT_EQUAL);
|
|
// clean upper bits of YMM registers
|
|
vpxor(vec1, vec1);
|
|
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
|
|
lea(str1, Address(str1, result, scale));
|
|
lea(str2, Address(str2, result, scale));
|
|
} else {
|
|
lea(str1, Address(str1, result, scale1));
|
|
lea(str2, Address(str2, result, scale2));
|
|
}
|
|
jmp(COMPARE_16_CHARS);
|
|
|
|
// Compare tail chars, length between 1 to 15 chars
|
|
bind(COMPARE_TAIL_LONG);
|
|
movl(cnt2, result);
|
|
cmpl(cnt2, stride);
|
|
jcc(Assembler::less, COMPARE_SMALL_STR);
|
|
|
|
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
|
|
movdqu(vec1, Address(str1, 0));
|
|
} else {
|
|
pmovzxbw(vec1, Address(str1, 0));
|
|
}
|
|
pcmpestri(vec1, Address(str2, 0), pcmpmask);
|
|
jcc(Assembler::below, COMPARE_INDEX_CHAR);
|
|
subptr(cnt2, stride);
|
|
jcc(Assembler::zero, LENGTH_DIFF_LABEL);
|
|
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
|
|
lea(str1, Address(str1, result, scale));
|
|
lea(str2, Address(str2, result, scale));
|
|
} else {
|
|
lea(str1, Address(str1, result, scale1));
|
|
lea(str2, Address(str2, result, scale2));
|
|
}
|
|
negptr(cnt2);
|
|
jmpb(WHILE_HEAD_LABEL);
|
|
|
|
bind(COMPARE_SMALL_STR);
|
|
} else if (UseSSE42Intrinsics) {
|
|
Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL;
|
|
int pcmpmask = 0x19;
|
|
// Setup to compare 8-char (16-byte) vectors,
|
|
// start from first character again because it has aligned address.
|
|
movl(result, cnt2);
|
|
andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count
|
|
if (ae == StrIntrinsicNode::LL) {
|
|
pcmpmask &= ~0x01;
|
|
}
|
|
jcc(Assembler::zero, COMPARE_TAIL);
|
|
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
|
|
lea(str1, Address(str1, result, scale));
|
|
lea(str2, Address(str2, result, scale));
|
|
} else {
|
|
lea(str1, Address(str1, result, scale1));
|
|
lea(str2, Address(str2, result, scale2));
|
|
}
|
|
negptr(result);
|
|
|
|
// pcmpestri
|
|
// inputs:
|
|
// vec1- substring
|
|
// rax - negative string length (elements count)
|
|
// mem - scanned string
|
|
// rdx - string length (elements count)
|
|
// pcmpmask - cmp mode: 11000 (string compare with negated result)
|
|
// + 00 (unsigned bytes) or + 01 (unsigned shorts)
|
|
// outputs:
|
|
// rcx - first mismatched element index
|
|
assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri");
|
|
|
|
bind(COMPARE_WIDE_VECTORS);
|
|
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
|
|
movdqu(vec1, Address(str1, result, scale));
|
|
pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
|
|
} else {
|
|
pmovzxbw(vec1, Address(str1, result, scale1));
|
|
pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
|
|
}
|
|
// After pcmpestri cnt1(rcx) contains mismatched element index
|
|
|
|
jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1
|
|
addptr(result, stride);
|
|
subptr(cnt2, stride);
|
|
jccb(Assembler::notZero, COMPARE_WIDE_VECTORS);
|
|
|
|
// compare wide vectors tail
|
|
testptr(result, result);
|
|
jcc(Assembler::zero, LENGTH_DIFF_LABEL);
|
|
|
|
movl(cnt2, stride);
|
|
movl(result, stride);
|
|
negptr(result);
|
|
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
|
|
movdqu(vec1, Address(str1, result, scale));
|
|
pcmpestri(vec1, Address(str2, result, scale), pcmpmask);
|
|
} else {
|
|
pmovzxbw(vec1, Address(str1, result, scale1));
|
|
pcmpestri(vec1, Address(str2, result, scale2), pcmpmask);
|
|
}
|
|
jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL);
|
|
|
|
// Mismatched characters in the vectors
|
|
bind(VECTOR_NOT_EQUAL);
|
|
addptr(cnt1, result);
|
|
load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae);
|
|
subl(result, cnt2);
|
|
jmpb(POP_LABEL);
|
|
|
|
bind(COMPARE_TAIL); // limit is zero
|
|
movl(cnt2, result);
|
|
// Fallthru to tail compare
|
|
}
|
|
// Shift str2 and str1 to the end of the arrays, negate min
|
|
if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) {
|
|
lea(str1, Address(str1, cnt2, scale));
|
|
lea(str2, Address(str2, cnt2, scale));
|
|
} else {
|
|
lea(str1, Address(str1, cnt2, scale1));
|
|
lea(str2, Address(str2, cnt2, scale2));
|
|
}
|
|
decrementl(cnt2); // first character was compared already
|
|
negptr(cnt2);
|
|
|
|
// Compare the rest of the elements
|
|
bind(WHILE_HEAD_LABEL);
|
|
load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae);
|
|
subl(result, cnt1);
|
|
jccb(Assembler::notZero, POP_LABEL);
|
|
increment(cnt2);
|
|
jccb(Assembler::notZero, WHILE_HEAD_LABEL);
|
|
|
|
// Strings are equal up to min length. Return the length difference.
|
|
bind(LENGTH_DIFF_LABEL);
|
|
pop(result);
|
|
if (ae == StrIntrinsicNode::UU) {
|
|
// Divide diff by 2 to get number of chars
|
|
sarl(result, 1);
|
|
}
|
|
jmpb(DONE_LABEL);
|
|
|
|
#ifdef _LP64
|
|
if (VM_Version::supports_avx512vlbw()) {
|
|
|
|
bind(COMPARE_WIDE_VECTORS_LOOP_FAILED);
|
|
|
|
kmovql(cnt1, k7);
|
|
notq(cnt1);
|
|
bsfq(cnt2, cnt1);
|
|
if (ae != StrIntrinsicNode::LL) {
|
|
// Divide diff by 2 to get number of chars
|
|
sarl(cnt2, 1);
|
|
}
|
|
addq(result, cnt2);
|
|
if (ae == StrIntrinsicNode::LL) {
|
|
load_unsigned_byte(cnt1, Address(str2, result));
|
|
load_unsigned_byte(result, Address(str1, result));
|
|
} else if (ae == StrIntrinsicNode::UU) {
|
|
load_unsigned_short(cnt1, Address(str2, result, scale));
|
|
load_unsigned_short(result, Address(str1, result, scale));
|
|
} else {
|
|
load_unsigned_short(cnt1, Address(str2, result, scale2));
|
|
load_unsigned_byte(result, Address(str1, result, scale1));
|
|
}
|
|
subl(result, cnt1);
|
|
jmpb(POP_LABEL);
|
|
}//if (VM_Version::supports_avx512vlbw())
|
|
#endif // _LP64
|
|
|
|
// Discard the stored length difference
|
|
bind(POP_LABEL);
|
|
pop(cnt1);
|
|
|
|
// That's it
|
|
bind(DONE_LABEL);
|
|
if(ae == StrIntrinsicNode::UL) {
|
|
negl(result);
|
|
}
|
|
|
|
}
|
|
|
|
// Search for Non-ASCII character (Negative byte value) in a byte array,
|
|
// return true if it has any and false otherwise.
|
|
// ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java
|
|
// @IntrinsicCandidate
|
|
// private static boolean hasNegatives(byte[] ba, int off, int len) {
|
|
// for (int i = off; i < off + len; i++) {
|
|
// if (ba[i] < 0) {
|
|
// return true;
|
|
// }
|
|
// }
|
|
// return false;
|
|
// }
|
|
void C2_MacroAssembler::has_negatives(Register ary1, Register len,
|
|
Register result, Register tmp1,
|
|
XMMRegister vec1, XMMRegister vec2) {
|
|
// rsi: byte array
|
|
// rcx: len
|
|
// rax: result
|
|
ShortBranchVerifier sbv(this);
|
|
assert_different_registers(ary1, len, result, tmp1);
|
|
assert_different_registers(vec1, vec2);
|
|
Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE;
|
|
|
|
// len == 0
|
|
testl(len, len);
|
|
jcc(Assembler::zero, FALSE_LABEL);
|
|
|
|
if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512
|
|
VM_Version::supports_avx512vlbw() &&
|
|
VM_Version::supports_bmi2()) {
|
|
|
|
Label test_64_loop, test_tail;
|
|
Register tmp3_aliased = len;
|
|
|
|
movl(tmp1, len);
|
|
vpxor(vec2, vec2, vec2, Assembler::AVX_512bit);
|
|
|
|
andl(tmp1, 64 - 1); // tail count (in chars) 0x3F
|
|
andl(len, ~(64 - 1)); // vector count (in chars)
|
|
jccb(Assembler::zero, test_tail);
|
|
|
|
lea(ary1, Address(ary1, len, Address::times_1));
|
|
negptr(len);
|
|
|
|
bind(test_64_loop);
|
|
// Check whether our 64 elements of size byte contain negatives
|
|
evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit);
|
|
kortestql(k2, k2);
|
|
jcc(Assembler::notZero, TRUE_LABEL);
|
|
|
|
addptr(len, 64);
|
|
jccb(Assembler::notZero, test_64_loop);
|
|
|
|
|
|
bind(test_tail);
|
|
// bail out when there is nothing to be done
|
|
testl(tmp1, -1);
|
|
jcc(Assembler::zero, FALSE_LABEL);
|
|
|
|
// ~(~0 << len) applied up to two times (for 32-bit scenario)
|
|
#ifdef _LP64
|
|
mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF);
|
|
shlxq(tmp3_aliased, tmp3_aliased, tmp1);
|
|
notq(tmp3_aliased);
|
|
kmovql(k3, tmp3_aliased);
|
|
#else
|
|
Label k_init;
|
|
jmp(k_init);
|
|
|
|
// We could not read 64-bits from a general purpose register thus we move
|
|
// data required to compose 64 1's to the instruction stream
|
|
// We emit 64 byte wide series of elements from 0..63 which later on would
|
|
// be used as a compare targets with tail count contained in tmp1 register.
|
|
// Result would be a k register having tmp1 consecutive number or 1
|
|
// counting from least significant bit.
|
|
address tmp = pc();
|
|
emit_int64(0x0706050403020100);
|
|
emit_int64(0x0F0E0D0C0B0A0908);
|
|
emit_int64(0x1716151413121110);
|
|
emit_int64(0x1F1E1D1C1B1A1918);
|
|
emit_int64(0x2726252423222120);
|
|
emit_int64(0x2F2E2D2C2B2A2928);
|
|
emit_int64(0x3736353433323130);
|
|
emit_int64(0x3F3E3D3C3B3A3938);
|
|
|
|
bind(k_init);
|
|
lea(len, InternalAddress(tmp));
|
|
// create mask to test for negative byte inside a vector
|
|
evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit);
|
|
evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit);
|
|
|
|
#endif
|
|
evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit);
|
|
ktestq(k2, k3);
|
|
jcc(Assembler::notZero, TRUE_LABEL);
|
|
|
|
jmp(FALSE_LABEL);
|
|
} else {
|
|
movl(result, len); // copy
|
|
|
|
if (UseAVX >= 2 && UseSSE >= 2) {
|
|
// With AVX2, use 32-byte vector compare
|
|
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
|
|
|
|
// Compare 32-byte vectors
|
|
andl(result, 0x0000001f); // tail count (in bytes)
|
|
andl(len, 0xffffffe0); // vector count (in bytes)
|
|
jccb(Assembler::zero, COMPARE_TAIL);
|
|
|
|
lea(ary1, Address(ary1, len, Address::times_1));
|
|
negptr(len);
|
|
|
|
movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector
|
|
movdl(vec2, tmp1);
|
|
vpbroadcastd(vec2, vec2, Assembler::AVX_256bit);
|
|
|
|
bind(COMPARE_WIDE_VECTORS);
|
|
vmovdqu(vec1, Address(ary1, len, Address::times_1));
|
|
vptest(vec1, vec2);
|
|
jccb(Assembler::notZero, TRUE_LABEL);
|
|
addptr(len, 32);
|
|
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
|
|
|
|
testl(result, result);
|
|
jccb(Assembler::zero, FALSE_LABEL);
|
|
|
|
vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
|
|
vptest(vec1, vec2);
|
|
jccb(Assembler::notZero, TRUE_LABEL);
|
|
jmpb(FALSE_LABEL);
|
|
|
|
bind(COMPARE_TAIL); // len is zero
|
|
movl(len, result);
|
|
// Fallthru to tail compare
|
|
} else if (UseSSE42Intrinsics) {
|
|
// With SSE4.2, use double quad vector compare
|
|
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
|
|
|
|
// Compare 16-byte vectors
|
|
andl(result, 0x0000000f); // tail count (in bytes)
|
|
andl(len, 0xfffffff0); // vector count (in bytes)
|
|
jcc(Assembler::zero, COMPARE_TAIL);
|
|
|
|
lea(ary1, Address(ary1, len, Address::times_1));
|
|
negptr(len);
|
|
|
|
movl(tmp1, 0x80808080);
|
|
movdl(vec2, tmp1);
|
|
pshufd(vec2, vec2, 0);
|
|
|
|
bind(COMPARE_WIDE_VECTORS);
|
|
movdqu(vec1, Address(ary1, len, Address::times_1));
|
|
ptest(vec1, vec2);
|
|
jcc(Assembler::notZero, TRUE_LABEL);
|
|
addptr(len, 16);
|
|
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
|
|
|
|
testl(result, result);
|
|
jcc(Assembler::zero, FALSE_LABEL);
|
|
|
|
movdqu(vec1, Address(ary1, result, Address::times_1, -16));
|
|
ptest(vec1, vec2);
|
|
jccb(Assembler::notZero, TRUE_LABEL);
|
|
jmpb(FALSE_LABEL);
|
|
|
|
bind(COMPARE_TAIL); // len is zero
|
|
movl(len, result);
|
|
// Fallthru to tail compare
|
|
}
|
|
}
|
|
// Compare 4-byte vectors
|
|
andl(len, 0xfffffffc); // vector count (in bytes)
|
|
jccb(Assembler::zero, COMPARE_CHAR);
|
|
|
|
lea(ary1, Address(ary1, len, Address::times_1));
|
|
negptr(len);
|
|
|
|
bind(COMPARE_VECTORS);
|
|
movl(tmp1, Address(ary1, len, Address::times_1));
|
|
andl(tmp1, 0x80808080);
|
|
jccb(Assembler::notZero, TRUE_LABEL);
|
|
addptr(len, 4);
|
|
jcc(Assembler::notZero, COMPARE_VECTORS);
|
|
|
|
// Compare trailing char (final 2 bytes), if any
|
|
bind(COMPARE_CHAR);
|
|
testl(result, 0x2); // tail char
|
|
jccb(Assembler::zero, COMPARE_BYTE);
|
|
load_unsigned_short(tmp1, Address(ary1, 0));
|
|
andl(tmp1, 0x00008080);
|
|
jccb(Assembler::notZero, TRUE_LABEL);
|
|
subptr(result, 2);
|
|
lea(ary1, Address(ary1, 2));
|
|
|
|
bind(COMPARE_BYTE);
|
|
testl(result, 0x1); // tail byte
|
|
jccb(Assembler::zero, FALSE_LABEL);
|
|
load_unsigned_byte(tmp1, Address(ary1, 0));
|
|
andl(tmp1, 0x00000080);
|
|
jccb(Assembler::notEqual, TRUE_LABEL);
|
|
jmpb(FALSE_LABEL);
|
|
|
|
bind(TRUE_LABEL);
|
|
movl(result, 1); // return true
|
|
jmpb(DONE);
|
|
|
|
bind(FALSE_LABEL);
|
|
xorl(result, result); // return false
|
|
|
|
// That's it
|
|
bind(DONE);
|
|
if (UseAVX >= 2 && UseSSE >= 2) {
|
|
// clean upper bits of YMM registers
|
|
vpxor(vec1, vec1);
|
|
vpxor(vec2, vec2);
|
|
}
|
|
}
|
|
// Compare char[] or byte[] arrays aligned to 4 bytes or substrings.
|
|
void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2,
|
|
Register limit, Register result, Register chr,
|
|
XMMRegister vec1, XMMRegister vec2, bool is_char) {
|
|
ShortBranchVerifier sbv(this);
|
|
Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE;
|
|
|
|
int length_offset = arrayOopDesc::length_offset_in_bytes();
|
|
int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE);
|
|
|
|
if (is_array_equ) {
|
|
// Check the input args
|
|
cmpoop(ary1, ary2);
|
|
jcc(Assembler::equal, TRUE_LABEL);
|
|
|
|
// Need additional checks for arrays_equals.
|
|
testptr(ary1, ary1);
|
|
jcc(Assembler::zero, FALSE_LABEL);
|
|
testptr(ary2, ary2);
|
|
jcc(Assembler::zero, FALSE_LABEL);
|
|
|
|
// Check the lengths
|
|
movl(limit, Address(ary1, length_offset));
|
|
cmpl(limit, Address(ary2, length_offset));
|
|
jcc(Assembler::notEqual, FALSE_LABEL);
|
|
}
|
|
|
|
// count == 0
|
|
testl(limit, limit);
|
|
jcc(Assembler::zero, TRUE_LABEL);
|
|
|
|
if (is_array_equ) {
|
|
// Load array address
|
|
lea(ary1, Address(ary1, base_offset));
|
|
lea(ary2, Address(ary2, base_offset));
|
|
}
|
|
|
|
if (is_array_equ && is_char) {
|
|
// arrays_equals when used for char[].
|
|
shll(limit, 1); // byte count != 0
|
|
}
|
|
movl(result, limit); // copy
|
|
|
|
if (UseAVX >= 2) {
|
|
// With AVX2, use 32-byte vector compare
|
|
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
|
|
|
|
// Compare 32-byte vectors
|
|
andl(result, 0x0000001f); // tail count (in bytes)
|
|
andl(limit, 0xffffffe0); // vector count (in bytes)
|
|
jcc(Assembler::zero, COMPARE_TAIL);
|
|
|
|
lea(ary1, Address(ary1, limit, Address::times_1));
|
|
lea(ary2, Address(ary2, limit, Address::times_1));
|
|
negptr(limit);
|
|
|
|
#ifdef _LP64
|
|
if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop
|
|
Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3;
|
|
|
|
cmpl(limit, -64);
|
|
jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2);
|
|
|
|
bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop
|
|
|
|
evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit);
|
|
evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit);
|
|
kortestql(k7, k7);
|
|
jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
|
|
addptr(limit, 64); // update since we already compared at this addr
|
|
cmpl(limit, -64);
|
|
jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3);
|
|
|
|
// At this point we may still need to compare -limit+result bytes.
|
|
// We could execute the next two instruction and just continue via non-wide path:
|
|
// cmpl(limit, 0);
|
|
// jcc(Assembler::equal, COMPARE_TAIL); // true
|
|
// But since we stopped at the points ary{1,2}+limit which are
|
|
// not farther than 64 bytes from the ends of arrays ary{1,2}+result
|
|
// (|limit| <= 32 and result < 32),
|
|
// we may just compare the last 64 bytes.
|
|
//
|
|
addptr(result, -64); // it is safe, bc we just came from this area
|
|
evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit);
|
|
evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit);
|
|
kortestql(k7, k7);
|
|
jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare
|
|
|
|
jmp(TRUE_LABEL);
|
|
|
|
bind(COMPARE_WIDE_VECTORS_LOOP_AVX2);
|
|
|
|
}//if (VM_Version::supports_avx512vlbw())
|
|
#endif //_LP64
|
|
bind(COMPARE_WIDE_VECTORS);
|
|
vmovdqu(vec1, Address(ary1, limit, Address::times_1));
|
|
vmovdqu(vec2, Address(ary2, limit, Address::times_1));
|
|
vpxor(vec1, vec2);
|
|
|
|
vptest(vec1, vec1);
|
|
jcc(Assembler::notZero, FALSE_LABEL);
|
|
addptr(limit, 32);
|
|
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
|
|
|
|
testl(result, result);
|
|
jcc(Assembler::zero, TRUE_LABEL);
|
|
|
|
vmovdqu(vec1, Address(ary1, result, Address::times_1, -32));
|
|
vmovdqu(vec2, Address(ary2, result, Address::times_1, -32));
|
|
vpxor(vec1, vec2);
|
|
|
|
vptest(vec1, vec1);
|
|
jccb(Assembler::notZero, FALSE_LABEL);
|
|
jmpb(TRUE_LABEL);
|
|
|
|
bind(COMPARE_TAIL); // limit is zero
|
|
movl(limit, result);
|
|
// Fallthru to tail compare
|
|
} else if (UseSSE42Intrinsics) {
|
|
// With SSE4.2, use double quad vector compare
|
|
Label COMPARE_WIDE_VECTORS, COMPARE_TAIL;
|
|
|
|
// Compare 16-byte vectors
|
|
andl(result, 0x0000000f); // tail count (in bytes)
|
|
andl(limit, 0xfffffff0); // vector count (in bytes)
|
|
jcc(Assembler::zero, COMPARE_TAIL);
|
|
|
|
lea(ary1, Address(ary1, limit, Address::times_1));
|
|
lea(ary2, Address(ary2, limit, Address::times_1));
|
|
negptr(limit);
|
|
|
|
bind(COMPARE_WIDE_VECTORS);
|
|
movdqu(vec1, Address(ary1, limit, Address::times_1));
|
|
movdqu(vec2, Address(ary2, limit, Address::times_1));
|
|
pxor(vec1, vec2);
|
|
|
|
ptest(vec1, vec1);
|
|
jcc(Assembler::notZero, FALSE_LABEL);
|
|
addptr(limit, 16);
|
|
jcc(Assembler::notZero, COMPARE_WIDE_VECTORS);
|
|
|
|
testl(result, result);
|
|
jcc(Assembler::zero, TRUE_LABEL);
|
|
|
|
movdqu(vec1, Address(ary1, result, Address::times_1, -16));
|
|
movdqu(vec2, Address(ary2, result, Address::times_1, -16));
|
|
pxor(vec1, vec2);
|
|
|
|
ptest(vec1, vec1);
|
|
jccb(Assembler::notZero, FALSE_LABEL);
|
|
jmpb(TRUE_LABEL);
|
|
|
|
bind(COMPARE_TAIL); // limit is zero
|
|
movl(limit, result);
|
|
// Fallthru to tail compare
|
|
}
|
|
|
|
// Compare 4-byte vectors
|
|
andl(limit, 0xfffffffc); // vector count (in bytes)
|
|
jccb(Assembler::zero, COMPARE_CHAR);
|
|
|
|
lea(ary1, Address(ary1, limit, Address::times_1));
|
|
lea(ary2, Address(ary2, limit, Address::times_1));
|
|
negptr(limit);
|
|
|
|
bind(COMPARE_VECTORS);
|
|
movl(chr, Address(ary1, limit, Address::times_1));
|
|
cmpl(chr, Address(ary2, limit, Address::times_1));
|
|
jccb(Assembler::notEqual, FALSE_LABEL);
|
|
addptr(limit, 4);
|
|
jcc(Assembler::notZero, COMPARE_VECTORS);
|
|
|
|
// Compare trailing char (final 2 bytes), if any
|
|
bind(COMPARE_CHAR);
|
|
testl(result, 0x2); // tail char
|
|
jccb(Assembler::zero, COMPARE_BYTE);
|
|
load_unsigned_short(chr, Address(ary1, 0));
|
|
load_unsigned_short(limit, Address(ary2, 0));
|
|
cmpl(chr, limit);
|
|
jccb(Assembler::notEqual, FALSE_LABEL);
|
|
|
|
if (is_array_equ && is_char) {
|
|
bind(COMPARE_BYTE);
|
|
} else {
|
|
lea(ary1, Address(ary1, 2));
|
|
lea(ary2, Address(ary2, 2));
|
|
|
|
bind(COMPARE_BYTE);
|
|
testl(result, 0x1); // tail byte
|
|
jccb(Assembler::zero, TRUE_LABEL);
|
|
load_unsigned_byte(chr, Address(ary1, 0));
|
|
load_unsigned_byte(limit, Address(ary2, 0));
|
|
cmpl(chr, limit);
|
|
jccb(Assembler::notEqual, FALSE_LABEL);
|
|
}
|
|
bind(TRUE_LABEL);
|
|
movl(result, 1); // return true
|
|
jmpb(DONE);
|
|
|
|
bind(FALSE_LABEL);
|
|
xorl(result, result); // return false
|
|
|
|
// That's it
|
|
bind(DONE);
|
|
if (UseAVX >= 2) {
|
|
// clean upper bits of YMM registers
|
|
vpxor(vec1, vec1);
|
|
vpxor(vec2, vec2);
|
|
}
|
|
}
|