mirror of
https://github.com/openjdk/jdk.git
synced 2025-09-18 10:04:42 +02:00
8261553: Efficient mask generation using BMI2 BZHI instruction
Reviewed-by: redestad, neliasso
This commit is contained in:
parent
a0658795cf
commit
cb84539d56
6 changed files with 28 additions and 27 deletions
|
@ -9173,6 +9173,13 @@ void Assembler::evpblendmq (XMMRegister dst, KRegister mask, XMMRegister nds, XM
|
||||||
emit_int16(0x64, (0xC0 | encode));
|
emit_int16(0x64, (0xC0 | encode));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Assembler::bzhiq(Register dst, Register src1, Register src2) {
|
||||||
|
assert(VM_Version::supports_bmi2(), "bit manipulation instructions not supported");
|
||||||
|
InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ false);
|
||||||
|
int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_NONE, VEX_OPCODE_0F_38, &attributes);
|
||||||
|
emit_int16((unsigned char)0xF5, (0xC0 | encode));
|
||||||
|
}
|
||||||
|
|
||||||
void Assembler::shlxl(Register dst, Register src1, Register src2) {
|
void Assembler::shlxl(Register dst, Register src1, Register src2) {
|
||||||
assert(VM_Version::supports_bmi2(), "");
|
assert(VM_Version::supports_bmi2(), "");
|
||||||
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
|
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
|
||||||
|
|
|
@ -2092,6 +2092,7 @@ private:
|
||||||
void shlxq(Register dst, Register src1, Register src2);
|
void shlxq(Register dst, Register src1, Register src2);
|
||||||
void shrxq(Register dst, Register src1, Register src2);
|
void shrxq(Register dst, Register src1, Register src2);
|
||||||
|
|
||||||
|
void bzhiq(Register dst, Register src1, Register src2);
|
||||||
|
|
||||||
//====================VECTOR ARITHMETIC=====================================
|
//====================VECTOR ARITHMETIC=====================================
|
||||||
void evpmovd2m(KRegister kdst, XMMRegister src, int vector_len);
|
void evpmovd2m(KRegister kdst, XMMRegister src, int vector_len);
|
||||||
|
|
|
@ -1894,17 +1894,9 @@ void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMReg
|
||||||
}
|
}
|
||||||
|
|
||||||
void C2_MacroAssembler::genmask(Register dst, Register len, Register temp) {
|
void C2_MacroAssembler::genmask(Register dst, Register len, Register temp) {
|
||||||
if (ArrayCopyPartialInlineSize <= 32) {
|
assert(ArrayCopyPartialInlineSize <= 64,"");
|
||||||
mov64(dst, 1);
|
mov64(dst, -1L);
|
||||||
shlxq(dst, dst, len);
|
bzhiq(dst, dst, len);
|
||||||
decq(dst);
|
|
||||||
} else {
|
|
||||||
mov64(dst, -1);
|
|
||||||
movq(temp, len);
|
|
||||||
negptr(temp);
|
|
||||||
addptr(temp, 64);
|
|
||||||
shrxq(dst, dst, temp);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#endif // _LP64
|
#endif // _LP64
|
||||||
|
|
||||||
|
|
|
@ -196,10 +196,8 @@ void MacroAssembler::copy64_masked_avx(Register dst, Register src, XMMRegister x
|
||||||
} else {
|
} else {
|
||||||
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
|
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
|
||||||
assert(MaxVectorSize == 64, "vector length != 64");
|
assert(MaxVectorSize == 64, "vector length != 64");
|
||||||
negptr(length);
|
mov64(temp, -1L);
|
||||||
addq(length, 64);
|
bzhiq(temp, temp, length);
|
||||||
mov64(temp, -1);
|
|
||||||
shrxq(temp, temp, length);
|
|
||||||
kmovql(mask, temp);
|
kmovql(mask, temp);
|
||||||
evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
|
evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
|
||||||
evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
|
evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
|
||||||
|
@ -213,9 +211,8 @@ void MacroAssembler::copy32_masked_avx(Register dst, Register src, XMMRegister x
|
||||||
assert(MaxVectorSize >= 32, "vector length should be >= 32");
|
assert(MaxVectorSize >= 32, "vector length should be >= 32");
|
||||||
BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
|
BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
|
||||||
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
|
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
|
||||||
mov64(temp, 1);
|
mov64(temp, -1L);
|
||||||
shlxq(temp, temp, length);
|
bzhiq(temp, temp, length);
|
||||||
decq(temp);
|
|
||||||
kmovql(mask, temp);
|
kmovql(mask, temp);
|
||||||
evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), Assembler::AVX_256bit);
|
evmovdqu(type[shift], mask, xmm, Address(src, index, scale, offset), Assembler::AVX_256bit);
|
||||||
evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, Assembler::AVX_256bit);
|
evmovdqu(type[shift], mask, Address(dst, index, scale, offset), xmm, Assembler::AVX_256bit);
|
||||||
|
|
|
@ -1471,6 +1471,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||||
__ subq(temp1, loop_size[shift]);
|
__ subq(temp1, loop_size[shift]);
|
||||||
|
|
||||||
// Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
|
// Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
|
||||||
|
__ align(32);
|
||||||
__ BIND(L_main_loop);
|
__ BIND(L_main_loop);
|
||||||
__ copy64_avx(to, from, temp4, xmm1, false, shift, 0);
|
__ copy64_avx(to, from, temp4, xmm1, false, shift, 0);
|
||||||
__ copy64_avx(to, from, temp4, xmm1, false, shift, 64);
|
__ copy64_avx(to, from, temp4, xmm1, false, shift, 64);
|
||||||
|
@ -1537,6 +1538,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||||
|
|
||||||
// Main loop with aligned copy block size of 192 bytes at
|
// Main loop with aligned copy block size of 192 bytes at
|
||||||
// 64 byte copy granularity.
|
// 64 byte copy granularity.
|
||||||
|
__ align(32);
|
||||||
__ BIND(L_main_loop_64bytes);
|
__ BIND(L_main_loop_64bytes);
|
||||||
__ copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
|
__ copy64_avx(to, from, temp4, xmm1, false, shift, 0 , true);
|
||||||
__ copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
|
__ copy64_avx(to, from, temp4, xmm1, false, shift, 64, true);
|
||||||
|
@ -1676,6 +1678,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||||
__ BIND(L_main_pre_loop);
|
__ BIND(L_main_pre_loop);
|
||||||
|
|
||||||
// Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
|
// Main loop with aligned copy block size of 192 bytes at 32 byte granularity.
|
||||||
|
__ align(32);
|
||||||
__ BIND(L_main_loop);
|
__ BIND(L_main_loop);
|
||||||
__ copy64_avx(to, from, temp1, xmm1, true, shift, -64);
|
__ copy64_avx(to, from, temp1, xmm1, true, shift, -64);
|
||||||
__ copy64_avx(to, from, temp1, xmm1, true, shift, -128);
|
__ copy64_avx(to, from, temp1, xmm1, true, shift, -128);
|
||||||
|
@ -1708,6 +1711,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||||
|
|
||||||
// Main loop with aligned copy block size of 192 bytes at
|
// Main loop with aligned copy block size of 192 bytes at
|
||||||
// 64 byte copy granularity.
|
// 64 byte copy granularity.
|
||||||
|
__ align(32);
|
||||||
__ BIND(L_main_loop_64bytes);
|
__ BIND(L_main_loop_64bytes);
|
||||||
__ copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
|
__ copy64_avx(to, from, temp1, xmm1, true, shift, -64 , true);
|
||||||
__ copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
|
__ copy64_avx(to, from, temp1, xmm1, true, shift, -128, true);
|
||||||
|
@ -1770,7 +1774,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||||
//
|
//
|
||||||
address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
|
address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
|
||||||
#if COMPILER2_OR_JVMCI
|
#if COMPILER2_OR_JVMCI
|
||||||
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
|
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
|
||||||
return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0,
|
return generate_disjoint_copy_avx3_masked(entry, "jbyte_disjoint_arraycopy_avx3", 0,
|
||||||
aligned, false, false);
|
aligned, false, false);
|
||||||
}
|
}
|
||||||
|
@ -1886,7 +1890,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||||
address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
|
address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
|
||||||
address* entry, const char *name) {
|
address* entry, const char *name) {
|
||||||
#if COMPILER2_OR_JVMCI
|
#if COMPILER2_OR_JVMCI
|
||||||
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
|
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
|
||||||
return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0,
|
return generate_conjoint_copy_avx3_masked(entry, "jbyte_conjoint_arraycopy_avx3", 0,
|
||||||
nooverlap_target, aligned, false, false);
|
nooverlap_target, aligned, false, false);
|
||||||
}
|
}
|
||||||
|
@ -1997,7 +2001,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||||
//
|
//
|
||||||
address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
|
address generate_disjoint_short_copy(bool aligned, address *entry, const char *name) {
|
||||||
#if COMPILER2_OR_JVMCI
|
#if COMPILER2_OR_JVMCI
|
||||||
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
|
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
|
||||||
return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1,
|
return generate_disjoint_copy_avx3_masked(entry, "jshort_disjoint_arraycopy_avx3", 1,
|
||||||
aligned, false, false);
|
aligned, false, false);
|
||||||
}
|
}
|
||||||
|
@ -2128,7 +2132,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||||
address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
|
address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
|
||||||
address *entry, const char *name) {
|
address *entry, const char *name) {
|
||||||
#if COMPILER2_OR_JVMCI
|
#if COMPILER2_OR_JVMCI
|
||||||
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
|
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
|
||||||
return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1,
|
return generate_conjoint_copy_avx3_masked(entry, "jshort_conjoint_arraycopy_avx3", 1,
|
||||||
nooverlap_target, aligned, false, false);
|
nooverlap_target, aligned, false, false);
|
||||||
}
|
}
|
||||||
|
@ -2232,7 +2236,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||||
address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
|
address generate_disjoint_int_oop_copy(bool aligned, bool is_oop, address* entry,
|
||||||
const char *name, bool dest_uninitialized = false) {
|
const char *name, bool dest_uninitialized = false) {
|
||||||
#if COMPILER2_OR_JVMCI
|
#if COMPILER2_OR_JVMCI
|
||||||
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
|
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
|
||||||
return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2,
|
return generate_disjoint_copy_avx3_masked(entry, "jint_disjoint_arraycopy_avx3", 2,
|
||||||
aligned, is_oop, dest_uninitialized);
|
aligned, is_oop, dest_uninitialized);
|
||||||
}
|
}
|
||||||
|
@ -2343,7 +2347,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||||
address *entry, const char *name,
|
address *entry, const char *name,
|
||||||
bool dest_uninitialized = false) {
|
bool dest_uninitialized = false) {
|
||||||
#if COMPILER2_OR_JVMCI
|
#if COMPILER2_OR_JVMCI
|
||||||
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
|
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
|
||||||
return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2,
|
return generate_conjoint_copy_avx3_masked(entry, "jint_conjoint_arraycopy_avx3", 2,
|
||||||
nooverlap_target, aligned, is_oop, dest_uninitialized);
|
nooverlap_target, aligned, is_oop, dest_uninitialized);
|
||||||
}
|
}
|
||||||
|
@ -2456,7 +2460,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||||
address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
|
address generate_disjoint_long_oop_copy(bool aligned, bool is_oop, address *entry,
|
||||||
const char *name, bool dest_uninitialized = false) {
|
const char *name, bool dest_uninitialized = false) {
|
||||||
#if COMPILER2_OR_JVMCI
|
#if COMPILER2_OR_JVMCI
|
||||||
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
|
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
|
||||||
return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3,
|
return generate_disjoint_copy_avx3_masked(entry, "jlong_disjoint_arraycopy_avx3", 3,
|
||||||
aligned, is_oop, dest_uninitialized);
|
aligned, is_oop, dest_uninitialized);
|
||||||
}
|
}
|
||||||
|
@ -2566,7 +2570,7 @@ class StubGenerator: public StubCodeGenerator {
|
||||||
address nooverlap_target, address *entry,
|
address nooverlap_target, address *entry,
|
||||||
const char *name, bool dest_uninitialized = false) {
|
const char *name, bool dest_uninitialized = false) {
|
||||||
#if COMPILER2_OR_JVMCI
|
#if COMPILER2_OR_JVMCI
|
||||||
if (VM_Version::supports_avx512vlbw() && MaxVectorSize >= 32) {
|
if (VM_Version::supports_avx512vlbw() && VM_Version::supports_bmi2() && MaxVectorSize >= 32) {
|
||||||
return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3,
|
return generate_conjoint_copy_avx3_masked(entry, "jlong_conjoint_arraycopy_avx3", 3,
|
||||||
nooverlap_target, aligned, is_oop, dest_uninitialized);
|
nooverlap_target, aligned, is_oop, dest_uninitialized);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1526,7 +1526,7 @@ const bool Matcher::match_rule_supported(int opcode) {
|
||||||
case Op_VectorMaskGen:
|
case Op_VectorMaskGen:
|
||||||
case Op_LoadVectorMasked:
|
case Op_LoadVectorMasked:
|
||||||
case Op_StoreVectorMasked:
|
case Op_StoreVectorMasked:
|
||||||
if (UseAVX < 3) {
|
if (UseAVX < 3 || !VM_Version::supports_bmi2()) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue