mirror of
https://github.com/openjdk/jdk.git
synced 2025-08-26 14:24:46 +02:00
8261542: X86 slice and unslice intrinsics for 256-bit byte/short vectors
Reviewed-by: kvn, neliasso
This commit is contained in:
parent
8b4fd77f60
commit
c53acc2a89
7 changed files with 114 additions and 19 deletions
|
@ -3005,6 +3005,16 @@ void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src
|
|||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
|
||||
assert(UseAVX > 0, "requires some form of AVX");
|
||||
if (reachable(src)) {
|
||||
Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
|
||||
} else {
|
||||
lea(rscratch, src);
|
||||
Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
|
||||
}
|
||||
}
|
||||
|
||||
void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
|
||||
assert(UseAVX > 0, "requires some form of AVX");
|
||||
if (reachable(src)) {
|
||||
|
|
|
@ -1245,6 +1245,7 @@ public:
|
|||
|
||||
void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
void vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch);
|
||||
|
||||
void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
|
||||
void vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 1999, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 1999, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -610,6 +610,21 @@ class StubGenerator: public StubCodeGenerator {
|
|||
return start;
|
||||
}
|
||||
|
||||
address generate_vector_byte_shuffle_mask(const char *stub_name) {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", stub_name);
|
||||
address start = __ pc();
|
||||
__ emit_data(0x70707070, relocInfo::none, 0);
|
||||
__ emit_data(0x70707070, relocInfo::none, 0);
|
||||
__ emit_data(0x70707070, relocInfo::none, 0);
|
||||
__ emit_data(0x70707070, relocInfo::none, 0);
|
||||
__ emit_data(0xF0F0F0F0, relocInfo::none, 0);
|
||||
__ emit_data(0xF0F0F0F0, relocInfo::none, 0);
|
||||
__ emit_data(0xF0F0F0F0, relocInfo::none, 0);
|
||||
__ emit_data(0xF0F0F0F0, relocInfo::none, 0);
|
||||
return start;
|
||||
}
|
||||
|
||||
address generate_vector_mask_long_double(const char *stub_name, int32_t maskhi, int32_t masklo) {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", stub_name);
|
||||
|
@ -3981,6 +3996,7 @@ class StubGenerator: public StubCodeGenerator {
|
|||
StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit,
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
|
||||
StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_mask("vector_int_shuffle_mask", 0x03020100);
|
||||
StubRoutines::x86::_vector_byte_shuffle_mask = generate_vector_byte_shuffle_mask("vector_byte_shuffle_mask");
|
||||
StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_mask("vector_short_shuffle_mask", 0x01000100);
|
||||
StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_mask_long_double("vector_long_shuffle_mask", 0x00000001, 0x0);
|
||||
StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
|
||||
|
|
|
@ -808,6 +808,17 @@ class StubGenerator: public StubCodeGenerator {
|
|||
return start;
|
||||
}
|
||||
|
||||
address generate_vector_byte_shuffle_mask(const char *stub_name) {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", stub_name);
|
||||
address start = __ pc();
|
||||
__ emit_data64(0x7070707070707070, relocInfo::none);
|
||||
__ emit_data64(0x7070707070707070, relocInfo::none);
|
||||
__ emit_data64(0xF0F0F0F0F0F0F0F0, relocInfo::none);
|
||||
__ emit_data64(0xF0F0F0F0F0F0F0F0, relocInfo::none);
|
||||
return start;
|
||||
}
|
||||
|
||||
address generate_fp_mask(const char *stub_name, int64_t mask) {
|
||||
__ align(CodeEntryAlignment);
|
||||
StubCodeMark mark(this, "StubRoutines", stub_name);
|
||||
|
@ -6832,6 +6843,7 @@ address generate_avx_ghash_processBlocks() {
|
|||
StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit,
|
||||
0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
|
||||
StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_mask("vector_int_shuffle_mask", 0x0302010003020100);
|
||||
StubRoutines::x86::_vector_byte_shuffle_mask = generate_vector_byte_shuffle_mask("vector_byte_shuffle_mask");
|
||||
StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_mask("vector_short_shuffle_mask", 0x0100010001000100);
|
||||
StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_mask("vector_long_shuffle_mask", 0x0000000100000000);
|
||||
StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000);
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2013, 2018, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2013, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -47,6 +47,7 @@ address StubRoutines::x86::_vector_short_to_byte_mask = NULL;
|
|||
address StubRoutines::x86::_vector_int_to_byte_mask = NULL;
|
||||
address StubRoutines::x86::_vector_int_to_short_mask = NULL;
|
||||
address StubRoutines::x86::_vector_all_bits_set = NULL;
|
||||
address StubRoutines::x86::_vector_byte_shuffle_mask = NULL;
|
||||
address StubRoutines::x86::_vector_short_shuffle_mask = NULL;
|
||||
address StubRoutines::x86::_vector_int_shuffle_mask = NULL;
|
||||
address StubRoutines::x86::_vector_long_shuffle_mask = NULL;
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2013, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2013, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -149,6 +149,7 @@ class x86 {
|
|||
static address _vector_32_bit_mask;
|
||||
static address _vector_64_bit_mask;
|
||||
static address _vector_int_shuffle_mask;
|
||||
static address _vector_byte_shuffle_mask;
|
||||
static address _vector_short_shuffle_mask;
|
||||
static address _vector_long_shuffle_mask;
|
||||
static address _vector_iota_indices;
|
||||
|
@ -280,6 +281,10 @@ class x86 {
|
|||
return _vector_int_shuffle_mask;
|
||||
}
|
||||
|
||||
static address vector_byte_shuffle_mask() {
|
||||
return _vector_byte_shuffle_mask;
|
||||
}
|
||||
|
||||
static address vector_short_shuffle_mask() {
|
||||
return _vector_short_shuffle_mask;
|
||||
}
|
||||
|
|
|
@ -1356,6 +1356,7 @@ Assembler::Width widthForType(BasicType bt) {
|
|||
static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
|
||||
static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
|
||||
static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
|
||||
static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
|
||||
static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
|
||||
static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
|
||||
static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
|
||||
|
@ -1693,9 +1694,9 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
|
|||
return false; // Implementation limitation due to how shuffle is loaded
|
||||
} else if (size_in_bits == 256 && UseAVX < 2) {
|
||||
return false; // Implementation limitation
|
||||
} else if (bt == T_BYTE && size_in_bits >= 256 && !VM_Version::supports_avx512_vbmi()) {
|
||||
} else if (bt == T_BYTE && size_in_bits > 256 && !VM_Version::supports_avx512_vbmi()) {
|
||||
return false; // Implementation limitation
|
||||
} else if (bt == T_SHORT && size_in_bits >= 256 && !VM_Version::supports_avx512bw()) {
|
||||
} else if (bt == T_SHORT && size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
|
||||
return false; // Implementation limitation
|
||||
}
|
||||
break;
|
||||
|
@ -7500,13 +7501,24 @@ instruct rearrangeB(vec dst, vec shuffle) %{
|
|||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rearrangeB_avx(vec dst, vec src, vec shuffle) %{
|
||||
instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
|
||||
predicate(vector_element_basic_type(n) == T_BYTE &&
|
||||
vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
|
||||
match(Set dst (VectorRearrange src shuffle));
|
||||
format %{ "vector_rearrange $dst, $shuffle, $src" %}
|
||||
effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
|
||||
format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
|
||||
ins_encode %{
|
||||
__ vpshufb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, Assembler::AVX_256bit);
|
||||
assert(UseAVX >= 2, "required");
|
||||
// Swap src into vtmp1
|
||||
__ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
|
||||
// Shuffle swapped src to get entries from other 128 bit lane
|
||||
__ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
|
||||
// Shuffle original src to get entries from self 128 bit lane
|
||||
__ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
|
||||
// Create a blend mask by setting high bits for entries coming from other lane in shuffle
|
||||
__ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
|
||||
// Perform the blend
|
||||
__ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
@ -7527,14 +7539,16 @@ instruct rearrangeB_evex(vec dst, vec src, vec shuffle) %{
|
|||
|
||||
instruct loadShuffleS(vec dst, vec src, vec vtmp, rRegP scratch) %{
|
||||
predicate(vector_element_basic_type(n) == T_SHORT &&
|
||||
vector_length(n) <= 8 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
|
||||
vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
|
||||
match(Set dst (VectorLoadShuffle src));
|
||||
effect(TEMP dst, TEMP vtmp, TEMP scratch);
|
||||
format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
|
||||
ins_encode %{
|
||||
// Create a byte shuffle mask from short shuffle mask
|
||||
// only byte shuffle instruction available on these platforms
|
||||
|
||||
int vlen_in_bytes = vector_length_in_bytes(this);
|
||||
if (UseAVX == 0) {
|
||||
assert(vlen_in_bytes <= 16, "required");
|
||||
// Multiply each shuffle by two to get byte index
|
||||
__ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
|
||||
__ psllw($vtmp$$XMMRegister, 1);
|
||||
|
@ -7547,6 +7561,20 @@ instruct loadShuffleS(vec dst, vec src, vec vtmp, rRegP scratch) %{
|
|||
// Add one to get alternate byte index
|
||||
__ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register);
|
||||
__ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
|
||||
} else {
|
||||
assert(UseAVX > 1 || vlen_in_bytes <= 16, "required");
|
||||
int vlen_enc = vector_length_encoding(this);
|
||||
// Multiply each shuffle by two to get byte index
|
||||
__ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
|
||||
__ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);
|
||||
|
||||
// Duplicate to create 2 copies of byte index
|
||||
__ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister, 8, vlen_enc);
|
||||
__ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
|
||||
|
||||
// Add one to get alternate byte index
|
||||
__ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, $scratch$$Register);
|
||||
}
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
@ -7563,6 +7591,28 @@ instruct rearrangeS(vec dst, vec shuffle) %{
|
|||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
|
||||
predicate(vector_element_basic_type(n) == T_SHORT &&
|
||||
vector_length(n) == 16 && !VM_Version::supports_avx512bw());
|
||||
match(Set dst (VectorRearrange src shuffle));
|
||||
effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
|
||||
format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
|
||||
ins_encode %{
|
||||
assert(UseAVX >= 2, "required");
|
||||
// Swap src into vtmp1
|
||||
__ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
|
||||
// Shuffle swapped src to get entries from other 128 bit lane
|
||||
__ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
|
||||
// Shuffle original src to get entries from self 128 bit lane
|
||||
__ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
|
||||
// Create a blend mask by setting high bits for entries coming from other lane in shuffle
|
||||
__ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
|
||||
// Perform the blend
|
||||
__ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct loadShuffleS_evex(vec dst, vec src) %{
|
||||
predicate(vector_element_basic_type(n) == T_SHORT &&
|
||||
VM_Version::supports_avx512bw());
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue