mirror of
https://github.com/openjdk/jdk.git
synced 2025-08-28 15:24:43 +02:00
8199421: Add support for vector popcount
Reviewed-by: kvn
This commit is contained in:
parent
147488cbce
commit
343cf9910d
13 changed files with 209 additions and 6 deletions
|
@ -8709,6 +8709,15 @@ void Assembler::popcntq(Register dst, Register src) {
|
|||
emit_int8((unsigned char)(0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::vpopcntd(XMMRegister dst, XMMRegister src, int vector_len) {
|
||||
assert(VM_Version::supports_vpopcntdq(), "must support vpopcntdq feature");
|
||||
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
|
||||
attributes.set_is_evex_instruction();
|
||||
int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
|
||||
emit_int8(0x55);
|
||||
emit_int8((unsigned char)(0xC0 | encode));
|
||||
}
|
||||
|
||||
void Assembler::popq(Address dst) {
|
||||
InstructionMark im(this);
|
||||
prefixq(dst);
|
||||
|
|
|
@ -1638,6 +1638,8 @@ private:
|
|||
void popcntq(Register dst, Register src);
|
||||
#endif
|
||||
|
||||
void vpopcntd(XMMRegister dst, XMMRegister src, int vector_len);
|
||||
|
||||
// Prefetches (SSE, SSE2, 3DNOW only)
|
||||
|
||||
void prefetchnta(Address src);
|
||||
|
|
|
@ -257,6 +257,8 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
|
|||
__ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset())));
|
||||
__ movl(Address(rsi, 0), rax);
|
||||
__ movl(Address(rsi, 4), rbx);
|
||||
__ movl(Address(rsi, 8), rcx);
|
||||
__ movl(Address(rsi, 12), rdx);
|
||||
|
||||
//
|
||||
// Extended cpuid(0x80000000)
|
||||
|
@ -662,6 +664,7 @@ void VM_Version::get_processor_features() {
|
|||
_features &= ~CPU_AVX512CD;
|
||||
_features &= ~CPU_AVX512BW;
|
||||
_features &= ~CPU_AVX512VL;
|
||||
_features &= ~CPU_AVX512_VPOPCNTDQ;
|
||||
}
|
||||
|
||||
if (UseAVX < 2)
|
||||
|
|
|
@ -228,6 +228,38 @@ class VM_Version : public Abstract_VM_Version {
|
|||
} bits;
|
||||
};
|
||||
|
||||
union SefCpuid7Ecx {
|
||||
uint32_t value;
|
||||
struct {
|
||||
uint32_t prefetchwt1 : 1,
|
||||
avx512_vbmi : 1,
|
||||
umip : 1,
|
||||
pku : 1,
|
||||
ospke : 1,
|
||||
: 1,
|
||||
avx512_vbmi2 : 1,
|
||||
: 1,
|
||||
gfni : 1,
|
||||
vaes : 1,
|
||||
vpclmulqdq : 1,
|
||||
avx512_vnni : 1,
|
||||
avx512_bitalg : 1,
|
||||
: 1,
|
||||
avx512_vpopcntdq : 1,
|
||||
: 17;
|
||||
} bits;
|
||||
};
|
||||
|
||||
union SefCpuid7Edx {
|
||||
uint32_t value;
|
||||
struct {
|
||||
uint32_t : 2,
|
||||
avx512_4vnniw : 1,
|
||||
avx512_4fmaps : 1,
|
||||
: 28;
|
||||
} bits;
|
||||
};
|
||||
|
||||
union ExtCpuid1EEbx {
|
||||
uint32_t value;
|
||||
struct {
|
||||
|
@ -301,6 +333,7 @@ protected:
|
|||
#define CPU_SHA ((uint64_t)UCONST64(0x400000000)) // SHA instructions
|
||||
#define CPU_FMA ((uint64_t)UCONST64(0x800000000)) // FMA instructions
|
||||
#define CPU_VZEROUPPER ((uint64_t)UCONST64(0x1000000000)) // Vzeroupper instruction
|
||||
#define CPU_AVX512_VPOPCNTDQ ((uint64_t)UCONST64(0x2000000000)) // Vector popcount
|
||||
|
||||
enum Extended_Family {
|
||||
// AMD
|
||||
|
@ -353,8 +386,8 @@ protected:
|
|||
// cpuid function 7 (structured extended features)
|
||||
SefCpuid7Eax sef_cpuid7_eax;
|
||||
SefCpuid7Ebx sef_cpuid7_ebx;
|
||||
uint32_t sef_cpuid7_ecx; // unused currently
|
||||
uint32_t sef_cpuid7_edx; // unused currently
|
||||
SefCpuid7Ecx sef_cpuid7_ecx;
|
||||
SefCpuid7Edx sef_cpuid7_edx;
|
||||
|
||||
// cpuid function 0xB (processor topology)
|
||||
// ecx = 0
|
||||
|
@ -507,6 +540,8 @@ protected:
|
|||
result |= CPU_AVX512BW;
|
||||
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512vl != 0)
|
||||
result |= CPU_AVX512VL;
|
||||
if (_cpuid_info.sef_cpuid7_ecx.bits.avx512_vpopcntdq != 0)
|
||||
result |= CPU_AVX512_VPOPCNTDQ;
|
||||
}
|
||||
}
|
||||
if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0)
|
||||
|
@ -783,6 +818,7 @@ public:
|
|||
static bool supports_sha() { return (_features & CPU_SHA) != 0; }
|
||||
static bool supports_fma() { return (_features & CPU_FMA) != 0 && supports_avx(); }
|
||||
static bool supports_vzeroupper() { return (_features & CPU_VZEROUPPER) != 0; }
|
||||
static bool supports_vpopcntdq() { return (_features & CPU_AVX512_VPOPCNTDQ) != 0; }
|
||||
|
||||
// Intel features
|
||||
static bool is_intel_family_core() { return is_intel() &&
|
||||
|
|
|
@ -1223,6 +1223,10 @@ const bool Matcher::match_rule_supported(int opcode) {
|
|||
if (!UsePopCountInstruction)
|
||||
ret_value = false;
|
||||
break;
|
||||
case Op_PopCountVI:
|
||||
if (!UsePopCountInstruction || !VM_Version::supports_vpopcntdq())
|
||||
ret_value = false;
|
||||
break;
|
||||
case Op_MulVI:
|
||||
if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
|
||||
ret_value = false;
|
||||
|
@ -10788,3 +10792,49 @@ instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{
|
|||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
// --------------------------------- PopCount --------------------------------------
|
||||
|
||||
instruct vpopcount2I(vecD dst, vecD src) %{
|
||||
predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 2);
|
||||
match(Set dst (PopCountVI src));
|
||||
format %{ "vpopcntd $dst,$src\t! vector popcount packed2I" %}
|
||||
ins_encode %{
|
||||
int vector_len = 0;
|
||||
__ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vpopcount4I(vecX dst, vecX src) %{
|
||||
predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 4);
|
||||
match(Set dst (PopCountVI src));
|
||||
format %{ "vpopcntd $dst,$src\t! vector popcount packed4I" %}
|
||||
ins_encode %{
|
||||
int vector_len = 0;
|
||||
__ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vpopcount8I(vecY dst, vecY src) %{
|
||||
predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 8);
|
||||
match(Set dst (PopCountVI src));
|
||||
format %{ "vpopcntd $dst,$src\t! vector popcount packed8I" %}
|
||||
ins_encode %{
|
||||
int vector_len = 1;
|
||||
__ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
||||
instruct vpopcount16I(vecZ dst, vecZ src) %{
|
||||
predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 16);
|
||||
match(Set dst (PopCountVI src));
|
||||
format %{ "vpopcntd $dst,$src\t! vector popcount packed16I" %}
|
||||
ins_encode %{
|
||||
int vector_len = 2;
|
||||
__ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
|
||||
%}
|
||||
ins_pipe( pipe_slow );
|
||||
%}
|
||||
|
|
|
@ -4180,7 +4180,7 @@ bool MatchRule::is_vector() const {
|
|||
"URShiftVB","URShiftVS","URShiftVI","URShiftVL",
|
||||
"ReplicateB","ReplicateS","ReplicateI","ReplicateL","ReplicateF","ReplicateD",
|
||||
"LoadVector","StoreVector",
|
||||
"FmaVD", "FmaVF",
|
||||
"FmaVD", "FmaVF","PopCountVI",
|
||||
// Next are not supported currently.
|
||||
"PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D",
|
||||
"ExtractB","ExtractUB","ExtractC","ExtractS","ExtractI","ExtractL","ExtractF","ExtractD"
|
||||
|
|
|
@ -241,6 +241,7 @@ macro(PartialSubtypeCheck)
|
|||
macro(Phi)
|
||||
macro(PopCountI)
|
||||
macro(PopCountL)
|
||||
macro(PopCountVI)
|
||||
macro(PrefetchAllocation)
|
||||
macro(Proj)
|
||||
macro(RShiftI)
|
||||
|
|
|
@ -2325,8 +2325,11 @@ void SuperWord::output() {
|
|||
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
|
||||
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
||||
}
|
||||
} else if (opc == Op_SqrtF || opc == Op_SqrtD || opc == Op_AbsF || opc == Op_AbsD || opc == Op_NegF || opc == Op_NegD) {
|
||||
// Promote operand to vector (Sqrt/Abs/Neg are 2 address instructions)
|
||||
} else if (opc == Op_SqrtF || opc == Op_SqrtD ||
|
||||
opc == Op_AbsF || opc == Op_AbsD ||
|
||||
opc == Op_NegF || opc == Op_NegD ||
|
||||
opc == Op_PopCountI) {
|
||||
assert(n->req() == 2, "only one input expected");
|
||||
Node* in = vector_opd(p, 1);
|
||||
vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
|
||||
vlen_in_bytes = vn->as_Vector()->length_in_bytes();
|
||||
|
|
|
@ -122,6 +122,13 @@ int VectorNode::opcode(int sopc, BasicType bt) {
|
|||
case Op_SqrtD:
|
||||
assert(bt == T_DOUBLE, "must be");
|
||||
return Op_SqrtVD;
|
||||
case Op_PopCountI:
|
||||
if (bt == T_INT) {
|
||||
return Op_PopCountVI;
|
||||
}
|
||||
// Unimplemented for subword types since bit count changes
|
||||
// depending on size of lane (and sign bit).
|
||||
return 0;
|
||||
case Op_LShiftI:
|
||||
switch (bt) {
|
||||
case T_BOOLEAN:
|
||||
|
@ -325,6 +332,8 @@ VectorNode* VectorNode::make(int opc, Node* n1, Node* n2, uint vlen, BasicType b
|
|||
case Op_SqrtVF: return new SqrtVFNode(n1, vt);
|
||||
case Op_SqrtVD: return new SqrtVDNode(n1, vt);
|
||||
|
||||
case Op_PopCountVI: return new PopCountVINode(n1, vt);
|
||||
|
||||
case Op_LShiftVB: return new LShiftVBNode(n1, n2, vt);
|
||||
case Op_LShiftVS: return new LShiftVSNode(n1, n2, vt);
|
||||
case Op_LShiftVI: return new LShiftVINode(n1, n2, vt);
|
||||
|
|
|
@ -381,6 +381,14 @@ class NegVDNode : public VectorNode {
|
|||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------PopCountVINode---------------------------------
|
||||
// Vector popcount integer bits
|
||||
class PopCountVINode : public VectorNode {
|
||||
public:
|
||||
PopCountVINode(Node* in, const TypeVect* vt) : VectorNode(in,vt) {}
|
||||
virtual int Opcode() const;
|
||||
};
|
||||
|
||||
//------------------------------SqrtVFNode--------------------------------------
|
||||
// Vector Sqrt float
|
||||
class SqrtVFNode : public VectorNode {
|
||||
|
|
|
@ -1996,6 +1996,7 @@ typedef PaddedEnd<ObjectMonitor> PaddedObjectMonitor;
|
|||
declare_c2_type(MulReductionVDNode, ReductionNode) \
|
||||
declare_c2_type(DivVFNode, VectorNode) \
|
||||
declare_c2_type(DivVDNode, VectorNode) \
|
||||
declare_c2_type(PopCountVINode, VectorNode) \
|
||||
declare_c2_type(LShiftVBNode, VectorNode) \
|
||||
declare_c2_type(LShiftVSNode, VectorNode) \
|
||||
declare_c2_type(LShiftVINode, VectorNode) \
|
||||
|
|
|
@ -111,6 +111,7 @@ tier1_compiler_3 = \
|
|||
compiler/types/ \
|
||||
compiler/uncommontrap/ \
|
||||
compiler/unsafe/ \
|
||||
compiler/vectorization/ \
|
||||
-compiler/intrinsics/bmi \
|
||||
-compiler/intrinsics/mathexact \
|
||||
-compiler/intrinsics/sha \
|
||||
|
|
|
@ -0,0 +1,80 @@
|
|||
/*
|
||||
* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @test
|
||||
* @bug 8199421
|
||||
* @summary Test vectorization of popcount
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UsePopCountInstruction
|
||||
* compiler.vectorization.TestPopCountVector
|
||||
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UsePopCountInstruction
|
||||
* -XX:MaxVectorSize=8 compiler.vectorization.TestPopCountVector
|
||||
*/
|
||||
|
||||
package compiler.vectorization;
|
||||
|
||||
public class TestPopCountVector {
|
||||
private int[] input;
|
||||
private int[] output;
|
||||
private static final int LEN = 1024;
|
||||
|
||||
public static void main(String args[]) {
|
||||
TestPopCountVector test = new TestPopCountVector();
|
||||
|
||||
for (int i = 0; i < 10_000; ++i) {
|
||||
test.vectorizeBitCount();
|
||||
}
|
||||
System.out.println("Checking popcount result");
|
||||
test.checkResult();
|
||||
|
||||
for (int i = 0; i < 10_000; ++i) {
|
||||
test.vectorizeBitCount();
|
||||
}
|
||||
System.out.println("Checking popcount result");
|
||||
test.checkResult();
|
||||
}
|
||||
|
||||
public TestPopCountVector() {
|
||||
input = new int[LEN];
|
||||
output = new int[LEN];
|
||||
for (int i = 0; i < LEN; ++i) {
|
||||
input[i] = i % 2 == 0 ? i : -1 * i;
|
||||
}
|
||||
}
|
||||
|
||||
public void vectorizeBitCount() {
|
||||
for (int i = 0; i < LEN; ++i) {
|
||||
output[i] = Integer.bitCount(input[i]);
|
||||
}
|
||||
}
|
||||
|
||||
public void checkResult() {
|
||||
for (int i = 0; i < LEN; ++i) {
|
||||
int expected = Integer.bitCount(input[i]);
|
||||
if (output[i] != expected) {
|
||||
throw new RuntimeException("Invalid result: output[" + i + "] = " + output[i] + " != " + expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Add table
Add a link
Reference in a new issue