8199421: Add support for vector popcount

Reviewed-by: kvn
This commit is contained in:
Razvan Lupusoru 2018-03-13 10:22:15 -07:00
parent 147488cbce
commit 343cf9910d
13 changed files with 209 additions and 6 deletions

View file

@ -8709,6 +8709,15 @@ void Assembler::popcntq(Register dst, Register src) {
emit_int8((unsigned char)(0xC0 | encode)); emit_int8((unsigned char)(0xC0 | encode));
} }
void Assembler::vpopcntd(XMMRegister dst, XMMRegister src, int vector_len) {
assert(VM_Version::supports_vpopcntdq(), "must support vpopcntdq feature");
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_is_evex_instruction();
int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int8(0x55);
emit_int8((unsigned char)(0xC0 | encode));
}
void Assembler::popq(Address dst) { void Assembler::popq(Address dst) {
InstructionMark im(this); InstructionMark im(this);
prefixq(dst); prefixq(dst);

View file

@ -1638,6 +1638,8 @@ private:
void popcntq(Register dst, Register src); void popcntq(Register dst, Register src);
#endif #endif
void vpopcntd(XMMRegister dst, XMMRegister src, int vector_len);
// Prefetches (SSE, SSE2, 3DNOW only) // Prefetches (SSE, SSE2, 3DNOW only)
void prefetchnta(Address src); void prefetchnta(Address src);

View file

@ -257,6 +257,8 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
__ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset()))); __ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset())));
__ movl(Address(rsi, 0), rax); __ movl(Address(rsi, 0), rax);
__ movl(Address(rsi, 4), rbx); __ movl(Address(rsi, 4), rbx);
__ movl(Address(rsi, 8), rcx);
__ movl(Address(rsi, 12), rdx);
// //
// Extended cpuid(0x80000000) // Extended cpuid(0x80000000)
@ -662,6 +664,7 @@ void VM_Version::get_processor_features() {
_features &= ~CPU_AVX512CD; _features &= ~CPU_AVX512CD;
_features &= ~CPU_AVX512BW; _features &= ~CPU_AVX512BW;
_features &= ~CPU_AVX512VL; _features &= ~CPU_AVX512VL;
_features &= ~CPU_AVX512_VPOPCNTDQ;
} }
if (UseAVX < 2) if (UseAVX < 2)

View file

@ -228,6 +228,38 @@ class VM_Version : public Abstract_VM_Version {
} bits; } bits;
}; };
union SefCpuid7Ecx {
uint32_t value;
struct {
uint32_t prefetchwt1 : 1,
avx512_vbmi : 1,
umip : 1,
pku : 1,
ospke : 1,
: 1,
avx512_vbmi2 : 1,
: 1,
gfni : 1,
vaes : 1,
vpclmulqdq : 1,
avx512_vnni : 1,
avx512_bitalg : 1,
: 1,
avx512_vpopcntdq : 1,
: 17;
} bits;
};
union SefCpuid7Edx {
uint32_t value;
struct {
uint32_t : 2,
avx512_4vnniw : 1,
avx512_4fmaps : 1,
: 28;
} bits;
};
union ExtCpuid1EEbx { union ExtCpuid1EEbx {
uint32_t value; uint32_t value;
struct { struct {
@ -300,7 +332,8 @@ protected:
#define CPU_AVX512VL ((uint64_t)UCONST64(0x200000000)) // EVEX instructions with smaller vector length #define CPU_AVX512VL ((uint64_t)UCONST64(0x200000000)) // EVEX instructions with smaller vector length
#define CPU_SHA ((uint64_t)UCONST64(0x400000000)) // SHA instructions #define CPU_SHA ((uint64_t)UCONST64(0x400000000)) // SHA instructions
#define CPU_FMA ((uint64_t)UCONST64(0x800000000)) // FMA instructions #define CPU_FMA ((uint64_t)UCONST64(0x800000000)) // FMA instructions
#define CPU_VZEROUPPER ((uint64_t)UCONST64(0x1000000000)) // Vzeroupper instruction #define CPU_VZEROUPPER ((uint64_t)UCONST64(0x1000000000)) // Vzeroupper instruction
#define CPU_AVX512_VPOPCNTDQ ((uint64_t)UCONST64(0x2000000000)) // Vector popcount
enum Extended_Family { enum Extended_Family {
// AMD // AMD
@ -353,8 +386,8 @@ protected:
// cpuid function 7 (structured extended features) // cpuid function 7 (structured extended features)
SefCpuid7Eax sef_cpuid7_eax; SefCpuid7Eax sef_cpuid7_eax;
SefCpuid7Ebx sef_cpuid7_ebx; SefCpuid7Ebx sef_cpuid7_ebx;
uint32_t sef_cpuid7_ecx; // unused currently SefCpuid7Ecx sef_cpuid7_ecx;
uint32_t sef_cpuid7_edx; // unused currently SefCpuid7Edx sef_cpuid7_edx;
// cpuid function 0xB (processor topology) // cpuid function 0xB (processor topology)
// ecx = 0 // ecx = 0
@ -507,6 +540,8 @@ protected:
result |= CPU_AVX512BW; result |= CPU_AVX512BW;
if (_cpuid_info.sef_cpuid7_ebx.bits.avx512vl != 0) if (_cpuid_info.sef_cpuid7_ebx.bits.avx512vl != 0)
result |= CPU_AVX512VL; result |= CPU_AVX512VL;
if (_cpuid_info.sef_cpuid7_ecx.bits.avx512_vpopcntdq != 0)
result |= CPU_AVX512_VPOPCNTDQ;
} }
} }
if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0) if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0)
@ -783,6 +818,7 @@ public:
static bool supports_sha() { return (_features & CPU_SHA) != 0; } static bool supports_sha() { return (_features & CPU_SHA) != 0; }
static bool supports_fma() { return (_features & CPU_FMA) != 0 && supports_avx(); } static bool supports_fma() { return (_features & CPU_FMA) != 0 && supports_avx(); }
static bool supports_vzeroupper() { return (_features & CPU_VZEROUPPER) != 0; } static bool supports_vzeroupper() { return (_features & CPU_VZEROUPPER) != 0; }
static bool supports_vpopcntdq() { return (_features & CPU_AVX512_VPOPCNTDQ) != 0; }
// Intel features // Intel features
static bool is_intel_family_core() { return is_intel() && static bool is_intel_family_core() { return is_intel() &&

View file

@ -1223,6 +1223,10 @@ const bool Matcher::match_rule_supported(int opcode) {
if (!UsePopCountInstruction) if (!UsePopCountInstruction)
ret_value = false; ret_value = false;
break; break;
case Op_PopCountVI:
if (!UsePopCountInstruction || !VM_Version::supports_vpopcntdq())
ret_value = false;
break;
case Op_MulVI: case Op_MulVI:
if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
ret_value = false; ret_value = false;
@ -10788,3 +10792,49 @@ instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{
%} %}
ins_pipe( pipe_slow ); ins_pipe( pipe_slow );
%} %}
// --------------------------------- PopCount --------------------------------------
instruct vpopcount2I(vecD dst, vecD src) %{
predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 2);
match(Set dst (PopCountVI src));
format %{ "vpopcntd $dst,$src\t! vector popcount packed2I" %}
ins_encode %{
int vector_len = 0;
__ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
%}
ins_pipe( pipe_slow );
%}
instruct vpopcount4I(vecX dst, vecX src) %{
predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 4);
match(Set dst (PopCountVI src));
format %{ "vpopcntd $dst,$src\t! vector popcount packed4I" %}
ins_encode %{
int vector_len = 0;
__ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
%}
ins_pipe( pipe_slow );
%}
instruct vpopcount8I(vecY dst, vecY src) %{
predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 8);
match(Set dst (PopCountVI src));
format %{ "vpopcntd $dst,$src\t! vector popcount packed8I" %}
ins_encode %{
int vector_len = 1;
__ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
%}
ins_pipe( pipe_slow );
%}
instruct vpopcount16I(vecZ dst, vecZ src) %{
predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 16);
match(Set dst (PopCountVI src));
format %{ "vpopcntd $dst,$src\t! vector popcount packed16I" %}
ins_encode %{
int vector_len = 2;
__ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
%}
ins_pipe( pipe_slow );
%}

View file

@ -4180,7 +4180,7 @@ bool MatchRule::is_vector() const {
"URShiftVB","URShiftVS","URShiftVI","URShiftVL", "URShiftVB","URShiftVS","URShiftVI","URShiftVL",
"ReplicateB","ReplicateS","ReplicateI","ReplicateL","ReplicateF","ReplicateD", "ReplicateB","ReplicateS","ReplicateI","ReplicateL","ReplicateF","ReplicateD",
"LoadVector","StoreVector", "LoadVector","StoreVector",
"FmaVD", "FmaVF", "FmaVD", "FmaVF","PopCountVI",
// Next are not supported currently. // Next are not supported currently.
"PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D", "PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D",
"ExtractB","ExtractUB","ExtractC","ExtractS","ExtractI","ExtractL","ExtractF","ExtractD" "ExtractB","ExtractUB","ExtractC","ExtractS","ExtractI","ExtractL","ExtractF","ExtractD"

View file

@ -241,6 +241,7 @@ macro(PartialSubtypeCheck)
macro(Phi) macro(Phi)
macro(PopCountI) macro(PopCountI)
macro(PopCountL) macro(PopCountL)
macro(PopCountVI)
macro(PrefetchAllocation) macro(PrefetchAllocation)
macro(Proj) macro(Proj)
macro(RShiftI) macro(RShiftI)

View file

@ -2325,8 +2325,11 @@ void SuperWord::output() {
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n)); vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
vlen_in_bytes = vn->as_Vector()->length_in_bytes(); vlen_in_bytes = vn->as_Vector()->length_in_bytes();
} }
} else if (opc == Op_SqrtF || opc == Op_SqrtD || opc == Op_AbsF || opc == Op_AbsD || opc == Op_NegF || opc == Op_NegD) { } else if (opc == Op_SqrtF || opc == Op_SqrtD ||
// Promote operand to vector (Sqrt/Abs/Neg are 2 address instructions) opc == Op_AbsF || opc == Op_AbsD ||
opc == Op_NegF || opc == Op_NegD ||
opc == Op_PopCountI) {
assert(n->req() == 2, "only one input expected");
Node* in = vector_opd(p, 1); Node* in = vector_opd(p, 1);
vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n)); vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
vlen_in_bytes = vn->as_Vector()->length_in_bytes(); vlen_in_bytes = vn->as_Vector()->length_in_bytes();

View file

@ -122,6 +122,13 @@ int VectorNode::opcode(int sopc, BasicType bt) {
case Op_SqrtD: case Op_SqrtD:
assert(bt == T_DOUBLE, "must be"); assert(bt == T_DOUBLE, "must be");
return Op_SqrtVD; return Op_SqrtVD;
case Op_PopCountI:
if (bt == T_INT) {
return Op_PopCountVI;
}
// Unimplemented for subword types since bit count changes
// depending on size of lane (and sign bit).
return 0;
case Op_LShiftI: case Op_LShiftI:
switch (bt) { switch (bt) {
case T_BOOLEAN: case T_BOOLEAN:
@ -325,6 +332,8 @@ VectorNode* VectorNode::make(int opc, Node* n1, Node* n2, uint vlen, BasicType b
case Op_SqrtVF: return new SqrtVFNode(n1, vt); case Op_SqrtVF: return new SqrtVFNode(n1, vt);
case Op_SqrtVD: return new SqrtVDNode(n1, vt); case Op_SqrtVD: return new SqrtVDNode(n1, vt);
case Op_PopCountVI: return new PopCountVINode(n1, vt);
case Op_LShiftVB: return new LShiftVBNode(n1, n2, vt); case Op_LShiftVB: return new LShiftVBNode(n1, n2, vt);
case Op_LShiftVS: return new LShiftVSNode(n1, n2, vt); case Op_LShiftVS: return new LShiftVSNode(n1, n2, vt);
case Op_LShiftVI: return new LShiftVINode(n1, n2, vt); case Op_LShiftVI: return new LShiftVINode(n1, n2, vt);

View file

@ -381,6 +381,14 @@ class NegVDNode : public VectorNode {
virtual int Opcode() const; virtual int Opcode() const;
}; };
//------------------------------PopCountVINode---------------------------------
// Vector popcount integer bits
class PopCountVINode : public VectorNode {
public:
PopCountVINode(Node* in, const TypeVect* vt) : VectorNode(in,vt) {}
virtual int Opcode() const;
};
//------------------------------SqrtVFNode-------------------------------------- //------------------------------SqrtVFNode--------------------------------------
// Vector Sqrt float // Vector Sqrt float
class SqrtVFNode : public VectorNode { class SqrtVFNode : public VectorNode {

View file

@ -1996,6 +1996,7 @@ typedef PaddedEnd<ObjectMonitor> PaddedObjectMonitor;
declare_c2_type(MulReductionVDNode, ReductionNode) \ declare_c2_type(MulReductionVDNode, ReductionNode) \
declare_c2_type(DivVFNode, VectorNode) \ declare_c2_type(DivVFNode, VectorNode) \
declare_c2_type(DivVDNode, VectorNode) \ declare_c2_type(DivVDNode, VectorNode) \
declare_c2_type(PopCountVINode, VectorNode) \
declare_c2_type(LShiftVBNode, VectorNode) \ declare_c2_type(LShiftVBNode, VectorNode) \
declare_c2_type(LShiftVSNode, VectorNode) \ declare_c2_type(LShiftVSNode, VectorNode) \
declare_c2_type(LShiftVINode, VectorNode) \ declare_c2_type(LShiftVINode, VectorNode) \

View file

@ -111,6 +111,7 @@ tier1_compiler_3 = \
compiler/types/ \ compiler/types/ \
compiler/uncommontrap/ \ compiler/uncommontrap/ \
compiler/unsafe/ \ compiler/unsafe/ \
compiler/vectorization/ \
-compiler/intrinsics/bmi \ -compiler/intrinsics/bmi \
-compiler/intrinsics/mathexact \ -compiler/intrinsics/mathexact \
-compiler/intrinsics/sha \ -compiler/intrinsics/sha \

View file

@ -0,0 +1,80 @@
/*
* Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
* @test
* @bug 8199421
* @summary Test vectorization of popcount
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UsePopCountInstruction
* compiler.vectorization.TestPopCountVector
* @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UsePopCountInstruction
* -XX:MaxVectorSize=8 compiler.vectorization.TestPopCountVector
*/
package compiler.vectorization;
public class TestPopCountVector {
private int[] input;
private int[] output;
private static final int LEN = 1024;
public static void main(String args[]) {
TestPopCountVector test = new TestPopCountVector();
for (int i = 0; i < 10_000; ++i) {
test.vectorizeBitCount();
}
System.out.println("Checking popcount result");
test.checkResult();
for (int i = 0; i < 10_000; ++i) {
test.vectorizeBitCount();
}
System.out.println("Checking popcount result");
test.checkResult();
}
public TestPopCountVector() {
input = new int[LEN];
output = new int[LEN];
for (int i = 0; i < LEN; ++i) {
input[i] = i % 2 == 0 ? i : -1 * i;
}
}
public void vectorizeBitCount() {
for (int i = 0; i < LEN; ++i) {
output[i] = Integer.bitCount(input[i]);
}
}
public void checkResult() {
for (int i = 0; i < LEN; ++i) {
int expected = Integer.bitCount(input[i]);
if (output[i] != expected) {
throw new RuntimeException("Invalid result: output[" + i + "] = " + output[i] + " != " + expected);
}
}
}
}