8199421: Add support for vector popcount

Reviewed-by: kvn
2025-08-28 15:24:43 +02:00 · 2018-03-13 10:22:15 -07:00 · 2018-03-13 10:22:15 -07:00 · 343cf9910d
commit 343cf9910d
parent 147488cbce
13 changed files with 209 additions and 6 deletions
--- a/src/hotspot/cpu/x86/assembler_x86.cpp
+++ b/src/hotspot/cpu/x86/assembler_x86.cpp
@ -8709,6 +8709,15 @@ void Assembler::popcntq(Register dst, Register src) {
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::vpopcntd(XMMRegister dst, XMMRegister src, int vector_len) {
  assert(VM_Version::supports_vpopcntdq(), "must support vpopcntdq feature");
  InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
  attributes.set_is_evex_instruction();
  int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
  emit_int8(0x55);
  emit_int8((unsigned char)(0xC0 | encode));
 }
 void Assembler::popq(Address dst) {
  InstructionMark im(this);
  prefixq(dst);
--- a/src/hotspot/cpu/x86/assembler_x86.hpp
+++ b/src/hotspot/cpu/x86/assembler_x86.hpp
@ -1638,6 +1638,8 @@ private:
  void popcntq(Register dst, Register src);
 #endif
  void vpopcntd(XMMRegister dst, XMMRegister src, int vector_len);
  // Prefetches (SSE, SSE2, 3DNOW only)
  void prefetchnta(Address src);
--- a/src/hotspot/cpu/x86/vm_version_x86.cpp
+++ b/src/hotspot/cpu/x86/vm_version_x86.cpp
@ -257,6 +257,8 @@ class VM_Version_StubGenerator: public StubCodeGenerator {
    __ lea(rsi, Address(rbp, in_bytes(VM_Version::sef_cpuid7_offset())));
    __ movl(Address(rsi, 0), rax);
    __ movl(Address(rsi, 4), rbx);
    __ movl(Address(rsi, 8), rcx);
    __ movl(Address(rsi, 12), rdx);
    //
    // Extended cpuid(0x80000000)
@ -662,6 +664,7 @@ void VM_Version::get_processor_features() {
    _features &= ~CPU_AVX512CD;
    _features &= ~CPU_AVX512BW;
    _features &= ~CPU_AVX512VL;
    _features &= ~CPU_AVX512_VPOPCNTDQ;
  }
  if (UseAVX < 2)
--- a/src/hotspot/cpu/x86/vm_version_x86.hpp
+++ b/src/hotspot/cpu/x86/vm_version_x86.hpp
@ -228,6 +228,38 @@ class VM_Version : public Abstract_VM_Version {
    } bits;
  };
  union SefCpuid7Ecx {
    uint32_t value;
    struct {
      uint32_t prefetchwt1 : 1,
               avx512_vbmi : 1,
                      umip : 1,
                       pku : 1,
                     ospke : 1,
                           : 1,
              avx512_vbmi2 : 1,
                           : 1,
                      gfni : 1,
                      vaes : 1,
                vpclmulqdq : 1,
               avx512_vnni : 1,
             avx512_bitalg : 1,
                           : 1,
          avx512_vpopcntdq : 1,
                           : 17;
    } bits;
  };
  union SefCpuid7Edx {
    uint32_t value;
    struct {
      uint32_t             : 2,
             avx512_4vnniw : 1,
             avx512_4fmaps : 1,
                           : 28;
    } bits;
  };
  union ExtCpuid1EEbx {
    uint32_t value;
    struct {
@ -300,7 +332,8 @@ protected:
 #define CPU_AVX512VL ((uint64_t)UCONST64(0x200000000)) // EVEX instructions with smaller vector length
 #define CPU_SHA ((uint64_t)UCONST64(0x400000000))      // SHA instructions
 #define CPU_FMA ((uint64_t)UCONST64(0x800000000))      // FMA instructions
-#define CPU_VZEROUPPER ((uint64_t)UCONST64(0x1000000000))      // Vzeroupper instruction
+#define CPU_VZEROUPPER ((uint64_t)UCONST64(0x1000000000))       // Vzeroupper instruction
 #define CPU_AVX512_VPOPCNTDQ ((uint64_t)UCONST64(0x2000000000)) // Vector popcount
  enum Extended_Family {
    // AMD
@ -353,8 +386,8 @@ protected:
    // cpuid function 7 (structured extended features)
    SefCpuid7Eax sef_cpuid7_eax;
    SefCpuid7Ebx sef_cpuid7_ebx;
-    uint32_t     sef_cpuid7_ecx; // unused currently
+    SefCpuid7Ecx sef_cpuid7_ecx;
-    uint32_t     sef_cpuid7_edx; // unused currently
+    SefCpuid7Edx sef_cpuid7_edx;
    // cpuid function 0xB (processor topology)
    // ecx = 0
@ -507,6 +540,8 @@ protected:
          result |= CPU_AVX512BW;
        if (_cpuid_info.sef_cpuid7_ebx.bits.avx512vl != 0)
          result |= CPU_AVX512VL;
        if (_cpuid_info.sef_cpuid7_ecx.bits.avx512_vpopcntdq != 0)
          result |= CPU_AVX512_VPOPCNTDQ;
      }
    }
    if(_cpuid_info.sef_cpuid7_ebx.bits.bmi1 != 0)
@ -783,6 +818,7 @@ public:
  static bool supports_sha()        { return (_features & CPU_SHA) != 0; }
  static bool supports_fma()        { return (_features & CPU_FMA) != 0 && supports_avx(); }
  static bool supports_vzeroupper() { return (_features & CPU_VZEROUPPER) != 0; }
  static bool supports_vpopcntdq()  { return (_features & CPU_AVX512_VPOPCNTDQ) != 0; }
  // Intel features
  static bool is_intel_family_core() { return is_intel() &&
--- a/src/hotspot/cpu/x86/x86.ad
+++ b/src/hotspot/cpu/x86/x86.ad
@ -1223,6 +1223,10 @@ const bool Matcher::match_rule_supported(int opcode) {
      if (!UsePopCountInstruction)
        ret_value = false;
      break;
    case Op_PopCountVI:
      if (!UsePopCountInstruction || !VM_Version::supports_vpopcntdq())
        ret_value = false;
      break;
    case Op_MulVI:
      if ((UseSSE < 4) && (UseAVX < 1)) // only with SSE4_1 or AVX
        ret_value = false;
@ -10788,3 +10792,49 @@ instruct vfma16F_mem(vecZ a, memory b, vecZ c) %{
  %}
  ins_pipe( pipe_slow );
 %}
 // --------------------------------- PopCount --------------------------------------
 instruct vpopcount2I(vecD dst, vecD src) %{
  predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 2);
  match(Set dst (PopCountVI src));
  format %{ "vpopcntd  $dst,$src\t! vector popcount packed2I" %}
  ins_encode %{
    int vector_len = 0;
    __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
  %}
  ins_pipe( pipe_slow );
 %}
 instruct vpopcount4I(vecX dst, vecX src) %{
  predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 4);
  match(Set dst (PopCountVI src));
  format %{ "vpopcntd  $dst,$src\t! vector popcount packed4I" %}
  ins_encode %{
    int vector_len = 0;
    __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
  %}
  ins_pipe( pipe_slow );
 %}
 instruct vpopcount8I(vecY dst, vecY src) %{
  predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 8);
  match(Set dst (PopCountVI src));
  format %{ "vpopcntd  $dst,$src\t! vector popcount packed8I" %}
  ins_encode %{
    int vector_len = 1;
    __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
  %}
  ins_pipe( pipe_slow );
 %}
 instruct vpopcount16I(vecZ dst, vecZ src) %{
  predicate(VM_Version::supports_vpopcntdq() && UsePopCountInstruction && n->as_Vector()->length() == 16);
  match(Set dst (PopCountVI src));
  format %{ "vpopcntd  $dst,$src\t! vector popcount packed16I" %}
  ins_encode %{
    int vector_len = 2;
    __ vpopcntd($dst$$XMMRegister, $src$$XMMRegister, vector_len);
  %}
  ins_pipe( pipe_slow );
 %}
--- a/src/hotspot/share/adlc/formssel.cpp
+++ b/src/hotspot/share/adlc/formssel.cpp
@ -4180,7 +4180,7 @@ bool MatchRule::is_vector() const {
    "URShiftVB","URShiftVS","URShiftVI","URShiftVL",
    "ReplicateB","ReplicateS","ReplicateI","ReplicateL","ReplicateF","ReplicateD",
    "LoadVector","StoreVector",
-    "FmaVD", "FmaVF",
+    "FmaVD", "FmaVF","PopCountVI",
    // Next are not supported currently.
    "PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D",
    "ExtractB","ExtractUB","ExtractC","ExtractS","ExtractI","ExtractL","ExtractF","ExtractD"
--- a/src/hotspot/share/opto/classes.hpp
+++ b/src/hotspot/share/opto/classes.hpp
@ -241,6 +241,7 @@ macro(PartialSubtypeCheck)
 macro(Phi)
 macro(PopCountI)
 macro(PopCountL)
 macro(PopCountVI)
 macro(PrefetchAllocation)
 macro(Proj)
 macro(RShiftI)
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@ -2325,8 +2325,11 @@ void SuperWord::output() {
          vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
          vlen_in_bytes = vn->as_Vector()->length_in_bytes();
        }
-      } else if (opc == Op_SqrtF || opc == Op_SqrtD || opc == Op_AbsF || opc == Op_AbsD || opc == Op_NegF || opc == Op_NegD) {
+      } else if (opc == Op_SqrtF || opc == Op_SqrtD ||
-        // Promote operand to vector (Sqrt/Abs/Neg are 2 address instructions)
+                 opc == Op_AbsF || opc == Op_AbsD ||
                 opc == Op_NegF || opc == Op_NegD ||
                 opc == Op_PopCountI) {
        assert(n->req() == 2, "only one input expected");
        Node* in = vector_opd(p, 1);
        vn = VectorNode::make(opc, in, NULL, vlen, velt_basic_type(n));
        vlen_in_bytes = vn->as_Vector()->length_in_bytes();
--- a/src/hotspot/share/opto/vectornode.cpp
+++ b/src/hotspot/share/opto/vectornode.cpp
@ -122,6 +122,13 @@ int VectorNode::opcode(int sopc, BasicType bt) {
  case Op_SqrtD:
    assert(bt == T_DOUBLE, "must be");
    return Op_SqrtVD;
  case Op_PopCountI:
    if (bt == T_INT) {
      return Op_PopCountVI;
    }
    // Unimplemented for subword types since bit count changes
    // depending on size of lane (and sign bit).
    return 0;
  case Op_LShiftI:
    switch (bt) {
    case T_BOOLEAN:
@ -325,6 +332,8 @@ VectorNode* VectorNode::make(int opc, Node* n1, Node* n2, uint vlen, BasicType b
  case Op_SqrtVF: return new SqrtVFNode(n1, vt);
  case Op_SqrtVD: return new SqrtVDNode(n1, vt);
  case Op_PopCountVI: return new PopCountVINode(n1, vt);
  case Op_LShiftVB: return new LShiftVBNode(n1, n2, vt);
  case Op_LShiftVS: return new LShiftVSNode(n1, n2, vt);
  case Op_LShiftVI: return new LShiftVINode(n1, n2, vt);
--- a/src/hotspot/share/opto/vectornode.hpp
+++ b/src/hotspot/share/opto/vectornode.hpp
@ -381,6 +381,14 @@ class NegVDNode : public VectorNode {
  virtual int Opcode() const;
 };
 //------------------------------PopCountVINode---------------------------------
 // Vector popcount integer bits
 class PopCountVINode : public VectorNode {
 public:
  PopCountVINode(Node* in, const TypeVect* vt) : VectorNode(in,vt) {}
  virtual int Opcode() const;
 };
 //------------------------------SqrtVFNode--------------------------------------
 // Vector Sqrt float
 class SqrtVFNode : public VectorNode {
--- a/src/hotspot/share/runtime/vmStructs.cpp
+++ b/src/hotspot/share/runtime/vmStructs.cpp
@ -1996,6 +1996,7 @@ typedef PaddedEnd<ObjectMonitor>              PaddedObjectMonitor;
  declare_c2_type(MulReductionVDNode, ReductionNode)                      \
  declare_c2_type(DivVFNode, VectorNode)                                  \
  declare_c2_type(DivVDNode, VectorNode)                                  \
  declare_c2_type(PopCountVINode, VectorNode)                             \
  declare_c2_type(LShiftVBNode, VectorNode)                               \
  declare_c2_type(LShiftVSNode, VectorNode)                               \
  declare_c2_type(LShiftVINode, VectorNode)                               \
--- a/test/hotspot/jtreg/TEST.groups
+++ b/test/hotspot/jtreg/TEST.groups
@ -111,6 +111,7 @@ tier1_compiler_3 = \
  compiler/types/ \
  compiler/uncommontrap/ \
  compiler/unsafe/ \
  compiler/vectorization/ \
  -compiler/intrinsics/bmi \
  -compiler/intrinsics/mathexact \
  -compiler/intrinsics/sha \
--- a/test/hotspot/jtreg/compiler/vectorization/TestPopCountVector.java
+++ b/test/hotspot/jtreg/compiler/vectorization/TestPopCountVector.java
@ -0,0 +1,80 @@
 /*
 * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
 /**
 * @test
 * @bug 8199421
 * @summary Test vectorization of popcount
 * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UsePopCountInstruction
 *      compiler.vectorization.TestPopCountVector
 * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UsePopCountInstruction
 *      -XX:MaxVectorSize=8 compiler.vectorization.TestPopCountVector
 */
 package compiler.vectorization;
 public class TestPopCountVector {
    private int[] input;
    private int[] output;
    private static final int LEN = 1024;
    public static void main(String args[]) {
        TestPopCountVector test = new TestPopCountVector();
        for (int i = 0; i < 10_000; ++i) {
          test.vectorizeBitCount();
        }
        System.out.println("Checking popcount result");
        test.checkResult();
        for (int i = 0; i < 10_000; ++i) {
          test.vectorizeBitCount();
        }
        System.out.println("Checking popcount result");
        test.checkResult();
    }
    public TestPopCountVector() {
        input = new int[LEN];
        output = new int[LEN];
        for (int i = 0; i < LEN; ++i) {
            input[i] = i % 2 == 0 ? i : -1 * i;
        }
    }
    public void vectorizeBitCount() {
        for (int i = 0; i < LEN; ++i) {
            output[i] = Integer.bitCount(input[i]);
        }
    }
    public void checkResult() {
        for (int i = 0; i < LEN; ++i) {
            int expected = Integer.bitCount(input[i]);
            if (output[i] != expected) {
                throw new RuntimeException("Invalid result: output[" + i + "] = " + output[i] + " != " + expected);
            }
        }
    }
 }