From de1c12ed636a43cc74b81c48cc987332fe341d7a Mon Sep 17 00:00:00 2001
From: Bhavana Kilambi <bkilambi@openjdk.org>
Date: Mon, 27 Mar 2023 08:50:05 +0000
Subject: [PATCH] 8301012: [vectorapi]: Intrinsify CompressBitsV/ExpandBitsV
 and add the AArch64 SVE backend implementation

Co-authored-by: Xiaohong Gong <xgong@openjdk.org>
Co-authored-by: Jatin Bhateja <jbhateja@openjdk.org>
Reviewed-by: ngasson, eliu, thartmann
---
 src/hotspot/cpu/aarch64/aarch64_vector.ad     |  36 ++++
 src/hotspot/cpu/aarch64/aarch64_vector_ad.m4  |  30 ++++
 src/hotspot/share/adlc/formssel.cpp           |   2 +-
 src/hotspot/share/opto/classes.hpp            |   2 +
 src/hotspot/share/opto/vectornode.cpp         |   8 +-
 src/hotspot/share/opto/vectornode.hpp         |  14 ++
 src/hotspot/share/runtime/vmStructs.cpp       |   2 +
 .../compiler/lib/ir_framework/IRNode.java     |  10 ++
 .../TestVectorCompressExpandBits.java         | 158 ++++++++++++++++++
 9 files changed, 257 insertions(+), 5 deletions(-)
 create mode 100644 test/hotspot/jtreg/compiler/vectorapi/TestVectorCompressExpandBits.java

diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad
index 6fd60b33b52..445e47822ef 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@@ -220,6 +220,12 @@ source %{
           return false;
         }
         break;
+      case Op_CompressBitsV:
+      case Op_ExpandBitsV:
+        if (UseSVE < 2 || !VM_Version::supports_svebitperm()) {
+          return false;
+        }
+        break;
       default:
         break;
     }
@@ -240,6 +246,8 @@ source %{
       case Op_MulReductionVF:
       case Op_MulReductionVI:
       case Op_MulReductionVL:
+      case Op_CompressBitsV:
+      case Op_ExpandBitsV:
         return false;
       // We use Op_LoadVectorMasked to implement the predicated Op_LoadVector.
       // Hence we turn to check whether Op_LoadVectorMasked is supported. The
@@ -6619,3 +6627,31 @@ instruct vsignum_gt128b(vReg dst, vReg src, vReg zero, vReg one, vReg tmp, pRegG
   %}
   ins_pipe(pipe_slow);
 %}
+
+// ---------------------------------- CompressBitsV --------------------------------
+
+instruct vcompressBits(vReg dst, vReg src1, vReg src2) %{
+  match(Set dst (CompressBitsV src1 src2));
+  format %{ "vcompressBits $dst, $src1, $src2\t# vector (sve)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
+    __ sve_bext($dst$$FloatRegister, size,
+                $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
+
+// ----------------------------------- ExpandBitsV ---------------------------------
+
+instruct vexpandBits(vReg dst, vReg src1, vReg src2) %{
+  match(Set dst (ExpandBitsV src1 src2));
+  format %{ "vexpandBits $dst, $src1, $src2\t# vector (sve)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
+    __ sve_bdep($dst$$FloatRegister, size,
+                $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
index e48c8a7c03e..f372854dd2c 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@@ -210,6 +210,12 @@ source %{
           return false;
         }
         break;
+      case Op_CompressBitsV:
+      case Op_ExpandBitsV:
+        if (UseSVE < 2 || !VM_Version::supports_svebitperm()) {
+          return false;
+        }
+        break;
       default:
         break;
     }
@@ -230,6 +236,8 @@ source %{
       case Op_MulReductionVF:
       case Op_MulReductionVI:
       case Op_MulReductionVL:
+      case Op_CompressBitsV:
+      case Op_ExpandBitsV:
         return false;
       // We use Op_LoadVectorMasked to implement the predicated Op_LoadVector.
       // Hence we turn to check whether Op_LoadVectorMasked is supported. The
@@ -4950,3 +4958,25 @@ instruct vsignum_gt128b(vReg dst, vReg src, vReg zero, vReg one, vReg tmp, pRegG
   %}
   ins_pipe(pipe_slow);
 %}
+
+dnl
+dnl BITPERM($1,        $2,      $3  )
+dnl BITPERM(insn_name, op_name, insn)
+define(`BITPERM', `
+instruct $1(vReg dst, vReg src1, vReg src2) %{
+  match(Set dst ($2 src1 src2));
+  format %{ "$1 $dst, $src1, $src2\t# vector (sve)" %}
+  ins_encode %{
+    BasicType bt = Matcher::vector_element_basic_type(this);
+    Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt);
+    __ $3($dst$$FloatRegister, size,
+                $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(pipe_slow);
+%}')dnl
+dnl
+// ---------------------------------- CompressBitsV --------------------------------
+BITPERM(vcompressBits, CompressBitsV, sve_bext)
+
+// ----------------------------------- ExpandBitsV ---------------------------------
+BITPERM(vexpandBits, ExpandBitsV, sve_bdep)
diff --git a/src/hotspot/share/adlc/formssel.cpp b/src/hotspot/share/adlc/formssel.cpp
index ce72c138bc6..316fb06b2db 100644
--- a/src/hotspot/share/adlc/formssel.cpp
+++ b/src/hotspot/share/adlc/formssel.cpp
@@ -4210,7 +4210,7 @@ bool MatchRule::is_vector() const {
     "SqrtVD","SqrtVF",
     "AndV" ,"XorV" ,"OrV",
     "MaxV", "MinV",
-    "CompressV", "ExpandV", "CompressM",
+    "CompressV", "ExpandV", "CompressM", "CompressBitsV", "ExpandBitsV",
     "AddReductionVI", "AddReductionVL",
     "AddReductionVF", "AddReductionVD",
     "MulReductionVI", "MulReductionVL",
diff --git a/src/hotspot/share/opto/classes.hpp b/src/hotspot/share/opto/classes.hpp
index 8a9f4146b9e..4531b0ecd8f 100644
--- a/src/hotspot/share/opto/classes.hpp
+++ b/src/hotspot/share/opto/classes.hpp
@@ -77,6 +77,8 @@ macro(CheckCastPP)
 macro(ClearArray)
 macro(CompressBits)
 macro(ExpandBits)
+macro(CompressBitsV)
+macro(ExpandBitsV)
 macro(ConstraintCast)
 macro(CMoveD)
 macro(CMoveVD)
diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp
index 66878384836..c51958be06b 100644
--- a/src/hotspot/share/opto/vectornode.cpp
+++ b/src/hotspot/share/opto/vectornode.cpp
@@ -182,11 +182,9 @@ int VectorNode::opcode(int sopc, BasicType bt) {
   case Op_ReverseBytesL:
     return (bt == T_LONG ? Op_ReverseBytesV : 0);
   case Op_CompressBits:
-    // Not implemented. Returning 0 temporarily
-    return 0;
+    return (bt == T_INT || bt == T_LONG ? Op_CompressBitsV : 0);
   case Op_ExpandBits:
-    // Not implemented. Returning 0 temporarily
-    return 0;
+    return (bt == T_INT || bt == T_LONG ? Op_ExpandBitsV : 0);
   case Op_LShiftI:
     switch (bt) {
     case T_BOOLEAN:
@@ -703,6 +701,8 @@ VectorNode* VectorNode::make(int vopc, Node* n1, Node* n2, const TypeVect* vt, b
   case Op_ExpandV: return new ExpandVNode(n1, n2, vt);
   case Op_CompressV: return new CompressVNode(n1, n2, vt);
   case Op_CompressM: assert(n1 == nullptr, ""); return new CompressMNode(n2, vt);
+  case Op_CompressBitsV: return new CompressBitsVNode(n1, n2, vt);
+  case Op_ExpandBitsV: return new ExpandBitsVNode(n1, n2, vt);
   case Op_CountLeadingZerosV: return new CountLeadingZerosVNode(n1, vt);
   case Op_CountTrailingZerosV: return new CountTrailingZerosVNode(n1, vt);
   default:
diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp
index b67ec59f3a0..13e6fc2a232 100644
--- a/src/hotspot/share/opto/vectornode.hpp
+++ b/src/hotspot/share/opto/vectornode.hpp
@@ -1804,4 +1804,18 @@ public:
   virtual int Opcode() const;
 };
 
+class CompressBitsVNode : public VectorNode {
+public:
+  CompressBitsVNode(Node* in, Node* mask, const TypeVect* vt)
+  : VectorNode(in, mask, vt) {}
+  virtual int Opcode() const;
+};
+
+class ExpandBitsVNode : public VectorNode {
+public:
+  ExpandBitsVNode(Node* in, Node* mask, const TypeVect* vt)
+  : VectorNode(in, mask, vt) {}
+  virtual int Opcode() const;
+};
+
 #endif // SHARE_OPTO_VECTORNODE_HPP
diff --git a/src/hotspot/share/runtime/vmStructs.cpp b/src/hotspot/share/runtime/vmStructs.cpp
index 28928a2120f..9ff307c0a6e 100644
--- a/src/hotspot/share/runtime/vmStructs.cpp
+++ b/src/hotspot/share/runtime/vmStructs.cpp
@@ -1764,6 +1764,8 @@
   declare_c2_type(CompressVNode, VectorNode)                              \
   declare_c2_type(CompressMNode, VectorNode)                              \
   declare_c2_type(ExpandVNode, VectorNode)                                \
+  declare_c2_type(CompressBitsVNode, VectorNode)                          \
+  declare_c2_type(ExpandBitsVNode, VectorNode)                            \
   declare_c2_type(MulReductionVDNode, ReductionNode)                      \
   declare_c2_type(DivVFNode, VectorNode)                                  \
   declare_c2_type(DivVDNode, VectorNode)                                  \
diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
index 6e326672282..6eb5ae996b1 100644
--- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
+++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
@@ -1408,6 +1408,16 @@ public class IRNode {
         machOnlyNameRegex(XOR3_SVE, "veor3_sve");
     }
 
+    public static final String COMPRESS_BITSV = PREFIX + "COMPRESS_BITSV" + POSTFIX;
+    static {
+        beforeMatchingNameRegex(COMPRESS_BITSV, "CompressBitsV");
+    }
+
+    public static final String EXPAND_BITSV = PREFIX + "EXPAND_BITSV" + POSTFIX;
+    static {
+        beforeMatchingNameRegex(EXPAND_BITSV, "ExpandBitsV");
+    }
+
     /*
      * Utility methods to set up IR_NODE_MAPPINGS.
      */
diff --git a/test/hotspot/jtreg/compiler/vectorapi/TestVectorCompressExpandBits.java b/test/hotspot/jtreg/compiler/vectorapi/TestVectorCompressExpandBits.java
new file mode 100644
index 00000000000..3b92960c181
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/vectorapi/TestVectorCompressExpandBits.java
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2023, Arm Limited. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package compiler.vectorapi;
+
+import compiler.lib.ir_framework.*;
+
+import java.util.Random;
+
+import jdk.incubator.vector.IntVector;
+import jdk.incubator.vector.LongVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+
+import jdk.test.lib.Asserts;
+import jdk.test.lib.Utils;
+
+/**
+ * @test
+ * @bug 8301012
+ * @library /test/lib /
+ * @requires os.arch == "aarch64" & vm.cpu.features ~= ".*sve2.*" & vm.cpu.features ~= ".*svebitperm.*"
+ * @summary [vectorapi]: Intrinsify CompressBitsV/ExpandBitsV and add the AArch64 SVE backend implementation
+ * @modules jdk.incubator.vector
+ * @run driver compiler.vectorapi.TestVectorCompressExpandBits
+ */
+
+public class TestVectorCompressExpandBits {
+    private static final VectorSpecies<Integer> I_SPECIES = IntVector.SPECIES_PREFERRED;
+    private static final VectorSpecies<Long> L_SPECIES = LongVector.SPECIES_PREFERRED;
+
+    private static int LENGTH = 1024;
+    private static final Random RD = Utils.getRandomInstance();
+
+    private static int[] ia;
+    private static int[] ib;
+    private static int[] ir;
+    private static long[] la;
+    private static long[] lb;
+    private static long[] lr;
+
+    static {
+        ia = new int[LENGTH];
+        ib = new int[LENGTH];
+        ir = new int[LENGTH];
+        la = new long[LENGTH];
+        lb = new long[LENGTH];
+        lr = new long[LENGTH];
+
+        for (int i = 0; i < LENGTH; i++) {
+            ia[i] = RD.nextInt(25);
+            ib[i] = RD.nextInt(25);
+            la[i] = RD.nextLong(25);
+            lb[i] = RD.nextLong(25);
+        }
+    }
+
+    // Test for vectorized Integer.compress operation in SVE2
+    @Test
+    @IR(counts = {IRNode.COMPRESS_BITSV, "> 0"})
+    public static void testIntCompress() {
+        for (int i = 0; i < LENGTH; i += I_SPECIES.length()) {
+            IntVector av = IntVector.fromArray(I_SPECIES, ia, i);
+            IntVector bv = IntVector.fromArray(I_SPECIES, ib, i);
+            av.lanewise(VectorOperators.COMPRESS_BITS, bv).intoArray(ir, i);
+        }
+    }
+
+    @Run(test = "testIntCompress")
+    public static void testIntCompress_runner() {
+        testIntCompress();
+        for (int i = 0; i < LENGTH; i++) {
+            Asserts.assertEquals(Integer.compress(ia[i], ib[i]), ir[i]);
+        }
+    }
+
+    // Test for vectorized Integer.expand operation in SVE2
+    @Test
+    @IR(counts = {IRNode.EXPAND_BITSV, "> 0"})
+    public static void testIntExpand() {
+        for (int i = 0; i < LENGTH; i += I_SPECIES.length()) {
+            IntVector av = IntVector.fromArray(I_SPECIES, ia, i);
+            IntVector bv = IntVector.fromArray(I_SPECIES, ib, i);
+            av.lanewise(VectorOperators.EXPAND_BITS, bv).intoArray(ir, i);
+        }
+    }
+
+    @Run(test = "testIntExpand")
+    public static void testIntExpand_runner() {
+        testIntExpand();
+        for (int i = 0; i < LENGTH; i++) {
+            Asserts.assertEquals(Integer.expand(ia[i], ib[i]), ir[i]);
+        }
+    }
+
+    // Test for vectorized Long.compress operation in SVE2
+    @Test
+    @IR(counts = {IRNode.COMPRESS_BITSV, "> 0"})
+    public static void testLongCompress() {
+        for (int i = 0; i < LENGTH; i += L_SPECIES.length()) {
+            LongVector av = LongVector.fromArray(L_SPECIES, la, i);
+            LongVector bv = LongVector.fromArray(L_SPECIES, lb, i);
+            av.lanewise(VectorOperators.COMPRESS_BITS, bv).intoArray(lr, i);
+        }
+    }
+
+    @Run(test = "testLongCompress")
+    public static void testLongCompress_runner() {
+        testLongCompress();
+        for (int i = 0; i < LENGTH; i++) {
+            Asserts.assertEquals(Long.compress(la[i], lb[i]), lr[i]);
+        }
+    }
+
+    // Test for vectorized Long.expand operation in SVE2
+    @Test
+    @IR(counts = {IRNode.EXPAND_BITSV, "> 0"})
+    public static void testLongExpand() {
+        for (int i = 0; i < LENGTH; i += L_SPECIES.length()) {
+            LongVector av = LongVector.fromArray(L_SPECIES, la, i);
+            LongVector bv = LongVector.fromArray(L_SPECIES, lb, i);
+            av.lanewise(VectorOperators.EXPAND_BITS, bv).intoArray(lr, i);
+        }
+    }
+
+    @Run(test = "testLongExpand")
+    public static void testLongExpand_runner() {
+        testLongExpand();
+        for (int i = 0; i < LENGTH; i++) {
+            Asserts.assertEquals(Long.expand(la[i], lb[i]), lr[i]);
+        }
+    }
+
+    public static void main(String[] args) {
+        TestFramework.runWithFlags("--add-modules=jdk.incubator.vector",
+                                   "-XX:UseSVE=2");
+    }
+}