From de1c12ed636a43cc74b81c48cc987332fe341d7a Mon Sep 17 00:00:00 2001 From: Bhavana Kilambi Date: Mon, 27 Mar 2023 08:50:05 +0000 Subject: [PATCH] 8301012: [vectorapi]: Intrinsify CompressBitsV/ExpandBitsV and add the AArch64 SVE backend implementation Co-authored-by: Xiaohong Gong Co-authored-by: Jatin Bhateja Reviewed-by: ngasson, eliu, thartmann --- src/hotspot/cpu/aarch64/aarch64_vector.ad | 36 ++++ src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 | 30 ++++ src/hotspot/share/adlc/formssel.cpp | 2 +- src/hotspot/share/opto/classes.hpp | 2 + src/hotspot/share/opto/vectornode.cpp | 8 +- src/hotspot/share/opto/vectornode.hpp | 14 ++ src/hotspot/share/runtime/vmStructs.cpp | 2 + .../compiler/lib/ir_framework/IRNode.java | 10 ++ .../TestVectorCompressExpandBits.java | 158 ++++++++++++++++++ 9 files changed, 257 insertions(+), 5 deletions(-) create mode 100644 test/hotspot/jtreg/compiler/vectorapi/TestVectorCompressExpandBits.java diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad index 6fd60b33b52..445e47822ef 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector.ad +++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad @@ -220,6 +220,12 @@ source %{ return false; } break; + case Op_CompressBitsV: + case Op_ExpandBitsV: + if (UseSVE < 2 || !VM_Version::supports_svebitperm()) { + return false; + } + break; default: break; } @@ -240,6 +246,8 @@ source %{ case Op_MulReductionVF: case Op_MulReductionVI: case Op_MulReductionVL: + case Op_CompressBitsV: + case Op_ExpandBitsV: return false; // We use Op_LoadVectorMasked to implement the predicated Op_LoadVector. // Hence we turn to check whether Op_LoadVectorMasked is supported. The @@ -6619,3 +6627,31 @@ instruct vsignum_gt128b(vReg dst, vReg src, vReg zero, vReg one, vReg tmp, pRegG %} ins_pipe(pipe_slow); %} + +// ---------------------------------- CompressBitsV -------------------------------- + +instruct vcompressBits(vReg dst, vReg src1, vReg src2) %{ + match(Set dst (CompressBitsV src1 src2)); + format %{ "vcompressBits $dst, $src1, $src2\t# vector (sve)" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt); + __ sve_bext($dst$$FloatRegister, size, + $src1$$FloatRegister, $src2$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} + +// ----------------------------------- ExpandBitsV --------------------------------- + +instruct vexpandBits(vReg dst, vReg src1, vReg src2) %{ + match(Set dst (ExpandBitsV src1 src2)); + format %{ "vexpandBits $dst, $src1, $src2\t# vector (sve)" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt); + __ sve_bdep($dst$$FloatRegister, size, + $src1$$FloatRegister, $src2$$FloatRegister); + %} + ins_pipe(pipe_slow); +%} diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 index e48c8a7c03e..f372854dd2c 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 +++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 @@ -210,6 +210,12 @@ source %{ return false; } break; + case Op_CompressBitsV: + case Op_ExpandBitsV: + if (UseSVE < 2 || !VM_Version::supports_svebitperm()) { + return false; + } + break; default: break; } @@ -230,6 +236,8 @@ source %{ case Op_MulReductionVF: case Op_MulReductionVI: case Op_MulReductionVL: + case Op_CompressBitsV: + case Op_ExpandBitsV: return false; // We use Op_LoadVectorMasked to implement the predicated Op_LoadVector. // Hence we turn to check whether Op_LoadVectorMasked is supported. The @@ -4950,3 +4958,25 @@ instruct vsignum_gt128b(vReg dst, vReg src, vReg zero, vReg one, vReg tmp, pRegG %} ins_pipe(pipe_slow); %} + +dnl +dnl BITPERM($1, $2, $3 ) +dnl BITPERM(insn_name, op_name, insn) +define(`BITPERM', ` +instruct $1(vReg dst, vReg src1, vReg src2) %{ + match(Set dst ($2 src1 src2)); + format %{ "$1 $dst, $src1, $src2\t# vector (sve)" %} + ins_encode %{ + BasicType bt = Matcher::vector_element_basic_type(this); + Assembler::SIMD_RegVariant size = __ elemType_to_regVariant(bt); + __ $3($dst$$FloatRegister, size, + $src1$$FloatRegister, $src2$$FloatRegister); + %} + ins_pipe(pipe_slow); +%}')dnl +dnl +// ---------------------------------- CompressBitsV -------------------------------- +BITPERM(vcompressBits, CompressBitsV, sve_bext) + +// ----------------------------------- ExpandBitsV --------------------------------- +BITPERM(vexpandBits, ExpandBitsV, sve_bdep) diff --git a/src/hotspot/share/adlc/formssel.cpp b/src/hotspot/share/adlc/formssel.cpp index ce72c138bc6..316fb06b2db 100644 --- a/src/hotspot/share/adlc/formssel.cpp +++ b/src/hotspot/share/adlc/formssel.cpp @@ -4210,7 +4210,7 @@ bool MatchRule::is_vector() const { "SqrtVD","SqrtVF", "AndV" ,"XorV" ,"OrV", "MaxV", "MinV", - "CompressV", "ExpandV", "CompressM", + "CompressV", "ExpandV", "CompressM", "CompressBitsV", "ExpandBitsV", "AddReductionVI", "AddReductionVL", "AddReductionVF", "AddReductionVD", "MulReductionVI", "MulReductionVL", diff --git a/src/hotspot/share/opto/classes.hpp b/src/hotspot/share/opto/classes.hpp index 8a9f4146b9e..4531b0ecd8f 100644 --- a/src/hotspot/share/opto/classes.hpp +++ b/src/hotspot/share/opto/classes.hpp @@ -77,6 +77,8 @@ macro(CheckCastPP) macro(ClearArray) macro(CompressBits) macro(ExpandBits) +macro(CompressBitsV) +macro(ExpandBitsV) macro(ConstraintCast) macro(CMoveD) macro(CMoveVD) diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index 66878384836..c51958be06b 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -182,11 +182,9 @@ int VectorNode::opcode(int sopc, BasicType bt) { case Op_ReverseBytesL: return (bt == T_LONG ? Op_ReverseBytesV : 0); case Op_CompressBits: - // Not implemented. Returning 0 temporarily - return 0; + return (bt == T_INT || bt == T_LONG ? Op_CompressBitsV : 0); case Op_ExpandBits: - // Not implemented. Returning 0 temporarily - return 0; + return (bt == T_INT || bt == T_LONG ? Op_ExpandBitsV : 0); case Op_LShiftI: switch (bt) { case T_BOOLEAN: @@ -703,6 +701,8 @@ VectorNode* VectorNode::make(int vopc, Node* n1, Node* n2, const TypeVect* vt, b case Op_ExpandV: return new ExpandVNode(n1, n2, vt); case Op_CompressV: return new CompressVNode(n1, n2, vt); case Op_CompressM: assert(n1 == nullptr, ""); return new CompressMNode(n2, vt); + case Op_CompressBitsV: return new CompressBitsVNode(n1, n2, vt); + case Op_ExpandBitsV: return new ExpandBitsVNode(n1, n2, vt); case Op_CountLeadingZerosV: return new CountLeadingZerosVNode(n1, vt); case Op_CountTrailingZerosV: return new CountTrailingZerosVNode(n1, vt); default: diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index b67ec59f3a0..13e6fc2a232 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -1804,4 +1804,18 @@ public: virtual int Opcode() const; }; +class CompressBitsVNode : public VectorNode { +public: + CompressBitsVNode(Node* in, Node* mask, const TypeVect* vt) + : VectorNode(in, mask, vt) {} + virtual int Opcode() const; +}; + +class ExpandBitsVNode : public VectorNode { +public: + ExpandBitsVNode(Node* in, Node* mask, const TypeVect* vt) + : VectorNode(in, mask, vt) {} + virtual int Opcode() const; +}; + #endif // SHARE_OPTO_VECTORNODE_HPP diff --git a/src/hotspot/share/runtime/vmStructs.cpp b/src/hotspot/share/runtime/vmStructs.cpp index 28928a2120f..9ff307c0a6e 100644 --- a/src/hotspot/share/runtime/vmStructs.cpp +++ b/src/hotspot/share/runtime/vmStructs.cpp @@ -1764,6 +1764,8 @@ declare_c2_type(CompressVNode, VectorNode) \ declare_c2_type(CompressMNode, VectorNode) \ declare_c2_type(ExpandVNode, VectorNode) \ + declare_c2_type(CompressBitsVNode, VectorNode) \ + declare_c2_type(ExpandBitsVNode, VectorNode) \ declare_c2_type(MulReductionVDNode, ReductionNode) \ declare_c2_type(DivVFNode, VectorNode) \ declare_c2_type(DivVDNode, VectorNode) \ diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java index 6e326672282..6eb5ae996b1 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java @@ -1408,6 +1408,16 @@ public class IRNode { machOnlyNameRegex(XOR3_SVE, "veor3_sve"); } + public static final String COMPRESS_BITSV = PREFIX + "COMPRESS_BITSV" + POSTFIX; + static { + beforeMatchingNameRegex(COMPRESS_BITSV, "CompressBitsV"); + } + + public static final String EXPAND_BITSV = PREFIX + "EXPAND_BITSV" + POSTFIX; + static { + beforeMatchingNameRegex(EXPAND_BITSV, "ExpandBitsV"); + } + /* * Utility methods to set up IR_NODE_MAPPINGS. */ diff --git a/test/hotspot/jtreg/compiler/vectorapi/TestVectorCompressExpandBits.java b/test/hotspot/jtreg/compiler/vectorapi/TestVectorCompressExpandBits.java new file mode 100644 index 00000000000..3b92960c181 --- /dev/null +++ b/test/hotspot/jtreg/compiler/vectorapi/TestVectorCompressExpandBits.java @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2023, Arm Limited. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package compiler.vectorapi; + +import compiler.lib.ir_framework.*; + +import java.util.Random; + +import jdk.incubator.vector.IntVector; +import jdk.incubator.vector.LongVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; + +import jdk.test.lib.Asserts; +import jdk.test.lib.Utils; + +/** + * @test + * @bug 8301012 + * @library /test/lib / + * @requires os.arch == "aarch64" & vm.cpu.features ~= ".*sve2.*" & vm.cpu.features ~= ".*svebitperm.*" + * @summary [vectorapi]: Intrinsify CompressBitsV/ExpandBitsV and add the AArch64 SVE backend implementation + * @modules jdk.incubator.vector + * @run driver compiler.vectorapi.TestVectorCompressExpandBits + */ + +public class TestVectorCompressExpandBits { + private static final VectorSpecies I_SPECIES = IntVector.SPECIES_PREFERRED; + private static final VectorSpecies L_SPECIES = LongVector.SPECIES_PREFERRED; + + private static int LENGTH = 1024; + private static final Random RD = Utils.getRandomInstance(); + + private static int[] ia; + private static int[] ib; + private static int[] ir; + private static long[] la; + private static long[] lb; + private static long[] lr; + + static { + ia = new int[LENGTH]; + ib = new int[LENGTH]; + ir = new int[LENGTH]; + la = new long[LENGTH]; + lb = new long[LENGTH]; + lr = new long[LENGTH]; + + for (int i = 0; i < LENGTH; i++) { + ia[i] = RD.nextInt(25); + ib[i] = RD.nextInt(25); + la[i] = RD.nextLong(25); + lb[i] = RD.nextLong(25); + } + } + + // Test for vectorized Integer.compress operation in SVE2 + @Test + @IR(counts = {IRNode.COMPRESS_BITSV, "> 0"}) + public static void testIntCompress() { + for (int i = 0; i < LENGTH; i += I_SPECIES.length()) { + IntVector av = IntVector.fromArray(I_SPECIES, ia, i); + IntVector bv = IntVector.fromArray(I_SPECIES, ib, i); + av.lanewise(VectorOperators.COMPRESS_BITS, bv).intoArray(ir, i); + } + } + + @Run(test = "testIntCompress") + public static void testIntCompress_runner() { + testIntCompress(); + for (int i = 0; i < LENGTH; i++) { + Asserts.assertEquals(Integer.compress(ia[i], ib[i]), ir[i]); + } + } + + // Test for vectorized Integer.expand operation in SVE2 + @Test + @IR(counts = {IRNode.EXPAND_BITSV, "> 0"}) + public static void testIntExpand() { + for (int i = 0; i < LENGTH; i += I_SPECIES.length()) { + IntVector av = IntVector.fromArray(I_SPECIES, ia, i); + IntVector bv = IntVector.fromArray(I_SPECIES, ib, i); + av.lanewise(VectorOperators.EXPAND_BITS, bv).intoArray(ir, i); + } + } + + @Run(test = "testIntExpand") + public static void testIntExpand_runner() { + testIntExpand(); + for (int i = 0; i < LENGTH; i++) { + Asserts.assertEquals(Integer.expand(ia[i], ib[i]), ir[i]); + } + } + + // Test for vectorized Long.compress operation in SVE2 + @Test + @IR(counts = {IRNode.COMPRESS_BITSV, "> 0"}) + public static void testLongCompress() { + for (int i = 0; i < LENGTH; i += L_SPECIES.length()) { + LongVector av = LongVector.fromArray(L_SPECIES, la, i); + LongVector bv = LongVector.fromArray(L_SPECIES, lb, i); + av.lanewise(VectorOperators.COMPRESS_BITS, bv).intoArray(lr, i); + } + } + + @Run(test = "testLongCompress") + public static void testLongCompress_runner() { + testLongCompress(); + for (int i = 0; i < LENGTH; i++) { + Asserts.assertEquals(Long.compress(la[i], lb[i]), lr[i]); + } + } + + // Test for vectorized Long.expand operation in SVE2 + @Test + @IR(counts = {IRNode.EXPAND_BITSV, "> 0"}) + public static void testLongExpand() { + for (int i = 0; i < LENGTH; i += L_SPECIES.length()) { + LongVector av = LongVector.fromArray(L_SPECIES, la, i); + LongVector bv = LongVector.fromArray(L_SPECIES, lb, i); + av.lanewise(VectorOperators.EXPAND_BITS, bv).intoArray(lr, i); + } + } + + @Run(test = "testLongExpand") + public static void testLongExpand_runner() { + testLongExpand(); + for (int i = 0; i < LENGTH; i++) { + Asserts.assertEquals(Long.expand(la[i], lb[i]), lr[i]); + } + } + + public static void main(String[] args) { + TestFramework.runWithFlags("--add-modules=jdk.incubator.vector", + "-XX:UseSVE=2"); + } +}