8282711: Accelerate Math.signum function for AVX and AVX512 target.

Reviewed-by: sviswanathan, thartmann
2025-09-17 17:44:40 +02:00 · 2022-04-29 06:34:09 +00:00 · 2022-04-29 06:34:09 +00:00 · e4066628ad
commit e4066628ad
parent 0a4a6403bb
13 changed files with 337 additions and 2 deletions
--- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
+++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
@ -4420,6 +4420,48 @@ void C2_MacroAssembler::vector_mask_operation(int opc, Register dst, XMMRegister
 }
 #endif
 void C2_MacroAssembler::vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
                                           KRegister ktmp1, int vec_enc) {
  if (opcode == Op_SignumVD) {
    vsubpd(dst, zero, one, vec_enc);
    // if src < 0 ? -1 : 1
    evcmppd(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
    evblendmpd(dst, ktmp1, one, dst, true, vec_enc);
    // if src == NaN, -0.0 or 0.0 return src.
    evcmppd(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
    evblendmpd(dst, ktmp1, dst, src, true, vec_enc);
  } else {
    assert(opcode == Op_SignumVF, "");
    vsubps(dst, zero, one, vec_enc);
    // if src < 0 ? -1 : 1
    evcmpps(ktmp1, k0, src, zero, Assembler::LT_OQ, vec_enc);
    evblendmps(dst, ktmp1, one, dst, true, vec_enc);
    // if src == NaN, -0.0 or 0.0 return src.
    evcmpps(ktmp1, k0, src, zero, Assembler::EQ_UQ, vec_enc);
    evblendmps(dst, ktmp1, dst, src, true, vec_enc);
  }
 }
 void C2_MacroAssembler::vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
                                          XMMRegister xtmp1, int vec_enc) {
  if (opcode == Op_SignumVD) {
    vsubpd(dst, zero, one, vec_enc);
    // if src < 0 ? -1 : 1
    vblendvpd(dst, one, dst, src, vec_enc);
    // if src == NaN, -0.0 or 0.0 return src.
    vcmppd(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
    vblendvpd(dst, dst, src, xtmp1, vec_enc);
  } else {
    assert(opcode == Op_SignumVF, "");
    vsubps(dst, zero, one, vec_enc);
    // if src < 0 ? -1 : 1
    vblendvps(dst, one, dst, src, vec_enc);
    // if src == NaN, -0.0 or 0.0 return src.
    vcmpps(xtmp1, src, zero, Assembler::EQ_UQ, vec_enc);
    vblendvps(dst, dst, src, xtmp1, vec_enc);
  }
 }
 void C2_MacroAssembler::vector_maskall_operation(KRegister dst, Register src, int mask_len) {
  if (VM_Version::supports_avx512bw()) {
    if (mask_len > 32) {
--- a/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
+++ b/src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
@ -340,6 +340,12 @@ public:
  void evpternlog(XMMRegister dst, int func, KRegister mask, XMMRegister src2, Address src3,
                  bool merge, BasicType bt, int vlen_enc);
  void vector_signum_avx(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
                         XMMRegister xtmp1, int vec_enc);
  void vector_signum_evex(int opcode, XMMRegister dst, XMMRegister src, XMMRegister zero, XMMRegister one,
                          KRegister ktmp1, int vec_enc);
  void udivI(Register rax, Register divisor, Register rdx);
  void umodI(Register rax, Register divisor, Register rdx);
  void udivmodI(Register rax, Register divisor, Register rdx, Register tmp);
@ -349,6 +355,7 @@ public:
  void umodL(Register rax, Register divisor, Register rdx);
  void udivmodL(Register rax, Register divisor, Register rdx, Register tmp);
  #endif
  void vector_popcount_int(XMMRegister dst, XMMRegister src, XMMRegister xtmp1,
                           XMMRegister xtmp2, XMMRegister xtmp3, Register rtmp,
                           int vec_enc);
--- a/src/hotspot/cpu/x86/x86.ad
+++ b/src/hotspot/cpu/x86/x86.ad
@ -1886,6 +1886,12 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
        return false;
      }
      break;
    case Op_SignumVD:
    case Op_SignumVF:
      if (UseAVX < 1) {
        return false;
      }
      break;
    case Op_PopCountVI:
      if (!VM_Version::supports_avx512_vpopcntdq() &&
          (vlen == 16) && !VM_Version::supports_avx512bw()) {
@ -6089,6 +6095,36 @@ instruct signumD_reg(regD dst, regD zero, regD one, rRegP scratch, rFlagsReg cr)
  ins_pipe( pipe_slow );
 %}
 instruct signumV_reg_avx(vec dst, vec src, vec zero, vec one, vec xtmp1) %{
  predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
  match(Set dst (SignumVF src (Binary zero one)));
  match(Set dst (SignumVD src (Binary zero one)));
  effect(TEMP dst, TEMP xtmp1);
  format %{ "vector_signum_avx $dst, $src\t! using $xtmp1 as TEMP" %}
  ins_encode %{
    int opcode = this->ideal_Opcode();
    int vec_enc = vector_length_encoding(this);
    __ vector_signum_avx(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
                         $xtmp1$$XMMRegister, vec_enc);
  %}
  ins_pipe( pipe_slow );
 %}
 instruct signumV_reg_evex(vec dst, vec src, vec zero, vec one, kReg ktmp1) %{
  predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
  match(Set dst (SignumVF src (Binary zero one)));
  match(Set dst (SignumVD src (Binary zero one)));
  effect(TEMP dst, TEMP ktmp1);
  format %{ "vector_signum_evex $dst, $src\t! using $ktmp1 as TEMP" %}
  ins_encode %{
    int opcode = this->ideal_Opcode();
    int vec_enc = vector_length_encoding(this);
    __ vector_signum_evex(opcode, $dst$$XMMRegister, $src$$XMMRegister, $zero$$XMMRegister, $one$$XMMRegister,
                          $ktmp1$$KRegister, vec_enc);
  %}
  ins_pipe( pipe_slow );
 %}
 // ---------------------------------------
 // For copySign use 0xE4 as writemask for vpternlog
 // Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
--- a/src/hotspot/share/adlc/formssel.cpp
+++ b/src/hotspot/share/adlc/formssel.cpp
@ -4237,7 +4237,7 @@ bool MatchRule::is_vector() const {
    "VectorCastL2X", "VectorCastF2X", "VectorCastD2X",
    "VectorUCastB2X", "VectorUCastS2X", "VectorUCastI2X",
    "VectorMaskWrapper","VectorMaskCmp","VectorReinterpret","LoadVectorMasked","StoreVectorMasked",
-    "FmaVD","FmaVF","PopCountVI", "PopCountVL", "VectorLongToMask",
+    "FmaVD","FmaVF","PopCountVI", "PopCountVL", "SignumVF", "SignumVD", "VectorLongToMask",
    // Next are vector mask ops.
    "MaskAll", "AndVMask", "OrVMask", "XorVMask", "VectorMaskCast",
    "RoundVF", "RoundVD",
--- a/src/hotspot/share/opto/classes.hpp
+++ b/src/hotspot/share/opto/classes.hpp
@ -321,6 +321,8 @@ macro(CopySignD)
 macro(CopySignF)
 macro(SignumD)
 macro(SignumF)
 macro(SignumVF)
 macro(SignumVD)
 macro(SqrtD)
 macro(SqrtF)
 macro(RoundF)
--- a/src/hotspot/share/opto/matcher.cpp
+++ b/src/hotspot/share/opto/matcher.cpp
@ -2456,6 +2456,8 @@ void Matcher::find_shared_post_visit(Node* n, uint opcode) {
      break;
    }
    case Op_CopySignD:
    case Op_SignumVF:
    case Op_SignumVD:
    case Op_SignumF:
    case Op_SignumD: {
      Node* pair = new BinaryNode(n->in(2), n->in(3));
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@ -2525,6 +2525,13 @@ bool SuperWord::output() {
        Node* in2 = vector_opd(p, 2);
        vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
        vlen_in_bytes = vn->as_Vector()->length_in_bytes();
      } else if (opc == Op_SignumF || opc == Op_SignumD) {
        assert(n->req() == 4, "four inputs expected");
        Node* in = vector_opd(p, 1);
        Node* zero = vector_opd(p, 2);
        Node* one = vector_opd(p, 3);
        vn = VectorNode::make(opc, in, zero, one, vlen, velt_basic_type(n));
        vlen_in_bytes = vn->as_Vector()->length_in_bytes();
      } else if (n->req() == 3 && !is_cmov_pack(p)) {
        // Promote operands to vector
        Node* in1 = NULL;
--- a/src/hotspot/share/opto/vectornode.cpp
+++ b/src/hotspot/share/opto/vectornode.cpp
@ -245,6 +245,10 @@ int VectorNode::opcode(int sopc, BasicType bt) {
    return Op_VectorCastF2X;
  case Op_ConvD2L:
    return Op_VectorCastD2X;
  case Op_SignumF:
    return Op_SignumVF;
  case Op_SignumD:
    return Op_SignumVD;
  default:
    return 0; // Unimplemented
@ -646,6 +650,8 @@ VectorNode* VectorNode::make(int vopc, Node* n1, Node* n2, Node* n3, const TypeV
  switch (vopc) {
  case Op_FmaVD: return new FmaVDNode(n1, n2, n3, vt);
  case Op_FmaVF: return new FmaVFNode(n1, n2, n3, vt);
  case Op_SignumVD: return new SignumVDNode(n1, n2, n3, vt);
  case Op_SignumVF: return new SignumVFNode(n1, n2, n3, vt);
  default:
    fatal("Missed vector creation for '%s'", NodeClassNames[vopc]);
    return NULL;
--- a/src/hotspot/share/opto/vectornode.hpp
+++ b/src/hotspot/share/opto/vectornode.hpp
@ -1674,4 +1674,20 @@ public:
  virtual int Opcode() const;
  Node* Ideal(PhaseGVN* phase, bool can_reshape);
 };
 class SignumVFNode : public VectorNode {
 public:
  SignumVFNode(Node* in1, Node* zero, Node* one, const TypeVect* vt)
  : VectorNode(in1, zero, one, vt) {}
  virtual int Opcode() const;
 };
 class SignumVDNode : public VectorNode {
 public:
  SignumVDNode(Node* in1, Node* zero, Node* one, const TypeVect* vt)
  : VectorNode(in1, zero, one, vt) {}
  virtual int Opcode() const;
 };
 #endif // SHARE_OPTO_VECTORNODE_HPP
--- a/test/hotspot/jtreg/compiler/c2/cr6340864/TestDoubleVect.java
+++ b/test/hotspot/jtreg/compiler/c2/cr6340864/TestDoubleVect.java
@ -89,6 +89,7 @@ public class TestDoubleVect {
      test_divv(a0, a1, -VALUE);
      test_diva(a0, a1, a3);
      test_negc(a0, a1);
      test_signum(a0, a1);
      test_rint(a0, a1);
      test_ceil(a0, a1);
      test_floor(a0, a1);
@ -426,6 +427,19 @@ public class TestDoubleVect {
        errn += verify("test_sqrt: ", i, a0[i], Math.sqrt((double)(ADD_INIT+i)));
      }
      test_signum(a0, a1);
      errn += verify("test_signum: ", 0, a0[0], (Double.NaN));
      errn += verify("test_signum: ", 1, a0[1],  1.0);
      errn += verify("test_signum: ", 2, a0[2], -1.0);
      errn += verify("test_signum: ", 3, a0[3],  1.0);
      errn += verify("test_signum: ", 4, a0[4],  1.0);
      errn += verify("test_signum: ", 5, a0[5],  1.0);
      errn += verify("test_signum: ", 6, a0[6],  0.0);
      errn += verify("test_signum: ", 7, a0[7], -0.0);
      for (int i=8; i<ARRLEN; i++) {
        errn += verify("test_signum: ", i, a0[i], (double)(((double)(ADD_INIT+i)) > 0.0 ? 1.0 : -1.0));
      }
      a1[6] = +0x1.fffffffffffffp-2;
      a1[7] = +0x1.0p-1;
      a1[8] = +0x1.0000000000001p-1;
@ -590,6 +604,13 @@ public class TestDoubleVect {
    end = System.currentTimeMillis();
    System.out.println("test_negc_n: " + (end - start));
    start = System.currentTimeMillis();
    for (int i=0; i<ITERS; i++) {
      test_signum(a0, a1);
    }
    end = System.currentTimeMillis();
    System.out.println("test_signum_n: " + (end - start));
    start = System.currentTimeMillis();
    for (int i=0; i<ITERS; i++) {
      test_sqrt(a0, a1);
@ -693,6 +714,12 @@ public class TestDoubleVect {
    }
  }
  static void test_signum(double[] a0, double[] a1) {
    for (int i = 0; i < a0.length; i+=1) {
      a0[i] = Math.signum(a1[i]);
    }
  }
  static void test_rint(double[] a0, double[] a1) {
    for (int i = 0; i < a0.length; i+=1) {
      a0[i] = Math.rint(a1[i] + ((double)(i))/1000);
--- a/test/hotspot/jtreg/compiler/c2/cr6340864/TestFloatVect.java
+++ b/test/hotspot/jtreg/compiler/c2/cr6340864/TestFloatVect.java
@ -88,6 +88,7 @@ public class TestFloatVect {
      test_divv(a0, a1, -VALUE);
      test_diva(a0, a1, a3);
      test_negc(a0, a1);
      test_signum(a0, a1);
      test_sqrt(a0, a1);
      test_round(i0, a1);
    }
@ -345,6 +346,7 @@ public class TestFloatVect {
        errn += verify("test_diva_n: ", i, a0[i], ((ADD_INIT+i)/(-VALUE)));
      }
      test_negc(a0, a1);
      errn += verify("test_negc: ", 0, a0[0], (Float.NaN));
      errn += verify("test_negc: ", 1, a0[1], (Float.NEGATIVE_INFINITY));
@ -372,6 +374,19 @@ public class TestFloatVect {
        errn += verify("test_sqrt: ", i, a0[i], (float)(Math.sqrt((double)(ADD_INIT+i))));
      }
      test_signum(a0, a1);
      errn += verify("test_signum: ", 0, a0[0], (Float.NaN));
      errn += verify("test_signum: ", 1, a0[1],  1.0f);
      errn += verify("test_signum: ", 2, a0[2], -1.0f);
      errn += verify("test_signum: ", 3, a0[3],  1.0f);
      errn += verify("test_signum: ", 4, a0[4],  1.0f);
      errn += verify("test_signum: ", 5, a0[5],  1.0f);
      errn += verify("test_signum: ", 6, a0[6],  0.0f);
      errn += verify("test_signum: ", 7, a0[7], -0.0f);
      for (int i=8; i<ARRLEN; i++) {
        errn += verify("test_signum: ", i, a0[i], (((float)(ADD_INIT+i)) > 0.0f ? 1.0f : -1.0f));
      }
      a1[6] = +0x1.fffffep-2f;
      a1[7] = +0x1.0p-1f;
      a1[8] = +0x1.000002p-1f;
@ -400,7 +415,6 @@ public class TestFloatVect {
      for (int i=14; i<ARRLEN; i++) {
        errn += verify("test_round: ", i, i0[i], Math.round(((float)(ADD_INIT+i))));
      }
    }
    if (errn > 0)
@ -537,6 +551,13 @@ public class TestFloatVect {
    end = System.currentTimeMillis();
    System.out.println("test_negc_n: " + (end - start));
    start = System.currentTimeMillis();
    for (int i=0; i<ITERS; i++) {
      test_signum(a0, a1);
    }
    end = System.currentTimeMillis();
    System.out.println("test_signum_n: " + (end - start));
    start = System.currentTimeMillis();
    for (int i=0; i<ITERS; i++) {
      test_sqrt(a0, a1);
@ -635,6 +656,12 @@ public class TestFloatVect {
    }
  }
  static void test_signum(float[] a0, float[] a1) {
    for (int i = 0; i < a0.length; i+=1) {
      a0[i] = Math.signum(a1[i]);
    }
  }
  static void test_negc(float[] a0, float[] a1) {
    for (int i = 0; i < a0.length; i+=1) {
      a0[i] = (float)(-((float)a1[i]));
--- a/test/hotspot/jtreg/compiler/vectorization/TestSignumVector.java
+++ b/test/hotspot/jtreg/compiler/vectorization/TestSignumVector.java
@ -0,0 +1,93 @@
 /*
 * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
 /**
 * @test
 * @bug 8282711
 * @summary Accelerate Math.signum function for AVX and AVX512.
 * @requires vm.compiler2.enabled
 * @requires vm.cpu.features ~= ".*avx.*"
 * @requires os.simpleArch == "x64"
 * @library /test/lib /
 * @run driver compiler.vectorization.TestSignumVector
 */
 package compiler.vectorization;
 import compiler.lib.ir_framework.*;
 public class TestSignumVector {
  private static final int ARRLEN = 1024;
  private static final int ITERS  = 11000;
  private static double [] dinp;
  private static double [] dout;
  private static float  [] finp;
  private static float  [] fout;
  public static void main(String args[]) {
      TestFramework.runWithFlags("-XX:-TieredCompilation",
                                  "-XX:CompileThresholdScaling=0.3");
      System.out.println("PASSED");
  }
  @Test
  @IR(counts = {"SignumVD" , " > 0 "})
  public void test_signum_double(double[] dout, double[] dinp) {
      for (int i = 0; i < dout.length; i+=1) {
          dout[i] = Math.signum(dinp[i]);
      }
  }
  @Run(test = {"test_signum_double"}, mode = RunMode.STANDALONE)
  public void kernel_test_signum_double() {
      dinp = new double[ARRLEN];
      dout = new double[ARRLEN];
      for(int i = 0 ; i < ARRLEN; i++) {
          dinp[i] = (double)i*1.4;
      }
      for (int i = 0; i < ITERS; i++) {
          test_signum_double(dout , dinp);
      }
  }
  @Test
  @IR(counts = {"SignumVF" , " > 0 "})
  public void test_signum_float(float[] fout, float[] finp) {
      for (int i = 0; i < finp.length; i+=1) {
          fout[i] = Math.signum(finp[i]);
      }
  }
  @Run(test = {"test_signum_float"}, mode = RunMode.STANDALONE)
  public void kernel_test_round() {
      finp = new float[ARRLEN];
      fout = new float[ARRLEN];
      for(int i = 0 ; i < ARRLEN; i++) {
          finp[i] = (float)i*1.4f;
      }
      for (int i = 0; i < ITERS; i++) {
          test_signum_float(fout , finp);
      }
  }
 }
--- a/test/micro/org/openjdk/bench/java/math/VectorSignum.java
+++ b/test/micro/org/openjdk/bench/java/math/VectorSignum.java
@ -0,0 +1,70 @@
 /*
 * Copyright (c) 2022, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 */
 package org.openjdk.bench.vm.compiler;
 import org.openjdk.jmh.annotations.*;
 import org.openjdk.jmh.infra.*;
 import java.util.concurrent.TimeUnit;
 import java.util.Random;
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Thread)
 public class VectorSignum {
    @Param({"256", "512", "1024", "2048"})
    private static int SIZE;
    private double[] res_doubles = new double[SIZE];
    private double[] doubles = new double[SIZE];
    private float[] res_floats = new float[SIZE];
    private float[] floats = new float[SIZE];
    private Random r = new Random(1024);
    @Setup
    public void init() {
        doubles = new double[SIZE];
        floats = new float[SIZE];
        res_doubles = new double[SIZE];
        res_floats = new float[SIZE];
        for (int i=0; i<SIZE; i++) {
            floats[i] = r.nextFloat();
            doubles[i] = r.nextDouble();
        }
    }
    @Benchmark
    public void floatSignum() {
        for(int i = 0; i < SIZE; i++) {
            res_floats[i] = Math.signum(floats[i]);
        }
    }
    @Benchmark
    public void doubleSignum() {
        for(int i = 0; i < SIZE; i++) {
            res_doubles[i] = Math.signum(doubles[i]);
        }
    }
 }