8289551: Conversions between bit representations of half precision values and floats

Reviewed-by: psandoz, jrose
2025-08-27 14:54:52 +02:00 · 2022-07-26 16:54:32 +00:00 · 2022-07-26 16:54:32 +00:00 · 7318b22209
commit 7318b22209
parent 2ae8e31183
3 changed files with 703 additions and 0 deletions
--- a/src/java.base/share/classes/java/lang/Float.java
+++ b/src/java.base/share/classes/java/lang/Float.java
@ -30,6 +30,7 @@ import java.lang.constant.Constable;
 import java.lang.constant.ConstantDesc;
 import java.util.Optional;

+import jdk.internal.math.FloatConsts;
 import jdk.internal.math.FloatingDecimal;
 import jdk.internal.math.FloatToDecimal;
 import jdk.internal.vm.annotation.IntrinsicCandidate;
@ -975,6 +976,198 @@ public final class Float extends Number
    @IntrinsicCandidate
    public static native float intBitsToFloat(int bits);

+    /**
+     * {@return the {@code float} value closest to the numerical value
+     * of the argument, a floating-point binary16 value encoded in a
+     * {@code short}} The conversion is exact; all binary16 values can
+     * be exactly represented in {@code float}.
+     *
+     * Special cases:
+     * <ul>
+     * <li> If the argument is zero, the result is a zero with the
+     * same sign as the argument.
+     * <li> If the argument is infinite, the result is an infinity
+     * with the same sign as the argument.
+     * <li> If the argument is a NaN, the result is a NaN.
+     * </ul>
+     *
+     * <h4><a id=binary16Format>IEEE 754 binary16 format</a></h4>
+     * The IEEE 754 standard defines binary16 as a 16-bit format, along
+     * with the 32-bit binary32 format (corresponding to the {@code
+     * float} type) and the 64-bit binary64 format (corresponding to
+     * the {@code double} type). The binary16 format is similar to the
+     * other IEEE 754 formats, except smaller, having all the usual
+     * IEEE 754 values such as NaN, signed infinities, signed zeros,
+     * and subnormals. The parameters (JLS {@jls 4.2.3}) for the
+     * binary16 format are N = 11 precision bits, K = 5 exponent bits,
+     * <i>E</i><sub><i>max</i></sub> = 15, and
+     * <i>E</i><sub><i>min</i></sub> = -14.
+     *
+     * @apiNote
+     * This method corresponds to the convertFormat operation defined
+     * in IEEE 754 from the binary16 format to the binary32 format.
+     * The operation of this method is analogous to a primitive
+     * widening conversion (JLS {@jls 5.1.2}).
+     *
+     * @param floatBinary16 the binary16 value to convert to {@code float}
+     * @since 20
+     */
+    // @IntrinsicCandidate
+    public static float float16ToFloat(short floatBinary16) {
+        /*
+         * The binary16 format has 1 sign bit, 5 exponent bits, and 10
+         * significand bits. The exponent bias is 15.
+         */
+        int bin16arg = (int)floatBinary16;
+        int bin16SignBit     = 0x8000 & bin16arg;
+        int bin16ExpBits     = 0x7c00 & bin16arg;
+        int bin16SignifBits  = 0x03FF & bin16arg;
+
+        // Shift left difference in the number of significand bits in
+        // the float and binary16 formats
+        final int SIGNIF_SHIFT = (FloatConsts.SIGNIFICAND_WIDTH - 11);
+
+        float sign = (bin16SignBit != 0) ? -1.0f : 1.0f;
+
+        // Extract binary16 exponent, remove its bias, add in the bias
+        // of a float exponent and shift to correct bit location
+        // (significand width includes the implicit bit so shift one
+        // less).
+        int bin16Exp = (bin16ExpBits >> 10) - 15;
+        if (bin16Exp == -15) {
+            // For subnormal binary16 values and 0, the numerical
+            // value is 2^24 * the significand as an integer (no
+            // implicit bit).
+            return sign * (0x1p-24f * bin16SignifBits);
+        } else if (bin16Exp == 16) {
+            return (bin16SignifBits == 0) ?
+                sign * Float.POSITIVE_INFINITY :
+                Float.intBitsToFloat((bin16SignBit << 16) |
+                                     0x7f80_0000 |
+                                     // Preserve NaN signif bits
+                                     ( bin16SignifBits << SIGNIF_SHIFT ));
+        }
+
+        assert -15 < bin16Exp  && bin16Exp < 16;
+
+        int floatExpBits = (bin16Exp + FloatConsts.EXP_BIAS)
+            << (FloatConsts.SIGNIFICAND_WIDTH - 1);
+
+        // Compute and combine result sign, exponent, and significand bits.
+        return Float.intBitsToFloat((bin16SignBit << 16) |
+                                    floatExpBits |
+                                    (bin16SignifBits << SIGNIF_SHIFT));
+    }
+
+    /**
+     * {@return the floating-point binary16 value, encoded in a {@code
+     * short}, closest in value to the argument}
+     * The conversion is computed under the {@linkplain
+     * java.math.RoundingMode#HALF_EVEN round to nearest even rounding
+     * mode}.
+     *
+     * Special cases:
+     * <ul>
+     * <li> If the argument is zero, the result is a zero with the
+     * same sign as the argument.
+     * <li> If the argument is infinite, the result is an infinity
+     * with the same sign as the argument.
+     * <li> If the argument is a NaN, the result is a NaN.
+     * </ul>
+     *
+     * The <a href="#binary16Format">binary16 format</a> is discussed in
+     * more detail in the {@link #float16ToFloat} method.
+     *
+     * @apiNote
+     * This method corresponds to the convertFormat operation defined
+     * in IEEE 754 from the binary32 format to the binary16 format.
+     * The operation of this method is analogous to a primitive
+     * narrowing conversion (JLS {@jls 5.1.3}).
+     *
+     * @param f the {@code float} value to convert to binary16
+     * @since 20
+     */
+    // @IntrinsicCandidate
+    public static short floatToFloat16(float f) {
+        int doppel = Float.floatToRawIntBits(f);
+        short sign_bit = (short)((doppel & 0x8000_0000) >> 16);
+
+        if (Float.isNaN(f)) {
+            // Preserve sign and attempt to preserve significand bits
+            return (short)(sign_bit
+                    | 0x7c00 // max exponent + 1
+                    // Preserve high order bit of float NaN in the
+                    // binary16 result NaN (tenth bit); OR in remaining
+                    // bits into lower 9 bits of binary 16 significand.
+                    | (doppel & 0x007f_e000) >> 13 // 10 bits
+                    | (doppel & 0x0000_1ff0) >> 4  //  9 bits
+                    | (doppel & 0x0000_000f));     //  4 bits
+        }
+
+        float abs_f = Math.abs(f);
+
+        // The overflow threshold is binary16 MAX_VALUE + 1/2 ulp
+        if (abs_f >= (0x1.ffcp15f + 0x0.002p15f) ) {
+            return (short)(sign_bit | 0x7c00); // Positive or negative infinity
+        }
+
+        // Smallest magnitude nonzero representable binary16 value
+        // is equal to 0x1.0p-24; half-way and smaller rounds to zero.
+        if (abs_f <= 0x1.0p-24f * 0.5f) { // Covers float zeros and subnormals.
+            return sign_bit; // Positive or negative zero
+        }
+
+        // Dealing with finite values in exponent range of binary16
+        // (when rounding is done, could still round up)
+        int exp = Math.getExponent(f);
+        assert -25 <= exp && exp <= 15;
+
+        // For binary16 subnormals, beside forcing exp to -15, retain
+        // the difference expdelta = E_min - exp.  This is the excess
+        // shift value, in addition to 13, to be used in the
+        // computations below.  Further the (hidden) msb with value 1
+        // in f must be involved as well.
+        int expdelta = 0;
+        int msb = 0x0000_0000;
+        if (exp < -14) {
+            expdelta = -14 - exp;
+            exp = -15;
+            msb = 0x0080_0000;
+        }
+        int f_signif_bits = doppel & 0x007f_ffff | msb;
+
+        // Significand bits as if using rounding to zero (truncation).
+        short signif_bits = (short)(f_signif_bits >> (13 + expdelta));
+
+        // For round to nearest even, determining whether or not to
+        // round up (in magnitude) is a function of the least
+        // significant bit (LSB), the next bit position (the round
+        // position), and the sticky bit (whether there are any
+        // nonzero bits in the exact result to the right of the round
+        // digit). An increment occurs in three cases:
+        //
+        // LSB  Round Sticky
+        // 0    1     1
+        // 1    1     0
+        // 1    1     1
+        // See "Computer Arithmetic Algorithms," Koren, Table 4.9
+
+        int lsb    = f_signif_bits & (1 << 13 + expdelta);
+        int round  = f_signif_bits & (1 << 12 + expdelta);
+        int sticky = f_signif_bits & ((1 << 12 + expdelta) - 1);
+
+        if (round != 0 && ((lsb | sticky) != 0 )) {
+            signif_bits++;
+        }
+
+        // No bits set in significand beyond the *first* exponent bit,
+        // not just the sigificand; quantity is added to the exponent
+        // to implement a carry out from rounding the significand.
+        assert (0xf800 & signif_bits) == 0x0;
+
+        return (short)(sign_bit | ( ((exp + 15) << 10) + signif_bits ) );
+    }
+
    /**
     * Compares two {@code Float} objects numerically.
     *