8296246: Update Unicode Data Files to Version 15.1.0

Reviewed-by: erikj, joehw, srl, rriggs
2025-08-28 15:24:43 +02:00 · 2023-09-20 17:39:57 +00:00 · 2023-09-20 17:39:57 +00:00 · 7c991cc567
commit 7c991cc567
parent a021dbcb9e
22 changed files with 1511 additions and 224 deletions
--- a/src/java.base/share/classes/java/lang/Character.java
+++ b/src/java.base/share/classes/java/lang/Character.java
@ -63,7 +63,7 @@ import static java.lang.constant.ConstantDescs.DEFAULT_NAME;
 * from the Unicode Consortium at
 * <a href="http://www.unicode.org">http://www.unicode.org</a>.
 * <p>
- * Character information is based on the Unicode Standard, version 15.0.
+ * Character information is based on the Unicode Standard, version 15.1.
 * <p>
 * The Java platform has supported different versions of the Unicode
 * Standard over time. Upgrades to newer versions of the Unicode Standard
@ -75,6 +75,8 @@ import static java.lang.constant.ConstantDescs.DEFAULT_NAME;
 *     <th scope="col">Unicode version</th></tr>
 * </thead>
 * <tbody>
+ * <tr><th scope="row" style="text-align:left">Java SE 22</th>
+ *     <td>Unicode 15.1</td></tr>
 * <tr><th scope="row" style="text-align:left">Java SE 20</th>
 *     <td>Unicode 15.0</td></tr>
 * <tr><th scope="row" style="text-align:left">Java SE 19</th>
@ -744,7 +746,7 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
         * It should be adjusted whenever the Unicode Character Database
         * is upgraded.
         */
-        private static final int NUM_ENTITIES = 756;
+        private static final int NUM_ENTITIES = 759;
        private static Map<String, UnicodeBlock> map = HashMap.newHashMap(NUM_ENTITIES);

        /**
@ -3611,6 +3613,16 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
                             "CJK UNIFIED IDEOGRAPHS EXTENSION H",
                             "CJKUNIFIEDIDEOGRAPHSEXTENSIONH");

+        /**
+         * Constant for the "CJK Unified Ideographs Extension I" Unicode
+         * character block.
+         * @since 22
+         */
+        public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I =
+            new UnicodeBlock("CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I",
+                             "CJK UNIFIED IDEOGRAPHS EXTENSION I",
+                             "CJKUNIFIEDIDEOGRAPHSEXTENSIONI");
+
        private static final int[] blockStarts = {
            0x0000,   // 0000..007F; Basic Latin
            0x0080,   // 0080..00FF; Latin-1 Supplement
@ -3978,7 +3990,8 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
            0x2B740,  // 2B740..2B81F; CJK Unified Ideographs Extension D
            0x2B820,  // 2B820..2CEAF; CJK Unified Ideographs Extension E
            0x2CEB0,  // 2CEB0..2EBEF; CJK Unified Ideographs Extension F
-            0x2EBF0,  //               unassigned
+            0x2EBF0,  // 2EBF0..2EE5F; CJK Unified Ideographs Extension I
+            0x2EE60,  //               unassigned
            0x2F800,  // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
            0x2FA20,  //               unassigned
            0x30000,  // 30000..3134F; CJK Unified Ideographs Extension G
@ -4359,6 +4372,7 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
            CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D,
            CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E,
            CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F,
+            CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I,
            null,
            CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
            null,
@ -6057,9 +6071,7 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
            0x2EF4,   // 2EF4..2EFF; UNKNOWN
            0x2F00,   // 2F00..2FD5; HAN
            0x2FD6,   // 2FD6..2FEF; UNKNOWN
-            0x2FF0,   // 2FF0..2FFB; COMMON
-            0x2FFC,   // 2FFC..2FFF; UNKNOWN
-            0x3000,   // 3000..3004; COMMON
+            0x2FF0,   // 2FF0..3004; COMMON
            0x3005,   // 3005      ; HAN
            0x3006,   // 3006      ; COMMON
            0x3007,   // 3007      ; HAN
@ -6088,7 +6100,8 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
            0x3190,   // 3190..319F; COMMON
            0x31A0,   // 31A0..31BF; BOPOMOFO
            0x31C0,   // 31C0..31E3; COMMON
-            0x31E4,   // 31E4..31EF; UNKNOWN
+            0x31E4,   // 31E4..31EE; UNKNOWN
+            0x31EF,   // 31EF      ; COMMON
            0x31F0,   // 31F0..31FF; KATAKANA
            0x3200,   // 3200..321E; HANGUL
            0x321F,   // 321F      ; UNKNOWN
@ -7028,7 +7041,9 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
            0x2B820,  // 2B820..2CEA1; HAN
            0x2CEA2,  // 2CEA2..2CEAF; UNKNOWN
            0x2CEB0,  // 2CEB0..2EBE0; HAN
-            0x2EBE1,  // 2EBE1..2F7FF; UNKNOWN
+            0x2EBE1,  // 2EBE1..2EBEF; UNKNOWN
+            0x2EBF0,  // 2EBF0..2EE5D; HAN
+            0x2EE5E,  // 2EE5E..2F7FF; UNKNOWN
            0x2F800,  // 2F800..2FA1D; HAN
            0x2FA1E,  // 2FA1E..2FFFF; UNKNOWN
            0x30000,  // 30000..3134A; HAN
@ -7717,9 +7732,7 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
            UNKNOWN,                  // 2EF4..2EFF
            HAN,                      // 2F00..2FD5
            UNKNOWN,                  // 2FD6..2FEF
-            COMMON,                   // 2FF0..2FFB
-            UNKNOWN,                  // 2FFC..2FFF
-            COMMON,                   // 3000..3004
+            COMMON,                   // 2FF0..3004
            HAN,                      // 3005
            COMMON,                   // 3006
            HAN,                      // 3007
@ -7748,7 +7761,8 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
            COMMON,                   // 3190..319F
            BOPOMOFO,                 // 31A0..31BF
            COMMON,                   // 31C0..31E3
-            UNKNOWN,                  // 31E4..31EF
+            UNKNOWN,                  // 31E4..31EE
+            COMMON,                   // 31EF
            KATAKANA,                 // 31F0..31FF
            HANGUL,                   // 3200..321E
            UNKNOWN,                  // 321F
@ -8688,7 +8702,9 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
            HAN,                      // 2B820..2CEA1
            UNKNOWN,                  // 2CEA2..2CEAF
            HAN,                      // 2CEB0..2EBE0
-            UNKNOWN,                  // 2EBE1..2F7FF
+            UNKNOWN,                  // 2EBE1..2EBEF
+            HAN,                      // 2EBF0..2EE5D
+            UNKNOWN,                  // 2EE5E..2F7FF
            HAN,                      // 2F800..2FA1D
            UNKNOWN,                  // 2FA1E..2FFFF
            HAN,                      // 30000..3134A
--- a/src/java.base/share/classes/jdk/internal/util/regex/Grapheme.java
+++ b/src/java.base/share/classes/jdk/internal/util/regex/Grapheme.java
@ -35,9 +35,9 @@ public final class Grapheme {
     * <p>
     * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
     * for the extended grapheme cluster boundary rules. The following implementation
-     * is based on the annex for Unicode version 15.0.
-     * (http://www.unicode.org/reports/tr29/tr29-40.html)
+     * is based on the annex for Unicode version 15.1.
     *
+     * @spec http://www.unicode.org/reports/tr29/tr29-43.html
     * @param src the {@code CharSequence} to be scanned
     * @param off offset to start looking for the next boundary in the src
     * @param limit limit offset in the src (exclusive)
@ -56,6 +56,15 @@ public final class Grapheme {
            int ch1 = Character.codePointAt(src, ret);
            int t1 = getType(ch1);

+            // GB9c
+            if (IndicConjunctBreak.isConsonant(ch0)) {
+                var advance = checkIndicConjunctBreak(src, ret, limit);
+                if (advance >= 0) {
+                    ret += advance;
+                    continue;
+                }
+            }
+
            if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
                // continue for gb11
            } else if (riCount % 2 == 1 && t0 == RI && t1 == RI) {
@ -70,6 +79,7 @@ public final class Grapheme {
            }

            riCount += (t1 == RI) ? 1 : 0;
+            ch0 = ch1;
            t0 = t1;

            ret += Character.charCount(ch1);
@ -283,4 +293,40 @@ public final class Grapheme {
        }
        return OTHER;
    }
+
+    /**
+     * Checks for a possible GB9c Indic Conjunct Break sequence. If it is
+     * repetitive, e.g., Consonant1/Linker1/Consonant2/Linker2/Consonant3, only
+     * the first part of the sequence (Consonant1/Linker1/Consonant2) is
+     * recognized. The rest is analyzed in the next iteration of the grapheme
+     * cluster boundary search.
+     *
+     * @param src the source char sequence
+     * @param index the index that points to the starting Linking Consonant
+     * @param limit limit to the char sequence
+     * @return the advance in index if the indic conjunct break sequence
+     *      is found, it will be negative if the sequence is not found
+     */
+    private static int checkIndicConjunctBreak(CharSequence src, int index, int limit) {
+        boolean linkerFound = false;
+        int advance = 0;
+
+        while (index + advance < limit) {
+            int ch1 = Character.codePointAt(src, index + advance);
+            advance += Character.charCount(ch1);
+
+            if (IndicConjunctBreak.isLinker(ch1)) {
+                linkerFound = true;
+            } else if (IndicConjunctBreak.isConsonant(ch1)) {
+                if (linkerFound) {
+                    return advance;
+                } else {
+                    break;
+                }
+            } else if (!IndicConjunctBreak.isExtend(ch1)) {
+                break;
+            }
+        }
+        return -1;
+    }
 }
--- a/src/java.base/share/classes/jdk/internal/util/regex/IndicConjunctBreak.java.template
+++ b/src/java.base/share/classes/jdk/internal/util/regex/IndicConjunctBreak.java.template
@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+package jdk.internal.util.regex;
+
+/**
+ * Helper class for supporting the GB9c rule in Unicode Text Segmentation TR29
+ *
+ * <blockquote>
+ * GB9c Do not break within certain combinations with Indic_Conjunct_Break (InCB)=Linker.
+ *
+ * \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* x \p{InCB=Consonant}*
+ * </blockquote>
+ *
+ * Code point conditions included in this class are derived from the "Derived Property: Indic_Conjunct_Break"
+ * section in DerivedCoreProperties.txt of the Unicode Character Database.
+ */
+final class IndicConjunctBreak {
+    static boolean isLinker(int cp) {
+        return
+%%%InCB=Linker%%%
+    }
+
+    static boolean isExtend(int cp) {
+        return
+%%%InCB=Extend%%%
+    }
+
+    static boolean isConsonant(int cp) {
+        // fast check - Devanagari to Malayalam
+        if (cp < 0x0900 || cp > 0x0D7F) {
+            return false;
+        }
+
+        return
+%%%InCB=Consonant%%%
+    }
+}