8296246: Update Unicode Data Files to Version 15.1.0

Reviewed-by: erikj, joehw, srl, rriggs
This commit is contained in:
Naoto Sato 2023-09-20 17:39:57 +00:00
parent a021dbcb9e
commit 7c991cc567
22 changed files with 1511 additions and 224 deletions

View file

@ -63,7 +63,7 @@ import static java.lang.constant.ConstantDescs.DEFAULT_NAME;
* from the Unicode Consortium at
* <a href="http://www.unicode.org">http://www.unicode.org</a>.
* <p>
* Character information is based on the Unicode Standard, version 15.0.
* Character information is based on the Unicode Standard, version 15.1.
* <p>
* The Java platform has supported different versions of the Unicode
* Standard over time. Upgrades to newer versions of the Unicode Standard
@ -75,6 +75,8 @@ import static java.lang.constant.ConstantDescs.DEFAULT_NAME;
* <th scope="col">Unicode version</th></tr>
* </thead>
* <tbody>
* <tr><th scope="row" style="text-align:left">Java SE 22</th>
* <td>Unicode 15.1</td></tr>
* <tr><th scope="row" style="text-align:left">Java SE 20</th>
* <td>Unicode 15.0</td></tr>
* <tr><th scope="row" style="text-align:left">Java SE 19</th>
@ -744,7 +746,7 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
* It should be adjusted whenever the Unicode Character Database
* is upgraded.
*/
private static final int NUM_ENTITIES = 756;
private static final int NUM_ENTITIES = 759;
private static Map<String, UnicodeBlock> map = HashMap.newHashMap(NUM_ENTITIES);
/**
@ -3611,6 +3613,16 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
"CJK UNIFIED IDEOGRAPHS EXTENSION H",
"CJKUNIFIEDIDEOGRAPHSEXTENSIONH");
/**
* Constant for the "CJK Unified Ideographs Extension I" Unicode
* character block.
* @since 22
*/
public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I =
new UnicodeBlock("CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I",
"CJK UNIFIED IDEOGRAPHS EXTENSION I",
"CJKUNIFIEDIDEOGRAPHSEXTENSIONI");
private static final int[] blockStarts = {
0x0000, // 0000..007F; Basic Latin
0x0080, // 0080..00FF; Latin-1 Supplement
@ -3978,7 +3990,8 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
0x2B740, // 2B740..2B81F; CJK Unified Ideographs Extension D
0x2B820, // 2B820..2CEAF; CJK Unified Ideographs Extension E
0x2CEB0, // 2CEB0..2EBEF; CJK Unified Ideographs Extension F
0x2EBF0, // unassigned
0x2EBF0, // 2EBF0..2EE5F; CJK Unified Ideographs Extension I
0x2EE60, // unassigned
0x2F800, // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
0x2FA20, // unassigned
0x30000, // 30000..3134F; CJK Unified Ideographs Extension G
@ -4359,6 +4372,7 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F,
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I,
null,
CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
null,
@ -6057,9 +6071,7 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
0x2EF4, // 2EF4..2EFF; UNKNOWN
0x2F00, // 2F00..2FD5; HAN
0x2FD6, // 2FD6..2FEF; UNKNOWN
0x2FF0, // 2FF0..2FFB; COMMON
0x2FFC, // 2FFC..2FFF; UNKNOWN
0x3000, // 3000..3004; COMMON
0x2FF0, // 2FF0..3004; COMMON
0x3005, // 3005 ; HAN
0x3006, // 3006 ; COMMON
0x3007, // 3007 ; HAN
@ -6088,7 +6100,8 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
0x3190, // 3190..319F; COMMON
0x31A0, // 31A0..31BF; BOPOMOFO
0x31C0, // 31C0..31E3; COMMON
0x31E4, // 31E4..31EF; UNKNOWN
0x31E4, // 31E4..31EE; UNKNOWN
0x31EF, // 31EF ; COMMON
0x31F0, // 31F0..31FF; KATAKANA
0x3200, // 3200..321E; HANGUL
0x321F, // 321F ; UNKNOWN
@ -7028,7 +7041,9 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
0x2B820, // 2B820..2CEA1; HAN
0x2CEA2, // 2CEA2..2CEAF; UNKNOWN
0x2CEB0, // 2CEB0..2EBE0; HAN
0x2EBE1, // 2EBE1..2F7FF; UNKNOWN
0x2EBE1, // 2EBE1..2EBEF; UNKNOWN
0x2EBF0, // 2EBF0..2EE5D; HAN
0x2EE5E, // 2EE5E..2F7FF; UNKNOWN
0x2F800, // 2F800..2FA1D; HAN
0x2FA1E, // 2FA1E..2FFFF; UNKNOWN
0x30000, // 30000..3134A; HAN
@ -7717,9 +7732,7 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
UNKNOWN, // 2EF4..2EFF
HAN, // 2F00..2FD5
UNKNOWN, // 2FD6..2FEF
COMMON, // 2FF0..2FFB
UNKNOWN, // 2FFC..2FFF
COMMON, // 3000..3004
COMMON, // 2FF0..3004
HAN, // 3005
COMMON, // 3006
HAN, // 3007
@ -7748,7 +7761,8 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
COMMON, // 3190..319F
BOPOMOFO, // 31A0..31BF
COMMON, // 31C0..31E3
UNKNOWN, // 31E4..31EF
UNKNOWN, // 31E4..31EE
COMMON, // 31EF
KATAKANA, // 31F0..31FF
HANGUL, // 3200..321E
UNKNOWN, // 321F
@ -8688,7 +8702,9 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
HAN, // 2B820..2CEA1
UNKNOWN, // 2CEA2..2CEAF
HAN, // 2CEB0..2EBE0
UNKNOWN, // 2EBE1..2F7FF
UNKNOWN, // 2EBE1..2EBEF
HAN, // 2EBF0..2EE5D
UNKNOWN, // 2EE5E..2F7FF
HAN, // 2F800..2FA1D
UNKNOWN, // 2FA1E..2FFFF
HAN, // 30000..3134A

View file

@ -35,9 +35,9 @@ public final class Grapheme {
* <p>
* See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
* for the extended grapheme cluster boundary rules. The following implementation
* is based on the annex for Unicode version 15.0.
* (http://www.unicode.org/reports/tr29/tr29-40.html)
* is based on the annex for Unicode version 15.1.
*
* @spec http://www.unicode.org/reports/tr29/tr29-43.html
* @param src the {@code CharSequence} to be scanned
* @param off offset to start looking for the next boundary in the src
* @param limit limit offset in the src (exclusive)
@ -56,6 +56,15 @@ public final class Grapheme {
int ch1 = Character.codePointAt(src, ret);
int t1 = getType(ch1);
// GB9c
if (IndicConjunctBreak.isConsonant(ch0)) {
var advance = checkIndicConjunctBreak(src, ret, limit);
if (advance >= 0) {
ret += advance;
continue;
}
}
if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
// continue for gb11
} else if (riCount % 2 == 1 && t0 == RI && t1 == RI) {
@ -70,6 +79,7 @@ public final class Grapheme {
}
riCount += (t1 == RI) ? 1 : 0;
ch0 = ch1;
t0 = t1;
ret += Character.charCount(ch1);
@ -283,4 +293,40 @@ public final class Grapheme {
}
return OTHER;
}
/**
* Checks for a possible GB9c Indic Conjunct Break sequence. If it is
* repetitive, e.g., Consonant1/Linker1/Consonant2/Linker2/Consonant3, only
* the first part of the sequence (Consonant1/Linker1/Consonant2) is
* recognized. The rest is analyzed in the next iteration of the grapheme
* cluster boundary search.
*
* @param src the source char sequence
* @param index the index that points to the starting Linking Consonant
* @param limit limit to the char sequence
* @return the advance in index if the indic conjunct break sequence
* is found, it will be negative if the sequence is not found
*/
private static int checkIndicConjunctBreak(CharSequence src, int index, int limit) {
boolean linkerFound = false;
int advance = 0;
while (index + advance < limit) {
int ch1 = Character.codePointAt(src, index + advance);
advance += Character.charCount(ch1);
if (IndicConjunctBreak.isLinker(ch1)) {
linkerFound = true;
} else if (IndicConjunctBreak.isConsonant(ch1)) {
if (linkerFound) {
return advance;
} else {
break;
}
} else if (!IndicConjunctBreak.isExtend(ch1)) {
break;
}
}
return -1;
}
}

View file

@ -0,0 +1,60 @@
/*
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package jdk.internal.util.regex;
/**
* Helper class for supporting the GB9c rule in Unicode Text Segmentation TR29
*
* <blockquote>
* GB9c Do not break within certain combinations with Indic_Conjunct_Break (InCB)=Linker.
*
* \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* x \p{InCB=Consonant}*
* </blockquote>
*
* Code point conditions included in this class are derived from the "Derived Property: Indic_Conjunct_Break"
* section in DerivedCoreProperties.txt of the Unicode Character Database.
*/
final class IndicConjunctBreak {
static boolean isLinker(int cp) {
return
%%%InCB=Linker%%%
}
static boolean isExtend(int cp) {
return
%%%InCB=Extend%%%
}
static boolean isConsonant(int cp) {
// fast check - Devanagari to Malayalam
if (cp < 0x0900 || cp > 0x0D7F) {
return false;
}
return
%%%InCB=Consonant%%%
}
}