mirror of
https://github.com/openjdk/jdk.git
synced 2025-08-28 15:24:43 +02:00
8296246: Update Unicode Data Files to Version 15.1.0
Reviewed-by: erikj, joehw, srl, rriggs
This commit is contained in:
parent
a021dbcb9e
commit
7c991cc567
22 changed files with 1511 additions and 224 deletions
|
@ -63,7 +63,7 @@ import static java.lang.constant.ConstantDescs.DEFAULT_NAME;
|
|||
* from the Unicode Consortium at
|
||||
* <a href="http://www.unicode.org">http://www.unicode.org</a>.
|
||||
* <p>
|
||||
* Character information is based on the Unicode Standard, version 15.0.
|
||||
* Character information is based on the Unicode Standard, version 15.1.
|
||||
* <p>
|
||||
* The Java platform has supported different versions of the Unicode
|
||||
* Standard over time. Upgrades to newer versions of the Unicode Standard
|
||||
|
@ -75,6 +75,8 @@ import static java.lang.constant.ConstantDescs.DEFAULT_NAME;
|
|||
* <th scope="col">Unicode version</th></tr>
|
||||
* </thead>
|
||||
* <tbody>
|
||||
* <tr><th scope="row" style="text-align:left">Java SE 22</th>
|
||||
* <td>Unicode 15.1</td></tr>
|
||||
* <tr><th scope="row" style="text-align:left">Java SE 20</th>
|
||||
* <td>Unicode 15.0</td></tr>
|
||||
* <tr><th scope="row" style="text-align:left">Java SE 19</th>
|
||||
|
@ -744,7 +746,7 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
|
|||
* It should be adjusted whenever the Unicode Character Database
|
||||
* is upgraded.
|
||||
*/
|
||||
private static final int NUM_ENTITIES = 756;
|
||||
private static final int NUM_ENTITIES = 759;
|
||||
private static Map<String, UnicodeBlock> map = HashMap.newHashMap(NUM_ENTITIES);
|
||||
|
||||
/**
|
||||
|
@ -3611,6 +3613,16 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
|
|||
"CJK UNIFIED IDEOGRAPHS EXTENSION H",
|
||||
"CJKUNIFIEDIDEOGRAPHSEXTENSIONH");
|
||||
|
||||
/**
|
||||
* Constant for the "CJK Unified Ideographs Extension I" Unicode
|
||||
* character block.
|
||||
* @since 22
|
||||
*/
|
||||
public static final UnicodeBlock CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I =
|
||||
new UnicodeBlock("CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I",
|
||||
"CJK UNIFIED IDEOGRAPHS EXTENSION I",
|
||||
"CJKUNIFIEDIDEOGRAPHSEXTENSIONI");
|
||||
|
||||
private static final int[] blockStarts = {
|
||||
0x0000, // 0000..007F; Basic Latin
|
||||
0x0080, // 0080..00FF; Latin-1 Supplement
|
||||
|
@ -3978,7 +3990,8 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
|
|||
0x2B740, // 2B740..2B81F; CJK Unified Ideographs Extension D
|
||||
0x2B820, // 2B820..2CEAF; CJK Unified Ideographs Extension E
|
||||
0x2CEB0, // 2CEB0..2EBEF; CJK Unified Ideographs Extension F
|
||||
0x2EBF0, // unassigned
|
||||
0x2EBF0, // 2EBF0..2EE5F; CJK Unified Ideographs Extension I
|
||||
0x2EE60, // unassigned
|
||||
0x2F800, // 2F800..2FA1F; CJK Compatibility Ideographs Supplement
|
||||
0x2FA20, // unassigned
|
||||
0x30000, // 30000..3134F; CJK Unified Ideographs Extension G
|
||||
|
@ -4359,6 +4372,7 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
|
|||
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_D,
|
||||
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_E,
|
||||
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_F,
|
||||
CJK_UNIFIED_IDEOGRAPHS_EXTENSION_I,
|
||||
null,
|
||||
CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT,
|
||||
null,
|
||||
|
@ -6057,9 +6071,7 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
|
|||
0x2EF4, // 2EF4..2EFF; UNKNOWN
|
||||
0x2F00, // 2F00..2FD5; HAN
|
||||
0x2FD6, // 2FD6..2FEF; UNKNOWN
|
||||
0x2FF0, // 2FF0..2FFB; COMMON
|
||||
0x2FFC, // 2FFC..2FFF; UNKNOWN
|
||||
0x3000, // 3000..3004; COMMON
|
||||
0x2FF0, // 2FF0..3004; COMMON
|
||||
0x3005, // 3005 ; HAN
|
||||
0x3006, // 3006 ; COMMON
|
||||
0x3007, // 3007 ; HAN
|
||||
|
@ -6088,7 +6100,8 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
|
|||
0x3190, // 3190..319F; COMMON
|
||||
0x31A0, // 31A0..31BF; BOPOMOFO
|
||||
0x31C0, // 31C0..31E3; COMMON
|
||||
0x31E4, // 31E4..31EF; UNKNOWN
|
||||
0x31E4, // 31E4..31EE; UNKNOWN
|
||||
0x31EF, // 31EF ; COMMON
|
||||
0x31F0, // 31F0..31FF; KATAKANA
|
||||
0x3200, // 3200..321E; HANGUL
|
||||
0x321F, // 321F ; UNKNOWN
|
||||
|
@ -7028,7 +7041,9 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
|
|||
0x2B820, // 2B820..2CEA1; HAN
|
||||
0x2CEA2, // 2CEA2..2CEAF; UNKNOWN
|
||||
0x2CEB0, // 2CEB0..2EBE0; HAN
|
||||
0x2EBE1, // 2EBE1..2F7FF; UNKNOWN
|
||||
0x2EBE1, // 2EBE1..2EBEF; UNKNOWN
|
||||
0x2EBF0, // 2EBF0..2EE5D; HAN
|
||||
0x2EE5E, // 2EE5E..2F7FF; UNKNOWN
|
||||
0x2F800, // 2F800..2FA1D; HAN
|
||||
0x2FA1E, // 2FA1E..2FFFF; UNKNOWN
|
||||
0x30000, // 30000..3134A; HAN
|
||||
|
@ -7717,9 +7732,7 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
|
|||
UNKNOWN, // 2EF4..2EFF
|
||||
HAN, // 2F00..2FD5
|
||||
UNKNOWN, // 2FD6..2FEF
|
||||
COMMON, // 2FF0..2FFB
|
||||
UNKNOWN, // 2FFC..2FFF
|
||||
COMMON, // 3000..3004
|
||||
COMMON, // 2FF0..3004
|
||||
HAN, // 3005
|
||||
COMMON, // 3006
|
||||
HAN, // 3007
|
||||
|
@ -7748,7 +7761,8 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
|
|||
COMMON, // 3190..319F
|
||||
BOPOMOFO, // 31A0..31BF
|
||||
COMMON, // 31C0..31E3
|
||||
UNKNOWN, // 31E4..31EF
|
||||
UNKNOWN, // 31E4..31EE
|
||||
COMMON, // 31EF
|
||||
KATAKANA, // 31F0..31FF
|
||||
HANGUL, // 3200..321E
|
||||
UNKNOWN, // 321F
|
||||
|
@ -8688,7 +8702,9 @@ class Character implements java.io.Serializable, Comparable<Character>, Constabl
|
|||
HAN, // 2B820..2CEA1
|
||||
UNKNOWN, // 2CEA2..2CEAF
|
||||
HAN, // 2CEB0..2EBE0
|
||||
UNKNOWN, // 2EBE1..2F7FF
|
||||
UNKNOWN, // 2EBE1..2EBEF
|
||||
HAN, // 2EBF0..2EE5D
|
||||
UNKNOWN, // 2EE5E..2F7FF
|
||||
HAN, // 2F800..2FA1D
|
||||
UNKNOWN, // 2FA1E..2FFFF
|
||||
HAN, // 30000..3134A
|
||||
|
|
|
@ -35,9 +35,9 @@ public final class Grapheme {
|
|||
* <p>
|
||||
* See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
|
||||
* for the extended grapheme cluster boundary rules. The following implementation
|
||||
* is based on the annex for Unicode version 15.0.
|
||||
* (http://www.unicode.org/reports/tr29/tr29-40.html)
|
||||
* is based on the annex for Unicode version 15.1.
|
||||
*
|
||||
* @spec http://www.unicode.org/reports/tr29/tr29-43.html
|
||||
* @param src the {@code CharSequence} to be scanned
|
||||
* @param off offset to start looking for the next boundary in the src
|
||||
* @param limit limit offset in the src (exclusive)
|
||||
|
@ -56,6 +56,15 @@ public final class Grapheme {
|
|||
int ch1 = Character.codePointAt(src, ret);
|
||||
int t1 = getType(ch1);
|
||||
|
||||
// GB9c
|
||||
if (IndicConjunctBreak.isConsonant(ch0)) {
|
||||
var advance = checkIndicConjunctBreak(src, ret, limit);
|
||||
if (advance >= 0) {
|
||||
ret += advance;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
|
||||
// continue for gb11
|
||||
} else if (riCount % 2 == 1 && t0 == RI && t1 == RI) {
|
||||
|
@ -70,6 +79,7 @@ public final class Grapheme {
|
|||
}
|
||||
|
||||
riCount += (t1 == RI) ? 1 : 0;
|
||||
ch0 = ch1;
|
||||
t0 = t1;
|
||||
|
||||
ret += Character.charCount(ch1);
|
||||
|
@ -283,4 +293,40 @@ public final class Grapheme {
|
|||
}
|
||||
return OTHER;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks for a possible GB9c Indic Conjunct Break sequence. If it is
|
||||
* repetitive, e.g., Consonant1/Linker1/Consonant2/Linker2/Consonant3, only
|
||||
* the first part of the sequence (Consonant1/Linker1/Consonant2) is
|
||||
* recognized. The rest is analyzed in the next iteration of the grapheme
|
||||
* cluster boundary search.
|
||||
*
|
||||
* @param src the source char sequence
|
||||
* @param index the index that points to the starting Linking Consonant
|
||||
* @param limit limit to the char sequence
|
||||
* @return the advance in index if the indic conjunct break sequence
|
||||
* is found, it will be negative if the sequence is not found
|
||||
*/
|
||||
private static int checkIndicConjunctBreak(CharSequence src, int index, int limit) {
|
||||
boolean linkerFound = false;
|
||||
int advance = 0;
|
||||
|
||||
while (index + advance < limit) {
|
||||
int ch1 = Character.codePointAt(src, index + advance);
|
||||
advance += Character.charCount(ch1);
|
||||
|
||||
if (IndicConjunctBreak.isLinker(ch1)) {
|
||||
linkerFound = true;
|
||||
} else if (IndicConjunctBreak.isConsonant(ch1)) {
|
||||
if (linkerFound) {
|
||||
return advance;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} else if (!IndicConjunctBreak.isExtend(ch1)) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,60 @@
|
|||
/*
|
||||
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package jdk.internal.util.regex;
|
||||
|
||||
/**
|
||||
* Helper class for supporting the GB9c rule in Unicode Text Segmentation TR29
|
||||
*
|
||||
* <blockquote>
|
||||
* GB9c Do not break within certain combinations with Indic_Conjunct_Break (InCB)=Linker.
|
||||
*
|
||||
* \p{InCB=Consonant} [ \p{InCB=Extend} \p{InCB=Linker} ]* \p{InCB=Linker} [ \p{InCB=Extend} \p{InCB=Linker} ]* x \p{InCB=Consonant}*
|
||||
* </blockquote>
|
||||
*
|
||||
* Code point conditions included in this class are derived from the "Derived Property: Indic_Conjunct_Break"
|
||||
* section in DerivedCoreProperties.txt of the Unicode Character Database.
|
||||
*/
|
||||
final class IndicConjunctBreak {
|
||||
static boolean isLinker(int cp) {
|
||||
return
|
||||
%%%InCB=Linker%%%
|
||||
}
|
||||
|
||||
static boolean isExtend(int cp) {
|
||||
return
|
||||
%%%InCB=Extend%%%
|
||||
}
|
||||
|
||||
static boolean isConsonant(int cp) {
|
||||
// fast check - Devanagari to Malayalam
|
||||
if (cp < 0x0900 || cp > 0x0D7F) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return
|
||||
%%%InCB=Consonant%%%
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue