mirror of
https://github.com/openjdk/jdk.git
synced 2025-08-27 23:04:50 +02:00
8268081: Upgrade Unicode Data Files to 14.0.0
Reviewed-by: joehw, iris, lancea
This commit is contained in:
parent
ddddec7d74
commit
0a094d7c28
38 changed files with 3333 additions and 1081 deletions
File diff suppressed because it is too large
Load diff
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2016, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2016, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -35,8 +35,8 @@ final class Grapheme {
|
|||
* <p>
|
||||
* See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
|
||||
* for the extended grapheme cluster boundary rules. The following implementation
|
||||
* is based on version 12.0 of the annex.
|
||||
* (http://www.unicode.org/reports/tr29/tr29-35.html)
|
||||
* is based on the annex for Unicode version 14.0.
|
||||
* (http://www.unicode.org/reports/tr29/tr29-38.html)
|
||||
*
|
||||
* @param src the {@code CharSequence} to be scanned
|
||||
* @param off offset to start looking for the next boundary in the src
|
||||
|
@ -97,7 +97,7 @@ final class Grapheme {
|
|||
private static final int FIRST_TYPE = 0;
|
||||
private static final int LAST_TYPE = 14;
|
||||
|
||||
private static boolean[][] rules;
|
||||
private static final boolean[][] rules;
|
||||
static {
|
||||
rules = new boolean[LAST_TYPE + 1][LAST_TYPE + 1];
|
||||
// GB 999 Any + Any -> default
|
||||
|
@ -201,8 +201,9 @@ final class Grapheme {
|
|||
if (cp == 0x200D)
|
||||
return ZWJ;
|
||||
if (cp >= 0x0600 && cp <= 0x0605 ||
|
||||
cp == 0x06DD || cp == 0x070F || cp == 0x08E2 ||
|
||||
cp == 0x110BD || cp == 0x110CD)
|
||||
cp == 0x06DD || cp == 0x070F ||
|
||||
cp == 0x0890 || cp == 0x0891 ||
|
||||
cp == 0x08E2 || cp == 0x110BD || cp == 0x110CD)
|
||||
return PREPEND;
|
||||
return CONTROL;
|
||||
case Character.NON_SPACING_MARK:
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2005, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -76,9 +76,6 @@ public final class Punycode {
|
|||
// TODO: eliminate the 256 limitation
|
||||
private static final int MAX_CP_COUNT = 256;
|
||||
|
||||
private static final int UINT_MAGIC = 0x80000000;
|
||||
private static final long ULONG_MAGIC = 0x8000000000000000L;
|
||||
|
||||
private static int adaptBias(int delta, int length, boolean firstTime){
|
||||
if(firstTime){
|
||||
delta /=DAMP;
|
||||
|
@ -96,34 +93,25 @@ public final class Punycode {
|
|||
}
|
||||
|
||||
/**
|
||||
* basicToDigit[] contains the numeric value of a basic code
|
||||
* point (for use in representing integers) in the range 0 to
|
||||
* BASE-1, or -1 if b is does not represent a value.
|
||||
* @return the numeric value of a basic code point (for use in representing integers)
|
||||
* in the range 0 to BASE-1, or a negative value if cp is invalid.
|
||||
*/
|
||||
static final int[] basicToDigit= new int[]{
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
|
||||
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
|
||||
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
|
||||
private static final int decodeDigit(int cp) {
|
||||
if(cp<='Z') {
|
||||
if(cp<='9') {
|
||||
if(cp<'0') {
|
||||
return -1;
|
||||
} else {
|
||||
return cp-'0'+26; // 0..9 -> 26..35
|
||||
}
|
||||
} else {
|
||||
return cp-'A'; // A-Z -> 0..25
|
||||
}
|
||||
} else if(cp<='z') {
|
||||
return cp-'a'; // a..z -> 0..25
|
||||
} else {
|
||||
return -1;
|
||||
}
|
||||
};
|
||||
|
||||
private static char asciiCaseMap(char b, boolean uppercase) {
|
||||
|
@ -158,6 +146,12 @@ public final class Punycode {
|
|||
return (char)((ZERO-26)+digit);
|
||||
}
|
||||
}
|
||||
|
||||
// ICU-13727: Limit input length for n^2 algorithm
|
||||
// where well-formed strings are at most 59 characters long.
|
||||
private static final int ENCODE_MAX_CODE_UNITS = 1000;
|
||||
private static final int DECODE_MAX_CHARS = 2000;
|
||||
|
||||
/**
|
||||
* Converts Unicode to Punycode.
|
||||
* The input string must not contain single, unpaired surrogates.
|
||||
|
@ -174,6 +168,10 @@ public final class Punycode {
|
|||
int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
|
||||
char c, c2;
|
||||
int srcLength = src.length();
|
||||
if (srcLength > ENCODE_MAX_CODE_UNITS) {
|
||||
throw new RuntimeException(
|
||||
"input too long: " + srcLength + " UTF-16 code units");
|
||||
}
|
||||
int destCapacity = MAX_CP_COUNT;
|
||||
char[] dest = new char[destCapacity];
|
||||
StringBuffer result = new StringBuffer();
|
||||
|
@ -251,7 +249,7 @@ public final class Punycode {
|
|||
* Increase delta enough to advance the decoder's
|
||||
* <n,i> state to <m,0>, but guard against overflow:
|
||||
*/
|
||||
if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
|
||||
if(m-n>(0x7fffffff-handledCPCount-delta)/(handledCPCount+1)) {
|
||||
throw new RuntimeException("Internal program error");
|
||||
}
|
||||
delta+=(m-n)*(handledCPCount+1);
|
||||
|
@ -332,6 +330,9 @@ public final class Punycode {
|
|||
public static StringBuffer decode(StringBuffer src, boolean[] caseFlags)
|
||||
throws ParseException{
|
||||
int srcLength = src.length();
|
||||
if (srcLength > DECODE_MAX_CHARS) {
|
||||
throw new RuntimeException("input too long: " + srcLength + " characters");
|
||||
}
|
||||
StringBuffer result = new StringBuffer();
|
||||
int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
|
||||
destCPCount, firstSupplementaryIndex, cpLength;
|
||||
|
@ -395,7 +396,7 @@ public final class Punycode {
|
|||
throw new ParseException("Illegal char found", -1);
|
||||
}
|
||||
|
||||
digit=basicToDigit[(byte)src.charAt(in++)];
|
||||
digit=decodeDigit(src.charAt(in++));
|
||||
if(digit<0) {
|
||||
throw new ParseException("Invalid char found", -1);
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2015, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -134,9 +134,15 @@ public class UnicodeSetStringSpan {
|
|||
|
||||
int i, spanLength;
|
||||
someRelevant = false;
|
||||
for (i = 0; i < stringsLength; ++i) {
|
||||
for (i = 0; i < stringsLength;) {
|
||||
String string = strings.get(i);
|
||||
int length16 = string.length();
|
||||
if (length16 == 0) {
|
||||
// Remove the empty string.
|
||||
strings.remove(i);
|
||||
--stringsLength;
|
||||
continue;
|
||||
}
|
||||
spanLength = spanSet.span(string, SpanCondition.CONTAINED);
|
||||
if (spanLength < length16) { // Relevant string.
|
||||
someRelevant = true;
|
||||
|
@ -144,6 +150,7 @@ public class UnicodeSetStringSpan {
|
|||
if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) {
|
||||
maxLength16 = length16;
|
||||
}
|
||||
++i;
|
||||
}
|
||||
if (!someRelevant && (which & WITH_COUNT) == 0) {
|
||||
return;
|
||||
|
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2009, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2009, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -84,7 +84,7 @@ import jdk.internal.icu.util.VersionInfo;
|
|||
* <p>
|
||||
* Further detail on differences can be determined using the program
|
||||
* <a href=
|
||||
* "http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
|
||||
* "https://github.com/unicode-org/icu/blob/main/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
|
||||
* com.ibm.icu.dev.test.lang.UCharacterCompare</a>
|
||||
* </p>
|
||||
* <p>
|
||||
|
@ -101,9 +101,9 @@ import jdk.internal.icu.util.VersionInfo;
|
|||
* For more information see
|
||||
* <a href="http://www.unicode/org/ucd/">"About the Unicode Character Database"</a>
|
||||
* (http://www.unicode.org/ucd/)
|
||||
* and the <a href="http://www.icu-project.org/userguide/properties.html">ICU
|
||||
* and the <a href="https://unicode-org.github.io/icu/userguide/strings/properties">ICU
|
||||
* User Guide chapter on Properties</a>
|
||||
* (http://www.icu-project.org/userguide/properties.html).
|
||||
* (https://unicode-org.github.io/icu/userguide/strings/properties).
|
||||
* </p>
|
||||
* <p>
|
||||
* There are also functions that provide easy migration from C/POSIX functions
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2005, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -53,7 +53,7 @@ public final class UCharacterDirection implements UCharacterEnums.ECharacterDire
|
|||
// private constructor =========================================
|
||||
///CLOVER:OFF
|
||||
/**
|
||||
* Private constructor to prevent initialisation
|
||||
* Private constructor to prevent initialization
|
||||
*/
|
||||
private UCharacterDirection()
|
||||
{
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2005, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -62,7 +62,7 @@ package jdk.internal.icu.lang;
|
|||
@Deprecated
|
||||
class UCharacterEnums {
|
||||
|
||||
/** This is just a namespace, it is not instantiatable. */
|
||||
/** This is just a namespace, it is not instantiable. */
|
||||
private UCharacterEnums() {};
|
||||
|
||||
/**
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2009, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2009, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -63,7 +63,7 @@ import jdk.internal.icu.impl.UBiDiProps;
|
|||
*
|
||||
* This is an implementation of the Unicode Bidirectional Algorithm. The
|
||||
* algorithm is defined in the
|
||||
* <a href="http://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
|
||||
* <a href="https://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
|
||||
* Unicode Bidirectional Algorithm</a>.
|
||||
* <p>
|
||||
*
|
||||
|
@ -985,7 +985,7 @@ public class BidiBase {
|
|||
/**
|
||||
* Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3).
|
||||
* Used in
|
||||
* <a href="http://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
|
||||
* <a href="https://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
|
||||
* Unicode Bidirectional Algorithm</a>.
|
||||
* Returns UCharacter.BidiPairedBracketType values.
|
||||
* @stable ICU 52
|
||||
|
@ -3365,7 +3365,7 @@ public class BidiBase {
|
|||
|
||||
/**
|
||||
* Perform the Unicode Bidi algorithm. It is defined in the
|
||||
* <a href="http://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
|
||||
* <a href="https://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
|
||||
* Unicode Bidirectional Algorithm</a>, version 13,
|
||||
* also described in The Unicode Standard, Version 4.0 .<p>
|
||||
*
|
||||
|
@ -3450,7 +3450,7 @@ public class BidiBase {
|
|||
|
||||
/**
|
||||
* Perform the Unicode Bidi algorithm. It is defined in the
|
||||
* <a href="http://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
|
||||
* <a href="https://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
|
||||
* Unicode Bidirectional Algorithm</a>, version 13,
|
||||
* also described in The Unicode Standard, Version 4.0 .<p>
|
||||
*
|
||||
|
@ -3786,7 +3786,7 @@ public class BidiBase {
|
|||
|
||||
/**
|
||||
* Perform the Unicode Bidi algorithm on a given paragraph, as defined in the
|
||||
* <a href="http://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
|
||||
* <a href="https://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
|
||||
* Unicode Bidirectional Algorithm</a>, version 13,
|
||||
* also described in The Unicode Standard, Version 4.0 .<p>
|
||||
*
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2009, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2009, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -47,7 +47,7 @@ final class BidiLine {
|
|||
* text in a single paragraph or in a line of a single paragraph
|
||||
* which has already been processed according to
|
||||
* the Unicode 3.0 Bidi algorithm as defined in
|
||||
* <a href="http://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
|
||||
* <a href="https://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
|
||||
* Unicode Bidirectional Algorithm</a>, version 13,
|
||||
* also described in The Unicode Standard, Version 4.0.1 .
|
||||
*
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2015, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2015, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -43,7 +43,7 @@ import jdk.internal.icu.impl.Norm2AllModes;
|
|||
* The primary functions are to produce a normalized string and to detect whether
|
||||
* a string is already normalized.
|
||||
* The most commonly used normalization forms are those defined in
|
||||
* <a href="http://www.unicode.org/reports/tr15/">Unicode Standard Annex #15:
|
||||
* <a href="https://www.unicode.org/reports/tr15/">Unicode Standard Annex #15:
|
||||
* Unicode Normalization Forms</a>.
|
||||
* However, this API supports additional normalization forms for specialized purposes.
|
||||
* For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2005, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2005, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -44,7 +44,7 @@ import java.text.Normalizer;
|
|||
* <code>normalize</code> transforms Unicode text into an equivalent composed or
|
||||
* decomposed form, allowing for easier sorting and searching of text.
|
||||
* <code>normalize</code> supports the standard normalization forms described in
|
||||
* <a href="http://www.unicode.org/reports/tr15/" target="unicode">
|
||||
* <a href="https://www.unicode.org/reports/tr15/" target="unicode">
|
||||
* Unicode Standard Annex #15 — Unicode Normalization Forms</a>.
|
||||
*
|
||||
* Characters with accents or other adornments can be encoded in
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2005, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -67,9 +67,9 @@ import jdk.internal.icu.util.VersionInfo;
|
|||
* <li> Unassigned Table: Contains code points that are unassigned
|
||||
* in the Unicode Version supported by StringPrep. Currently
|
||||
* RFC 3454 supports Unicode 3.2. </li>
|
||||
* <li> Prohibited Table: Contains code points that are prohibted from
|
||||
* <li> Prohibited Table: Contains code points that are prohibited from
|
||||
* the output of the StringPrep processing function. </li>
|
||||
* <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li>
|
||||
* <li> Mapping Table: Contains code points that are deleted from the output or case mapped. </li>
|
||||
* </ul>
|
||||
*
|
||||
* The procedure for preparing Unicode strings:
|
||||
|
@ -226,8 +226,8 @@ public final class StringPrep {
|
|||
sprepUniVer = getVersionInfo(reader.getUnicodeVersion());
|
||||
normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
|
||||
VersionInfo normUniVer = UCharacter.getUnicodeVersion();
|
||||
if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
|
||||
normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
|
||||
if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Version of the normalization data */
|
||||
normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Version of the normalization data */
|
||||
((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
|
||||
){
|
||||
throw new IOException("Normalization Correction version not supported");
|
||||
|
@ -325,7 +325,7 @@ public final class StringPrep {
|
|||
ch -= val.value;
|
||||
}
|
||||
}else if(val.type == DELETE){
|
||||
// just consume the codepoint and contine
|
||||
// just consume the codepoint and continue
|
||||
continue;
|
||||
}
|
||||
//copy the source into destination
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -147,9 +147,9 @@ public abstract class UCharacterIterator
|
|||
*/
|
||||
public int nextCodePoint(){
|
||||
int ch1 = next();
|
||||
if(UTF16.isLeadSurrogate((char)ch1)){
|
||||
if(UTF16.isLeadSurrogate(ch1)){
|
||||
int ch2 = next();
|
||||
if(UTF16.isTrailSurrogate((char)ch2)){
|
||||
if(UTF16.isTrailSurrogate(ch2)){
|
||||
return UCharacterProperty.getRawSupplementary((char)ch1,
|
||||
(char)ch2);
|
||||
}else if (ch2 != DONE) {
|
||||
|
@ -175,7 +175,7 @@ public abstract class UCharacterIterator
|
|||
/**
|
||||
* Retreat to the start of the previous code point in the text,
|
||||
* and return it (pre-decrement semantics). If the index is not
|
||||
* preceeded by a valid surrogate pair, the behavior is the same
|
||||
* preceded by a valid surrogate pair, the behavior is the same
|
||||
* as <code>previous()</code>. Otherwise the iterator is
|
||||
* decremented to the start of the surrogate pair, and the code
|
||||
* point represented by the pair is returned.
|
||||
|
@ -185,9 +185,9 @@ public abstract class UCharacterIterator
|
|||
*/
|
||||
public int previousCodePoint(){
|
||||
int ch1 = previous();
|
||||
if(UTF16.isTrailSurrogate((char)ch1)){
|
||||
if(UTF16.isTrailSurrogate(ch1)){
|
||||
int ch2 = previous();
|
||||
if(UTF16.isLeadSurrogate((char)ch2)){
|
||||
if(UTF16.isLeadSurrogate(ch2)){
|
||||
return UCharacterProperty.getRawSupplementary((char)ch2,
|
||||
(char)ch1);
|
||||
}else if (ch2 != DONE) {
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2005, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -382,36 +382,39 @@ public final class UTF16
|
|||
}
|
||||
|
||||
/**
|
||||
* Determines whether the code value is a surrogate.
|
||||
* @param char16 the input character.
|
||||
* @return true if the input character is a surrogate.
|
||||
* @stable ICU 2.1
|
||||
* Determines whether the code point is a surrogate.
|
||||
*
|
||||
* @param codePoint The input character.
|
||||
* (In ICU 2.1-69 the type of this parameter was <code>char</code>.)
|
||||
* @return true If the input code point is a surrogate.
|
||||
* @stable ICU 70
|
||||
*/
|
||||
public static boolean isSurrogate(char char16)
|
||||
{
|
||||
return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
|
||||
public static boolean isSurrogate(int codePoint) {
|
||||
return (codePoint & SURROGATE_BITMASK) == SURROGATE_BITS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the character is a trail surrogate.
|
||||
* @param char16 the input character.
|
||||
* @return true if the input character is a trail surrogate.
|
||||
* @stable ICU 2.1
|
||||
* Determines whether the code point is a trail surrogate.
|
||||
*
|
||||
* @param codePoint The input character.
|
||||
* (In ICU 2.1-69 the type of this parameter was <code>char</code>.)
|
||||
* @return true If the input code point is a trail surrogate.
|
||||
* @stable ICU 70
|
||||
*/
|
||||
public static boolean isTrailSurrogate(char char16)
|
||||
{
|
||||
return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
|
||||
public static boolean isTrailSurrogate(int codePoint) {
|
||||
return (codePoint & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines whether the character is a lead surrogate.
|
||||
* @param char16 the input character.
|
||||
* @return true if the input character is a lead surrogate
|
||||
* @stable ICU 2.1
|
||||
* Determines whether the code point is a lead surrogate.
|
||||
*
|
||||
* @param codePoint The input character.
|
||||
* (In ICU 2.1-69 the type of this parameter was <code>char</code>.)
|
||||
* @return true If the input code point is a lead surrogate
|
||||
* @stable ICU 70
|
||||
*/
|
||||
public static boolean isLeadSurrogate(char char16)
|
||||
{
|
||||
return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
|
||||
public static boolean isLeadSurrogate(int codePoint) {
|
||||
return (codePoint & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2005, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -135,8 +135,8 @@ import jdk.internal.icu.util.VersionInfo;
|
|||
* "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a
|
||||
* complete list of supported property patterns, see the User's Guide
|
||||
* for UnicodeSet at
|
||||
* <a href="http://www.icu-project.org/userguide/unicodeSet.html">
|
||||
* http://www.icu-project.org/userguide/unicodeSet.html</a>.
|
||||
* <a href="https://unicode-org.github.io/icu/userguide/strings/unicodeset">
|
||||
* https://unicode-org.github.io/icu/userguide/strings/unicodeset</a>.
|
||||
* Actual determination of property data is defined by the underlying
|
||||
* Unicode database as implemented by UCharacter.
|
||||
*
|
||||
|
@ -147,6 +147,13 @@ import jdk.internal.icu.util.VersionInfo;
|
|||
* their delimiters; "[:^foo]" and "\P{foo}". In any other location,
|
||||
* '^' has no special meaning.
|
||||
*
|
||||
* <p>Since ICU 70, "[^...]", "[:^foo]", "\P{foo}", and "[:binaryProperty=No:]"
|
||||
* perform a "code point complement" (all code points minus the original set),
|
||||
* removing all multicharacter strings,
|
||||
* equivalent to .{@link #complement()}.{@link #removeAllStrings()} .
|
||||
* The {@link #complement()} API function continues to perform a
|
||||
* symmetric difference with all code points and thus retains all multicharacter strings.
|
||||
*
|
||||
* <p>Ranges are indicated by placing two a '-' between two
|
||||
* characters, as in "a-z". This specifies the range of all
|
||||
* characters from the left to the right, in Unicode order. If the
|
||||
|
@ -189,8 +196,6 @@ import jdk.internal.icu.util.VersionInfo;
|
|||
* Unicode property
|
||||
* </table>
|
||||
*
|
||||
* <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
|
||||
*
|
||||
* <p><b>Formal syntax</b></p>
|
||||
*
|
||||
* <blockquote>
|
||||
|
@ -230,9 +235,8 @@ import jdk.internal.icu.util.VersionInfo;
|
|||
* </tr>
|
||||
* <tr align="top">
|
||||
* <td nowrap valign="top" align="right"><code>hex := </code></td>
|
||||
* <td valign="top"><em>any character for which
|
||||
* </em><code>Character.digit(c, 16)</code><em>
|
||||
* returns a non-negative result</em></td>
|
||||
* <td style="vertical-align: top;"><code>'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |<br>
|
||||
* 'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'</code></td>
|
||||
* </tr>
|
||||
* <tr>
|
||||
* <td nowrap valign="top" align="right"><code>property := </code></td>
|
||||
|
@ -487,7 +491,7 @@ public class UnicodeSet {
|
|||
else if (i > 0 && c == list[i-1]) {
|
||||
// c is after end of prior range
|
||||
list[i-1]++;
|
||||
// no need to chcek for collapse here
|
||||
// no need to check for collapse here
|
||||
}
|
||||
|
||||
else {
|
||||
|
@ -528,7 +532,6 @@ public class UnicodeSet {
|
|||
* present. If this set already contains the multicharacter,
|
||||
* the call leaves this set unchanged.
|
||||
* Thus {@code "ch" => {"ch"}}
|
||||
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
|
||||
* @param s the source string
|
||||
* @return this object, for chaining
|
||||
* @stable ICU 2.0
|
||||
|
@ -546,22 +549,19 @@ public class UnicodeSet {
|
|||
|
||||
/**
|
||||
* Utility for getting code point from single code point CharSequence.
|
||||
* See the public UTF16.getSingleCodePoint()
|
||||
* See the public UTF16.getSingleCodePoint() (which returns -1 for null rather than throwing NPE).
|
||||
*
|
||||
* @return a code point IF the string consists of a single one.
|
||||
* otherwise returns -1.
|
||||
* @param s to test
|
||||
*/
|
||||
private static int getSingleCP(CharSequence s) {
|
||||
if (s.length() < 1) {
|
||||
throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
|
||||
}
|
||||
if (s.length() > 2) return -1;
|
||||
if (s.length() == 1) return s.charAt(0);
|
||||
|
||||
// at this point, len = 2
|
||||
int cp = UTF16.charAt(s, 0);
|
||||
if (cp > 0xFFFF) { // is surrogate pair
|
||||
return cp;
|
||||
if (s.length() == 2) {
|
||||
int cp = Character.codePointAt(s, 0);
|
||||
if (cp > 0xFFFF) { // is surrogate pair
|
||||
return cp;
|
||||
}
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
@ -569,13 +569,11 @@ public class UnicodeSet {
|
|||
/**
|
||||
* Complements the specified range in this set. Any character in
|
||||
* the range will be removed if it is in this set, or will be
|
||||
* added if it is not in this set. If {@code end > start}
|
||||
* added if it is not in this set. If <code>start > end</code>
|
||||
* then an empty range is complemented, leaving the set unchanged.
|
||||
*
|
||||
* @param start first character, inclusive, of range to be removed
|
||||
* from this set.
|
||||
* @param end last character, inclusive, of range to be removed
|
||||
* from this set.
|
||||
* @param start first character, inclusive, of range
|
||||
* @param end last character, inclusive, of range
|
||||
* @stable ICU 2.0
|
||||
*/
|
||||
public UnicodeSet complement(int start, int end) {
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2019, 2021, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -43,7 +43,7 @@ import static jdk.internal.icu.impl.NormalizerImpl.UTF16Plus;
|
|||
/**
|
||||
* Immutable Unicode code point trie.
|
||||
* Fast, reasonably compact, map from Unicode code points (U+0000..U+10FFFF) to integer values.
|
||||
* For details see http://site.icu-project.org/design/struct/utrie
|
||||
* For details see https://icu.unicode.org/design/struct/utrie
|
||||
*
|
||||
* <p>This class is not intended for public subclassing.
|
||||
*
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2005, 2022, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -54,7 +54,7 @@ public final class VersionInfo
|
|||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
public static final String ICU_DATA_VERSION_PATH = "67b";
|
||||
public static final String ICU_DATA_VERSION_PATH = "70b";
|
||||
|
||||
// public methods ------------------------------------------------------
|
||||
|
||||
|
@ -148,7 +148,15 @@ public final class VersionInfo
|
|||
*/
|
||||
public int compareTo(VersionInfo other)
|
||||
{
|
||||
return m_version_ - other.m_version_;
|
||||
// m_version_ is an int, a signed 32-bit integer.
|
||||
// When the major version is >=128, then the version int is negative.
|
||||
// Compare it in two steps to simulate an unsigned-int comparison.
|
||||
// (Alternatively we could turn each int into a long and reset the upper 32 bits.)
|
||||
// Compare the upper bits first, using logical shift right (unsigned).
|
||||
int diff = (m_version_ >>> 1) - (other.m_version_ >>> 1);
|
||||
if (diff != 0) { return diff; }
|
||||
// Compare the remaining bits.
|
||||
return (m_version_ & 1) - (other.m_version_ & 1);
|
||||
}
|
||||
|
||||
// private data members ----------------------------------------------
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue