8268081: Upgrade Unicode Data Files to 14.0.0

Reviewed-by: joehw, iris, lancea
This commit is contained in:
Naoto Sato 2022-01-12 19:17:18 +00:00
parent ddddec7d74
commit 0a094d7c28
38 changed files with 3333 additions and 1081 deletions

File diff suppressed because it is too large Load diff

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2016, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2016, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -35,8 +35,8 @@ final class Grapheme {
* <p>
* See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
* for the extended grapheme cluster boundary rules. The following implementation
* is based on version 12.0 of the annex.
* (http://www.unicode.org/reports/tr29/tr29-35.html)
* is based on the annex for Unicode version 14.0.
* (http://www.unicode.org/reports/tr29/tr29-38.html)
*
* @param src the {@code CharSequence} to be scanned
* @param off offset to start looking for the next boundary in the src
@ -97,7 +97,7 @@ final class Grapheme {
private static final int FIRST_TYPE = 0;
private static final int LAST_TYPE = 14;
private static boolean[][] rules;
private static final boolean[][] rules;
static {
rules = new boolean[LAST_TYPE + 1][LAST_TYPE + 1];
// GB 999 Any + Any -> default
@ -201,8 +201,9 @@ final class Grapheme {
if (cp == 0x200D)
return ZWJ;
if (cp >= 0x0600 && cp <= 0x0605 ||
cp == 0x06DD || cp == 0x070F || cp == 0x08E2 ||
cp == 0x110BD || cp == 0x110CD)
cp == 0x06DD || cp == 0x070F ||
cp == 0x0890 || cp == 0x0891 ||
cp == 0x08E2 || cp == 0x110BD || cp == 0x110CD)
return PREPEND;
return CONTROL;
case Character.NON_SPACING_MARK:

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2005, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -76,9 +76,6 @@ public final class Punycode {
// TODO: eliminate the 256 limitation
private static final int MAX_CP_COUNT = 256;
private static final int UINT_MAGIC = 0x80000000;
private static final long ULONG_MAGIC = 0x8000000000000000L;
private static int adaptBias(int delta, int length, boolean firstTime){
if(firstTime){
delta /=DAMP;
@ -96,34 +93,25 @@ public final class Punycode {
}
/**
* basicToDigit[] contains the numeric value of a basic code
* point (for use in representing integers) in the range 0 to
* BASE-1, or -1 if b is does not represent a value.
* @return the numeric value of a basic code point (for use in representing integers)
* in the range 0 to BASE-1, or a negative value if cp is invalid.
*/
static final int[] basicToDigit= new int[]{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
private static final int decodeDigit(int cp) {
if(cp<='Z') {
if(cp<='9') {
if(cp<'0') {
return -1;
} else {
return cp-'0'+26; // 0..9 -> 26..35
}
} else {
return cp-'A'; // A-Z -> 0..25
}
} else if(cp<='z') {
return cp-'a'; // a..z -> 0..25
} else {
return -1;
}
};
private static char asciiCaseMap(char b, boolean uppercase) {
@ -158,6 +146,12 @@ public final class Punycode {
return (char)((ZERO-26)+digit);
}
}
// ICU-13727: Limit input length for n^2 algorithm
// where well-formed strings are at most 59 characters long.
private static final int ENCODE_MAX_CODE_UNITS = 1000;
private static final int DECODE_MAX_CHARS = 2000;
/**
* Converts Unicode to Punycode.
* The input string must not contain single, unpaired surrogates.
@ -174,6 +168,10 @@ public final class Punycode {
int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
char c, c2;
int srcLength = src.length();
if (srcLength > ENCODE_MAX_CODE_UNITS) {
throw new RuntimeException(
"input too long: " + srcLength + " UTF-16 code units");
}
int destCapacity = MAX_CP_COUNT;
char[] dest = new char[destCapacity];
StringBuffer result = new StringBuffer();
@ -251,7 +249,7 @@ public final class Punycode {
* Increase delta enough to advance the decoder's
* <n,i> state to <m,0>, but guard against overflow:
*/
if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
if(m-n>(0x7fffffff-handledCPCount-delta)/(handledCPCount+1)) {
throw new RuntimeException("Internal program error");
}
delta+=(m-n)*(handledCPCount+1);
@ -332,6 +330,9 @@ public final class Punycode {
public static StringBuffer decode(StringBuffer src, boolean[] caseFlags)
throws ParseException{
int srcLength = src.length();
if (srcLength > DECODE_MAX_CHARS) {
throw new RuntimeException("input too long: " + srcLength + " characters");
}
StringBuffer result = new StringBuffer();
int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
destCPCount, firstSupplementaryIndex, cpLength;
@ -395,7 +396,7 @@ public final class Punycode {
throw new ParseException("Illegal char found", -1);
}
digit=basicToDigit[(byte)src.charAt(in++)];
digit=decodeDigit(src.charAt(in++));
if(digit<0) {
throw new ParseException("Invalid char found", -1);
}

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -134,9 +134,15 @@ public class UnicodeSetStringSpan {
int i, spanLength;
someRelevant = false;
for (i = 0; i < stringsLength; ++i) {
for (i = 0; i < stringsLength;) {
String string = strings.get(i);
int length16 = string.length();
if (length16 == 0) {
// Remove the empty string.
strings.remove(i);
--stringsLength;
continue;
}
spanLength = spanSet.span(string, SpanCondition.CONTAINED);
if (spanLength < length16) { // Relevant string.
someRelevant = true;
@ -144,6 +150,7 @@ public class UnicodeSetStringSpan {
if (/* (0 != (which & UTF16)) && */ length16 > maxLength16) {
maxLength16 = length16;
}
++i;
}
if (!someRelevant && (which & WITH_COUNT) == 0) {
return;

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2009, 2020, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2009, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -84,7 +84,7 @@ import jdk.internal.icu.util.VersionInfo;
* <p>
* Further detail on differences can be determined using the program
* <a href=
* "http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
* "https://github.com/unicode-org/icu/blob/main/icu4j/main/tests/core/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
* com.ibm.icu.dev.test.lang.UCharacterCompare</a>
* </p>
* <p>
@ -101,9 +101,9 @@ import jdk.internal.icu.util.VersionInfo;
* For more information see
* <a href="http://www.unicode/org/ucd/">"About the Unicode Character Database"</a>
* (http://www.unicode.org/ucd/)
* and the <a href="http://www.icu-project.org/userguide/properties.html">ICU
* and the <a href="https://unicode-org.github.io/icu/userguide/strings/properties">ICU
* User Guide chapter on Properties</a>
* (http://www.icu-project.org/userguide/properties.html).
* (https://unicode-org.github.io/icu/userguide/strings/properties).
* </p>
* <p>
* There are also functions that provide easy migration from C/POSIX functions

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2005, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -53,7 +53,7 @@ public final class UCharacterDirection implements UCharacterEnums.ECharacterDire
// private constructor =========================================
///CLOVER:OFF
/**
* Private constructor to prevent initialisation
* Private constructor to prevent initialization
*/
private UCharacterDirection()
{

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2005, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -62,7 +62,7 @@ package jdk.internal.icu.lang;
@Deprecated
class UCharacterEnums {
/** This is just a namespace, it is not instantiatable. */
/** This is just a namespace, it is not instantiable. */
private UCharacterEnums() {};
/**

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2009, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2009, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -63,7 +63,7 @@ import jdk.internal.icu.impl.UBiDiProps;
*
* This is an implementation of the Unicode Bidirectional Algorithm. The
* algorithm is defined in the
* <a href="http://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
* <a href="https://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
* Unicode Bidirectional Algorithm</a>.
* <p>
*
@ -985,7 +985,7 @@ public class BidiBase {
/**
* Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3).
* Used in
* <a href="http://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
* <a href="https://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
* Unicode Bidirectional Algorithm</a>.
* Returns UCharacter.BidiPairedBracketType values.
* @stable ICU 52
@ -3365,7 +3365,7 @@ public class BidiBase {
/**
* Perform the Unicode Bidi algorithm. It is defined in the
* <a href="http://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
* <a href="https://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
* Unicode Bidirectional Algorithm</a>, version 13,
* also described in The Unicode Standard, Version 4.0 .<p>
*
@ -3450,7 +3450,7 @@ public class BidiBase {
/**
* Perform the Unicode Bidi algorithm. It is defined in the
* <a href="http://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
* <a href="https://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
* Unicode Bidirectional Algorithm</a>, version 13,
* also described in The Unicode Standard, Version 4.0 .<p>
*
@ -3786,7 +3786,7 @@ public class BidiBase {
/**
* Perform the Unicode Bidi algorithm on a given paragraph, as defined in the
* <a href="http://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
* <a href="https://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
* Unicode Bidirectional Algorithm</a>, version 13,
* also described in The Unicode Standard, Version 4.0 .<p>
*

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2009, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2009, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -47,7 +47,7 @@ final class BidiLine {
* text in a single paragraph or in a line of a single paragraph
* which has already been processed according to
* the Unicode 3.0 Bidi algorithm as defined in
* <a href="http://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
* <a href="https://www.unicode.org/reports/tr9/">Unicode Standard Annex #9:
* Unicode Bidirectional Algorithm</a>, version 13,
* also described in The Unicode Standard, Version 4.0.1 .
*

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -43,7 +43,7 @@ import jdk.internal.icu.impl.Norm2AllModes;
* The primary functions are to produce a normalized string and to detect whether
* a string is already normalized.
* The most commonly used normalization forms are those defined in
* <a href="http://www.unicode.org/reports/tr15/">Unicode Standard Annex #15:
* <a href="https://www.unicode.org/reports/tr15/">Unicode Standard Annex #15:
* Unicode Normalization Forms</a>.
* However, this API supports additional normalization forms for specialized purposes.
* For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2005, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2005, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -44,7 +44,7 @@ import java.text.Normalizer;
* <code>normalize</code> transforms Unicode text into an equivalent composed or
* decomposed form, allowing for easier sorting and searching of text.
* <code>normalize</code> supports the standard normalization forms described in
* <a href="http://www.unicode.org/reports/tr15/" target="unicode">
* <a href="https://www.unicode.org/reports/tr15/" target="unicode">
* Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
*
* Characters with accents or other adornments can be encoded in

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2005, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -67,9 +67,9 @@ import jdk.internal.icu.util.VersionInfo;
* <li> Unassigned Table: Contains code points that are unassigned
* in the Unicode Version supported by StringPrep. Currently
* RFC 3454 supports Unicode 3.2. </li>
* <li> Prohibited Table: Contains code points that are prohibted from
* <li> Prohibited Table: Contains code points that are prohibited from
* the output of the StringPrep processing function. </li>
* <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li>
* <li> Mapping Table: Contains code points that are deleted from the output or case mapped. </li>
* </ul>
*
* The procedure for preparing Unicode strings:
@ -226,8 +226,8 @@ public final class StringPrep {
sprepUniVer = getVersionInfo(reader.getUnicodeVersion());
normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
VersionInfo normUniVer = UCharacter.getUnicodeVersion();
if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Version of the normalization data */
normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Version of the normalization data */
((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
){
throw new IOException("Normalization Correction version not supported");
@ -325,7 +325,7 @@ public final class StringPrep {
ch -= val.value;
}
}else if(val.type == DELETE){
// just consume the codepoint and contine
// just consume the codepoint and continue
continue;
}
//copy the source into destination

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -147,9 +147,9 @@ public abstract class UCharacterIterator
*/
public int nextCodePoint(){
int ch1 = next();
if(UTF16.isLeadSurrogate((char)ch1)){
if(UTF16.isLeadSurrogate(ch1)){
int ch2 = next();
if(UTF16.isTrailSurrogate((char)ch2)){
if(UTF16.isTrailSurrogate(ch2)){
return UCharacterProperty.getRawSupplementary((char)ch1,
(char)ch2);
}else if (ch2 != DONE) {
@ -175,7 +175,7 @@ public abstract class UCharacterIterator
/**
* Retreat to the start of the previous code point in the text,
* and return it (pre-decrement semantics). If the index is not
* preceeded by a valid surrogate pair, the behavior is the same
* preceded by a valid surrogate pair, the behavior is the same
* as <code>previous()</code>. Otherwise the iterator is
* decremented to the start of the surrogate pair, and the code
* point represented by the pair is returned.
@ -185,9 +185,9 @@ public abstract class UCharacterIterator
*/
public int previousCodePoint(){
int ch1 = previous();
if(UTF16.isTrailSurrogate((char)ch1)){
if(UTF16.isTrailSurrogate(ch1)){
int ch2 = previous();
if(UTF16.isLeadSurrogate((char)ch2)){
if(UTF16.isLeadSurrogate(ch2)){
return UCharacterProperty.getRawSupplementary((char)ch2,
(char)ch1);
}else if (ch2 != DONE) {

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2005, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -382,36 +382,39 @@ public final class UTF16
}
/**
* Determines whether the code value is a surrogate.
* @param char16 the input character.
* @return true if the input character is a surrogate.
* @stable ICU 2.1
* Determines whether the code point is a surrogate.
*
* @param codePoint The input character.
* (In ICU 2.1-69 the type of this parameter was <code>char</code>.)
* @return true If the input code point is a surrogate.
* @stable ICU 70
*/
public static boolean isSurrogate(char char16)
{
return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
public static boolean isSurrogate(int codePoint) {
return (codePoint & SURROGATE_BITMASK) == SURROGATE_BITS;
}
/**
* Determines whether the character is a trail surrogate.
* @param char16 the input character.
* @return true if the input character is a trail surrogate.
* @stable ICU 2.1
* Determines whether the code point is a trail surrogate.
*
* @param codePoint The input character.
* (In ICU 2.1-69 the type of this parameter was <code>char</code>.)
* @return true If the input code point is a trail surrogate.
* @stable ICU 70
*/
public static boolean isTrailSurrogate(char char16)
{
return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
public static boolean isTrailSurrogate(int codePoint) {
return (codePoint & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
}
/**
* Determines whether the character is a lead surrogate.
* @param char16 the input character.
* @return true if the input character is a lead surrogate
* @stable ICU 2.1
* Determines whether the code point is a lead surrogate.
*
* @param codePoint The input character.
* (In ICU 2.1-69 the type of this parameter was <code>char</code>.)
* @return true If the input code point is a lead surrogate
* @stable ICU 70
*/
public static boolean isLeadSurrogate(char char16)
{
return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
public static boolean isLeadSurrogate(int codePoint) {
return (codePoint & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
}
/**

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2005, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -135,8 +135,8 @@ import jdk.internal.icu.util.VersionInfo;
* "[:Lu:]" and the Perl-like syntax "\p{Lu}" are recognized. For a
* complete list of supported property patterns, see the User's Guide
* for UnicodeSet at
* <a href="http://www.icu-project.org/userguide/unicodeSet.html">
* http://www.icu-project.org/userguide/unicodeSet.html</a>.
* <a href="https://unicode-org.github.io/icu/userguide/strings/unicodeset">
* https://unicode-org.github.io/icu/userguide/strings/unicodeset</a>.
* Actual determination of property data is defined by the underlying
* Unicode database as implemented by UCharacter.
*
@ -147,6 +147,13 @@ import jdk.internal.icu.util.VersionInfo;
* their delimiters; "[:^foo]" and "\P{foo}". In any other location,
* '^' has no special meaning.
*
* <p>Since ICU 70, "[^...]", "[:^foo]", "\P{foo}", and "[:binaryProperty=No:]"
* perform a "code point complement" (all code points minus the original set),
* removing all multicharacter strings,
* equivalent to .{@link #complement()}.{@link #removeAllStrings()} .
* The {@link #complement()} API function continues to perform a
* symmetric difference with all code points and thus retains all multicharacter strings.
*
* <p>Ranges are indicated by placing two a '-' between two
* characters, as in "a-z". This specifies the range of all
* characters from the left to the right, in Unicode order. If the
@ -189,8 +196,6 @@ import jdk.internal.icu.util.VersionInfo;
* Unicode property
* </table>
*
* <p><b>Warning</b>: you cannot add an empty string ("") to a UnicodeSet.</p>
*
* <p><b>Formal syntax</b></p>
*
* <blockquote>
@ -230,9 +235,8 @@ import jdk.internal.icu.util.VersionInfo;
* </tr>
* <tr align="top">
* <td nowrap valign="top" align="right"><code>hex :=&nbsp; </code></td>
* <td valign="top"><em>any character for which
* </em><code>Character.digit(c, 16)</code><em>
* returns a non-negative result</em></td>
* <td style="vertical-align: top;"><code>'0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' |<br>
* &nbsp;&nbsp;&nbsp;&nbsp;'A' | 'B' | 'C' | 'D' | 'E' | 'F' | 'a' | 'b' | 'c' | 'd' | 'e' | 'f'</code></td>
* </tr>
* <tr>
* <td nowrap valign="top" align="right"><code>property :=&nbsp; </code></td>
@ -487,7 +491,7 @@ public class UnicodeSet {
else if (i > 0 && c == list[i-1]) {
// c is after end of prior range
list[i-1]++;
// no need to chcek for collapse here
// no need to check for collapse here
}
else {
@ -528,7 +532,6 @@ public class UnicodeSet {
* present. If this set already contains the multicharacter,
* the call leaves this set unchanged.
* Thus {@code "ch" => {"ch"}}
* <br><b>Warning: you cannot add an empty string ("") to a UnicodeSet.</b>
* @param s the source string
* @return this object, for chaining
* @stable ICU 2.0
@ -546,22 +549,19 @@ public class UnicodeSet {
/**
* Utility for getting code point from single code point CharSequence.
* See the public UTF16.getSingleCodePoint()
* See the public UTF16.getSingleCodePoint() (which returns -1 for null rather than throwing NPE).
*
* @return a code point IF the string consists of a single one.
* otherwise returns -1.
* @param s to test
*/
private static int getSingleCP(CharSequence s) {
if (s.length() < 1) {
throw new IllegalArgumentException("Can't use zero-length strings in UnicodeSet");
}
if (s.length() > 2) return -1;
if (s.length() == 1) return s.charAt(0);
// at this point, len = 2
int cp = UTF16.charAt(s, 0);
if (cp > 0xFFFF) { // is surrogate pair
return cp;
if (s.length() == 2) {
int cp = Character.codePointAt(s, 0);
if (cp > 0xFFFF) { // is surrogate pair
return cp;
}
}
return -1;
}
@ -569,13 +569,11 @@ public class UnicodeSet {
/**
* Complements the specified range in this set. Any character in
* the range will be removed if it is in this set, or will be
* added if it is not in this set. If {@code end > start}
* added if it is not in this set. If <code>start &gt; end</code>
* then an empty range is complemented, leaving the set unchanged.
*
* @param start first character, inclusive, of range to be removed
* from this set.
* @param end last character, inclusive, of range to be removed
* from this set.
* @param start first character, inclusive, of range
* @param end last character, inclusive, of range
* @stable ICU 2.0
*/
public UnicodeSet complement(int start, int end) {

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2019, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2019, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -43,7 +43,7 @@ import static jdk.internal.icu.impl.NormalizerImpl.UTF16Plus;
/**
* Immutable Unicode code point trie.
* Fast, reasonably compact, map from Unicode code points (U+0000..U+10FFFF) to integer values.
* For details see http://site.icu-project.org/design/struct/utrie
* For details see https://icu.unicode.org/design/struct/utrie
*
* <p>This class is not intended for public subclassing.
*

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2005, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -54,7 +54,7 @@ public final class VersionInfo
* @deprecated This API is ICU internal only.
*/
@Deprecated
public static final String ICU_DATA_VERSION_PATH = "67b";
public static final String ICU_DATA_VERSION_PATH = "70b";
// public methods ------------------------------------------------------
@ -148,7 +148,15 @@ public final class VersionInfo
*/
public int compareTo(VersionInfo other)
{
return m_version_ - other.m_version_;
// m_version_ is an int, a signed 32-bit integer.
// When the major version is >=128, then the version int is negative.
// Compare it in two steps to simulate an unsigned-int comparison.
// (Alternatively we could turn each int into a long and reset the upper 32 bits.)
// Compare the upper bits first, using logical shift right (unsigned).
int diff = (m_version_ >>> 1) - (other.m_version_ >>> 1);
if (diff != 0) { return diff; }
// Compare the remaining bits.
return (m_version_ & 1) - (other.m_version_ & 1);
}
// private data members ----------------------------------------------