mirror of
https://github.com/openjdk/jdk.git
synced 2025-08-28 15:24:43 +02:00
8221431: Support for Unicode 12.1
Reviewed-by: erikj, rriggs
This commit is contained in:
parent
e4f31b1cd7
commit
93fabcdc5a
66 changed files with 60279 additions and 38178 deletions
|
@ -0,0 +1,501 @@
|
|||
/*
|
||||
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
// (c) 2018 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html#License
|
||||
|
||||
// created: 2018may10 Markus W. Scherer
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values.
|
||||
* This does not implement java.util.Map.
|
||||
*
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public abstract class CodePointMap implements Iterable<CodePointMap.Range> {
|
||||
/**
|
||||
* Selectors for how getRange() should report value ranges overlapping with surrogates.
|
||||
* Most users should use NORMAL.
|
||||
*
|
||||
* @see #getRange
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public enum RangeOption {
|
||||
/**
|
||||
* getRange() enumerates all same-value ranges as stored in the map.
|
||||
* Most users should use this option.
|
||||
*
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
NORMAL,
|
||||
/**
|
||||
* getRange() enumerates all same-value ranges as stored in the map,
|
||||
* except that lead surrogates (U+D800..U+DBFF) are treated as having the
|
||||
* surrogateValue, which is passed to getRange() as a separate parameter.
|
||||
* The surrogateValue is not transformed via filter().
|
||||
* See {@link Character#isHighSurrogate}.
|
||||
*
|
||||
* <p>Most users should use NORMAL instead.
|
||||
*
|
||||
* <p>This option is useful for maps that map surrogate code *units* to
|
||||
* special values optimized for UTF-16 string processing
|
||||
* or for special error behavior for unpaired surrogates,
|
||||
* but those values are not to be associated with the lead surrogate code *points*.
|
||||
*
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
FIXED_LEAD_SURROGATES,
|
||||
/**
|
||||
* getRange() enumerates all same-value ranges as stored in the map,
|
||||
* except that all surrogates (U+D800..U+DFFF) are treated as having the
|
||||
* surrogateValue, which is passed to getRange() as a separate parameter.
|
||||
* The surrogateValue is not transformed via filter().
|
||||
* See {@link Character#isSurrogate}.
|
||||
*
|
||||
* <p>Most users should use NORMAL instead.
|
||||
*
|
||||
* <p>This option is useful for maps that map surrogate code *units* to
|
||||
* special values optimized for UTF-16 string processing
|
||||
* or for special error behavior for unpaired surrogates,
|
||||
* but those values are not to be associated with the lead surrogate code *points*.
|
||||
*
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
FIXED_ALL_SURROGATES
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback function interface: Modifies a map value.
|
||||
* Optionally called by getRange().
|
||||
* The modified value will be returned by the getRange() function.
|
||||
*
|
||||
* <p>Can be used to ignore some of the value bits,
|
||||
* make a filter for one of several values,
|
||||
* return a value index computed from the map value, etc.
|
||||
*
|
||||
* @see #getRange
|
||||
* @see #iterator
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public interface ValueFilter {
|
||||
/**
|
||||
* Modifies the map value.
|
||||
*
|
||||
* @param value map value
|
||||
* @return modified value
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public int apply(int value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Range iteration result data.
|
||||
* Code points from start to end map to the same value.
|
||||
* The value may have been modified by {@link ValueFilter#apply(int)},
|
||||
* or it may be the surrogateValue if a RangeOption other than "normal" was used.
|
||||
*
|
||||
* @see #getRange
|
||||
* @see #iterator
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public static final class Range {
|
||||
private int start;
|
||||
private int end;
|
||||
private int value;
|
||||
|
||||
/**
|
||||
* Constructor. Sets start and end to -1 and value to 0.
|
||||
*
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public Range() {
|
||||
start = end = -1;
|
||||
value = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the start code point
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public int getStart() { return start; }
|
||||
/**
|
||||
* @return the (inclusive) end code point
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public int getEnd() { return end; }
|
||||
/**
|
||||
* @return the range value
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public int getValue() { return value; }
|
||||
/**
|
||||
* Sets the range. When using {@link #iterator()},
|
||||
* iteration will resume after the newly set end.
|
||||
*
|
||||
* @param start new start code point
|
||||
* @param end new end code point
|
||||
* @param value new value
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public void set(int start, int end, int value) {
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
this.value = value;
|
||||
}
|
||||
}
|
||||
|
||||
private final class RangeIterator implements Iterator<Range> {
|
||||
private Range range = new Range();
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return -1 <= range.end && range.end < 0x10ffff;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Range next() {
|
||||
if (getRange(range.end + 1, null, range)) {
|
||||
return range;
|
||||
} else {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterates over code points of a string and fetches map values.
|
||||
* This does not implement java.util.Iterator.
|
||||
*
|
||||
* <pre>
|
||||
* void onString(CodePointMap map, CharSequence s, int start) {
|
||||
* CodePointMap.StringIterator iter = map.stringIterator(s, start);
|
||||
* while (iter.next()) {
|
||||
* int end = iter.getIndex(); // code point from between start and end
|
||||
* useValue(s, start, end, iter.getCodePoint(), iter.getValue());
|
||||
* start = end;
|
||||
* }
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* <p>This class is not intended for public subclassing.
|
||||
*
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public class StringIterator {
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected CharSequence s;
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected int sIndex;
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected int c;
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected int value;
|
||||
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected StringIterator(CharSequence s, int sIndex) {
|
||||
this.s = s;
|
||||
this.sIndex = sIndex;
|
||||
c = -1;
|
||||
value = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the iterator to a new string and/or a new string index.
|
||||
*
|
||||
* @param s string to iterate over
|
||||
* @param sIndex string index where the iteration will start
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public void reset(CharSequence s, int sIndex) {
|
||||
this.s = s;
|
||||
this.sIndex = sIndex;
|
||||
c = -1;
|
||||
value = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the next code point, post-increments the string index,
|
||||
* and gets a value from the map.
|
||||
* Sets an implementation-defined error value if the code point is an unpaired surrogate.
|
||||
*
|
||||
* @return true if the string index was not yet at the end of the string;
|
||||
* otherwise the iterator did not advance
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public boolean next() {
|
||||
if (sIndex >= s.length()) {
|
||||
return false;
|
||||
}
|
||||
c = Character.codePointAt(s, sIndex);
|
||||
sIndex += Character.charCount(c);
|
||||
value = get(c);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the previous code point, pre-decrements the string index,
|
||||
* and gets a value from the map.
|
||||
* Sets an implementation-defined error value if the code point is an unpaired surrogate.
|
||||
*
|
||||
* @return true if the string index was not yet at the start of the string;
|
||||
* otherwise the iterator did not advance
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public boolean previous() {
|
||||
if (sIndex <= 0) {
|
||||
return false;
|
||||
}
|
||||
c = Character.codePointBefore(s, sIndex);
|
||||
sIndex -= Character.charCount(c);
|
||||
value = get(c);
|
||||
return true;
|
||||
}
|
||||
/**
|
||||
* @return the string index
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public final int getIndex() { return sIndex; }
|
||||
/**
|
||||
* @return the code point
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public final int getCodePoint() { return c; }
|
||||
/**
|
||||
* @return the map value,
|
||||
* or an implementation-defined error value if
|
||||
* the code point is an unpaired surrogate
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public final int getValue() { return value; }
|
||||
}
|
||||
|
||||
/**
|
||||
* Protected no-args constructor.
|
||||
*
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
protected CodePointMap() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the value for a code point as stored in the map, with range checking.
|
||||
* Returns an implementation-defined error value if c is not in the range 0..U+10FFFF.
|
||||
*
|
||||
* @param c the code point
|
||||
* @return the map value,
|
||||
* or an implementation-defined error value if
|
||||
* the code point is not in the range 0..U+10FFFF
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public abstract int get(int c);
|
||||
|
||||
/**
|
||||
* Sets the range object to a range of code points beginning with the start parameter.
|
||||
* The range start is the same as the start input parameter
|
||||
* (even if there are preceding code points that have the same value).
|
||||
* The range end is the last code point such that
|
||||
* all those from start to there have the same value.
|
||||
* Returns false if start is not 0..U+10FFFF.
|
||||
* Can be used to efficiently iterate over all same-value ranges in a map.
|
||||
* (This is normally faster than iterating over code points and get()ting each value,
|
||||
* but may be much slower than a data structure that stores ranges directly.)
|
||||
*
|
||||
* <p>If the {@link ValueFilter} parameter is not null, then
|
||||
* the value to be delivered is passed through that filter, and the return value is the end
|
||||
* of the range where all values are modified to the same actual value.
|
||||
* The value is unchanged if that parameter is null.
|
||||
*
|
||||
* <p>Example:
|
||||
* <pre>
|
||||
* int start = 0;
|
||||
* CodePointMap.Range range = new CodePointMap.Range();
|
||||
* while (map.getRange(start, null, range)) {
|
||||
* int end = range.getEnd();
|
||||
* int value = range.getValue();
|
||||
* // Work with the range start..end and its value.
|
||||
* start = end + 1;
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* @param start range start
|
||||
* @param filter an object that may modify the map data value,
|
||||
* or null if the values from the map are to be used unmodified
|
||||
* @param range the range object that will be set to the code point range and value
|
||||
* @return true if start is 0..U+10FFFF; otherwise no new range is fetched
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public abstract boolean getRange(int start, ValueFilter filter, Range range);
|
||||
|
||||
/**
|
||||
* Sets the range object to a range of code points beginning with the start parameter.
|
||||
* The range start is the same as the start input parameter
|
||||
* (even if there are preceding code points that have the same value).
|
||||
* The range end is the last code point such that
|
||||
* all those from start to there have the same value.
|
||||
* Returns false if start is not 0..U+10FFFF.
|
||||
*
|
||||
* <p>Same as the simpler {@link #getRange(int, ValueFilter, Range)} but optionally
|
||||
* modifies the range if it overlaps with surrogate code points.
|
||||
*
|
||||
* @param start range start
|
||||
* @param option defines whether surrogates are treated normally,
|
||||
* or as having the surrogateValue; usually {@link RangeOption#NORMAL}
|
||||
* @param surrogateValue value for surrogates; ignored if option=={@link RangeOption#NORMAL}
|
||||
* @param filter an object that may modify the map data value,
|
||||
* or null if the values from the map are to be used unmodified
|
||||
* @param range the range object that will be set to the code point range and value
|
||||
* @return true if start is 0..U+10FFFF; otherwise no new range is fetched
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public boolean getRange(int start, RangeOption option, int surrogateValue,
|
||||
ValueFilter filter, Range range) {
|
||||
assert option != null;
|
||||
if (!getRange(start, filter, range)) {
|
||||
return false;
|
||||
}
|
||||
if (option == RangeOption.NORMAL) {
|
||||
return true;
|
||||
}
|
||||
int surrEnd = option == RangeOption.FIXED_ALL_SURROGATES ? 0xdfff : 0xdbff;
|
||||
int end = range.end;
|
||||
if (end < 0xd7ff || start > surrEnd) {
|
||||
return true;
|
||||
}
|
||||
// The range overlaps with surrogates, or ends just before the first one.
|
||||
if (range.value == surrogateValue) {
|
||||
if (end >= surrEnd) {
|
||||
// Surrogates followed by a non-surrValue range,
|
||||
// or surrogates are part of a larger surrValue range.
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
if (start <= 0xd7ff) {
|
||||
range.end = 0xd7ff; // Non-surrValue range ends before surrValue surrogates.
|
||||
return true;
|
||||
}
|
||||
// Start is a surrogate with a non-surrValue code *unit* value.
|
||||
// Return a surrValue code *point* range.
|
||||
range.value = surrogateValue;
|
||||
if (end > surrEnd) {
|
||||
range.end = surrEnd; // Surrogate range ends before non-surrValue rest of range.
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// See if the surrValue surrogate range can be merged with
|
||||
// an immediately following range.
|
||||
if (getRange(surrEnd + 1, filter, range) && range.value == surrogateValue) {
|
||||
range.start = start;
|
||||
return true;
|
||||
}
|
||||
range.start = start;
|
||||
range.end = surrEnd;
|
||||
range.value = surrogateValue;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience iterator over same-map-value code point ranges.
|
||||
* Same as looping over all ranges with {@link #getRange(int, ValueFilter, Range)}
|
||||
* without filtering.
|
||||
* Adjacent ranges have different map values.
|
||||
*
|
||||
* <p>The iterator always returns the same Range object.
|
||||
*
|
||||
* @return a Range iterator
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
@Override
|
||||
public Iterator<Range> iterator() {
|
||||
return new RangeIterator();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an iterator (not a java.util.Iterator) over code points of a string
|
||||
* for fetching map values.
|
||||
*
|
||||
* @param s string to iterate over
|
||||
* @param sIndex string index where the iteration will start
|
||||
* @return the iterator
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public StringIterator stringIterator(CharSequence s, int sIndex) {
|
||||
return new StringIterator(s, sIndex);
|
||||
}
|
||||
}
|
1310
src/java.base/share/classes/sun/text/normalizer/CodePointTrie.java
Normal file
1310
src/java.base/share/classes/sun/text/normalizer/CodePointTrie.java
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -119,10 +119,7 @@ public final class ICUBinary {
|
|||
} else if (capacity < 0x4000) {
|
||||
capacity *= 2; // Grow faster until we reach 16kB.
|
||||
}
|
||||
// TODO Java 6 replace new byte[] and arraycopy(): bytes = Arrays.copyOf(bytes, capacity);
|
||||
byte[] newBytes = new byte[capacity];
|
||||
System.arraycopy(bytes, 0, newBytes, 0, length);
|
||||
bytes = newBytes;
|
||||
bytes = Arrays.copyOf(bytes, capacity);
|
||||
bytes[length++] = (byte) nextByte;
|
||||
}
|
||||
}
|
||||
|
@ -264,6 +261,36 @@ public final class ICUBinary {
|
|||
}
|
||||
}
|
||||
|
||||
public static byte[] getBytes(ByteBuffer bytes, int length, int additionalSkipLength) {
|
||||
byte[] dest = new byte[length];
|
||||
bytes.get(dest);
|
||||
if (additionalSkipLength > 0) {
|
||||
skipBytes(bytes, additionalSkipLength);
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
public static String getString(ByteBuffer bytes, int length, int additionalSkipLength) {
|
||||
CharSequence cs = bytes.asCharBuffer();
|
||||
String s = cs.subSequence(0, length).toString();
|
||||
skipBytes(bytes, length * 2 + additionalSkipLength);
|
||||
return s;
|
||||
}
|
||||
|
||||
public static char[] getChars(ByteBuffer bytes, int length, int additionalSkipLength) {
|
||||
char[] dest = new char[length];
|
||||
bytes.asCharBuffer().get(dest);
|
||||
skipBytes(bytes, length * 2 + additionalSkipLength);
|
||||
return dest;
|
||||
}
|
||||
|
||||
public static int[] getInts(ByteBuffer bytes, int length, int additionalSkipLength) {
|
||||
int[] dest = new int[length];
|
||||
bytes.asIntBuffer().get(dest);
|
||||
skipBytes(bytes, length * 4 + additionalSkipLength);
|
||||
return dest;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a VersionInfo for the bytes in the compact version integer.
|
||||
*/
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -265,7 +265,7 @@ final class Norm2AllModes {
|
|||
private static final class Norm2AllModesSingleton {
|
||||
private Norm2AllModesSingleton(String name) {
|
||||
try {
|
||||
String DATA_FILE_NAME = "/sun/text/resources/" + name + ".icu";
|
||||
String DATA_FILE_NAME = "/sun/text/resources/" + name + ".nrm";
|
||||
NormalizerImpl impl=new NormalizerImpl().load(DATA_FILE_NAME);
|
||||
allModes=new Norm2AllModes(impl);
|
||||
} catch (RuntimeException e) {
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -61,7 +61,7 @@ public final class NormalizerImpl {
|
|||
return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
|
||||
}
|
||||
|
||||
/**
|
||||
/**
|
||||
* Decomposes c, which must be a Hangul syllable, into buffer
|
||||
* and returns the length of the decomposition (2 or 3).
|
||||
*/
|
||||
|
@ -145,8 +145,7 @@ public final class NormalizerImpl {
|
|||
insert(c, cc);
|
||||
}
|
||||
}
|
||||
// s must be in NFD, otherwise change the implementation.
|
||||
public void append(CharSequence s, int start, int limit,
|
||||
public void append(CharSequence s, int start, int limit, boolean isNFD,
|
||||
int leadCC, int trailCC) {
|
||||
if(start==limit) {
|
||||
return;
|
||||
|
@ -167,8 +166,11 @@ public final class NormalizerImpl {
|
|||
c=Character.codePointAt(s, start);
|
||||
start+=Character.charCount(c);
|
||||
if(start<limit) {
|
||||
// s must be in NFD, otherwise we need to use getCC().
|
||||
leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));
|
||||
if (isNFD) {
|
||||
leadCC = getCCFromYesOrMaybe(impl.getNorm16(c));
|
||||
} else {
|
||||
leadCC = impl.getCC(impl.getNorm16(c));
|
||||
}
|
||||
} else {
|
||||
leadCC=trailCC;
|
||||
}
|
||||
|
@ -310,6 +312,12 @@ public final class NormalizerImpl {
|
|||
// TODO: Propose widening UTF16 methods that take char to take int.
|
||||
// TODO: Propose widening UTF16 methods that take String to take CharSequence.
|
||||
public static final class UTF16Plus {
|
||||
/**
|
||||
* Is this code point a lead surrogate (U+d800..U+dbff)?
|
||||
* @param c code unit or code point
|
||||
* @return true or false
|
||||
*/
|
||||
public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; }
|
||||
/**
|
||||
* Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
|
||||
* is it a lead surrogate?
|
||||
|
@ -350,7 +358,7 @@ public final class NormalizerImpl {
|
|||
|
||||
private static final class IsAcceptable implements ICUBinary.Authenticate {
|
||||
public boolean isDataVersionAcceptable(byte version[]) {
|
||||
return version[0]==3;
|
||||
return version[0]==4;
|
||||
}
|
||||
}
|
||||
private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
|
||||
|
@ -387,8 +395,9 @@ public final class NormalizerImpl {
|
|||
// Read the normTrie.
|
||||
int offset=inIndexes[IX_NORM_TRIE_OFFSET];
|
||||
int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
|
||||
normTrie=Trie2_16.createFromSerialized(bytes);
|
||||
int trieLength=normTrie.getSerializedLength();
|
||||
int triePosition = bytes.position();
|
||||
normTrie = CodePointTrie.Fast16.fromBinary(bytes);
|
||||
int trieLength = bytes.position() - triePosition;
|
||||
if(trieLength>(nextOffset-offset)) {
|
||||
throw new InternalError("Normalizer2 data: not enough bytes for normTrie");
|
||||
}
|
||||
|
@ -398,13 +407,8 @@ public final class NormalizerImpl {
|
|||
offset=nextOffset;
|
||||
nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
|
||||
int numChars=(nextOffset-offset)/2;
|
||||
char[] chars;
|
||||
if(numChars!=0) {
|
||||
chars=new char[numChars];
|
||||
for(int i=0; i<numChars; ++i) {
|
||||
chars[i]=bytes.getChar();
|
||||
}
|
||||
maybeYesCompositions=new String(chars);
|
||||
maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0);
|
||||
extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
|
||||
}
|
||||
|
||||
|
@ -422,8 +426,12 @@ public final class NormalizerImpl {
|
|||
return load(ICUBinary.getRequiredData(name));
|
||||
}
|
||||
|
||||
|
||||
public int getNorm16(int c) { return normTrie.get(c); }
|
||||
// The trie stores values for lead surrogate code *units*.
|
||||
// Surrogate code *points* are inert.
|
||||
public int getNorm16(int c) {
|
||||
return UTF16Plus.isLeadSurrogate(c) ? INERT : normTrie.get(c);
|
||||
}
|
||||
public int getRawNorm16(int c) { return normTrie.get(c); }
|
||||
public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
|
||||
public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
|
||||
public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
|
||||
|
@ -486,7 +494,7 @@ public final class NormalizerImpl {
|
|||
}
|
||||
// Maps to an isCompYesAndZeroCC.
|
||||
c=mapAlgorithmic(c, norm16);
|
||||
norm16=getNorm16(c);
|
||||
norm16=getRawNorm16(c);
|
||||
}
|
||||
}
|
||||
if(norm16<=minYesNo || isHangulLVT(norm16)) {
|
||||
|
@ -519,7 +527,7 @@ public final class NormalizerImpl {
|
|||
// Maps to an isCompYesAndZeroCC.
|
||||
decomp=c=mapAlgorithmic(c, norm16);
|
||||
// The mapping might decompose further.
|
||||
norm16 = getNorm16(c);
|
||||
norm16 = getRawNorm16(c);
|
||||
}
|
||||
if (norm16 < minYesNo) {
|
||||
if(decomp<0) {
|
||||
|
@ -641,27 +649,23 @@ public final class NormalizerImpl {
|
|||
// count code units below the minimum or with irrelevant data for the quick check
|
||||
for(prevSrc=src; src!=limit;) {
|
||||
if( (c=s.charAt(src))<minNoCP ||
|
||||
isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
|
||||
isMostDecompYesAndZeroCC(norm16=normTrie.bmpGet(c))
|
||||
) {
|
||||
++src;
|
||||
} else if(!UTF16.isSurrogate((char)c)) {
|
||||
} else if(!UTF16Plus.isLeadSurrogate(c)) {
|
||||
break;
|
||||
} else {
|
||||
char c2;
|
||||
if(UTF16Plus.isSurrogateLead(c)) {
|
||||
if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
|
||||
c=Character.toCodePoint((char)c, c2);
|
||||
if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
|
||||
c = Character.toCodePoint((char)c, c2);
|
||||
norm16 = normTrie.suppGet(c);
|
||||
if (isMostDecompYesAndZeroCC(norm16)) {
|
||||
src += 2;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} else /* trail surrogate */ {
|
||||
if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
|
||||
--src;
|
||||
c=Character.toCodePoint(c2, (char)c);
|
||||
}
|
||||
}
|
||||
if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
|
||||
src+=Character.charCount(c);
|
||||
} else {
|
||||
break;
|
||||
++src; // unpaired lead surrogate: inert
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -721,7 +725,7 @@ public final class NormalizerImpl {
|
|||
c=Character.codePointAt(s, src);
|
||||
cc=getCC(getNorm16(c));
|
||||
};
|
||||
buffer.append(s, 0, src, firstCC, prevCC);
|
||||
buffer.append(s, 0, src, false, firstCC, prevCC);
|
||||
buffer.append(s, src, limit);
|
||||
}
|
||||
|
||||
|
@ -749,28 +753,22 @@ public final class NormalizerImpl {
|
|||
return true;
|
||||
}
|
||||
if( (c=s.charAt(src))<minNoMaybeCP ||
|
||||
isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
|
||||
isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
|
||||
) {
|
||||
++src;
|
||||
} else {
|
||||
prevSrc = src++;
|
||||
if(!UTF16.isSurrogate((char)c)) {
|
||||
if (!UTF16Plus.isLeadSurrogate(c)) {
|
||||
break;
|
||||
} else {
|
||||
char c2;
|
||||
if(UTF16Plus.isSurrogateLead(c)) {
|
||||
if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
|
||||
++src;
|
||||
c=Character.toCodePoint((char)c, c2);
|
||||
if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {
|
||||
++src;
|
||||
c = Character.toCodePoint((char)c, c2);
|
||||
norm16 = normTrie.suppGet(c);
|
||||
if (!isCompYesAndZeroCC(norm16)) {
|
||||
break;
|
||||
}
|
||||
} else /* trail surrogate */ {
|
||||
if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
|
||||
--prevSrc;
|
||||
c=Character.toCodePoint(c2, (char)c);
|
||||
}
|
||||
}
|
||||
if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -991,28 +989,22 @@ public final class NormalizerImpl {
|
|||
return (src<<1)|qcResult; // "yes" or "maybe"
|
||||
}
|
||||
if( (c=s.charAt(src))<minNoMaybeCP ||
|
||||
isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
|
||||
isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
|
||||
) {
|
||||
++src;
|
||||
} else {
|
||||
prevSrc = src++;
|
||||
if(!UTF16.isSurrogate((char)c)) {
|
||||
if (!UTF16Plus.isLeadSurrogate(c)) {
|
||||
break;
|
||||
} else {
|
||||
char c2;
|
||||
if(UTF16Plus.isSurrogateLead(c)) {
|
||||
if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
|
||||
++src;
|
||||
c=Character.toCodePoint((char)c, c2);
|
||||
if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {
|
||||
++src;
|
||||
c = Character.toCodePoint((char)c, c2);
|
||||
norm16 = normTrie.suppGet(c);
|
||||
if (!isCompYesAndZeroCC(norm16)) {
|
||||
break;
|
||||
}
|
||||
} else /* trail surrogate */ {
|
||||
if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
|
||||
--prevSrc;
|
||||
c=Character.toCodePoint(c2, (char)c);
|
||||
}
|
||||
}
|
||||
if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1134,17 +1126,10 @@ public final class NormalizerImpl {
|
|||
prevFCD16=0;
|
||||
++src;
|
||||
} else {
|
||||
if(UTF16.isSurrogate((char)c)) {
|
||||
if (UTF16Plus.isLeadSurrogate(c)) {
|
||||
char c2;
|
||||
if(UTF16Plus.isSurrogateLead(c)) {
|
||||
if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
|
||||
c=Character.toCodePoint((char)c, c2);
|
||||
}
|
||||
} else /* trail surrogate */ {
|
||||
if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
|
||||
--src;
|
||||
c=Character.toCodePoint(c2, (char)c);
|
||||
}
|
||||
if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
|
||||
c = Character.toCodePoint((char)c, c2);
|
||||
}
|
||||
}
|
||||
if((fcd16=getFCD16FromNormData(c))<=0xff) {
|
||||
|
@ -1430,7 +1415,7 @@ public final class NormalizerImpl {
|
|||
}
|
||||
// Maps to an isCompYesAndZeroCC.
|
||||
c=mapAlgorithmic(c, norm16);
|
||||
norm16=getNorm16(c);
|
||||
norm16=getRawNorm16(c);
|
||||
}
|
||||
if (norm16 < minYesNo) {
|
||||
// c does not decompose
|
||||
|
@ -1451,7 +1436,7 @@ public final class NormalizerImpl {
|
|||
leadCC=0;
|
||||
}
|
||||
++mapping; // skip over the firstUnit
|
||||
buffer.append(extraData, mapping, mapping+length, leadCC, trailCC);
|
||||
buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1643,7 +1628,7 @@ public final class NormalizerImpl {
|
|||
// Is the composite a starter that combines forward?
|
||||
if((compositeAndFwd&1)!=0) {
|
||||
compositionsList=
|
||||
getCompositionsListForComposite(getNorm16(composite));
|
||||
getCompositionsListForComposite(getRawNorm16(composite));
|
||||
} else {
|
||||
compositionsList=-1;
|
||||
}
|
||||
|
@ -2196,9 +2181,8 @@ public final class NormalizerImpl {
|
|||
private int centerNoNoDelta;
|
||||
private int minMaybeYes;
|
||||
|
||||
private Trie2_16 normTrie;
|
||||
private CodePointTrie.Fast16 normTrie;
|
||||
private String maybeYesCompositions;
|
||||
private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
|
||||
private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
|
||||
|
||||
}
|
||||
}
|
||||
|
|
Binary file not shown.
Binary file not shown.
BIN
src/java.base/share/classes/sun/text/resources/nfkc.nrm
Normal file
BIN
src/java.base/share/classes/sun/text/resources/nfkc.nrm
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue