mirror of
https://github.com/openjdk/jdk.git
synced 2025-09-20 19:14:38 +02:00
8221431: Support for Unicode 12.1
Reviewed-by: erikj, rriggs
This commit is contained in:
parent
e4f31b1cd7
commit
93fabcdc5a
66 changed files with 60279 additions and 38178 deletions
File diff suppressed because it is too large
Load diff
|
@ -0,0 +1,46 @@
|
|||
/*
|
||||
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
|
||||
package java.util.regex;
|
||||
|
||||
/**
|
||||
* Holds data contained in the Unicode Technical Standard #51: Unicode
|
||||
* Emoji.
|
||||
*
|
||||
* Currently it is only used for the rule "GB11" in UAX #29 Unicode Text
|
||||
* Segmentation.
|
||||
*/
|
||||
final class EmojiData {
|
||||
/**
|
||||
* Returns whether the code point is an extended pictographic or not.
|
||||
*
|
||||
* @param cp code point to examine
|
||||
* @return true if {@code cp} is an extended pictographic
|
||||
*/
|
||||
static boolean isExtendedPictographic(int cp) {
|
||||
return
|
||||
%%%EXTPICT%%%
|
||||
}
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -25,17 +25,56 @@
|
|||
|
||||
package java.util.regex;
|
||||
|
||||
import java.util.Objects;
|
||||
|
||||
final class Grapheme {
|
||||
|
||||
/**
|
||||
* Determines if there is an extended grapheme cluster boundary between two
|
||||
* continuing characters {@code cp1} and {@code cp2}.
|
||||
* Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
|
||||
* the start of the char sequence is a boundary.
|
||||
* <p>
|
||||
* See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
|
||||
* for the extended grapheme cluster boundary rules
|
||||
* for the extended grapheme cluster boundary rules. The following implementation
|
||||
* is based on version 12.0 of the annex.
|
||||
* (http://www.unicode.org/reports/tr29/tr29-35.html)
|
||||
*
|
||||
* @param src the {@code CharSequence} to be scanned
|
||||
* @param off offset to start looking for the next boundary in the src
|
||||
* @param limit limit offset in the src (exclusive)
|
||||
* @return the next possible boundary
|
||||
*/
|
||||
static boolean isBoundary(int cp1, int cp2) {
|
||||
return rules[getType(cp1)][getType(cp2)];
|
||||
static int nextBoundary(CharSequence src, int off, int limit) {
|
||||
Objects.checkFromToIndex(off, limit, src.length());
|
||||
|
||||
int ch0 = Character.codePointAt(src, 0);
|
||||
int ret = Character.charCount(ch0);
|
||||
int ch1;
|
||||
// indicates whether gb11 or gb12 is underway
|
||||
boolean gb11 = EmojiData.isExtendedPictographic(ch0);
|
||||
int riCount = getType(ch0) == RI ? 1 : 0;
|
||||
while (ret < limit) {
|
||||
ch1 = Character.codePointAt(src, ret);
|
||||
int t0 = getType(ch0);
|
||||
int t1 = getType(ch1);
|
||||
|
||||
if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
|
||||
gb11 = false;
|
||||
} else if (riCount % 2 == 1 && t0 == RI && t1 == RI) {
|
||||
// continue for gb12
|
||||
} else if (rules[t0][t1]) {
|
||||
if (ret > off) {
|
||||
break;
|
||||
} else {
|
||||
gb11 = EmojiData.isExtendedPictographic(ch1);
|
||||
riCount = 0;
|
||||
}
|
||||
}
|
||||
|
||||
riCount += getType(ch1) == RI ? 1 : 0;
|
||||
ch0 = ch1;
|
||||
ret += Character.charCount(ch1);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
// types
|
||||
|
@ -44,22 +83,24 @@ final class Grapheme {
|
|||
private static final int LF = 2;
|
||||
private static final int CONTROL = 3;
|
||||
private static final int EXTEND = 4;
|
||||
private static final int RI = 5;
|
||||
private static final int PREPEND = 6;
|
||||
private static final int SPACINGMARK = 7;
|
||||
private static final int L = 8;
|
||||
private static final int V = 9;
|
||||
private static final int T = 10;
|
||||
private static final int LV = 11;
|
||||
private static final int LVT = 12;
|
||||
private static final int ZWJ = 5;
|
||||
private static final int RI = 6;
|
||||
private static final int PREPEND = 7;
|
||||
private static final int SPACINGMARK = 8;
|
||||
private static final int L = 9;
|
||||
private static final int V = 10;
|
||||
private static final int T = 11;
|
||||
private static final int LV = 12;
|
||||
private static final int LVT = 13;
|
||||
private static final int EXTENDED_PICTOGRAPHIC = 14;
|
||||
|
||||
private static final int FIRST_TYPE = 0;
|
||||
private static final int LAST_TYPE = 12;
|
||||
private static final int LAST_TYPE = 14;
|
||||
|
||||
private static boolean[][] rules;
|
||||
static {
|
||||
rules = new boolean[LAST_TYPE + 1][LAST_TYPE + 1];
|
||||
// default, any + any
|
||||
// GB 999 Any + Any -> default
|
||||
for (int i = FIRST_TYPE; i <= LAST_TYPE; i++)
|
||||
for (int j = FIRST_TYPE; j <= LAST_TYPE; j++)
|
||||
rules[i][j] = true;
|
||||
|
@ -76,13 +117,12 @@ final class Grapheme {
|
|||
// GB 8 (LVT | T) x T
|
||||
rules[LVT][T] = false;
|
||||
rules[T][T] = false;
|
||||
// GB 8a RI x RI
|
||||
rules[RI][RI] = false;
|
||||
// GB 9 x Extend
|
||||
// GB 9 x (Extend|ZWJ)
|
||||
// GB 9a x Spacing Mark
|
||||
// GB 9b Prepend x
|
||||
for (int i = FIRST_TYPE; i <= LAST_TYPE; i++) {
|
||||
rules[i][EXTEND] = false;
|
||||
rules[i][ZWJ] = false;
|
||||
rules[i][SPACINGMARK] = false;
|
||||
rules[PREPEND][i] = false;
|
||||
}
|
||||
|
@ -95,7 +135,9 @@ final class Grapheme {
|
|||
}
|
||||
// GB 3 CR x LF
|
||||
rules[CR][LF] = false;
|
||||
// GB 10 Any + Any -> default
|
||||
// GB 11 Exended_Pictographic x (Extend|ZWJ)
|
||||
rules[EXTENDED_PICTOGRAPHIC][EXTEND] = false;
|
||||
rules[EXTENDED_PICTOGRAPHIC][ZWJ] = false;
|
||||
}
|
||||
|
||||
// Hangul syllables
|
||||
|
@ -123,6 +165,10 @@ final class Grapheme {
|
|||
|
||||
@SuppressWarnings("fallthrough")
|
||||
private static int getType(int cp) {
|
||||
if (EmojiData.isExtendedPictographic(cp)) {
|
||||
return EXTENDED_PICTOGRAPHIC;
|
||||
}
|
||||
|
||||
int type = Character.getType(cp);
|
||||
switch(type) {
|
||||
case Character.CONTROL:
|
||||
|
@ -131,29 +177,36 @@ final class Grapheme {
|
|||
if (cp == 0x000A)
|
||||
return LF;
|
||||
return CONTROL;
|
||||
case Character.UNASSIGNED:
|
||||
case Character.UNASSIGNED:
|
||||
// NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
|
||||
// but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
|
||||
// so type it as "Other" to make the test happy
|
||||
if (cp == 0x0378)
|
||||
return OTHER;
|
||||
if (cp == 0x0378)
|
||||
return OTHER;
|
||||
|
||||
case Character.LINE_SEPARATOR:
|
||||
case Character.PARAGRAPH_SEPARATOR:
|
||||
case Character.SURROGATE:
|
||||
return CONTROL;
|
||||
case Character.FORMAT:
|
||||
if (cp == 0x200C || cp == 0x200D)
|
||||
if (cp == 0x200C ||
|
||||
cp >= 0xE0020 && cp <= 0xE007F)
|
||||
return EXTEND;
|
||||
if (cp == 0x200D)
|
||||
return ZWJ;
|
||||
if (cp >= 0x0600 && cp <= 0x0605 ||
|
||||
cp == 0x06DD || cp == 0x070F || cp == 0x08E2 ||
|
||||
cp == 0x110BD || cp == 0x110CD)
|
||||
return PREPEND;
|
||||
return CONTROL;
|
||||
case Character.NON_SPACING_MARK:
|
||||
case Character.ENCLOSING_MARK:
|
||||
// NOTE:
|
||||
// #tr29 "plus a few General_Category = Spacing_Mark needed for
|
||||
// canonical equivalence."
|
||||
// but for "extended grapheme clusters" support, there is no
|
||||
// need actually to diff "extend" and "spackmark" given GB9, GB9a
|
||||
return EXTEND;
|
||||
// NOTE:
|
||||
// #tr29 "plus a few General_Category = Spacing_Mark needed for
|
||||
// canonical equivalence."
|
||||
// but for "extended grapheme clusters" support, there is no
|
||||
// need actually to diff "extend" and "spackmark" given GB9, GB9a
|
||||
return EXTEND;
|
||||
case Character.COMBINING_SPACING_MARK:
|
||||
if (isExcludedSpacingMark(cp))
|
||||
return OTHER;
|
||||
|
@ -167,9 +220,11 @@ final class Grapheme {
|
|||
return RI;
|
||||
return OTHER;
|
||||
case Character.MODIFIER_LETTER:
|
||||
case Character.MODIFIER_SYMBOL:
|
||||
// WARNING:
|
||||
// not mentioned in #tr29 but listed in GraphemeBreakProperty.txt
|
||||
if (cp == 0xFF9E || cp == 0xFF9F)
|
||||
if (cp == 0xFF9E || cp == 0xFF9F ||
|
||||
cp >= 0x1F3FB && cp <= 0x1F3FF)
|
||||
return EXTEND;
|
||||
return OTHER;
|
||||
case Character.OTHER_LETTER:
|
||||
|
@ -199,6 +254,22 @@ final class Grapheme {
|
|||
return V;
|
||||
if (cp >= 0xD7CB && cp <= 0xD7FB)
|
||||
return T;
|
||||
|
||||
// Prepend
|
||||
switch (cp) {
|
||||
case 0x0D4E:
|
||||
case 0x111C2:
|
||||
case 0x111C3:
|
||||
case 0x11A3A:
|
||||
case 0x11A84:
|
||||
case 0x11A85:
|
||||
case 0x11A86:
|
||||
case 0x11A87:
|
||||
case 0x11A88:
|
||||
case 0x11A89:
|
||||
case 0x11D46:
|
||||
return PREPEND;
|
||||
}
|
||||
}
|
||||
return OTHER;
|
||||
}
|
||||
|
|
|
@ -540,7 +540,7 @@ import jdk.internal.util.ArraysSupport;
|
|||
* <p> This class is in conformance with Level 1 of <a
|
||||
* href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
|
||||
* Standard #18: Unicode Regular Expression</i></a>, plus RL2.1
|
||||
* Canonical Equivalents.
|
||||
* Canonical Equivalents and RL2.2 Extended Grapheme Clusters.
|
||||
* <p>
|
||||
* <b>Unicode escape sequences</b> such as <code>\u2014</code> in Java source code
|
||||
* are processed as described in section 3.3 of
|
||||
|
@ -1501,15 +1501,8 @@ public final class Pattern
|
|||
off++;
|
||||
continue;
|
||||
}
|
||||
int j = off + Character.charCount(ch0);
|
||||
int j = Grapheme.nextBoundary(src, off, limit);
|
||||
int ch1;
|
||||
while (j < limit) {
|
||||
ch1 = src.codePointAt(j);
|
||||
if (Grapheme.isBoundary(ch0, ch1))
|
||||
break;
|
||||
ch0 = ch1;
|
||||
j += Character.charCount(ch1);
|
||||
}
|
||||
String seq = src.substring(off, j);
|
||||
String nfd = Normalizer.normalize(seq, Normalizer.Form.NFD);
|
||||
off = j;
|
||||
|
@ -3975,14 +3968,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||
if (i < matcher.to) {
|
||||
int ch0 = Character.codePointAt(seq, i);
|
||||
int n = Character.charCount(ch0);
|
||||
int j = i + n;
|
||||
while (j < matcher.to) {
|
||||
int ch1 = Character.codePointAt(seq, j);
|
||||
if (Grapheme.isBoundary(ch0, ch1))
|
||||
break;
|
||||
ch0 = ch1;
|
||||
j += Character.charCount(ch1);
|
||||
}
|
||||
int j = Grapheme.nextBoundary(seq, i, matcher.to);
|
||||
if (i + n == j) { // single, assume nfc cp
|
||||
if (predicate.is(ch0))
|
||||
return next.match(matcher, j, seq);
|
||||
|
@ -4021,15 +4007,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||
static class XGrapheme extends Node {
|
||||
boolean match(Matcher matcher, int i, CharSequence seq) {
|
||||
if (i < matcher.to) {
|
||||
int ch0 = Character.codePointAt(seq, i);
|
||||
i += Character.charCount(ch0);
|
||||
while (i < matcher.to) {
|
||||
int ch1 = Character.codePointAt(seq, i);
|
||||
if (Grapheme.isBoundary(ch0, ch1))
|
||||
break;
|
||||
ch0 = ch1;
|
||||
i += Character.charCount(ch1);
|
||||
}
|
||||
i = Grapheme.nextBoundary(seq, i, matcher.to);
|
||||
return next.match(matcher, i, seq);
|
||||
}
|
||||
matcher.hitEnd = true;
|
||||
|
@ -4059,8 +4037,9 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||
}
|
||||
if (i < endIndex) {
|
||||
if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) ||
|
||||
!Grapheme.isBoundary(Character.codePointBefore(seq, i),
|
||||
Character.codePointAt(seq, i))) {
|
||||
Grapheme.nextBoundary(seq,
|
||||
i - Character.charCount(Character.codePointBefore(seq, i)),
|
||||
i + Character.charCount(Character.codePointAt(seq, i))) > i) {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
|
|
|
@ -0,0 +1,501 @@
|
|||
/*
|
||||
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
* under the terms of the GNU General Public License version 2 only, as
|
||||
* published by the Free Software Foundation. Oracle designates this
|
||||
* particular file as subject to the "Classpath" exception as provided
|
||||
* by Oracle in the LICENSE file that accompanied this code.
|
||||
*
|
||||
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||
* version 2 for more details (a copy is included in the LICENSE file that
|
||||
* accompanied this code).
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License version
|
||||
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
*
|
||||
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||
* or visit www.oracle.com if you need additional information or have any
|
||||
* questions.
|
||||
*/
|
||||
// (c) 2018 and later: Unicode, Inc. and others.
|
||||
// License & terms of use: http://www.unicode.org/copyright.html#License
|
||||
|
||||
// created: 2018may10 Markus W. Scherer
|
||||
|
||||
package sun.text.normalizer;
|
||||
|
||||
import java.util.Iterator;
|
||||
import java.util.NoSuchElementException;
|
||||
|
||||
/**
|
||||
* Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values.
|
||||
* This does not implement java.util.Map.
|
||||
*
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public abstract class CodePointMap implements Iterable<CodePointMap.Range> {
|
||||
/**
|
||||
* Selectors for how getRange() should report value ranges overlapping with surrogates.
|
||||
* Most users should use NORMAL.
|
||||
*
|
||||
* @see #getRange
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public enum RangeOption {
|
||||
/**
|
||||
* getRange() enumerates all same-value ranges as stored in the map.
|
||||
* Most users should use this option.
|
||||
*
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
NORMAL,
|
||||
/**
|
||||
* getRange() enumerates all same-value ranges as stored in the map,
|
||||
* except that lead surrogates (U+D800..U+DBFF) are treated as having the
|
||||
* surrogateValue, which is passed to getRange() as a separate parameter.
|
||||
* The surrogateValue is not transformed via filter().
|
||||
* See {@link Character#isHighSurrogate}.
|
||||
*
|
||||
* <p>Most users should use NORMAL instead.
|
||||
*
|
||||
* <p>This option is useful for maps that map surrogate code *units* to
|
||||
* special values optimized for UTF-16 string processing
|
||||
* or for special error behavior for unpaired surrogates,
|
||||
* but those values are not to be associated with the lead surrogate code *points*.
|
||||
*
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
FIXED_LEAD_SURROGATES,
|
||||
/**
|
||||
* getRange() enumerates all same-value ranges as stored in the map,
|
||||
* except that all surrogates (U+D800..U+DFFF) are treated as having the
|
||||
* surrogateValue, which is passed to getRange() as a separate parameter.
|
||||
* The surrogateValue is not transformed via filter().
|
||||
* See {@link Character#isSurrogate}.
|
||||
*
|
||||
* <p>Most users should use NORMAL instead.
|
||||
*
|
||||
* <p>This option is useful for maps that map surrogate code *units* to
|
||||
* special values optimized for UTF-16 string processing
|
||||
* or for special error behavior for unpaired surrogates,
|
||||
* but those values are not to be associated with the lead surrogate code *points*.
|
||||
*
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
FIXED_ALL_SURROGATES
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback function interface: Modifies a map value.
|
||||
* Optionally called by getRange().
|
||||
* The modified value will be returned by the getRange() function.
|
||||
*
|
||||
* <p>Can be used to ignore some of the value bits,
|
||||
* make a filter for one of several values,
|
||||
* return a value index computed from the map value, etc.
|
||||
*
|
||||
* @see #getRange
|
||||
* @see #iterator
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public interface ValueFilter {
|
||||
/**
|
||||
* Modifies the map value.
|
||||
*
|
||||
* @param value map value
|
||||
* @return modified value
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public int apply(int value);
|
||||
}
|
||||
|
||||
/**
|
||||
* Range iteration result data.
|
||||
* Code points from start to end map to the same value.
|
||||
* The value may have been modified by {@link ValueFilter#apply(int)},
|
||||
* or it may be the surrogateValue if a RangeOption other than "normal" was used.
|
||||
*
|
||||
* @see #getRange
|
||||
* @see #iterator
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public static final class Range {
|
||||
private int start;
|
||||
private int end;
|
||||
private int value;
|
||||
|
||||
/**
|
||||
* Constructor. Sets start and end to -1 and value to 0.
|
||||
*
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public Range() {
|
||||
start = end = -1;
|
||||
value = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* @return the start code point
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public int getStart() { return start; }
|
||||
/**
|
||||
* @return the (inclusive) end code point
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public int getEnd() { return end; }
|
||||
/**
|
||||
* @return the range value
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public int getValue() { return value; }
|
||||
/**
|
||||
* Sets the range. When using {@link #iterator()},
|
||||
* iteration will resume after the newly set end.
|
||||
*
|
||||
* @param start new start code point
|
||||
* @param end new end code point
|
||||
* @param value new value
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public void set(int start, int end, int value) {
|
||||
this.start = start;
|
||||
this.end = end;
|
||||
this.value = value;
|
||||
}
|
||||
}
|
||||
|
||||
private final class RangeIterator implements Iterator<Range> {
|
||||
private Range range = new Range();
|
||||
|
||||
@Override
|
||||
public boolean hasNext() {
|
||||
return -1 <= range.end && range.end < 0x10ffff;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Range next() {
|
||||
if (getRange(range.end + 1, null, range)) {
|
||||
return range;
|
||||
} else {
|
||||
throw new NoSuchElementException();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public final void remove() {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Iterates over code points of a string and fetches map values.
|
||||
* This does not implement java.util.Iterator.
|
||||
*
|
||||
* <pre>
|
||||
* void onString(CodePointMap map, CharSequence s, int start) {
|
||||
* CodePointMap.StringIterator iter = map.stringIterator(s, start);
|
||||
* while (iter.next()) {
|
||||
* int end = iter.getIndex(); // code point from between start and end
|
||||
* useValue(s, start, end, iter.getCodePoint(), iter.getValue());
|
||||
* start = end;
|
||||
* }
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* <p>This class is not intended for public subclassing.
|
||||
*
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public class StringIterator {
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected CharSequence s;
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected int sIndex;
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected int c;
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected int value;
|
||||
|
||||
/**
|
||||
* @internal
|
||||
* @deprecated This API is ICU internal only.
|
||||
*/
|
||||
@Deprecated
|
||||
protected StringIterator(CharSequence s, int sIndex) {
|
||||
this.s = s;
|
||||
this.sIndex = sIndex;
|
||||
c = -1;
|
||||
value = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resets the iterator to a new string and/or a new string index.
|
||||
*
|
||||
* @param s string to iterate over
|
||||
* @param sIndex string index where the iteration will start
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public void reset(CharSequence s, int sIndex) {
|
||||
this.s = s;
|
||||
this.sIndex = sIndex;
|
||||
c = -1;
|
||||
value = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the next code point, post-increments the string index,
|
||||
* and gets a value from the map.
|
||||
* Sets an implementation-defined error value if the code point is an unpaired surrogate.
|
||||
*
|
||||
* @return true if the string index was not yet at the end of the string;
|
||||
* otherwise the iterator did not advance
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public boolean next() {
|
||||
if (sIndex >= s.length()) {
|
||||
return false;
|
||||
}
|
||||
c = Character.codePointAt(s, sIndex);
|
||||
sIndex += Character.charCount(c);
|
||||
value = get(c);
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Reads the previous code point, pre-decrements the string index,
|
||||
* and gets a value from the map.
|
||||
* Sets an implementation-defined error value if the code point is an unpaired surrogate.
|
||||
*
|
||||
* @return true if the string index was not yet at the start of the string;
|
||||
* otherwise the iterator did not advance
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public boolean previous() {
|
||||
if (sIndex <= 0) {
|
||||
return false;
|
||||
}
|
||||
c = Character.codePointBefore(s, sIndex);
|
||||
sIndex -= Character.charCount(c);
|
||||
value = get(c);
|
||||
return true;
|
||||
}
|
||||
/**
|
||||
* @return the string index
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public final int getIndex() { return sIndex; }
|
||||
/**
|
||||
* @return the code point
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public final int getCodePoint() { return c; }
|
||||
/**
|
||||
* @return the map value,
|
||||
* or an implementation-defined error value if
|
||||
* the code point is an unpaired surrogate
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public final int getValue() { return value; }
|
||||
}
|
||||
|
||||
/**
|
||||
* Protected no-args constructor.
|
||||
*
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
protected CodePointMap() {
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the value for a code point as stored in the map, with range checking.
|
||||
* Returns an implementation-defined error value if c is not in the range 0..U+10FFFF.
|
||||
*
|
||||
* @param c the code point
|
||||
* @return the map value,
|
||||
* or an implementation-defined error value if
|
||||
* the code point is not in the range 0..U+10FFFF
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public abstract int get(int c);
|
||||
|
||||
/**
|
||||
* Sets the range object to a range of code points beginning with the start parameter.
|
||||
* The range start is the same as the start input parameter
|
||||
* (even if there are preceding code points that have the same value).
|
||||
* The range end is the last code point such that
|
||||
* all those from start to there have the same value.
|
||||
* Returns false if start is not 0..U+10FFFF.
|
||||
* Can be used to efficiently iterate over all same-value ranges in a map.
|
||||
* (This is normally faster than iterating over code points and get()ting each value,
|
||||
* but may be much slower than a data structure that stores ranges directly.)
|
||||
*
|
||||
* <p>If the {@link ValueFilter} parameter is not null, then
|
||||
* the value to be delivered is passed through that filter, and the return value is the end
|
||||
* of the range where all values are modified to the same actual value.
|
||||
* The value is unchanged if that parameter is null.
|
||||
*
|
||||
* <p>Example:
|
||||
* <pre>
|
||||
* int start = 0;
|
||||
* CodePointMap.Range range = new CodePointMap.Range();
|
||||
* while (map.getRange(start, null, range)) {
|
||||
* int end = range.getEnd();
|
||||
* int value = range.getValue();
|
||||
* // Work with the range start..end and its value.
|
||||
* start = end + 1;
|
||||
* }
|
||||
* </pre>
|
||||
*
|
||||
* @param start range start
|
||||
* @param filter an object that may modify the map data value,
|
||||
* or null if the values from the map are to be used unmodified
|
||||
* @param range the range object that will be set to the code point range and value
|
||||
* @return true if start is 0..U+10FFFF; otherwise no new range is fetched
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public abstract boolean getRange(int start, ValueFilter filter, Range range);
|
||||
|
||||
/**
|
||||
* Sets the range object to a range of code points beginning with the start parameter.
|
||||
* The range start is the same as the start input parameter
|
||||
* (even if there are preceding code points that have the same value).
|
||||
* The range end is the last code point such that
|
||||
* all those from start to there have the same value.
|
||||
* Returns false if start is not 0..U+10FFFF.
|
||||
*
|
||||
* <p>Same as the simpler {@link #getRange(int, ValueFilter, Range)} but optionally
|
||||
* modifies the range if it overlaps with surrogate code points.
|
||||
*
|
||||
* @param start range start
|
||||
* @param option defines whether surrogates are treated normally,
|
||||
* or as having the surrogateValue; usually {@link RangeOption#NORMAL}
|
||||
* @param surrogateValue value for surrogates; ignored if option=={@link RangeOption#NORMAL}
|
||||
* @param filter an object that may modify the map data value,
|
||||
* or null if the values from the map are to be used unmodified
|
||||
* @param range the range object that will be set to the code point range and value
|
||||
* @return true if start is 0..U+10FFFF; otherwise no new range is fetched
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public boolean getRange(int start, RangeOption option, int surrogateValue,
|
||||
ValueFilter filter, Range range) {
|
||||
assert option != null;
|
||||
if (!getRange(start, filter, range)) {
|
||||
return false;
|
||||
}
|
||||
if (option == RangeOption.NORMAL) {
|
||||
return true;
|
||||
}
|
||||
int surrEnd = option == RangeOption.FIXED_ALL_SURROGATES ? 0xdfff : 0xdbff;
|
||||
int end = range.end;
|
||||
if (end < 0xd7ff || start > surrEnd) {
|
||||
return true;
|
||||
}
|
||||
// The range overlaps with surrogates, or ends just before the first one.
|
||||
if (range.value == surrogateValue) {
|
||||
if (end >= surrEnd) {
|
||||
// Surrogates followed by a non-surrValue range,
|
||||
// or surrogates are part of a larger surrValue range.
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
if (start <= 0xd7ff) {
|
||||
range.end = 0xd7ff; // Non-surrValue range ends before surrValue surrogates.
|
||||
return true;
|
||||
}
|
||||
// Start is a surrogate with a non-surrValue code *unit* value.
|
||||
// Return a surrValue code *point* range.
|
||||
range.value = surrogateValue;
|
||||
if (end > surrEnd) {
|
||||
range.end = surrEnd; // Surrogate range ends before non-surrValue rest of range.
|
||||
return true;
|
||||
}
|
||||
}
|
||||
// See if the surrValue surrogate range can be merged with
|
||||
// an immediately following range.
|
||||
if (getRange(surrEnd + 1, filter, range) && range.value == surrogateValue) {
|
||||
range.start = start;
|
||||
return true;
|
||||
}
|
||||
range.start = start;
|
||||
range.end = surrEnd;
|
||||
range.value = surrogateValue;
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* Convenience iterator over same-map-value code point ranges.
|
||||
* Same as looping over all ranges with {@link #getRange(int, ValueFilter, Range)}
|
||||
* without filtering.
|
||||
* Adjacent ranges have different map values.
|
||||
*
|
||||
* <p>The iterator always returns the same Range object.
|
||||
*
|
||||
* @return a Range iterator
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
@Override
|
||||
public Iterator<Range> iterator() {
|
||||
return new RangeIterator();
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns an iterator (not a java.util.Iterator) over code points of a string
|
||||
* for fetching map values.
|
||||
*
|
||||
* @param s string to iterate over
|
||||
* @param sIndex string index where the iteration will start
|
||||
* @return the iterator
|
||||
* @draft ICU 63
|
||||
* @provisional This API might change or be removed in a future release.
|
||||
*/
|
||||
public StringIterator stringIterator(CharSequence s, int sIndex) {
|
||||
return new StringIterator(s, sIndex);
|
||||
}
|
||||
}
|
1310
src/java.base/share/classes/sun/text/normalizer/CodePointTrie.java
Normal file
1310
src/java.base/share/classes/sun/text/normalizer/CodePointTrie.java
Normal file
File diff suppressed because it is too large
Load diff
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -119,10 +119,7 @@ public final class ICUBinary {
|
|||
} else if (capacity < 0x4000) {
|
||||
capacity *= 2; // Grow faster until we reach 16kB.
|
||||
}
|
||||
// TODO Java 6 replace new byte[] and arraycopy(): bytes = Arrays.copyOf(bytes, capacity);
|
||||
byte[] newBytes = new byte[capacity];
|
||||
System.arraycopy(bytes, 0, newBytes, 0, length);
|
||||
bytes = newBytes;
|
||||
bytes = Arrays.copyOf(bytes, capacity);
|
||||
bytes[length++] = (byte) nextByte;
|
||||
}
|
||||
}
|
||||
|
@ -264,6 +261,36 @@ public final class ICUBinary {
|
|||
}
|
||||
}
|
||||
|
||||
public static byte[] getBytes(ByteBuffer bytes, int length, int additionalSkipLength) {
|
||||
byte[] dest = new byte[length];
|
||||
bytes.get(dest);
|
||||
if (additionalSkipLength > 0) {
|
||||
skipBytes(bytes, additionalSkipLength);
|
||||
}
|
||||
return dest;
|
||||
}
|
||||
|
||||
public static String getString(ByteBuffer bytes, int length, int additionalSkipLength) {
|
||||
CharSequence cs = bytes.asCharBuffer();
|
||||
String s = cs.subSequence(0, length).toString();
|
||||
skipBytes(bytes, length * 2 + additionalSkipLength);
|
||||
return s;
|
||||
}
|
||||
|
||||
public static char[] getChars(ByteBuffer bytes, int length, int additionalSkipLength) {
|
||||
char[] dest = new char[length];
|
||||
bytes.asCharBuffer().get(dest);
|
||||
skipBytes(bytes, length * 2 + additionalSkipLength);
|
||||
return dest;
|
||||
}
|
||||
|
||||
public static int[] getInts(ByteBuffer bytes, int length, int additionalSkipLength) {
|
||||
int[] dest = new int[length];
|
||||
bytes.asIntBuffer().get(dest);
|
||||
skipBytes(bytes, length * 4 + additionalSkipLength);
|
||||
return dest;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a VersionInfo for the bytes in the compact version integer.
|
||||
*/
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -265,7 +265,7 @@ final class Norm2AllModes {
|
|||
private static final class Norm2AllModesSingleton {
|
||||
private Norm2AllModesSingleton(String name) {
|
||||
try {
|
||||
String DATA_FILE_NAME = "/sun/text/resources/" + name + ".icu";
|
||||
String DATA_FILE_NAME = "/sun/text/resources/" + name + ".nrm";
|
||||
NormalizerImpl impl=new NormalizerImpl().load(DATA_FILE_NAME);
|
||||
allModes=new Norm2AllModes(impl);
|
||||
} catch (RuntimeException e) {
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/*
|
||||
* Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved.
|
||||
* Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
|
||||
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||
*
|
||||
* This code is free software; you can redistribute it and/or modify it
|
||||
|
@ -61,7 +61,7 @@ public final class NormalizerImpl {
|
|||
return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
|
||||
}
|
||||
|
||||
/**
|
||||
/**
|
||||
* Decomposes c, which must be a Hangul syllable, into buffer
|
||||
* and returns the length of the decomposition (2 or 3).
|
||||
*/
|
||||
|
@ -145,8 +145,7 @@ public final class NormalizerImpl {
|
|||
insert(c, cc);
|
||||
}
|
||||
}
|
||||
// s must be in NFD, otherwise change the implementation.
|
||||
public void append(CharSequence s, int start, int limit,
|
||||
public void append(CharSequence s, int start, int limit, boolean isNFD,
|
||||
int leadCC, int trailCC) {
|
||||
if(start==limit) {
|
||||
return;
|
||||
|
@ -167,8 +166,11 @@ public final class NormalizerImpl {
|
|||
c=Character.codePointAt(s, start);
|
||||
start+=Character.charCount(c);
|
||||
if(start<limit) {
|
||||
// s must be in NFD, otherwise we need to use getCC().
|
||||
leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));
|
||||
if (isNFD) {
|
||||
leadCC = getCCFromYesOrMaybe(impl.getNorm16(c));
|
||||
} else {
|
||||
leadCC = impl.getCC(impl.getNorm16(c));
|
||||
}
|
||||
} else {
|
||||
leadCC=trailCC;
|
||||
}
|
||||
|
@ -310,6 +312,12 @@ public final class NormalizerImpl {
|
|||
// TODO: Propose widening UTF16 methods that take char to take int.
|
||||
// TODO: Propose widening UTF16 methods that take String to take CharSequence.
|
||||
public static final class UTF16Plus {
|
||||
/**
|
||||
* Is this code point a lead surrogate (U+d800..U+dbff)?
|
||||
* @param c code unit or code point
|
||||
* @return true or false
|
||||
*/
|
||||
public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; }
|
||||
/**
|
||||
* Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
|
||||
* is it a lead surrogate?
|
||||
|
@ -350,7 +358,7 @@ public final class NormalizerImpl {
|
|||
|
||||
private static final class IsAcceptable implements ICUBinary.Authenticate {
|
||||
public boolean isDataVersionAcceptable(byte version[]) {
|
||||
return version[0]==3;
|
||||
return version[0]==4;
|
||||
}
|
||||
}
|
||||
private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
|
||||
|
@ -387,8 +395,9 @@ public final class NormalizerImpl {
|
|||
// Read the normTrie.
|
||||
int offset=inIndexes[IX_NORM_TRIE_OFFSET];
|
||||
int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
|
||||
normTrie=Trie2_16.createFromSerialized(bytes);
|
||||
int trieLength=normTrie.getSerializedLength();
|
||||
int triePosition = bytes.position();
|
||||
normTrie = CodePointTrie.Fast16.fromBinary(bytes);
|
||||
int trieLength = bytes.position() - triePosition;
|
||||
if(trieLength>(nextOffset-offset)) {
|
||||
throw new InternalError("Normalizer2 data: not enough bytes for normTrie");
|
||||
}
|
||||
|
@ -398,13 +407,8 @@ public final class NormalizerImpl {
|
|||
offset=nextOffset;
|
||||
nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
|
||||
int numChars=(nextOffset-offset)/2;
|
||||
char[] chars;
|
||||
if(numChars!=0) {
|
||||
chars=new char[numChars];
|
||||
for(int i=0; i<numChars; ++i) {
|
||||
chars[i]=bytes.getChar();
|
||||
}
|
||||
maybeYesCompositions=new String(chars);
|
||||
maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0);
|
||||
extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
|
||||
}
|
||||
|
||||
|
@ -422,8 +426,12 @@ public final class NormalizerImpl {
|
|||
return load(ICUBinary.getRequiredData(name));
|
||||
}
|
||||
|
||||
|
||||
public int getNorm16(int c) { return normTrie.get(c); }
|
||||
// The trie stores values for lead surrogate code *units*.
|
||||
// Surrogate code *points* are inert.
|
||||
public int getNorm16(int c) {
|
||||
return UTF16Plus.isLeadSurrogate(c) ? INERT : normTrie.get(c);
|
||||
}
|
||||
public int getRawNorm16(int c) { return normTrie.get(c); }
|
||||
public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
|
||||
public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
|
||||
public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
|
||||
|
@ -486,7 +494,7 @@ public final class NormalizerImpl {
|
|||
}
|
||||
// Maps to an isCompYesAndZeroCC.
|
||||
c=mapAlgorithmic(c, norm16);
|
||||
norm16=getNorm16(c);
|
||||
norm16=getRawNorm16(c);
|
||||
}
|
||||
}
|
||||
if(norm16<=minYesNo || isHangulLVT(norm16)) {
|
||||
|
@ -519,7 +527,7 @@ public final class NormalizerImpl {
|
|||
// Maps to an isCompYesAndZeroCC.
|
||||
decomp=c=mapAlgorithmic(c, norm16);
|
||||
// The mapping might decompose further.
|
||||
norm16 = getNorm16(c);
|
||||
norm16 = getRawNorm16(c);
|
||||
}
|
||||
if (norm16 < minYesNo) {
|
||||
if(decomp<0) {
|
||||
|
@ -641,27 +649,23 @@ public final class NormalizerImpl {
|
|||
// count code units below the minimum or with irrelevant data for the quick check
|
||||
for(prevSrc=src; src!=limit;) {
|
||||
if( (c=s.charAt(src))<minNoCP ||
|
||||
isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
|
||||
isMostDecompYesAndZeroCC(norm16=normTrie.bmpGet(c))
|
||||
) {
|
||||
++src;
|
||||
} else if(!UTF16.isSurrogate((char)c)) {
|
||||
} else if(!UTF16Plus.isLeadSurrogate(c)) {
|
||||
break;
|
||||
} else {
|
||||
char c2;
|
||||
if(UTF16Plus.isSurrogateLead(c)) {
|
||||
if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
|
||||
c=Character.toCodePoint((char)c, c2);
|
||||
if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
|
||||
c = Character.toCodePoint((char)c, c2);
|
||||
norm16 = normTrie.suppGet(c);
|
||||
if (isMostDecompYesAndZeroCC(norm16)) {
|
||||
src += 2;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} else /* trail surrogate */ {
|
||||
if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
|
||||
--src;
|
||||
c=Character.toCodePoint(c2, (char)c);
|
||||
}
|
||||
}
|
||||
if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
|
||||
src+=Character.charCount(c);
|
||||
} else {
|
||||
break;
|
||||
++src; // unpaired lead surrogate: inert
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -721,7 +725,7 @@ public final class NormalizerImpl {
|
|||
c=Character.codePointAt(s, src);
|
||||
cc=getCC(getNorm16(c));
|
||||
};
|
||||
buffer.append(s, 0, src, firstCC, prevCC);
|
||||
buffer.append(s, 0, src, false, firstCC, prevCC);
|
||||
buffer.append(s, src, limit);
|
||||
}
|
||||
|
||||
|
@ -749,28 +753,22 @@ public final class NormalizerImpl {
|
|||
return true;
|
||||
}
|
||||
if( (c=s.charAt(src))<minNoMaybeCP ||
|
||||
isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
|
||||
isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
|
||||
) {
|
||||
++src;
|
||||
} else {
|
||||
prevSrc = src++;
|
||||
if(!UTF16.isSurrogate((char)c)) {
|
||||
if (!UTF16Plus.isLeadSurrogate(c)) {
|
||||
break;
|
||||
} else {
|
||||
char c2;
|
||||
if(UTF16Plus.isSurrogateLead(c)) {
|
||||
if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
|
||||
++src;
|
||||
c=Character.toCodePoint((char)c, c2);
|
||||
if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {
|
||||
++src;
|
||||
c = Character.toCodePoint((char)c, c2);
|
||||
norm16 = normTrie.suppGet(c);
|
||||
if (!isCompYesAndZeroCC(norm16)) {
|
||||
break;
|
||||
}
|
||||
} else /* trail surrogate */ {
|
||||
if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
|
||||
--prevSrc;
|
||||
c=Character.toCodePoint(c2, (char)c);
|
||||
}
|
||||
}
|
||||
if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -991,28 +989,22 @@ public final class NormalizerImpl {
|
|||
return (src<<1)|qcResult; // "yes" or "maybe"
|
||||
}
|
||||
if( (c=s.charAt(src))<minNoMaybeCP ||
|
||||
isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
|
||||
isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
|
||||
) {
|
||||
++src;
|
||||
} else {
|
||||
prevSrc = src++;
|
||||
if(!UTF16.isSurrogate((char)c)) {
|
||||
if (!UTF16Plus.isLeadSurrogate(c)) {
|
||||
break;
|
||||
} else {
|
||||
char c2;
|
||||
if(UTF16Plus.isSurrogateLead(c)) {
|
||||
if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
|
||||
++src;
|
||||
c=Character.toCodePoint((char)c, c2);
|
||||
if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {
|
||||
++src;
|
||||
c = Character.toCodePoint((char)c, c2);
|
||||
norm16 = normTrie.suppGet(c);
|
||||
if (!isCompYesAndZeroCC(norm16)) {
|
||||
break;
|
||||
}
|
||||
} else /* trail surrogate */ {
|
||||
if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
|
||||
--prevSrc;
|
||||
c=Character.toCodePoint(c2, (char)c);
|
||||
}
|
||||
}
|
||||
if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -1134,17 +1126,10 @@ public final class NormalizerImpl {
|
|||
prevFCD16=0;
|
||||
++src;
|
||||
} else {
|
||||
if(UTF16.isSurrogate((char)c)) {
|
||||
if (UTF16Plus.isLeadSurrogate(c)) {
|
||||
char c2;
|
||||
if(UTF16Plus.isSurrogateLead(c)) {
|
||||
if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
|
||||
c=Character.toCodePoint((char)c, c2);
|
||||
}
|
||||
} else /* trail surrogate */ {
|
||||
if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
|
||||
--src;
|
||||
c=Character.toCodePoint(c2, (char)c);
|
||||
}
|
||||
if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
|
||||
c = Character.toCodePoint((char)c, c2);
|
||||
}
|
||||
}
|
||||
if((fcd16=getFCD16FromNormData(c))<=0xff) {
|
||||
|
@ -1430,7 +1415,7 @@ public final class NormalizerImpl {
|
|||
}
|
||||
// Maps to an isCompYesAndZeroCC.
|
||||
c=mapAlgorithmic(c, norm16);
|
||||
norm16=getNorm16(c);
|
||||
norm16=getRawNorm16(c);
|
||||
}
|
||||
if (norm16 < minYesNo) {
|
||||
// c does not decompose
|
||||
|
@ -1451,7 +1436,7 @@ public final class NormalizerImpl {
|
|||
leadCC=0;
|
||||
}
|
||||
++mapping; // skip over the firstUnit
|
||||
buffer.append(extraData, mapping, mapping+length, leadCC, trailCC);
|
||||
buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1643,7 +1628,7 @@ public final class NormalizerImpl {
|
|||
// Is the composite a starter that combines forward?
|
||||
if((compositeAndFwd&1)!=0) {
|
||||
compositionsList=
|
||||
getCompositionsListForComposite(getNorm16(composite));
|
||||
getCompositionsListForComposite(getRawNorm16(composite));
|
||||
} else {
|
||||
compositionsList=-1;
|
||||
}
|
||||
|
@ -2196,9 +2181,8 @@ public final class NormalizerImpl {
|
|||
private int centerNoNoDelta;
|
||||
private int minMaybeYes;
|
||||
|
||||
private Trie2_16 normTrie;
|
||||
private CodePointTrie.Fast16 normTrie;
|
||||
private String maybeYesCompositions;
|
||||
private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
|
||||
private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
|
||||
|
||||
}
|
||||
}
|
||||
|
|
Binary file not shown.
Binary file not shown.
BIN
src/java.base/share/classes/sun/text/resources/nfkc.nrm
Normal file
BIN
src/java.base/share/classes/sun/text/resources/nfkc.nrm
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -1,38 +1,11 @@
|
|||
## International Components for Unicode (ICU4J) v62.1
|
||||
## International Components for Unicode (ICU4J) v64.2
|
||||
|
||||
### ICU4J License
|
||||
|
||||
```
|
||||
COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later)
|
||||
|
||||
UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
|
||||
Unicode Data Files include all data files under the directories
|
||||
http://www.unicode.org/Public/, http://www.unicode.org/reports/,
|
||||
http://www.unicode.org/cldr/data/,
|
||||
http://source.icu-project.org/repos/icu/, and
|
||||
http://www.unicode.org/utility/trac/browser/.
|
||||
|
||||
Unicode Data Files do not include PDF online code charts under the
|
||||
directory http://www.unicode.org/Public/.
|
||||
|
||||
Software includes any source code published in the Unicode Standard
|
||||
or under the directories
|
||||
http://www.unicode.org/Public/, http://www.unicode.org/reports/,
|
||||
http://www.unicode.org/cldr/data/,
|
||||
http://source.icu-project.org/repos/icu/, and
|
||||
http://www.unicode.org/utility/trac/browser/.
|
||||
|
||||
NOTICE TO USER: Carefully read the following legal agreement.
|
||||
BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
|
||||
DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
|
||||
YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
|
||||
TERMS AND CONDITIONS OF THIS AGREEMENT.
|
||||
IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
|
||||
THE DATA FILES OR SOFTWARE.
|
||||
|
||||
COPYRIGHT AND PERMISSION NOTICE
|
||||
|
||||
Copyright © 1991-2018 Unicode, Inc. All rights reserved.
|
||||
Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
|
||||
Copyright © 1991-2019 Unicode, Inc. All rights reserved.
|
||||
Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of the Unicode data files and any associated documentation
|
||||
|
@ -63,4 +36,354 @@ shall not be used in advertising or otherwise to promote the sale,
|
|||
use or other dealings in these Data Files or Software without prior
|
||||
written authorization of the copyright holder.
|
||||
|
||||
```
|
||||
---------------------
|
||||
|
||||
Third-Party Software Licenses
|
||||
|
||||
This section contains third-party software notices and/or additional
|
||||
terms for licensed third-party software components included within ICU
|
||||
libraries.
|
||||
|
||||
1. ICU License - ICU 1.8.1 to ICU 57.1
|
||||
|
||||
COPYRIGHT AND PERMISSION NOTICE
|
||||
|
||||
Copyright (c) 1995-2016 International Business Machines Corporation and others
|
||||
All rights reserved.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, and/or sell copies of the Software, and to permit persons
|
||||
to whom the Software is furnished to do so, provided that the above
|
||||
copyright notice(s) and this permission notice appear in all copies of
|
||||
the Software and that both the above copyright notice(s) and this
|
||||
permission notice appear in supporting documentation.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
|
||||
OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
|
||||
HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY
|
||||
SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
|
||||
RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
|
||||
CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
||||
CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
||||
|
||||
Except as contained in this notice, the name of a copyright holder
|
||||
shall not be used in advertising or otherwise to promote the sale, use
|
||||
or other dealings in this Software without prior written authorization
|
||||
of the copyright holder.
|
||||
|
||||
All trademarks and registered trademarks mentioned herein are the
|
||||
property of their respective owners.
|
||||
|
||||
2. Chinese/Japanese Word Break Dictionary Data (cjdict.txt)
|
||||
|
||||
# The Google Chrome software developed by Google is licensed under
|
||||
# the BSD license. Other software included in this distribution is
|
||||
# provided under other licenses, as set forth below.
|
||||
#
|
||||
# The BSD License
|
||||
# http://opensource.org/licenses/bsd-license.php
|
||||
# Copyright (C) 2006-2008, Google Inc.
|
||||
#
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# Redistributions of source code must retain the above copyright notice,
|
||||
# this list of conditions and the following disclaimer.
|
||||
# Redistributions in binary form must reproduce the above
|
||||
# copyright notice, this list of conditions and the following
|
||||
# disclaimer in the documentation and/or other materials provided with
|
||||
# the distribution.
|
||||
# Neither the name of Google Inc. nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
|
||||
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
||||
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
|
||||
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
#
|
||||
#
|
||||
# The word list in cjdict.txt are generated by combining three word lists
|
||||
# listed below with further processing for compound word breaking. The
|
||||
# frequency is generated with an iterative training against Google web
|
||||
# corpora.
|
||||
#
|
||||
# * Libtabe (Chinese)
|
||||
# - https://sourceforge.net/project/?group_id=1519
|
||||
# - Its license terms and conditions are shown below.
|
||||
#
|
||||
# * IPADIC (Japanese)
|
||||
# - http://chasen.aist-nara.ac.jp/chasen/distribution.html
|
||||
# - Its license terms and conditions are shown below.
|
||||
#
|
||||
# ---------COPYING.libtabe ---- BEGIN--------------------
|
||||
#
|
||||
# /*
|
||||
# * Copyright (c) 1999 TaBE Project.
|
||||
# * Copyright (c) 1999 Pai-Hsiang Hsiao.
|
||||
# * All rights reserved.
|
||||
# *
|
||||
# * Redistribution and use in source and binary forms, with or without
|
||||
# * modification, are permitted provided that the following conditions
|
||||
# * are met:
|
||||
# *
|
||||
# * . Redistributions of source code must retain the above copyright
|
||||
# * notice, this list of conditions and the following disclaimer.
|
||||
# * . Redistributions in binary form must reproduce the above copyright
|
||||
# * notice, this list of conditions and the following disclaimer in
|
||||
# * the documentation and/or other materials provided with the
|
||||
# * distribution.
|
||||
# * . Neither the name of the TaBE Project nor the names of its
|
||||
# * contributors may be used to endorse or promote products derived
|
||||
# * from this software without specific prior written permission.
|
||||
# *
|
||||
# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
# * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
# * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
# * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
# * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
# * OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
# */
|
||||
#
|
||||
# /*
|
||||
# * Copyright (c) 1999 Computer Systems and Communication Lab,
|
||||
# * Institute of Information Science, Academia
|
||||
# * Sinica. All rights reserved.
|
||||
# *
|
||||
# * Redistribution and use in source and binary forms, with or without
|
||||
# * modification, are permitted provided that the following conditions
|
||||
# * are met:
|
||||
# *
|
||||
# * . Redistributions of source code must retain the above copyright
|
||||
# * notice, this list of conditions and the following disclaimer.
|
||||
# * . Redistributions in binary form must reproduce the above copyright
|
||||
# * notice, this list of conditions and the following disclaimer in
|
||||
# * the documentation and/or other materials provided with the
|
||||
# * distribution.
|
||||
# * . Neither the name of the Computer Systems and Communication Lab
|
||||
# * nor the names of its contributors may be used to endorse or
|
||||
# * promote products derived from this software without specific
|
||||
# * prior written permission.
|
||||
# *
|
||||
# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
# * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
# * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
|
||||
# * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
# * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
# * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
# * OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
# */
|
||||
#
|
||||
# Copyright 1996 Chih-Hao Tsai @ Beckman Institute,
|
||||
# University of Illinois
|
||||
# c-tsai4@uiuc.edu http://casper.beckman.uiuc.edu/~c-tsai4
|
||||
#
|
||||
# ---------------COPYING.libtabe-----END--------------------------------
|
||||
#
|
||||
#
|
||||
# ---------------COPYING.ipadic-----BEGIN-------------------------------
|
||||
#
|
||||
# Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
|
||||
# and Technology. All Rights Reserved.
|
||||
#
|
||||
# Use, reproduction, and distribution of this software is permitted.
|
||||
# Any copy of this software, whether in its original form or modified,
|
||||
# must include both the above copyright notice and the following
|
||||
# paragraphs.
|
||||
#
|
||||
# Nara Institute of Science and Technology (NAIST),
|
||||
# the copyright holders, disclaims all warranties with regard to this
|
||||
# software, including all implied warranties of merchantability and
|
||||
# fitness, in no event shall NAIST be liable for
|
||||
# any special, indirect or consequential damages or any damages
|
||||
# whatsoever resulting from loss of use, data or profits, whether in an
|
||||
# action of contract, negligence or other tortuous action, arising out
|
||||
# of or in connection with the use or performance of this software.
|
||||
#
|
||||
# A large portion of the dictionary entries
|
||||
# originate from ICOT Free Software. The following conditions for ICOT
|
||||
# Free Software applies to the current dictionary as well.
|
||||
#
|
||||
# Each User may also freely distribute the Program, whether in its
|
||||
# original form or modified, to any third party or parties, PROVIDED
|
||||
# that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
|
||||
# on, or be attached to, the Program, which is distributed substantially
|
||||
# in the same form as set out herein and that such intended
|
||||
# distribution, if actually made, will neither violate or otherwise
|
||||
# contravene any of the laws and regulations of the countries having
|
||||
# jurisdiction over the User or the intended distribution itself.
|
||||
#
|
||||
# NO WARRANTY
|
||||
#
|
||||
# The program was produced on an experimental basis in the course of the
|
||||
# research and development conducted during the project and is provided
|
||||
# to users as so produced on an experimental basis. Accordingly, the
|
||||
# program is provided without any warranty whatsoever, whether express,
|
||||
# implied, statutory or otherwise. The term "warranty" used herein
|
||||
# includes, but is not limited to, any warranty of the quality,
|
||||
# performance, merchantability and fitness for a particular purpose of
|
||||
# the program and the nonexistence of any infringement or violation of
|
||||
# any right of any third party.
|
||||
#
|
||||
# Each user of the program will agree and understand, and be deemed to
|
||||
# have agreed and understood, that there is no warranty whatsoever for
|
||||
# the program and, accordingly, the entire risk arising from or
|
||||
# otherwise connected with the program is assumed by the user.
|
||||
#
|
||||
# Therefore, neither ICOT, the copyright holder, or any other
|
||||
# organization that participated in or was otherwise related to the
|
||||
# development of the program and their respective officials, directors,
|
||||
# officers and other employees shall be held liable for any and all
|
||||
# damages, including, without limitation, general, special, incidental
|
||||
# and consequential damages, arising out of or otherwise in connection
|
||||
# with the use or inability to use the program or any product, material
|
||||
# or result produced or otherwise obtained by using the program,
|
||||
# regardless of whether they have been advised of, or otherwise had
|
||||
# knowledge of, the possibility of such damages at any time during the
|
||||
# project or thereafter. Each user will be deemed to have agreed to the
|
||||
# foregoing by his or her commencement of use of the program. The term
|
||||
# "use" as used herein includes, but is not limited to, the use,
|
||||
# modification, copying and distribution of the program and the
|
||||
# production of secondary products from the program.
|
||||
#
|
||||
# In the case where the program, whether in its original form or
|
||||
# modified, was distributed or delivered to or received by a user from
|
||||
# any person, organization or entity other than ICOT, unless it makes or
|
||||
# grants independently of ICOT any specific warranty to the user in
|
||||
# writing, such person, organization or entity, will also be exempted
|
||||
# from and not be held liable to the user for any such damages as noted
|
||||
# above as far as the program is concerned.
|
||||
#
|
||||
# ---------------COPYING.ipadic-----END----------------------------------
|
||||
|
||||
3. Lao Word Break Dictionary Data (laodict.txt)
|
||||
|
||||
# Copyright (c) 2013 International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
# Project: http://code.google.com/p/lao-dictionary/
|
||||
# Dictionary: http://lao-dictionary.googlecode.com/git/Lao-Dictionary.txt
|
||||
# License: http://lao-dictionary.googlecode.com/git/Lao-Dictionary-LICENSE.txt
|
||||
# (copied below)
|
||||
#
|
||||
# This file is derived from the above dictionary, with slight
|
||||
# modifications.
|
||||
# ----------------------------------------------------------------------
|
||||
# Copyright (C) 2013 Brian Eugene Wilson, Robert Martin Campbell.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification,
|
||||
# are permitted provided that the following conditions are met:
|
||||
#
|
||||
#
|
||||
# Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer. Redistributions in
|
||||
# binary form must reproduce the above copyright notice, this list of
|
||||
# conditions and the following disclaimer in the documentation and/or
|
||||
# other materials provided with the distribution.
|
||||
#
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
|
||||
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
|
||||
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
||||
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
|
||||
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
# OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
4. Burmese Word Break Dictionary Data (burmesedict.txt)
|
||||
|
||||
# Copyright (c) 2014 International Business Machines Corporation
|
||||
# and others. All Rights Reserved.
|
||||
#
|
||||
# This list is part of a project hosted at:
|
||||
# github.com/kanyawtech/myanmar-karen-word-lists
|
||||
#
|
||||
# --------------------------------------------------------------------------
|
||||
# Copyright (c) 2013, LeRoy Benjamin Sharon
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions
|
||||
# are met: Redistributions of source code must retain the above
|
||||
# copyright notice, this list of conditions and the following
|
||||
# disclaimer. Redistributions in binary form must reproduce the
|
||||
# above copyright notice, this list of conditions and the following
|
||||
# disclaimer in the documentation and/or other materials provided
|
||||
# with the distribution.
|
||||
#
|
||||
# Neither the name Myanmar Karen Word Lists, nor the names of its
|
||||
# contributors may be used to endorse or promote products derived
|
||||
# from this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
|
||||
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
|
||||
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
|
||||
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
|
||||
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
|
||||
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
|
||||
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
|
||||
# THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||
# SUCH DAMAGE.
|
||||
# --------------------------------------------------------------------------
|
||||
|
||||
5. Time Zone Database
|
||||
|
||||
ICU uses the public domain data and code derived from Time Zone
|
||||
Database for its time zone support. The ownership of the TZ database
|
||||
is explained in BCP 175: Procedure for Maintaining the Time Zone
|
||||
Database section 7.
|
||||
|
||||
# 7. Database Ownership
|
||||
#
|
||||
# The TZ database itself is not an IETF Contribution or an IETF
|
||||
# document. Rather it is a pre-existing and regularly updated work
|
||||
# that is in the public domain, and is intended to remain in the
|
||||
# public domain. Therefore, BCPs 78 [RFC5378] and 79 [RFC3979] do
|
||||
# not apply to the TZ Database or contributions that individuals make
|
||||
# to it. Should any claims be made and substantiated against the TZ
|
||||
# Database, the organization that is providing the IANA
|
||||
# Considerations defined in this RFC, under the memorandum of
|
||||
# understanding with the IETF, currently ICANN, may act in accordance
|
||||
# with all competent court orders. No ownership claims will be made
|
||||
# by ICANN or the IETF Trust on the database or the code. Any person
|
||||
# making a contribution to the database or code waives all rights to
|
||||
# future claims in that contribution or in the TZ Database.
|
||||
|
|
|
@ -1,25 +1,11 @@
|
|||
## The Unicode Standard, Unicode Character Database, Version 11.0.0
|
||||
|
||||
## The Unicode Standard, Unicode Character Database, Version 12.1.0
|
||||
|
||||
### Unicode Character Database
|
||||
|
||||
```
|
||||
|
||||
UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
|
||||
Unicode Data Files include all data files under the directories
|
||||
http://www.unicode.org/Public/, http://www.unicode.org/reports/,
|
||||
http://www.unicode.org/cldr/data/,
|
||||
http://source.icu-project.org/repos/icu/, and
|
||||
http://www.unicode.org/utility/trac/browser/.
|
||||
|
||||
Unicode Data Files do not include PDF online code charts under the
|
||||
directory http://www.unicode.org/Public/.
|
||||
|
||||
Software includes any source code published in the Unicode Standard
|
||||
or under the directories
|
||||
http://www.unicode.org/Public/, http://www.unicode.org/reports/,
|
||||
http://www.unicode.org/cldr/data/,
|
||||
http://source.icu-project.org/repos/icu/, and
|
||||
http://www.unicode.org/utility/trac/browser/.
|
||||
See Terms of Use for definitions of Unicode Inc.'s
|
||||
Data Files and Software.
|
||||
|
||||
NOTICE TO USER: Carefully read the following legal agreement.
|
||||
BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
|
||||
|
@ -31,8 +17,8 @@ THE DATA FILES OR SOFTWARE.
|
|||
|
||||
COPYRIGHT AND PERMISSION NOTICE
|
||||
|
||||
Copyright © 1991-2018 Unicode, Inc. All rights reserved.
|
||||
Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
|
||||
Copyright © 1991-2019 Unicode, Inc. All rights reserved.
|
||||
Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of the Unicode data files and any associated documentation
|
||||
|
@ -62,5 +48,3 @@ Except as contained in this notice, the name of a copyright holder
|
|||
shall not be used in advertising or otherwise to promote the sale,
|
||||
use or other dealings in these Data Files or Software without prior
|
||||
written authorization of the copyright holder.
|
||||
|
||||
```
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue