8221431: Support for Unicode 12.1

Reviewed-by: erikj, rriggs
This commit is contained in:
Naoto Sato 2019-05-23 12:21:21 -07:00
parent e4f31b1cd7
commit 93fabcdc5a
66 changed files with 60279 additions and 38178 deletions

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,46 @@
/*
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package java.util.regex;
/**
* Holds data contained in the Unicode Technical Standard #51: Unicode
* Emoji.
*
* Currently it is only used for the rule "GB11" in UAX #29 Unicode Text
* Segmentation.
*/
final class EmojiData {
/**
* Returns whether the code point is an extended pictographic or not.
*
* @param cp code point to examine
* @return true if {@code cp} is an extended pictographic
*/
static boolean isExtendedPictographic(int cp) {
return
%%%EXTPICT%%%
}
}

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2016, 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -25,17 +25,56 @@
package java.util.regex;
import java.util.Objects;
final class Grapheme {
/**
* Determines if there is an extended grapheme cluster boundary between two
* continuing characters {@code cp1} and {@code cp2}.
* Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
* the start of the char sequence is a boundary.
* <p>
* See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
* for the extended grapheme cluster boundary rules
* for the extended grapheme cluster boundary rules. The following implementation
* is based on version 12.0 of the annex.
* (http://www.unicode.org/reports/tr29/tr29-35.html)
*
* @param src the {@code CharSequence} to be scanned
* @param off offset to start looking for the next boundary in the src
* @param limit limit offset in the src (exclusive)
* @return the next possible boundary
*/
static boolean isBoundary(int cp1, int cp2) {
return rules[getType(cp1)][getType(cp2)];
static int nextBoundary(CharSequence src, int off, int limit) {
Objects.checkFromToIndex(off, limit, src.length());
int ch0 = Character.codePointAt(src, 0);
int ret = Character.charCount(ch0);
int ch1;
// indicates whether gb11 or gb12 is underway
boolean gb11 = EmojiData.isExtendedPictographic(ch0);
int riCount = getType(ch0) == RI ? 1 : 0;
while (ret < limit) {
ch1 = Character.codePointAt(src, ret);
int t0 = getType(ch0);
int t1 = getType(ch1);
if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
gb11 = false;
} else if (riCount % 2 == 1 && t0 == RI && t1 == RI) {
// continue for gb12
} else if (rules[t0][t1]) {
if (ret > off) {
break;
} else {
gb11 = EmojiData.isExtendedPictographic(ch1);
riCount = 0;
}
}
riCount += getType(ch1) == RI ? 1 : 0;
ch0 = ch1;
ret += Character.charCount(ch1);
}
return ret;
}
// types
@ -44,22 +83,24 @@ final class Grapheme {
private static final int LF = 2;
private static final int CONTROL = 3;
private static final int EXTEND = 4;
private static final int RI = 5;
private static final int PREPEND = 6;
private static final int SPACINGMARK = 7;
private static final int L = 8;
private static final int V = 9;
private static final int T = 10;
private static final int LV = 11;
private static final int LVT = 12;
private static final int ZWJ = 5;
private static final int RI = 6;
private static final int PREPEND = 7;
private static final int SPACINGMARK = 8;
private static final int L = 9;
private static final int V = 10;
private static final int T = 11;
private static final int LV = 12;
private static final int LVT = 13;
private static final int EXTENDED_PICTOGRAPHIC = 14;
private static final int FIRST_TYPE = 0;
private static final int LAST_TYPE = 12;
private static final int LAST_TYPE = 14;
private static boolean[][] rules;
static {
rules = new boolean[LAST_TYPE + 1][LAST_TYPE + 1];
// default, any + any
// GB 999 Any + Any -> default
for (int i = FIRST_TYPE; i <= LAST_TYPE; i++)
for (int j = FIRST_TYPE; j <= LAST_TYPE; j++)
rules[i][j] = true;
@ -76,13 +117,12 @@ final class Grapheme {
// GB 8 (LVT | T) x T
rules[LVT][T] = false;
rules[T][T] = false;
// GB 8a RI x RI
rules[RI][RI] = false;
// GB 9 x Extend
// GB 9 x (Extend|ZWJ)
// GB 9a x Spacing Mark
// GB 9b Prepend x
for (int i = FIRST_TYPE; i <= LAST_TYPE; i++) {
rules[i][EXTEND] = false;
rules[i][ZWJ] = false;
rules[i][SPACINGMARK] = false;
rules[PREPEND][i] = false;
}
@ -95,7 +135,9 @@ final class Grapheme {
}
// GB 3 CR x LF
rules[CR][LF] = false;
// GB 10 Any + Any -> default
// GB 11 Exended_Pictographic x (Extend|ZWJ)
rules[EXTENDED_PICTOGRAPHIC][EXTEND] = false;
rules[EXTENDED_PICTOGRAPHIC][ZWJ] = false;
}
// Hangul syllables
@ -123,6 +165,10 @@ final class Grapheme {
@SuppressWarnings("fallthrough")
private static int getType(int cp) {
if (EmojiData.isExtendedPictographic(cp)) {
return EXTENDED_PICTOGRAPHIC;
}
int type = Character.getType(cp);
switch(type) {
case Character.CONTROL:
@ -131,29 +177,36 @@ final class Grapheme {
if (cp == 0x000A)
return LF;
return CONTROL;
case Character.UNASSIGNED:
case Character.UNASSIGNED:
// NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
// but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
// so type it as "Other" to make the test happy
if (cp == 0x0378)
return OTHER;
if (cp == 0x0378)
return OTHER;
case Character.LINE_SEPARATOR:
case Character.PARAGRAPH_SEPARATOR:
case Character.SURROGATE:
return CONTROL;
case Character.FORMAT:
if (cp == 0x200C || cp == 0x200D)
if (cp == 0x200C ||
cp >= 0xE0020 && cp <= 0xE007F)
return EXTEND;
if (cp == 0x200D)
return ZWJ;
if (cp >= 0x0600 && cp <= 0x0605 ||
cp == 0x06DD || cp == 0x070F || cp == 0x08E2 ||
cp == 0x110BD || cp == 0x110CD)
return PREPEND;
return CONTROL;
case Character.NON_SPACING_MARK:
case Character.ENCLOSING_MARK:
// NOTE:
// #tr29 "plus a few General_Category = Spacing_Mark needed for
// canonical equivalence."
// but for "extended grapheme clusters" support, there is no
// need actually to diff "extend" and "spackmark" given GB9, GB9a
return EXTEND;
// NOTE:
// #tr29 "plus a few General_Category = Spacing_Mark needed for
// canonical equivalence."
// but for "extended grapheme clusters" support, there is no
// need actually to diff "extend" and "spackmark" given GB9, GB9a
return EXTEND;
case Character.COMBINING_SPACING_MARK:
if (isExcludedSpacingMark(cp))
return OTHER;
@ -167,9 +220,11 @@ final class Grapheme {
return RI;
return OTHER;
case Character.MODIFIER_LETTER:
case Character.MODIFIER_SYMBOL:
// WARNING:
// not mentioned in #tr29 but listed in GraphemeBreakProperty.txt
if (cp == 0xFF9E || cp == 0xFF9F)
if (cp == 0xFF9E || cp == 0xFF9F ||
cp >= 0x1F3FB && cp <= 0x1F3FF)
return EXTEND;
return OTHER;
case Character.OTHER_LETTER:
@ -199,6 +254,22 @@ final class Grapheme {
return V;
if (cp >= 0xD7CB && cp <= 0xD7FB)
return T;
// Prepend
switch (cp) {
case 0x0D4E:
case 0x111C2:
case 0x111C3:
case 0x11A3A:
case 0x11A84:
case 0x11A85:
case 0x11A86:
case 0x11A87:
case 0x11A88:
case 0x11A89:
case 0x11D46:
return PREPEND;
}
}
return OTHER;
}

View file

@ -540,7 +540,7 @@ import jdk.internal.util.ArraysSupport;
* <p> This class is in conformance with Level 1 of <a
* href="http://www.unicode.org/reports/tr18/"><i>Unicode Technical
* Standard #18: Unicode Regular Expression</i></a>, plus RL2.1
* Canonical Equivalents.
* Canonical Equivalents and RL2.2 Extended Grapheme Clusters.
* <p>
* <b>Unicode escape sequences</b> such as <code>&#92;u2014</code> in Java source code
* are processed as described in section 3.3 of
@ -1501,15 +1501,8 @@ public final class Pattern
off++;
continue;
}
int j = off + Character.charCount(ch0);
int j = Grapheme.nextBoundary(src, off, limit);
int ch1;
while (j < limit) {
ch1 = src.codePointAt(j);
if (Grapheme.isBoundary(ch0, ch1))
break;
ch0 = ch1;
j += Character.charCount(ch1);
}
String seq = src.substring(off, j);
String nfd = Normalizer.normalize(seq, Normalizer.Form.NFD);
off = j;
@ -3975,14 +3968,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
if (i < matcher.to) {
int ch0 = Character.codePointAt(seq, i);
int n = Character.charCount(ch0);
int j = i + n;
while (j < matcher.to) {
int ch1 = Character.codePointAt(seq, j);
if (Grapheme.isBoundary(ch0, ch1))
break;
ch0 = ch1;
j += Character.charCount(ch1);
}
int j = Grapheme.nextBoundary(seq, i, matcher.to);
if (i + n == j) { // single, assume nfc cp
if (predicate.is(ch0))
return next.match(matcher, j, seq);
@ -4021,15 +4007,7 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
static class XGrapheme extends Node {
boolean match(Matcher matcher, int i, CharSequence seq) {
if (i < matcher.to) {
int ch0 = Character.codePointAt(seq, i);
i += Character.charCount(ch0);
while (i < matcher.to) {
int ch1 = Character.codePointAt(seq, i);
if (Grapheme.isBoundary(ch0, ch1))
break;
ch0 = ch1;
i += Character.charCount(ch1);
}
i = Grapheme.nextBoundary(seq, i, matcher.to);
return next.match(matcher, i, seq);
}
matcher.hitEnd = true;
@ -4059,8 +4037,9 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
}
if (i < endIndex) {
if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) ||
!Grapheme.isBoundary(Character.codePointBefore(seq, i),
Character.codePointAt(seq, i))) {
Grapheme.nextBoundary(seq,
i - Character.charCount(Character.codePointBefore(seq, i)),
i + Character.charCount(Character.codePointAt(seq, i))) > i) {
return false;
}
} else {

View file

@ -0,0 +1,501 @@
/*
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
// (c) 2018 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
// created: 2018may10 Markus W. Scherer
package sun.text.normalizer;
import java.util.Iterator;
import java.util.NoSuchElementException;
/**
* Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values.
* This does not implement java.util.Map.
*
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public abstract class CodePointMap implements Iterable<CodePointMap.Range> {
/**
* Selectors for how getRange() should report value ranges overlapping with surrogates.
* Most users should use NORMAL.
*
* @see #getRange
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public enum RangeOption {
/**
* getRange() enumerates all same-value ranges as stored in the map.
* Most users should use this option.
*
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
NORMAL,
/**
* getRange() enumerates all same-value ranges as stored in the map,
* except that lead surrogates (U+D800..U+DBFF) are treated as having the
* surrogateValue, which is passed to getRange() as a separate parameter.
* The surrogateValue is not transformed via filter().
* See {@link Character#isHighSurrogate}.
*
* <p>Most users should use NORMAL instead.
*
* <p>This option is useful for maps that map surrogate code *units* to
* special values optimized for UTF-16 string processing
* or for special error behavior for unpaired surrogates,
* but those values are not to be associated with the lead surrogate code *points*.
*
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
FIXED_LEAD_SURROGATES,
/**
* getRange() enumerates all same-value ranges as stored in the map,
* except that all surrogates (U+D800..U+DFFF) are treated as having the
* surrogateValue, which is passed to getRange() as a separate parameter.
* The surrogateValue is not transformed via filter().
* See {@link Character#isSurrogate}.
*
* <p>Most users should use NORMAL instead.
*
* <p>This option is useful for maps that map surrogate code *units* to
* special values optimized for UTF-16 string processing
* or for special error behavior for unpaired surrogates,
* but those values are not to be associated with the lead surrogate code *points*.
*
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
FIXED_ALL_SURROGATES
}
/**
* Callback function interface: Modifies a map value.
* Optionally called by getRange().
* The modified value will be returned by the getRange() function.
*
* <p>Can be used to ignore some of the value bits,
* make a filter for one of several values,
* return a value index computed from the map value, etc.
*
* @see #getRange
* @see #iterator
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public interface ValueFilter {
/**
* Modifies the map value.
*
* @param value map value
* @return modified value
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public int apply(int value);
}
/**
* Range iteration result data.
* Code points from start to end map to the same value.
* The value may have been modified by {@link ValueFilter#apply(int)},
* or it may be the surrogateValue if a RangeOption other than "normal" was used.
*
* @see #getRange
* @see #iterator
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public static final class Range {
private int start;
private int end;
private int value;
/**
* Constructor. Sets start and end to -1 and value to 0.
*
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public Range() {
start = end = -1;
value = 0;
}
/**
* @return the start code point
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public int getStart() { return start; }
/**
* @return the (inclusive) end code point
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public int getEnd() { return end; }
/**
* @return the range value
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public int getValue() { return value; }
/**
* Sets the range. When using {@link #iterator()},
* iteration will resume after the newly set end.
*
* @param start new start code point
* @param end new end code point
* @param value new value
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public void set(int start, int end, int value) {
this.start = start;
this.end = end;
this.value = value;
}
}
private final class RangeIterator implements Iterator<Range> {
private Range range = new Range();
@Override
public boolean hasNext() {
return -1 <= range.end && range.end < 0x10ffff;
}
@Override
public Range next() {
if (getRange(range.end + 1, null, range)) {
return range;
} else {
throw new NoSuchElementException();
}
}
@Override
public final void remove() {
throw new UnsupportedOperationException();
}
}
/**
* Iterates over code points of a string and fetches map values.
* This does not implement java.util.Iterator.
*
* <pre>
* void onString(CodePointMap map, CharSequence s, int start) {
* CodePointMap.StringIterator iter = map.stringIterator(s, start);
* while (iter.next()) {
* int end = iter.getIndex(); // code point from between start and end
* useValue(s, start, end, iter.getCodePoint(), iter.getValue());
* start = end;
* }
* }
* </pre>
*
* <p>This class is not intended for public subclassing.
*
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public class StringIterator {
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected CharSequence s;
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected int sIndex;
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected int c;
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected int value;
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected StringIterator(CharSequence s, int sIndex) {
this.s = s;
this.sIndex = sIndex;
c = -1;
value = 0;
}
/**
* Resets the iterator to a new string and/or a new string index.
*
* @param s string to iterate over
* @param sIndex string index where the iteration will start
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public void reset(CharSequence s, int sIndex) {
this.s = s;
this.sIndex = sIndex;
c = -1;
value = 0;
}
/**
* Reads the next code point, post-increments the string index,
* and gets a value from the map.
* Sets an implementation-defined error value if the code point is an unpaired surrogate.
*
* @return true if the string index was not yet at the end of the string;
* otherwise the iterator did not advance
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public boolean next() {
if (sIndex >= s.length()) {
return false;
}
c = Character.codePointAt(s, sIndex);
sIndex += Character.charCount(c);
value = get(c);
return true;
}
/**
* Reads the previous code point, pre-decrements the string index,
* and gets a value from the map.
* Sets an implementation-defined error value if the code point is an unpaired surrogate.
*
* @return true if the string index was not yet at the start of the string;
* otherwise the iterator did not advance
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public boolean previous() {
if (sIndex <= 0) {
return false;
}
c = Character.codePointBefore(s, sIndex);
sIndex -= Character.charCount(c);
value = get(c);
return true;
}
/**
* @return the string index
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public final int getIndex() { return sIndex; }
/**
* @return the code point
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public final int getCodePoint() { return c; }
/**
* @return the map value,
* or an implementation-defined error value if
* the code point is an unpaired surrogate
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public final int getValue() { return value; }
}
/**
* Protected no-args constructor.
*
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
protected CodePointMap() {
}
/**
* Returns the value for a code point as stored in the map, with range checking.
* Returns an implementation-defined error value if c is not in the range 0..U+10FFFF.
*
* @param c the code point
* @return the map value,
* or an implementation-defined error value if
* the code point is not in the range 0..U+10FFFF
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public abstract int get(int c);
/**
* Sets the range object to a range of code points beginning with the start parameter.
* The range start is the same as the start input parameter
* (even if there are preceding code points that have the same value).
* The range end is the last code point such that
* all those from start to there have the same value.
* Returns false if start is not 0..U+10FFFF.
* Can be used to efficiently iterate over all same-value ranges in a map.
* (This is normally faster than iterating over code points and get()ting each value,
* but may be much slower than a data structure that stores ranges directly.)
*
* <p>If the {@link ValueFilter} parameter is not null, then
* the value to be delivered is passed through that filter, and the return value is the end
* of the range where all values are modified to the same actual value.
* The value is unchanged if that parameter is null.
*
* <p>Example:
* <pre>
* int start = 0;
* CodePointMap.Range range = new CodePointMap.Range();
* while (map.getRange(start, null, range)) {
* int end = range.getEnd();
* int value = range.getValue();
* // Work with the range start..end and its value.
* start = end + 1;
* }
* </pre>
*
* @param start range start
* @param filter an object that may modify the map data value,
* or null if the values from the map are to be used unmodified
* @param range the range object that will be set to the code point range and value
* @return true if start is 0..U+10FFFF; otherwise no new range is fetched
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public abstract boolean getRange(int start, ValueFilter filter, Range range);
/**
* Sets the range object to a range of code points beginning with the start parameter.
* The range start is the same as the start input parameter
* (even if there are preceding code points that have the same value).
* The range end is the last code point such that
* all those from start to there have the same value.
* Returns false if start is not 0..U+10FFFF.
*
* <p>Same as the simpler {@link #getRange(int, ValueFilter, Range)} but optionally
* modifies the range if it overlaps with surrogate code points.
*
* @param start range start
* @param option defines whether surrogates are treated normally,
* or as having the surrogateValue; usually {@link RangeOption#NORMAL}
* @param surrogateValue value for surrogates; ignored if option=={@link RangeOption#NORMAL}
* @param filter an object that may modify the map data value,
* or null if the values from the map are to be used unmodified
* @param range the range object that will be set to the code point range and value
* @return true if start is 0..U+10FFFF; otherwise no new range is fetched
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public boolean getRange(int start, RangeOption option, int surrogateValue,
ValueFilter filter, Range range) {
assert option != null;
if (!getRange(start, filter, range)) {
return false;
}
if (option == RangeOption.NORMAL) {
return true;
}
int surrEnd = option == RangeOption.FIXED_ALL_SURROGATES ? 0xdfff : 0xdbff;
int end = range.end;
if (end < 0xd7ff || start > surrEnd) {
return true;
}
// The range overlaps with surrogates, or ends just before the first one.
if (range.value == surrogateValue) {
if (end >= surrEnd) {
// Surrogates followed by a non-surrValue range,
// or surrogates are part of a larger surrValue range.
return true;
}
} else {
if (start <= 0xd7ff) {
range.end = 0xd7ff; // Non-surrValue range ends before surrValue surrogates.
return true;
}
// Start is a surrogate with a non-surrValue code *unit* value.
// Return a surrValue code *point* range.
range.value = surrogateValue;
if (end > surrEnd) {
range.end = surrEnd; // Surrogate range ends before non-surrValue rest of range.
return true;
}
}
// See if the surrValue surrogate range can be merged with
// an immediately following range.
if (getRange(surrEnd + 1, filter, range) && range.value == surrogateValue) {
range.start = start;
return true;
}
range.start = start;
range.end = surrEnd;
range.value = surrogateValue;
return true;
}
/**
* Convenience iterator over same-map-value code point ranges.
* Same as looping over all ranges with {@link #getRange(int, ValueFilter, Range)}
* without filtering.
* Adjacent ranges have different map values.
*
* <p>The iterator always returns the same Range object.
*
* @return a Range iterator
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
@Override
public Iterator<Range> iterator() {
return new RangeIterator();
}
/**
* Returns an iterator (not a java.util.Iterator) over code points of a string
* for fetching map values.
*
* @param s string to iterate over
* @param sIndex string index where the iteration will start
* @return the iterator
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public StringIterator stringIterator(CharSequence s, int sIndex) {
return new StringIterator(s, sIndex);
}
}

File diff suppressed because it is too large Load diff

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -119,10 +119,7 @@ public final class ICUBinary {
} else if (capacity < 0x4000) {
capacity *= 2; // Grow faster until we reach 16kB.
}
// TODO Java 6 replace new byte[] and arraycopy(): bytes = Arrays.copyOf(bytes, capacity);
byte[] newBytes = new byte[capacity];
System.arraycopy(bytes, 0, newBytes, 0, length);
bytes = newBytes;
bytes = Arrays.copyOf(bytes, capacity);
bytes[length++] = (byte) nextByte;
}
}
@ -264,6 +261,36 @@ public final class ICUBinary {
}
}
public static byte[] getBytes(ByteBuffer bytes, int length, int additionalSkipLength) {
byte[] dest = new byte[length];
bytes.get(dest);
if (additionalSkipLength > 0) {
skipBytes(bytes, additionalSkipLength);
}
return dest;
}
public static String getString(ByteBuffer bytes, int length, int additionalSkipLength) {
CharSequence cs = bytes.asCharBuffer();
String s = cs.subSequence(0, length).toString();
skipBytes(bytes, length * 2 + additionalSkipLength);
return s;
}
public static char[] getChars(ByteBuffer bytes, int length, int additionalSkipLength) {
char[] dest = new char[length];
bytes.asCharBuffer().get(dest);
skipBytes(bytes, length * 2 + additionalSkipLength);
return dest;
}
public static int[] getInts(ByteBuffer bytes, int length, int additionalSkipLength) {
int[] dest = new int[length];
bytes.asIntBuffer().get(dest);
skipBytes(bytes, length * 4 + additionalSkipLength);
return dest;
}
/**
* Returns a VersionInfo for the bytes in the compact version integer.
*/

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -265,7 +265,7 @@ final class Norm2AllModes {
private static final class Norm2AllModesSingleton {
private Norm2AllModesSingleton(String name) {
try {
String DATA_FILE_NAME = "/sun/text/resources/" + name + ".icu";
String DATA_FILE_NAME = "/sun/text/resources/" + name + ".nrm";
NormalizerImpl impl=new NormalizerImpl().load(DATA_FILE_NAME);
allModes=new Norm2AllModes(impl);
} catch (RuntimeException e) {

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2009, 2018, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2009, 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -61,7 +61,7 @@ public final class NormalizerImpl {
return 0<=c && c<HANGUL_COUNT && c%JAMO_T_COUNT==0;
}
/**
/**
* Decomposes c, which must be a Hangul syllable, into buffer
* and returns the length of the decomposition (2 or 3).
*/
@ -145,8 +145,7 @@ public final class NormalizerImpl {
insert(c, cc);
}
}
// s must be in NFD, otherwise change the implementation.
public void append(CharSequence s, int start, int limit,
public void append(CharSequence s, int start, int limit, boolean isNFD,
int leadCC, int trailCC) {
if(start==limit) {
return;
@ -167,8 +166,11 @@ public final class NormalizerImpl {
c=Character.codePointAt(s, start);
start+=Character.charCount(c);
if(start<limit) {
// s must be in NFD, otherwise we need to use getCC().
leadCC=getCCFromYesOrMaybe(impl.getNorm16(c));
if (isNFD) {
leadCC = getCCFromYesOrMaybe(impl.getNorm16(c));
} else {
leadCC = impl.getCC(impl.getNorm16(c));
}
} else {
leadCC=trailCC;
}
@ -310,6 +312,12 @@ public final class NormalizerImpl {
// TODO: Propose widening UTF16 methods that take char to take int.
// TODO: Propose widening UTF16 methods that take String to take CharSequence.
public static final class UTF16Plus {
/**
* Is this code point a lead surrogate (U+d800..U+dbff)?
* @param c code unit or code point
* @return true or false
*/
public static boolean isLeadSurrogate(int c) { return (c & 0xfffffc00) == 0xd800; }
/**
* Assuming c is a surrogate code point (UTF16.isSurrogate(c)),
* is it a lead surrogate?
@ -350,7 +358,7 @@ public final class NormalizerImpl {
private static final class IsAcceptable implements ICUBinary.Authenticate {
public boolean isDataVersionAcceptable(byte version[]) {
return version[0]==3;
return version[0]==4;
}
}
private static final IsAcceptable IS_ACCEPTABLE = new IsAcceptable();
@ -387,8 +395,9 @@ public final class NormalizerImpl {
// Read the normTrie.
int offset=inIndexes[IX_NORM_TRIE_OFFSET];
int nextOffset=inIndexes[IX_EXTRA_DATA_OFFSET];
normTrie=Trie2_16.createFromSerialized(bytes);
int trieLength=normTrie.getSerializedLength();
int triePosition = bytes.position();
normTrie = CodePointTrie.Fast16.fromBinary(bytes);
int trieLength = bytes.position() - triePosition;
if(trieLength>(nextOffset-offset)) {
throw new InternalError("Normalizer2 data: not enough bytes for normTrie");
}
@ -398,13 +407,8 @@ public final class NormalizerImpl {
offset=nextOffset;
nextOffset=inIndexes[IX_SMALL_FCD_OFFSET];
int numChars=(nextOffset-offset)/2;
char[] chars;
if(numChars!=0) {
chars=new char[numChars];
for(int i=0; i<numChars; ++i) {
chars[i]=bytes.getChar();
}
maybeYesCompositions=new String(chars);
maybeYesCompositions=ICUBinary.getString(bytes, numChars, 0);
extraData=maybeYesCompositions.substring((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT);
}
@ -422,8 +426,12 @@ public final class NormalizerImpl {
return load(ICUBinary.getRequiredData(name));
}
public int getNorm16(int c) { return normTrie.get(c); }
// The trie stores values for lead surrogate code *units*.
// Surrogate code *points* are inert.
public int getNorm16(int c) {
return UTF16Plus.isLeadSurrogate(c) ? INERT : normTrie.get(c);
}
public int getRawNorm16(int c) { return normTrie.get(c); }
public boolean isAlgorithmicNoNo(int norm16) { return limitNoNo<=norm16 && norm16<minMaybeYes; }
public boolean isCompNo(int norm16) { return minNoNo<=norm16 && norm16<minMaybeYes; }
public boolean isDecompYes(int norm16) { return norm16<minYesNo || minMaybeYes<=norm16; }
@ -486,7 +494,7 @@ public final class NormalizerImpl {
}
// Maps to an isCompYesAndZeroCC.
c=mapAlgorithmic(c, norm16);
norm16=getNorm16(c);
norm16=getRawNorm16(c);
}
}
if(norm16<=minYesNo || isHangulLVT(norm16)) {
@ -519,7 +527,7 @@ public final class NormalizerImpl {
// Maps to an isCompYesAndZeroCC.
decomp=c=mapAlgorithmic(c, norm16);
// The mapping might decompose further.
norm16 = getNorm16(c);
norm16 = getRawNorm16(c);
}
if (norm16 < minYesNo) {
if(decomp<0) {
@ -641,27 +649,23 @@ public final class NormalizerImpl {
// count code units below the minimum or with irrelevant data for the quick check
for(prevSrc=src; src!=limit;) {
if( (c=s.charAt(src))<minNoCP ||
isMostDecompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
isMostDecompYesAndZeroCC(norm16=normTrie.bmpGet(c))
) {
++src;
} else if(!UTF16.isSurrogate((char)c)) {
} else if(!UTF16Plus.isLeadSurrogate(c)) {
break;
} else {
char c2;
if(UTF16Plus.isSurrogateLead(c)) {
if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
c=Character.toCodePoint((char)c, c2);
if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
c = Character.toCodePoint((char)c, c2);
norm16 = normTrie.suppGet(c);
if (isMostDecompYesAndZeroCC(norm16)) {
src += 2;
} else {
break;
}
} else /* trail surrogate */ {
if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
--src;
c=Character.toCodePoint(c2, (char)c);
}
}
if(isMostDecompYesAndZeroCC(norm16=getNorm16(c))) {
src+=Character.charCount(c);
} else {
break;
++src; // unpaired lead surrogate: inert
}
}
}
@ -721,7 +725,7 @@ public final class NormalizerImpl {
c=Character.codePointAt(s, src);
cc=getCC(getNorm16(c));
};
buffer.append(s, 0, src, firstCC, prevCC);
buffer.append(s, 0, src, false, firstCC, prevCC);
buffer.append(s, src, limit);
}
@ -749,28 +753,22 @@ public final class NormalizerImpl {
return true;
}
if( (c=s.charAt(src))<minNoMaybeCP ||
isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
) {
++src;
} else {
prevSrc = src++;
if(!UTF16.isSurrogate((char)c)) {
if (!UTF16Plus.isLeadSurrogate(c)) {
break;
} else {
char c2;
if(UTF16Plus.isSurrogateLead(c)) {
if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
++src;
c=Character.toCodePoint((char)c, c2);
if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {
++src;
c = Character.toCodePoint((char)c, c2);
norm16 = normTrie.suppGet(c);
if (!isCompYesAndZeroCC(norm16)) {
break;
}
} else /* trail surrogate */ {
if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
--prevSrc;
c=Character.toCodePoint(c2, (char)c);
}
}
if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
break;
}
}
}
@ -991,28 +989,22 @@ public final class NormalizerImpl {
return (src<<1)|qcResult; // "yes" or "maybe"
}
if( (c=s.charAt(src))<minNoMaybeCP ||
isCompYesAndZeroCC(norm16=normTrie.getFromU16SingleLead((char)c))
isCompYesAndZeroCC(norm16=normTrie.bmpGet(c))
) {
++src;
} else {
prevSrc = src++;
if(!UTF16.isSurrogate((char)c)) {
if (!UTF16Plus.isLeadSurrogate(c)) {
break;
} else {
char c2;
if(UTF16Plus.isSurrogateLead(c)) {
if(src!=limit && Character.isLowSurrogate(c2=s.charAt(src))) {
++src;
c=Character.toCodePoint((char)c, c2);
if (src != limit && Character.isLowSurrogate(c2 = s.charAt(src))) {
++src;
c = Character.toCodePoint((char)c, c2);
norm16 = normTrie.suppGet(c);
if (!isCompYesAndZeroCC(norm16)) {
break;
}
} else /* trail surrogate */ {
if(prevBoundary<prevSrc && Character.isHighSurrogate(c2=s.charAt(prevSrc-1))) {
--prevSrc;
c=Character.toCodePoint(c2, (char)c);
}
}
if(!isCompYesAndZeroCC(norm16=getNorm16(c))) {
break;
}
}
}
@ -1134,17 +1126,10 @@ public final class NormalizerImpl {
prevFCD16=0;
++src;
} else {
if(UTF16.isSurrogate((char)c)) {
if (UTF16Plus.isLeadSurrogate(c)) {
char c2;
if(UTF16Plus.isSurrogateLead(c)) {
if((src+1)!=limit && Character.isLowSurrogate(c2=s.charAt(src+1))) {
c=Character.toCodePoint((char)c, c2);
}
} else /* trail surrogate */ {
if(prevSrc<src && Character.isHighSurrogate(c2=s.charAt(src-1))) {
--src;
c=Character.toCodePoint(c2, (char)c);
}
if ((src + 1) != limit && Character.isLowSurrogate(c2 = s.charAt(src + 1))) {
c = Character.toCodePoint((char)c, c2);
}
}
if((fcd16=getFCD16FromNormData(c))<=0xff) {
@ -1430,7 +1415,7 @@ public final class NormalizerImpl {
}
// Maps to an isCompYesAndZeroCC.
c=mapAlgorithmic(c, norm16);
norm16=getNorm16(c);
norm16=getRawNorm16(c);
}
if (norm16 < minYesNo) {
// c does not decompose
@ -1451,7 +1436,7 @@ public final class NormalizerImpl {
leadCC=0;
}
++mapping; // skip over the firstUnit
buffer.append(extraData, mapping, mapping+length, leadCC, trailCC);
buffer.append(extraData, mapping, mapping+length, true, leadCC, trailCC);
}
}
@ -1643,7 +1628,7 @@ public final class NormalizerImpl {
// Is the composite a starter that combines forward?
if((compositeAndFwd&1)!=0) {
compositionsList=
getCompositionsListForComposite(getNorm16(composite));
getCompositionsListForComposite(getRawNorm16(composite));
} else {
compositionsList=-1;
}
@ -2196,9 +2181,8 @@ public final class NormalizerImpl {
private int centerNoNoDelta;
private int minMaybeYes;
private Trie2_16 normTrie;
private CodePointTrie.Fast16 normTrie;
private String maybeYesCompositions;
private String extraData; // mappings and/or compositions for yesYes, yesNo & noNo characters
private byte[] smallFCD; // [0x100] one bit per 32 BMP code points, set if any FCD!=0
}
}

View file

@ -1,38 +1,11 @@
## International Components for Unicode (ICU4J) v62.1
## International Components for Unicode (ICU4J) v64.2
### ICU4J License
```
COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later)
UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
Unicode Data Files include all data files under the directories
http://www.unicode.org/Public/, http://www.unicode.org/reports/,
http://www.unicode.org/cldr/data/,
http://source.icu-project.org/repos/icu/, and
http://www.unicode.org/utility/trac/browser/.
Unicode Data Files do not include PDF online code charts under the
directory http://www.unicode.org/Public/.
Software includes any source code published in the Unicode Standard
or under the directories
http://www.unicode.org/Public/, http://www.unicode.org/reports/,
http://www.unicode.org/cldr/data/,
http://source.icu-project.org/repos/icu/, and
http://www.unicode.org/utility/trac/browser/.
NOTICE TO USER: Carefully read the following legal agreement.
BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"),
YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE
TERMS AND CONDITIONS OF THIS AGREEMENT.
IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE
THE DATA FILES OR SOFTWARE.
COPYRIGHT AND PERMISSION NOTICE
Copyright © 1991-2018 Unicode, Inc. All rights reserved.
Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
Copyright © 1991-2019 Unicode, Inc. All rights reserved.
Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
Permission is hereby granted, free of charge, to any person obtaining
a copy of the Unicode data files and any associated documentation
@ -63,4 +36,354 @@ shall not be used in advertising or otherwise to promote the sale,
use or other dealings in these Data Files or Software without prior
written authorization of the copyright holder.
```
---------------------
Third-Party Software Licenses
This section contains third-party software notices and/or additional
terms for licensed third-party software components included within ICU
libraries.
1. ICU License - ICU 1.8.1 to ICU 57.1
COPYRIGHT AND PERMISSION NOTICE
Copyright (c) 1995-2016 International Business Machines Corporation and others
All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining
a copy of this software and associated documentation files (the
"Software"), to deal in the Software without restriction, including
without limitation the rights to use, copy, modify, merge, publish,
distribute, and/or sell copies of the Software, and to permit persons
to whom the Software is furnished to do so, provided that the above
copyright notice(s) and this permission notice appear in all copies of
the Software and that both the above copyright notice(s) and this
permission notice appear in supporting documentation.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY
SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
Except as contained in this notice, the name of a copyright holder
shall not be used in advertising or otherwise to promote the sale, use
or other dealings in this Software without prior written authorization
of the copyright holder.
All trademarks and registered trademarks mentioned herein are the
property of their respective owners.
2. Chinese/Japanese Word Break Dictionary Data (cjdict.txt)
# The Google Chrome software developed by Google is licensed under
# the BSD license. Other software included in this distribution is
# provided under other licenses, as set forth below.
#
# The BSD License
# http://opensource.org/licenses/bsd-license.php
# Copyright (C) 2006-2008, Google Inc.
#
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# Redistributions of source code must retain the above copyright notice,
# this list of conditions and the following disclaimer.
# Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided with
# the distribution.
# Neither the name of Google Inc. nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
#
# The word list in cjdict.txt are generated by combining three word lists
# listed below with further processing for compound word breaking. The
# frequency is generated with an iterative training against Google web
# corpora.
#
# * Libtabe (Chinese)
# - https://sourceforge.net/project/?group_id=1519
# - Its license terms and conditions are shown below.
#
# * IPADIC (Japanese)
# - http://chasen.aist-nara.ac.jp/chasen/distribution.html
# - Its license terms and conditions are shown below.
#
# ---------COPYING.libtabe ---- BEGIN--------------------
#
# /*
# * Copyright (c) 1999 TaBE Project.
# * Copyright (c) 1999 Pai-Hsiang Hsiao.
# * All rights reserved.
# *
# * Redistribution and use in source and binary forms, with or without
# * modification, are permitted provided that the following conditions
# * are met:
# *
# * . Redistributions of source code must retain the above copyright
# * notice, this list of conditions and the following disclaimer.
# * . Redistributions in binary form must reproduce the above copyright
# * notice, this list of conditions and the following disclaimer in
# * the documentation and/or other materials provided with the
# * distribution.
# * . Neither the name of the TaBE Project nor the names of its
# * contributors may be used to endorse or promote products derived
# * from this software without specific prior written permission.
# *
# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
# * OF THE POSSIBILITY OF SUCH DAMAGE.
# */
#
# /*
# * Copyright (c) 1999 Computer Systems and Communication Lab,
# * Institute of Information Science, Academia
# * Sinica. All rights reserved.
# *
# * Redistribution and use in source and binary forms, with or without
# * modification, are permitted provided that the following conditions
# * are met:
# *
# * . Redistributions of source code must retain the above copyright
# * notice, this list of conditions and the following disclaimer.
# * . Redistributions in binary form must reproduce the above copyright
# * notice, this list of conditions and the following disclaimer in
# * the documentation and/or other materials provided with the
# * distribution.
# * . Neither the name of the Computer Systems and Communication Lab
# * nor the names of its contributors may be used to endorse or
# * promote products derived from this software without specific
# * prior written permission.
# *
# * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
# * OF THE POSSIBILITY OF SUCH DAMAGE.
# */
#
# Copyright 1996 Chih-Hao Tsai @ Beckman Institute,
# University of Illinois
# c-tsai4@uiuc.edu http://casper.beckman.uiuc.edu/~c-tsai4
#
# ---------------COPYING.libtabe-----END--------------------------------
#
#
# ---------------COPYING.ipadic-----BEGIN-------------------------------
#
# Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
# and Technology. All Rights Reserved.
#
# Use, reproduction, and distribution of this software is permitted.
# Any copy of this software, whether in its original form or modified,
# must include both the above copyright notice and the following
# paragraphs.
#
# Nara Institute of Science and Technology (NAIST),
# the copyright holders, disclaims all warranties with regard to this
# software, including all implied warranties of merchantability and
# fitness, in no event shall NAIST be liable for
# any special, indirect or consequential damages or any damages
# whatsoever resulting from loss of use, data or profits, whether in an
# action of contract, negligence or other tortuous action, arising out
# of or in connection with the use or performance of this software.
#
# A large portion of the dictionary entries
# originate from ICOT Free Software. The following conditions for ICOT
# Free Software applies to the current dictionary as well.
#
# Each User may also freely distribute the Program, whether in its
# original form or modified, to any third party or parties, PROVIDED
# that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
# on, or be attached to, the Program, which is distributed substantially
# in the same form as set out herein and that such intended
# distribution, if actually made, will neither violate or otherwise
# contravene any of the laws and regulations of the countries having
# jurisdiction over the User or the intended distribution itself.
#
# NO WARRANTY
#
# The program was produced on an experimental basis in the course of the
# research and development conducted during the project and is provided
# to users as so produced on an experimental basis. Accordingly, the
# program is provided without any warranty whatsoever, whether express,
# implied, statutory or otherwise. The term "warranty" used herein
# includes, but is not limited to, any warranty of the quality,
# performance, merchantability and fitness for a particular purpose of
# the program and the nonexistence of any infringement or violation of
# any right of any third party.
#
# Each user of the program will agree and understand, and be deemed to
# have agreed and understood, that there is no warranty whatsoever for
# the program and, accordingly, the entire risk arising from or
# otherwise connected with the program is assumed by the user.
#
# Therefore, neither ICOT, the copyright holder, or any other
# organization that participated in or was otherwise related to the
# development of the program and their respective officials, directors,
# officers and other employees shall be held liable for any and all
# damages, including, without limitation, general, special, incidental
# and consequential damages, arising out of or otherwise in connection
# with the use or inability to use the program or any product, material
# or result produced or otherwise obtained by using the program,
# regardless of whether they have been advised of, or otherwise had
# knowledge of, the possibility of such damages at any time during the
# project or thereafter. Each user will be deemed to have agreed to the
# foregoing by his or her commencement of use of the program. The term
# "use" as used herein includes, but is not limited to, the use,
# modification, copying and distribution of the program and the
# production of secondary products from the program.
#
# In the case where the program, whether in its original form or
# modified, was distributed or delivered to or received by a user from
# any person, organization or entity other than ICOT, unless it makes or
# grants independently of ICOT any specific warranty to the user in
# writing, such person, organization or entity, will also be exempted
# from and not be held liable to the user for any such damages as noted
# above as far as the program is concerned.
#
# ---------------COPYING.ipadic-----END----------------------------------
3. Lao Word Break Dictionary Data (laodict.txt)
# Copyright (c) 2013 International Business Machines Corporation
# and others. All Rights Reserved.
#
# Project: http://code.google.com/p/lao-dictionary/
# Dictionary: http://lao-dictionary.googlecode.com/git/Lao-Dictionary.txt
# License: http://lao-dictionary.googlecode.com/git/Lao-Dictionary-LICENSE.txt
# (copied below)
#
# This file is derived from the above dictionary, with slight
# modifications.
# ----------------------------------------------------------------------
# Copyright (C) 2013 Brian Eugene Wilson, Robert Martin Campbell.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification,
# are permitted provided that the following conditions are met:
#
#
# Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer. Redistributions in
# binary form must reproduce the above copyright notice, this list of
# conditions and the following disclaimer in the documentation and/or
# other materials provided with the distribution.
#
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
# COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
# OF THE POSSIBILITY OF SUCH DAMAGE.
# --------------------------------------------------------------------------
4. Burmese Word Break Dictionary Data (burmesedict.txt)
# Copyright (c) 2014 International Business Machines Corporation
# and others. All Rights Reserved.
#
# This list is part of a project hosted at:
# github.com/kanyawtech/myanmar-karen-word-lists
#
# --------------------------------------------------------------------------
# Copyright (c) 2013, LeRoy Benjamin Sharon
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met: Redistributions of source code must retain the above
# copyright notice, this list of conditions and the following
# disclaimer. Redistributions in binary form must reproduce the
# above copyright notice, this list of conditions and the following
# disclaimer in the documentation and/or other materials provided
# with the distribution.
#
# Neither the name Myanmar Karen Word Lists, nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
# CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
# INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
# TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
# THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
# --------------------------------------------------------------------------
5. Time Zone Database
ICU uses the public domain data and code derived from Time Zone
Database for its time zone support. The ownership of the TZ database
is explained in BCP 175: Procedure for Maintaining the Time Zone
Database section 7.
# 7. Database Ownership
#
# The TZ database itself is not an IETF Contribution or an IETF
# document. Rather it is a pre-existing and regularly updated work
# that is in the public domain, and is intended to remain in the
# public domain. Therefore, BCPs 78 [RFC5378] and 79 [RFC3979] do
# not apply to the TZ Database or contributions that individuals make
# to it. Should any claims be made and substantiated against the TZ
# Database, the organization that is providing the IANA
# Considerations defined in this RFC, under the memorandum of
# understanding with the IETF, currently ICANN, may act in accordance
# with all competent court orders. No ownership claims will be made
# by ICANN or the IETF Trust on the database or the code. Any person
# making a contribution to the database or code waives all rights to
# future claims in that contribution or in the TZ Database.

View file

@ -1,25 +1,11 @@
## The Unicode Standard, Unicode Character Database, Version 11.0.0
## The Unicode Standard, Unicode Character Database, Version 12.1.0
### Unicode Character Database
```
UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE
Unicode Data Files include all data files under the directories
http://www.unicode.org/Public/, http://www.unicode.org/reports/,
http://www.unicode.org/cldr/data/,
http://source.icu-project.org/repos/icu/, and
http://www.unicode.org/utility/trac/browser/.
Unicode Data Files do not include PDF online code charts under the
directory http://www.unicode.org/Public/.
Software includes any source code published in the Unicode Standard
or under the directories
http://www.unicode.org/Public/, http://www.unicode.org/reports/,
http://www.unicode.org/cldr/data/,
http://source.icu-project.org/repos/icu/, and
http://www.unicode.org/utility/trac/browser/.
See Terms of Use for definitions of Unicode Inc.'s
Data Files and Software.
NOTICE TO USER: Carefully read the following legal agreement.
BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S
@ -31,8 +17,8 @@ THE DATA FILES OR SOFTWARE.
COPYRIGHT AND PERMISSION NOTICE
Copyright © 1991-2018 Unicode, Inc. All rights reserved.
Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
Copyright © 1991-2019 Unicode, Inc. All rights reserved.
Distributed under the Terms of Use in https://www.unicode.org/copyright.html.
Permission is hereby granted, free of charge, to any person obtaining
a copy of the Unicode data files and any associated documentation
@ -62,5 +48,3 @@ Except as contained in this notice, the name of a copyright holder
shall not be used in advertising or otherwise to promote the sale,
use or other dealings in these Data Files or Software without prior
written authorization of the copyright holder.
```