8174270: Consolidate ICU sources in one location

Reviewed-by: srl, joehw
This commit is contained in:
Naoto Sato 2020-01-13 08:05:59 -08:00
parent 91bb1d3700
commit 1b24cf800f
57 changed files with 251 additions and 178 deletions

View file

@ -1,511 +0,0 @@
/*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2003-2004, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
//
// CHANGELOG
// 2005-05-19 Edward Wang
// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/Punycode.java
// - move from package com.ibm.icu.text to package sun.net.idn
// - use ParseException instead of StringPrepParseException
// 2007-08-14 Martin Buchholz
// - remove redundant casts
//
package sun.net.idn;
import java.text.ParseException;
import sun.text.normalizer.UCharacter;
import sun.text.normalizer.UTF16;
/**
* Ported code from ICU punycode.c
* @author ram
*/
/* Package Private class */
public final class Punycode {
/* Punycode parameters for Bootstring */
private static final int BASE = 36;
private static final int TMIN = 1;
private static final int TMAX = 26;
private static final int SKEW = 38;
private static final int DAMP = 700;
private static final int INITIAL_BIAS = 72;
private static final int INITIAL_N = 0x80;
/* "Basic" Unicode/ASCII code points */
private static final int HYPHEN = 0x2d;
private static final int DELIMITER = HYPHEN;
private static final int ZERO = 0x30;
private static final int NINE = 0x39;
private static final int SMALL_A = 0x61;
private static final int SMALL_Z = 0x7a;
private static final int CAPITAL_A = 0x41;
private static final int CAPITAL_Z = 0x5a;
// TODO: eliminate the 256 limitation
private static final int MAX_CP_COUNT = 256;
private static final int UINT_MAGIC = 0x80000000;
private static final long ULONG_MAGIC = 0x8000000000000000L;
private static int adaptBias(int delta, int length, boolean firstTime){
if(firstTime){
delta /=DAMP;
}else{
delta /= 2;
}
delta += delta/length;
int count=0;
for(; delta>((BASE-TMIN)*TMAX)/2; count+=BASE) {
delta/=(BASE-TMIN);
}
return count+(((BASE-TMIN+1)*delta)/(delta+SKEW));
}
/**
* basicToDigit[] contains the numeric value of a basic code
* point (for use in representing integers) in the range 0 to
* BASE-1, or -1 if b is does not represent a value.
*/
static final int[] basicToDigit= new int[]{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
26, 27, 28, 29, 30, 31, 32, 33, 34, 35, -1, -1, -1, -1, -1, -1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
};
private static char asciiCaseMap(char b, boolean uppercase) {
if(uppercase) {
if(SMALL_A<=b && b<=SMALL_Z) {
b-=(SMALL_A-CAPITAL_A);
}
} else {
if(CAPITAL_A<=b && b<=CAPITAL_Z) {
b+=(SMALL_A-CAPITAL_A);
}
}
return b;
}
/**
* digitToBasic() returns the basic code point whose value
* (when used for representing integers) is d, which must be in the
* range 0 to BASE-1. The lowercase form is used unless the uppercase flag is
* nonzero, in which case the uppercase form is used.
*/
private static char digitToBasic(int digit, boolean uppercase) {
/* 0..25 map to ASCII a..z or A..Z */
/* 26..35 map to ASCII 0..9 */
if(digit<26) {
if(uppercase) {
return (char)(CAPITAL_A+digit);
} else {
return (char)(SMALL_A+digit);
}
} else {
return (char)((ZERO-26)+digit);
}
}
/**
* Converts Unicode to Punycode.
* The input string must not contain single, unpaired surrogates.
* The output will be represented as an array of ASCII code points.
*
* @param src
* @param caseFlags
* @return
* @throws ParseException
*/
public static StringBuffer encode(StringBuffer src, boolean[] caseFlags) throws ParseException{
int[] cpBuffer = new int[MAX_CP_COUNT];
int n, delta, handledCPCount, basicLength, destLength, bias, j, m, q, k, t, srcCPCount;
char c, c2;
int srcLength = src.length();
int destCapacity = MAX_CP_COUNT;
char[] dest = new char[destCapacity];
StringBuffer result = new StringBuffer();
/*
* Handle the basic code points and
* convert extended ones to UTF-32 in cpBuffer (caseFlag in sign bit):
*/
srcCPCount=destLength=0;
for(j=0; j<srcLength; ++j) {
if(srcCPCount==MAX_CP_COUNT) {
/* too many input code points */
throw new IndexOutOfBoundsException();
}
c=src.charAt(j);
if(isBasic(c)) {
if(destLength<destCapacity) {
cpBuffer[srcCPCount++]=0;
dest[destLength]=
caseFlags!=null ?
asciiCaseMap(c, caseFlags[j]) :
c;
}
++destLength;
} else {
n=((caseFlags!=null && caseFlags[j])? 1 : 0)<<31L;
if(!UTF16.isSurrogate(c)) {
n|=c;
} else if(UTF16.isLeadSurrogate(c) && (j+1)<srcLength && UTF16.isTrailSurrogate(c2=src.charAt(j+1))) {
++j;
n|=UCharacter.getCodePoint(c, c2);
} else {
/* error: unmatched surrogate */
throw new ParseException("Illegal char found", -1);
}
cpBuffer[srcCPCount++]=n;
}
}
/* Finish the basic string - if it is not empty - with a delimiter. */
basicLength=destLength;
if(basicLength>0) {
if(destLength<destCapacity) {
dest[destLength]=DELIMITER;
}
++destLength;
}
/*
* handledCPCount is the number of code points that have been handled
* basicLength is the number of basic code points
* destLength is the number of chars that have been output
*/
/* Initialize the state: */
n=INITIAL_N;
delta=0;
bias=INITIAL_BIAS;
/* Main encoding loop: */
for(handledCPCount=basicLength; handledCPCount<srcCPCount; /* no op */) {
/*
* All non-basic code points < n have been handled already.
* Find the next larger one:
*/
for(m=0x7fffffff, j=0; j<srcCPCount; ++j) {
q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
if(n<=q && q<m) {
m=q;
}
}
/*
* Increase delta enough to advance the decoder's
* <n,i> state to <m,0>, but guard against overflow:
*/
if(m-n>(0x7fffffff-MAX_CP_COUNT-delta)/(handledCPCount+1)) {
throw new RuntimeException("Internal program error");
}
delta+=(m-n)*(handledCPCount+1);
n=m;
/* Encode a sequence of same code points n */
for(j=0; j<srcCPCount; ++j) {
q=cpBuffer[j]&0x7fffffff; /* remove case flag from the sign bit */
if(q<n) {
++delta;
} else if(q==n) {
/* Represent delta as a generalized variable-length integer: */
for(q=delta, k=BASE; /* no condition */; k+=BASE) {
/** RAM: comment out the old code for conformance with draft-ietf-idn-punycode-03.txt
t=k-bias;
if(t<TMIN) {
t=TMIN;
} else if(t>TMAX) {
t=TMAX;
}
*/
t=k-bias;
if(t<TMIN) {
t=TMIN;
} else if(k>=(bias+TMAX)) {
t=TMAX;
}
if(q<t) {
break;
}
if(destLength<destCapacity) {
dest[destLength++]=digitToBasic(t+(q-t)%(BASE-t), false);
}
q=(q-t)/(BASE-t);
}
if(destLength<destCapacity) {
dest[destLength++]=digitToBasic(q, (cpBuffer[j]<0));
}
bias=adaptBias(delta, handledCPCount+1,(handledCPCount==basicLength));
delta=0;
++handledCPCount;
}
}
++delta;
++n;
}
return result.append(dest, 0, destLength);
}
private static boolean isBasic(int ch){
return (ch < INITIAL_N);
}
private static boolean isBasicUpperCase(int ch){
return( CAPITAL_A <= ch && ch <= CAPITAL_Z);
}
private static boolean isSurrogate(int ch){
return (((ch)&0xfffff800)==0xd800);
}
/**
* Converts Punycode to Unicode.
* The Unicode string will be at most as long as the Punycode string.
*
* @param src
* @param caseFlags
* @return
* @throws ParseException
*/
public static StringBuffer decode(StringBuffer src, boolean[] caseFlags)
throws ParseException{
int srcLength = src.length();
StringBuffer result = new StringBuffer();
int n, destLength, i, bias, basicLength, j, in, oldi, w, k, digit, t,
destCPCount, firstSupplementaryIndex, cpLength;
char b;
int destCapacity = MAX_CP_COUNT;
char[] dest = new char[destCapacity];
/*
* Handle the basic code points:
* Let basicLength be the number of input code points
* before the last delimiter, or 0 if there is none,
* then copy the first basicLength code points to the output.
*
* The two following loops iterate backward.
*/
for(j=srcLength; j>0;) {
if(src.charAt(--j)==DELIMITER) {
break;
}
}
destLength=basicLength=destCPCount=j;
while(j>0) {
b=src.charAt(--j);
if(!isBasic(b)) {
throw new ParseException("Illegal char found", -1);
}
if(j<destCapacity) {
dest[j]= b;
if(caseFlags!=null) {
caseFlags[j]=isBasicUpperCase(b);
}
}
}
/* Initialize the state: */
n=INITIAL_N;
i=0;
bias=INITIAL_BIAS;
firstSupplementaryIndex=1000000000;
/*
* Main decoding loop:
* Start just after the last delimiter if any
* basic code points were copied; start at the beginning otherwise.
*/
for(in=basicLength>0 ? basicLength+1 : 0; in<srcLength; /* no op */) {
/*
* in is the index of the next character to be consumed, and
* destCPCount is the number of code points in the output array.
*
* Decode a generalized variable-length integer into delta,
* which gets added to i. The overflow checking is easier
* if we increase i as we go, then subtract off its starting
* value at the end to obtain delta.
*/
for(oldi=i, w=1, k=BASE; /* no condition */; k+=BASE) {
if(in>=srcLength) {
throw new ParseException("Illegal char found", -1);
}
digit=basicToDigit[(byte)src.charAt(in++)];
if(digit<0) {
throw new ParseException("Invalid char found", -1);
}
if(digit>(0x7fffffff-i)/w) {
/* integer overflow */
throw new ParseException("Illegal char found", -1);
}
i+=digit*w;
t=k-bias;
if(t<TMIN) {
t=TMIN;
} else if(k>=(bias+TMAX)) {
t=TMAX;
}
if(digit<t) {
break;
}
if(w>0x7fffffff/(BASE-t)) {
/* integer overflow */
throw new ParseException("Illegal char found", -1);
}
w*=BASE-t;
}
/*
* Modification from sample code:
* Increments destCPCount here,
* where needed instead of in for() loop tail.
*/
++destCPCount;
bias=adaptBias(i-oldi, destCPCount, (oldi==0));
/*
* i was supposed to wrap around from (incremented) destCPCount to 0,
* incrementing n each time, so we'll fix that now:
*/
if(i/destCPCount>(0x7fffffff-n)) {
/* integer overflow */
throw new ParseException("Illegal char found", -1);
}
n+=i/destCPCount;
i%=destCPCount;
/* not needed for Punycode: */
/* if (decode_digit(n) <= BASE) return punycode_invalid_input; */
if(n>0x10ffff || isSurrogate(n)) {
/* Unicode code point overflow */
throw new ParseException("Illegal char found", -1);
}
/* Insert n at position i of the output: */
cpLength=UTF16.getCharCount(n);
if((destLength+cpLength)<destCapacity) {
int codeUnitIndex;
/*
* Handle indexes when supplementary code points are present.
*
* In almost all cases, there will be only BMP code points before i
* and even in the entire string.
* This is handled with the same efficiency as with UTF-32.
*
* Only the rare cases with supplementary code points are handled
* more slowly - but not too bad since this is an insertion anyway.
*/
if(i<=firstSupplementaryIndex) {
codeUnitIndex=i;
if(cpLength>1) {
firstSupplementaryIndex=codeUnitIndex;
} else {
++firstSupplementaryIndex;
}
} else {
codeUnitIndex=firstSupplementaryIndex;
codeUnitIndex=UTF16.moveCodePointOffset(dest, 0, destLength, codeUnitIndex, i-codeUnitIndex);
}
/* use the UChar index codeUnitIndex instead of the code point index i */
if(codeUnitIndex<destLength) {
System.arraycopy(dest, codeUnitIndex,
dest, codeUnitIndex+cpLength,
(destLength-codeUnitIndex));
if(caseFlags!=null) {
System.arraycopy(caseFlags, codeUnitIndex,
caseFlags, codeUnitIndex+cpLength,
destLength-codeUnitIndex);
}
}
if(cpLength==1) {
/* BMP, insert one code unit */
dest[codeUnitIndex]=(char)n;
} else {
/* supplementary character, insert two code units */
dest[codeUnitIndex]=UTF16.getLeadSurrogate(n);
dest[codeUnitIndex+1]=UTF16.getTrailSurrogate(n);
}
if(caseFlags!=null) {
/* Case of last character determines uppercase flag: */
caseFlags[codeUnitIndex]=isBasicUpperCase(src.charAt(in-1));
if(cpLength==2) {
caseFlags[codeUnitIndex+1]=false;
}
}
}
destLength+=cpLength;
++i;
}
result.append(dest, 0, destLength);
return result;
}
}

View file

@ -1,486 +0,0 @@
/*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
/*
*******************************************************************************
* Copyright (C) 2003-2004, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
//
// CHANGELOG
// 2005-05-19 Edward Wang
// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/text/StringPrep.java
// - move from package com.ibm.icu.text to package sun.net.idn
// - use ParseException instead of StringPrepParseException
// - change 'Normalizer.getUnicodeVersion()' to 'NormalizerImpl.getUnicodeVersion()'
// - remove all @deprecated tag to make compiler happy
// 2007-08-14 Martin Buchholz
// - remove redundant casts
//
package sun.net.idn;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.text.ParseException;
import sun.text.Normalizer;
import sun.text.normalizer.CharTrie;
import sun.text.normalizer.Trie;
import sun.text.normalizer.VersionInfo;
import sun.text.normalizer.UCharacter;
import sun.text.normalizer.UCharacterIterator;
import sun.text.normalizer.UTF16;
import sun.net.idn.UCharacterDirection;
import sun.net.idn.StringPrepDataReader;
/**
* StringPrep API implements the StingPrep framework as described by
* <a href="http://www.ietf.org/rfc/rfc3454.txt">RFC 3454</a>.
* StringPrep prepares Unicode strings for use in network protocols.
* Profiles of StingPrep are set of rules and data according to which the
* Unicode Strings are prepared. Each profiles contains tables which describe
* how a code point should be treated. The tables are broadly classied into
* <ul>
* <li> Unassigned Table: Contains code points that are unassigned
* in the Unicode Version supported by StringPrep. Currently
* RFC 3454 supports Unicode 3.2. </li>
* <li> Prohibited Table: Contains code points that are prohibted from
* the output of the StringPrep processing function. </li>
* <li> Mapping Table: Contains code ponts that are deleted from the output or case mapped. </li>
* </ul>
*
* The procedure for preparing Unicode strings:
* <ol>
* <li> Map: For each character in the input, check if it has a mapping
* and, if so, replace it with its mapping. </li>
* <li> Normalize: Possibly normalize the result of step 1 using Unicode
* normalization. </li>
* <li> Prohibit: Check for any characters that are not allowed in the
* output. If any are found, return an error.</li>
* <li> Check bidi: Possibly check for right-to-left characters, and if
* any are found, make sure that the whole string satisfies the
* requirements for bidirectional strings. If the string does not
* satisfy the requirements for bidirectional strings, return an
* error. </li>
* </ol>
* @author Ram Viswanadha
* @draft ICU 2.8
*/
public final class StringPrep {
/**
* Option to prohibit processing of unassigned code points in the input
*
* @see #prepare
* @draft ICU 2.8
*/
public static final int DEFAULT = 0x0000;
/**
* Option to allow processing of unassigned code points in the input
*
* @see #prepare
* @draft ICU 2.8
*/
public static final int ALLOW_UNASSIGNED = 0x0001;
private static final int UNASSIGNED = 0x0000;
private static final int MAP = 0x0001;
private static final int PROHIBITED = 0x0002;
private static final int DELETE = 0x0003;
private static final int TYPE_LIMIT = 0x0004;
private static final int NORMALIZATION_ON = 0x0001;
private static final int CHECK_BIDI_ON = 0x0002;
private static final int TYPE_THRESHOLD = 0xFFF0;
private static final int MAX_INDEX_VALUE = 0x3FBF; /*16139*/
private static final int MAX_INDEX_TOP_LENGTH = 0x0003;
/* indexes[] value names */
private static final int INDEX_TRIE_SIZE = 0; /* number of bytes in normalization trie */
private static final int INDEX_MAPPING_DATA_SIZE = 1; /* The array that contains the mapping */
private static final int NORM_CORRECTNS_LAST_UNI_VERSION = 2; /* The index of Unicode version of last entry in NormalizationCorrections.txt */
private static final int ONE_UCHAR_MAPPING_INDEX_START = 3; /* The starting index of 1 UChar mapping index in the mapping data array */
private static final int TWO_UCHARS_MAPPING_INDEX_START = 4; /* The starting index of 2 UChars mapping index in the mapping data array */
private static final int THREE_UCHARS_MAPPING_INDEX_START = 5;
private static final int FOUR_UCHARS_MAPPING_INDEX_START = 6;
private static final int OPTIONS = 7; /* Bit set of options to turn on in the profile */
private static final int INDEX_TOP = 16; /* changing this requires a new formatVersion */
/**
* Default buffer size of datafile
*/
private static final int DATA_BUFFER_SIZE = 25000;
/* Wrappers for Trie implementations */
private static final class StringPrepTrieImpl implements Trie.DataManipulate{
private CharTrie sprepTrie = null;
/**
* Called by com.ibm.icu.util.Trie to extract from a lead surrogate's
* data the index array offset of the indexes for that lead surrogate.
* @param property data value for a surrogate from the trie, including
* the folding offset
* @return data offset or 0 if there is no data for the lead surrogate
*/
public int getFoldingOffset(int value){
return value;
}
}
// CharTrie implementation for reading the trie data
private StringPrepTrieImpl sprepTrieImpl;
// Indexes read from the data file
private int[] indexes;
// mapping data read from the data file
private char[] mappingData;
// format version of the data file
private byte[] formatVersion;
// the version of Unicode supported by the data file
private VersionInfo sprepUniVer;
// the Unicode version of last entry in the
// NormalizationCorrections.txt file if normalization
// is turned on
private VersionInfo normCorrVer;
// Option to turn on Normalization
private boolean doNFKC;
// Option to turn on checking for BiDi rules
private boolean checkBiDi;
private char getCodePointValue(int ch){
return sprepTrieImpl.sprepTrie.getCodePointValue(ch);
}
private static VersionInfo getVersionInfo(int comp){
int micro = comp & 0xFF;
int milli =(comp >> 8) & 0xFF;
int minor =(comp >> 16) & 0xFF;
int major =(comp >> 24) & 0xFF;
return VersionInfo.getInstance(major,minor,milli,micro);
}
private static VersionInfo getVersionInfo(byte[] version){
if(version.length != 4){
return null;
}
return VersionInfo.getInstance((int)version[0],(int) version[1],(int) version[2],(int) version[3]);
}
/**
* Creates an StringPrep object after reading the input stream.
* The object does not hold a reference to the input steam, so the stream can be
* closed after the method returns.
*
* @param inputStream The stream for reading the StringPrep profile binarySun
* @throws IOException
* @draft ICU 2.8
*/
public StringPrep(InputStream inputStream) throws IOException{
BufferedInputStream b = new BufferedInputStream(inputStream,DATA_BUFFER_SIZE);
StringPrepDataReader reader = new StringPrepDataReader(b);
// read the indexes
indexes = reader.readIndexes(INDEX_TOP);
byte[] sprepBytes = new byte[indexes[INDEX_TRIE_SIZE]];
//indexes[INDEX_MAPPING_DATA_SIZE] store the size of mappingData in bytes
mappingData = new char[indexes[INDEX_MAPPING_DATA_SIZE]/2];
// load the rest of the data and initialize the data members
reader.read(sprepBytes,mappingData);
sprepTrieImpl = new StringPrepTrieImpl();
sprepTrieImpl.sprepTrie = new CharTrie( new ByteArrayInputStream(sprepBytes),sprepTrieImpl );
// get the data format version
formatVersion = reader.getDataFormatVersion();
// get the options
doNFKC = ((indexes[OPTIONS] & NORMALIZATION_ON) > 0);
checkBiDi = ((indexes[OPTIONS] & CHECK_BIDI_ON) > 0);
sprepUniVer = getVersionInfo(reader.getUnicodeVersion());
normCorrVer = getVersionInfo(indexes[NORM_CORRECTNS_LAST_UNI_VERSION]);
VersionInfo normUniVer = UCharacter.getUnicodeVersion();
if(normUniVer.compareTo(sprepUniVer) < 0 && /* the Unicode version of SPREP file must be less than the Unicode Vesion of the normalization data */
normUniVer.compareTo(normCorrVer) < 0 && /* the Unicode version of the NormalizationCorrections.txt file should be less than the Unicode Vesion of the normalization data */
((indexes[OPTIONS] & NORMALIZATION_ON) > 0) /* normalization turned on*/
){
throw new IOException("Normalization Correction version not supported");
}
b.close();
}
private static final class Values{
boolean isIndex;
int value;
int type;
public void reset(){
isIndex = false;
value = 0;
type = -1;
}
}
private static final void getValues(char trieWord,Values values){
values.reset();
if(trieWord == 0){
/*
* Initial value stored in the mapping table
* just return TYPE_LIMIT .. so that
* the source codepoint is copied to the destination
*/
values.type = TYPE_LIMIT;
}else if(trieWord >= TYPE_THRESHOLD){
values.type = (trieWord - TYPE_THRESHOLD);
}else{
/* get the type */
values.type = MAP;
/* ascertain if the value is index or delta */
if((trieWord & 0x02)>0){
values.isIndex = true;
values.value = trieWord >> 2; //mask off the lower 2 bits and shift
}else{
values.isIndex = false;
values.value = (trieWord<<16)>>16;
values.value = (values.value >> 2);
}
if((trieWord>>2) == MAX_INDEX_VALUE){
values.type = DELETE;
values.isIndex = false;
values.value = 0;
}
}
}
private StringBuffer map( UCharacterIterator iter, int options)
throws ParseException {
Values val = new Values();
char result = 0;
int ch = UCharacterIterator.DONE;
StringBuffer dest = new StringBuffer();
boolean allowUnassigned = ((options & ALLOW_UNASSIGNED)>0);
while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
result = getCodePointValue(ch);
getValues(result,val);
// check if the source codepoint is unassigned
if(val.type == UNASSIGNED && allowUnassigned == false){
throw new ParseException("An unassigned code point was found in the input " +
iter.getText(), iter.getIndex());
}else if((val.type == MAP)){
int index, length;
if(val.isIndex){
index = val.value;
if(index >= indexes[ONE_UCHAR_MAPPING_INDEX_START] &&
index < indexes[TWO_UCHARS_MAPPING_INDEX_START]){
length = 1;
}else if(index >= indexes[TWO_UCHARS_MAPPING_INDEX_START] &&
index < indexes[THREE_UCHARS_MAPPING_INDEX_START]){
length = 2;
}else if(index >= indexes[THREE_UCHARS_MAPPING_INDEX_START] &&
index < indexes[FOUR_UCHARS_MAPPING_INDEX_START]){
length = 3;
}else{
length = mappingData[index++];
}
/* copy mapping to destination */
dest.append(mappingData,index,length);
continue;
}else{
ch -= val.value;
}
}else if(val.type == DELETE){
// just consume the codepoint and contine
continue;
}
//copy the source into destination
UTF16.append(dest,ch);
}
return dest;
}
private StringBuffer normalize(StringBuffer src){
/*
* Option UNORM_BEFORE_PRI_29:
*
* IDNA as interpreted by IETF members (see unicode mailing list 2004H1)
* requires strict adherence to Unicode 3.2 normalization,
* including buggy composition from before fixing Public Review Issue #29.
* Note that this results in some valid but nonsensical text to be
* either corrupted or rejected, depending on the text.
* See http://www.unicode.org/review/resolved-pri.html#pri29
* See unorm.cpp and cnormtst.c
*/
return new StringBuffer(
Normalizer.normalize(
src.toString(),
java.text.Normalizer.Form.NFKC,
Normalizer.UNICODE_3_2));
}
/*
boolean isLabelSeparator(int ch){
int result = getCodePointValue(ch);
if( (result & 0x07) == LABEL_SEPARATOR){
return true;
}
return false;
}
*/
/*
1) Map -- For each character in the input, check if it has a mapping
and, if so, replace it with its mapping.
2) Normalize -- Possibly normalize the result of step 1 using Unicode
normalization.
3) Prohibit -- Check for any characters that are not allowed in the
output. If any are found, return an error.
4) Check bidi -- Possibly check for right-to-left characters, and if
any are found, make sure that the whole string satisfies the
requirements for bidirectional strings. If the string does not
satisfy the requirements for bidirectional strings, return an
error.
[Unicode3.2] defines several bidirectional categories; each character
has one bidirectional category assigned to it. For the purposes of
the requirements below, an "RandALCat character" is a character that
has Unicode bidirectional categories "R" or "AL"; an "LCat character"
is a character that has Unicode bidirectional category "L". Note
that there are many characters which fall in neither of the above
definitions; Latin digits (<U+0030> through <U+0039>) are examples of
this because they have bidirectional category "EN".
In any profile that specifies bidirectional character handling, all
three of the following requirements MUST be met:
1) The characters in section 5.8 MUST be prohibited.
2) If a string contains any RandALCat character, the string MUST NOT
contain any LCat character.
3) If a string contains any RandALCat character, a RandALCat
character MUST be the first character of the string, and a
RandALCat character MUST be the last character of the string.
*/
/**
* Prepare the input buffer for use in applications with the given profile. This operation maps, normalizes(NFKC),
* checks for prohited and BiDi characters in the order defined by RFC 3454
* depending on the options specified in the profile.
*
* @param src A UCharacterIterator object containing the source string
* @param options A bit set of options:
*
* - StringPrep.NONE Prohibit processing of unassigned code points in the input
*
* - StringPrep.ALLOW_UNASSIGNED Treat the unassigned code points are in the input
* as normal Unicode code points.
*
* @return StringBuffer A StringBuffer containing the output
* @throws ParseException
* @draft ICU 2.8
*/
public StringBuffer prepare(UCharacterIterator src, int options)
throws ParseException{
// map
StringBuffer mapOut = map(src,options);
StringBuffer normOut = mapOut;// initialize
if(doNFKC){
// normalize
normOut = normalize(mapOut);
}
int ch;
char result;
UCharacterIterator iter = UCharacterIterator.getInstance(normOut);
Values val = new Values();
int direction=UCharacterDirection.CHAR_DIRECTION_COUNT,
firstCharDir=UCharacterDirection.CHAR_DIRECTION_COUNT;
int rtlPos=-1, ltrPos=-1;
boolean rightToLeft=false, leftToRight=false;
while((ch=iter.nextCodePoint())!= UCharacterIterator.DONE){
result = getCodePointValue(ch);
getValues(result,val);
if(val.type == PROHIBITED ){
throw new ParseException("A prohibited code point was found in the input" +
iter.getText(), val.value);
}
direction = UCharacter.getDirection(ch);
if(firstCharDir == UCharacterDirection.CHAR_DIRECTION_COUNT){
firstCharDir = direction;
}
if(direction == UCharacterDirection.LEFT_TO_RIGHT){
leftToRight = true;
ltrPos = iter.getIndex()-1;
}
if(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC){
rightToLeft = true;
rtlPos = iter.getIndex()-1;
}
}
if(checkBiDi == true){
// satisfy 2
if( leftToRight == true && rightToLeft == true){
throw new ParseException("The input does not conform to the rules for BiDi code points." +
iter.getText(),
(rtlPos>ltrPos) ? rtlPos : ltrPos);
}
//satisfy 3
if( rightToLeft == true &&
!((firstCharDir == UCharacterDirection.RIGHT_TO_LEFT || firstCharDir == UCharacterDirection.RIGHT_TO_LEFT_ARABIC) &&
(direction == UCharacterDirection.RIGHT_TO_LEFT || direction == UCharacterDirection.RIGHT_TO_LEFT_ARABIC))
){
throw new ParseException("The input does not conform to the rules for BiDi code points." +
iter.getText(),
(rtlPos>ltrPos) ? rtlPos : ltrPos);
}
}
return normOut;
}
}

View file

@ -1,127 +0,0 @@
/*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
/*
******************************************************************************
* Copyright (C) 2003, International Business Machines Corporation and *
* others. All Rights Reserved. *
******************************************************************************
*
* Created on May 2, 2003
*
* To change the template for this generated file go to
* Window>Preferences>Java>Code Generation>Code and Comments
*/
// CHANGELOG
// 2005-05-19 Edward Wang
// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/impl/StringPrepDataReader.java
// - move from package com.ibm.icu.impl to package sun.net.idn
//
package sun.net.idn;
import java.io.DataInputStream;
import java.io.IOException;
import java.io.InputStream;
import sun.text.normalizer.ICUBinary;
/**
* @author ram
*
* To change the template for this generated type comment go to
* Window>Preferences>Java>Code Generation>Code and Comments
*/
final class StringPrepDataReader implements ICUBinary.Authenticate {
/**
* <p>private constructor.</p>
* @param inputStream ICU uprop.dat file input stream
* @exception IOException throw if data file fails authentication
* @draft 2.1
*/
public StringPrepDataReader(InputStream inputStream)
throws IOException{
unicodeVersion = ICUBinary.readHeader(inputStream, DATA_FORMAT_ID, this);
dataInputStream = new DataInputStream(inputStream);
}
public void read(byte[] idnaBytes,
char[] mappingTable)
throws IOException{
//Read the bytes that make up the idnaTrie
dataInputStream.read(idnaBytes);
//Read the extra data
for(int i=0;i<mappingTable.length;i++){
mappingTable[i]=dataInputStream.readChar();
}
}
public byte[] getDataFormatVersion(){
return DATA_FORMAT_VERSION;
}
public boolean isDataVersionAcceptable(byte version[]){
return version[0] == DATA_FORMAT_VERSION[0]
&& version[2] == DATA_FORMAT_VERSION[2]
&& version[3] == DATA_FORMAT_VERSION[3];
}
public int[] readIndexes(int length)throws IOException{
int[] indexes = new int[length];
//Read the indexes
for (int i = 0; i <length ; i++) {
indexes[i] = dataInputStream.readInt();
}
return indexes;
}
public byte[] getUnicodeVersion(){
return unicodeVersion;
}
// private data members -------------------------------------------------
/**
* ICU data file input stream
*/
private DataInputStream dataInputStream;
private byte[] unicodeVersion;
/**
* File format version that this class understands.
* No guarantees are made if a older version is used
* see store.c of gennorm for more information and values
*/
///* dataFormat="SPRP" 0x53, 0x50, 0x52, 0x50 */
private static final byte DATA_FORMAT_ID[] = {(byte)0x53, (byte)0x50,
(byte)0x52, (byte)0x50};
private static final byte DATA_FORMAT_VERSION[] = {(byte)0x3, (byte)0x2,
(byte)0x5, (byte)0x2};
}

View file

@ -1,112 +0,0 @@
/*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
/**
*******************************************************************************
* Copyright (C) 1996-2004, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
// CHANGELOG
// 2005-05-19 Edward Wang
// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/lang/UCharacterDirection.java
// - move from package com.ibm.icu.lang to package sun.net.idn
//
package sun.net.idn;
/**
* Enumerated Unicode character linguistic direction constants.
* Used as return results from <a href=UCharacter.html>UCharacter</a>
* <p>
* This class is not subclassable
* </p>
* @author Syn Wee Quek
* @stable ICU 2.1
*/
@SuppressWarnings("deprecation")
final class UCharacterDirection implements UCharacterEnums.ECharacterDirection {
// private constructor =========================================
///CLOVER:OFF
/**
* Private constructor to prevent initialisation
*/
private UCharacterDirection()
{
}
///CLOVER:ON
/**
* Gets the name of the argument direction
* @param dir direction type to retrieve name
* @return directional name
* @stable ICU 2.1
*/
public static String toString(int dir) {
switch(dir)
{
case LEFT_TO_RIGHT :
return "Left-to-Right";
case RIGHT_TO_LEFT :
return "Right-to-Left";
case EUROPEAN_NUMBER :
return "European Number";
case EUROPEAN_NUMBER_SEPARATOR :
return "European Number Separator";
case EUROPEAN_NUMBER_TERMINATOR :
return "European Number Terminator";
case ARABIC_NUMBER :
return "Arabic Number";
case COMMON_NUMBER_SEPARATOR :
return "Common Number Separator";
case BLOCK_SEPARATOR :
return "Paragraph Separator";
case SEGMENT_SEPARATOR :
return "Segment Separator";
case WHITE_SPACE_NEUTRAL :
return "Whitespace";
case OTHER_NEUTRAL :
return "Other Neutrals";
case LEFT_TO_RIGHT_EMBEDDING :
return "Left-to-Right Embedding";
case LEFT_TO_RIGHT_OVERRIDE :
return "Left-to-Right Override";
case RIGHT_TO_LEFT_ARABIC :
return "Right-to-Left Arabic";
case RIGHT_TO_LEFT_EMBEDDING :
return "Right-to-Left Embedding";
case RIGHT_TO_LEFT_OVERRIDE :
return "Right-to-Left Override";
case POP_DIRECTIONAL_FORMAT :
return "Pop Directional Format";
case DIR_NON_SPACING_MARK :
return "Non-Spacing Mark";
case BOUNDARY_NEUTRAL :
return "Boundary Neutral";
}
return "Unassigned";
}
}

View file

@ -1,587 +0,0 @@
/*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
/**
*******************************************************************************
* Copyright (C) 2004, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
// CHANGELOG
// 2005-05-19 Edward Wang
// - copy this file from icu4jsrc_3_2/src/com/ibm/icu/lang/UCharacterEnums.java
// - move from package com.ibm.icu.lang to package sun.net.idn
//
// 2011-09-06 Kurchi Subhra Hazra
// - Added @Deprecated tag to the following:
// - class UCharacterEnums
// - interfaces ECharacterCategory, ECharacterDirection
// - fields INITIAL_QUOTE_PUNCTUATION, FINAL_QUOTE_PUNCTUATION,
// DIRECTIONALITY_LEFT_TO_RIGHT, DIRECTIONALITY_RIGHT_TO_LEFT,
// DIRECTIONALITY_EUROPEAN_NUMBER, DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR
// DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR, DIRECTIONALITY_ARABIC_NUMBER,
// DIRECTIONALITY_COMMON_NUMBER_SEPARATOR, DIRECTIONALITY_PARAGRAPH_SEPARATOR,
// DIRECTIONALITY_SEGMENT_SEPARATOR, DIRECTIONALITY_WHITESPACE,
// DIRECTIONALITY_OTHER_NEUTRALS, DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING,
// DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE, DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC,
// DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING, DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE,
// DIRECTIONALITY_POP_DIRECTIONAL_FORMAT, DIRECTIONALITY_NON_SPACING_MARK,
// DIRECTIONALITY_BOUNDARY_NEUTRAL, DIRECTIONALITY_UNDEFINED
//
package sun.net.idn;
/**
* A container for the different 'enumerated types' used by UCharacter.
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
class UCharacterEnums {
/** This is just a namespace, it is not instantiatable. */
private UCharacterEnums() {};
/**
* 'Enum' for the CharacterCategory constants. These constants are
* compatible in name <b>but not in value</b> with those defined in
* <code>java.lang.Character</code>.
* @see UCharacterCategory
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static interface ECharacterCategory {
/**
* Unassigned character type
* @stable ICU 2.1
*/
public static final int UNASSIGNED = 0;
/**
* Character type Cn
* Not Assigned (no characters in [UnicodeData.txt] have this property)
* @stable ICU 2.6
*/
public static final int GENERAL_OTHER_TYPES = 0;
/**
* Character type Lu
* @stable ICU 2.1
*/
public static final int UPPERCASE_LETTER = 1;
/**
* Character type Ll
* @stable ICU 2.1
*/
public static final int LOWERCASE_LETTER = 2;
/**
* Character type Lt
* @stable ICU 2.1
*/
public static final int TITLECASE_LETTER = 3;
/**
* Character type Lm
* @stable ICU 2.1
*/
public static final int MODIFIER_LETTER = 4;
/**
* Character type Lo
* @stable ICU 2.1
*/
public static final int OTHER_LETTER = 5;
/**
* Character type Mn
* @stable ICU 2.1
*/
public static final int NON_SPACING_MARK = 6;
/**
* Character type Me
* @stable ICU 2.1
*/
public static final int ENCLOSING_MARK = 7;
/**
* Character type Mc
* @stable ICU 2.1
*/
public static final int COMBINING_SPACING_MARK = 8;
/**
* Character type Nd
* @stable ICU 2.1
*/
public static final int DECIMAL_DIGIT_NUMBER = 9;
/**
* Character type Nl
* @stable ICU 2.1
*/
public static final int LETTER_NUMBER = 10;
/**
* Character type No
* @stable ICU 2.1
*/
public static final int OTHER_NUMBER = 11;
/**
* Character type Zs
* @stable ICU 2.1
*/
public static final int SPACE_SEPARATOR = 12;
/**
* Character type Zl
* @stable ICU 2.1
*/
public static final int LINE_SEPARATOR = 13;
/**
* Character type Zp
* @stable ICU 2.1
*/
public static final int PARAGRAPH_SEPARATOR = 14;
/**
* Character type Cc
* @stable ICU 2.1
*/
public static final int CONTROL = 15;
/**
* Character type Cf
* @stable ICU 2.1
*/
public static final int FORMAT = 16;
/**
* Character type Co
* @stable ICU 2.1
*/
public static final int PRIVATE_USE = 17;
/**
* Character type Cs
* @stable ICU 2.1
*/
public static final int SURROGATE = 18;
/**
* Character type Pd
* @stable ICU 2.1
*/
public static final int DASH_PUNCTUATION = 19;
/**
* Character type Ps
* @stable ICU 2.1
*/
public static final int START_PUNCTUATION = 20;
/**
* Character type Pe
* @stable ICU 2.1
*/
public static final int END_PUNCTUATION = 21;
/**
* Character type Pc
* @stable ICU 2.1
*/
public static final int CONNECTOR_PUNCTUATION = 22;
/**
* Character type Po
* @stable ICU 2.1
*/
public static final int OTHER_PUNCTUATION = 23;
/**
* Character type Sm
* @stable ICU 2.1
*/
public static final int MATH_SYMBOL = 24;
/**
* Character type Sc
* @stable ICU 2.1
*/
public static final int CURRENCY_SYMBOL = 25;
/**
* Character type Sk
* @stable ICU 2.1
*/
public static final int MODIFIER_SYMBOL = 26;
/**
* Character type So
* @stable ICU 2.1
*/
public static final int OTHER_SYMBOL = 27;
/**
* Character type Pi
* @see #INITIAL_QUOTE_PUNCTUATION
* @stable ICU 2.1
*/
public static final int INITIAL_PUNCTUATION = 28;
/**
* Character type Pi
* This name is compatible with java.lang.Character's name for this type.
* @see #INITIAL_PUNCTUATION
* @draft ICU 2.8
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final int INITIAL_QUOTE_PUNCTUATION = 28;
/**
* Character type Pf
* @see #FINAL_QUOTE_PUNCTUATION
* @stable ICU 2.1
*/
public static final int FINAL_PUNCTUATION = 29;
/**
* Character type Pf
* This name is compatible with java.lang.Character's name for this type.
* @see #FINAL_PUNCTUATION
* @draft ICU 2.8
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final int FINAL_QUOTE_PUNCTUATION = 29;
/**
* Character type count
* @stable ICU 2.1
*/
public static final int CHAR_CATEGORY_COUNT = 30;
}
/**
* 'Enum' for the CharacterDirection constants. There are two sets
* of names, those used in ICU, and those used in the JDK. The
* JDK constants are compatible in name <b>but not in value</b>
* with those defined in <code>java.lang.Character</code>.
* @see UCharacterDirection
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static interface ECharacterDirection {
/**
* Directional type L
* @stable ICU 2.1
*/
public static final int LEFT_TO_RIGHT = 0;
/**
* JDK-compatible synonum for LEFT_TO_RIGHT.
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final byte DIRECTIONALITY_LEFT_TO_RIGHT = (byte)LEFT_TO_RIGHT;
/**
* Directional type R
* @stable ICU 2.1
*/
public static final int RIGHT_TO_LEFT = 1;
/**
* JDK-compatible synonum for RIGHT_TO_LEFT.
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final byte DIRECTIONALITY_RIGHT_TO_LEFT = (byte)RIGHT_TO_LEFT;
/**
* Directional type EN
* @stable ICU 2.1
*/
public static final int EUROPEAN_NUMBER = 2;
/**
* JDK-compatible synonum for EUROPEAN_NUMBER.
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final byte DIRECTIONALITY_EUROPEAN_NUMBER = (byte)EUROPEAN_NUMBER;
/**
* Directional type ES
* @stable ICU 2.1
*/
public static final int EUROPEAN_NUMBER_SEPARATOR = 3;
/**
* JDK-compatible synonum for EUROPEAN_NUMBER_SEPARATOR.
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_SEPARATOR = (byte)EUROPEAN_NUMBER_SEPARATOR;
/**
* Directional type ET
* @stable ICU 2.1
*/
public static final int EUROPEAN_NUMBER_TERMINATOR = 4;
/**
* JDK-compatible synonum for EUROPEAN_NUMBER_TERMINATOR.
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final byte DIRECTIONALITY_EUROPEAN_NUMBER_TERMINATOR = (byte)EUROPEAN_NUMBER_TERMINATOR;
/**
* Directional type AN
* @stable ICU 2.1
*/
public static final int ARABIC_NUMBER = 5;
/**
* JDK-compatible synonum for ARABIC_NUMBER.
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final byte DIRECTIONALITY_ARABIC_NUMBER = (byte)ARABIC_NUMBER;
/**
* Directional type CS
* @stable ICU 2.1
*/
public static final int COMMON_NUMBER_SEPARATOR = 6;
/**
* JDK-compatible synonum for COMMON_NUMBER_SEPARATOR.
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final byte DIRECTIONALITY_COMMON_NUMBER_SEPARATOR = (byte)COMMON_NUMBER_SEPARATOR;
/**
* Directional type B
* @stable ICU 2.1
*/
public static final int BLOCK_SEPARATOR = 7;
/**
* JDK-compatible synonum for BLOCK_SEPARATOR.
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final byte DIRECTIONALITY_PARAGRAPH_SEPARATOR = (byte)BLOCK_SEPARATOR;
/**
* Directional type S
* @stable ICU 2.1
*/
public static final int SEGMENT_SEPARATOR = 8;
/**
* JDK-compatible synonum for SEGMENT_SEPARATOR.
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final byte DIRECTIONALITY_SEGMENT_SEPARATOR = (byte)SEGMENT_SEPARATOR;
/**
* Directional type WS
* @stable ICU 2.1
*/
public static final int WHITE_SPACE_NEUTRAL = 9;
/**
* JDK-compatible synonum for WHITE_SPACE_NEUTRAL.
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final byte DIRECTIONALITY_WHITESPACE = (byte)WHITE_SPACE_NEUTRAL;
/**
* Directional type ON
* @stable ICU 2.1
*/
public static final int OTHER_NEUTRAL = 10;
/**
* JDK-compatible synonum for OTHER_NEUTRAL.
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final byte DIRECTIONALITY_OTHER_NEUTRALS = (byte)OTHER_NEUTRAL;
/**
* Directional type LRE
* @stable ICU 2.1
*/
public static final int LEFT_TO_RIGHT_EMBEDDING = 11;
/**
* JDK-compatible synonum for LEFT_TO_RIGHT_EMBEDDING.
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_EMBEDDING = (byte)LEFT_TO_RIGHT_EMBEDDING;
/**
* Directional type LRO
* @stable ICU 2.1
*/
public static final int LEFT_TO_RIGHT_OVERRIDE = 12;
/**
* JDK-compatible synonum for LEFT_TO_RIGHT_OVERRIDE.
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final byte DIRECTIONALITY_LEFT_TO_RIGHT_OVERRIDE = (byte)LEFT_TO_RIGHT_OVERRIDE;
/**
* Directional type AL
* @stable ICU 2.1
*/
public static final int RIGHT_TO_LEFT_ARABIC = 13;
/**
* JDK-compatible synonum for RIGHT_TO_LEFT_ARABIC.
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_ARABIC = (byte)RIGHT_TO_LEFT_ARABIC;
/**
* Directional type RLE
* @stable ICU 2.1
*/
public static final int RIGHT_TO_LEFT_EMBEDDING = 14;
/**
* JDK-compatible synonum for RIGHT_TO_LEFT_EMBEDDING.
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_EMBEDDING = (byte)RIGHT_TO_LEFT_EMBEDDING;
/**
* Directional type RLO
* @stable ICU 2.1
*/
public static final int RIGHT_TO_LEFT_OVERRIDE = 15;
/**
* JDK-compatible synonum for RIGHT_TO_LEFT_OVERRIDE.
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final byte DIRECTIONALITY_RIGHT_TO_LEFT_OVERRIDE = (byte)RIGHT_TO_LEFT_OVERRIDE;
/**
* Directional type PDF
* @stable ICU 2.1
*/
public static final int POP_DIRECTIONAL_FORMAT = 16;
/**
* JDK-compatible synonum for POP_DIRECTIONAL_FORMAT.
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final byte DIRECTIONALITY_POP_DIRECTIONAL_FORMAT = (byte)POP_DIRECTIONAL_FORMAT;
/**
* Directional type NSM
* @stable ICU 2.1
*/
public static final int DIR_NON_SPACING_MARK = 17;
/**
* JDK-compatible synonum for DIR_NON_SPACING_MARK.
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final byte DIRECTIONALITY_NON_SPACING_MARK = (byte)DIR_NON_SPACING_MARK;
/**
* Directional type BN
* @stable ICU 2.1
*/
public static final int BOUNDARY_NEUTRAL = 18;
/**
* JDK-compatible synonum for BOUNDARY_NEUTRAL.
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final byte DIRECTIONALITY_BOUNDARY_NEUTRAL = (byte)BOUNDARY_NEUTRAL;
/**
* Number of directional types
* @stable ICU 2.1
*/
public static final int CHAR_DIRECTION_COUNT = 19;
/**
* Undefined bidirectional character type. Undefined <code>char</code>
* values have undefined directionality in the Unicode specification.
* @draft ICU 3.0
* @deprecated This is a draft API and might change in a future release of ICU.
*/
@Deprecated
public static final byte DIRECTIONALITY_UNDEFINED = -1;
}
}

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2000, 2005, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2000, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -25,7 +25,7 @@
package sun.text;
import sun.text.normalizer.NormalizerBase;
import jdk.internal.icu.text.NormalizerBase;
public class CollatorUtilities {

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2001, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -25,8 +25,8 @@
package sun.text;
import sun.text.normalizer.NormalizerBase;
import sun.text.normalizer.NormalizerImpl;
import jdk.internal.icu.impl.NormalizerImpl;
import jdk.internal.icu.text.NormalizerBase;
public final class ComposedCharIter {
/**

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2005, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -25,8 +25,8 @@
package sun.text;
import sun.text.normalizer.NormalizerBase;
import sun.text.normalizer.UCharacter;
import jdk.internal.icu.lang.UCharacter;
import jdk.internal.icu.text.NormalizerBase;
/**
* This Normalizer is for Unicode 3.2 support for IDNA only.

File diff suppressed because it is too large Load diff

View file

@ -1,835 +0,0 @@
/*
* Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2001-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
/* Written by Simon Montagu, Matitiahu Allouche
* (ported from C code written by Markus W. Scherer)
*/
package sun.text.bidi;
import java.text.Bidi;
import java.util.Arrays;
final class BidiLine {
/*
* General remarks about the functions in this file:
*
* These functions deal with the aspects of potentially mixed-directional
* text in a single paragraph or in a line of a single paragraph
* which has already been processed according to
* the Unicode 3.0 Bidi algorithm as defined in
* http://www.unicode.org/unicode/reports/tr9/ , version 13,
* also described in The Unicode Standard, Version 4.0.1 .
*
* This means that there is a Bidi object with a levels
* and a dirProps array.
* paraLevel and direction are also set.
* Only if the length of the text is zero, then levels==dirProps==NULL.
*
* The overall directionality of the paragraph
* or line is used to bypass the reordering steps if possible.
* Even purely RTL text does not need reordering there because
* the getLogical/VisualIndex() methods can compute the
* index on the fly in such a case.
*
* The implementation of the access to same-level-runs and of the reordering
* do attempt to provide better performance and less memory usage compared to
* a direct implementation of especially rule (L2) with an array of
* one (32-bit) integer per text character.
*
* Here, the levels array is scanned as soon as necessary, and a vector of
* same-level-runs is created. Reordering then is done on this vector.
* For each run of text positions that were resolved to the same level,
* only 8 bytes are stored: the first text position of the run and the visual
* position behind the run after reordering.
* One sign bit is used to hold the directionality of the run.
* This is inefficient if there are many very short runs. If the average run
* length is <2, then this uses more memory.
*
* In a further attempt to save memory, the levels array is never changed
* after all the resolution rules (Xn, Wn, Nn, In).
* Many methods have to consider the field trailingWSStart:
* if it is less than length, then there is an implicit trailing run
* at the paraLevel,
* which is not reflected in the levels array.
* This allows a line Bidi object to use the same levels array as
* its paragraph parent object.
*
* When a Bidi object is created for a line of a paragraph, then the
* paragraph's levels and dirProps arrays are reused by way of setting
* a pointer into them, not by copying. This again saves memory and forbids to
* change the now shared levels for (L1).
*/
/* handle trailing WS (L1) -------------------------------------------------- */
/*
* setTrailingWSStart() sets the start index for a trailing
* run of WS in the line. This is necessary because we do not modify
* the paragraph's levels array that we just point into.
* Using trailingWSStart is another form of performing (L1).
*
* To make subsequent operations easier, we also include the run
* before the WS if it is at the paraLevel - we merge the two here.
*
* This method is called only from setLine(), so paraLevel is
* set correctly for the line even when contextual multiple paragraphs.
*/
static void setTrailingWSStart(BidiBase bidiBase)
{
byte[] dirProps = bidiBase.dirProps;
byte[] levels = bidiBase.levels;
int start = bidiBase.length;
byte paraLevel = bidiBase.paraLevel;
/* If the line is terminated by a block separator, all preceding WS etc...
are already set to paragraph level.
Setting trailingWSStart to pBidi->length will avoid changing the
level of B chars from 0 to paraLevel in getLevels when
orderParagraphsLTR==TRUE
*/
if (dirProps[start - 1] == BidiBase.B) {
bidiBase.trailingWSStart = start; /* currently == bidiBase.length */
return;
}
/* go backwards across all WS, BN, explicit codes */
while (start > 0 &&
(BidiBase.DirPropFlag(dirProps[start - 1]) & BidiBase.MASK_WS) != 0) {
--start;
}
/* if the WS run can be merged with the previous run then do so here */
while (start > 0 && levels[start - 1] == paraLevel) {
--start;
}
bidiBase.trailingWSStart=start;
}
static Bidi setLine(BidiBase paraBidi,
Bidi newBidi, BidiBase lineBidi,
int start, int limit) {
int length;
/* set the values in lineBidi from its paraBidi parent */
/* class members are already initialized to 0 */
// lineBidi.paraBidi = null; /* mark unfinished setLine */
// lineBidi.flags = 0;
// lineBidi.controlCount = 0;
length = lineBidi.length = lineBidi.originalLength =
lineBidi.resultLength = limit - start;
lineBidi.text = new char[length];
System.arraycopy(paraBidi.text, start, lineBidi.text, 0, length);
lineBidi.paraLevel = paraBidi.GetParaLevelAt(start);
lineBidi.paraCount = paraBidi.paraCount;
lineBidi.runs = new BidiRun[0];
lineBidi.reorderingMode = paraBidi.reorderingMode;
lineBidi.reorderingOptions = paraBidi.reorderingOptions;
if (paraBidi.controlCount > 0) {
int j;
for (j = start; j < limit; j++) {
if (BidiBase.IsBidiControlChar(paraBidi.text[j])) {
lineBidi.controlCount++;
}
}
lineBidi.resultLength -= lineBidi.controlCount;
}
/* copy proper subset of DirProps */
lineBidi.getDirPropsMemory(length);
lineBidi.dirProps = lineBidi.dirPropsMemory;
System.arraycopy(paraBidi.dirProps, start, lineBidi.dirProps, 0,
length);
/* copy proper subset of Levels */
lineBidi.getLevelsMemory(length);
lineBidi.levels = lineBidi.levelsMemory;
System.arraycopy(paraBidi.levels, start, lineBidi.levels, 0,
length);
lineBidi.runCount = -1;
if (paraBidi.direction != BidiBase.MIXED) {
/* the parent is already trivial */
lineBidi.direction = paraBidi.direction;
/*
* The parent's levels are all either
* implicitly or explicitly ==paraLevel;
* do the same here.
*/
if (paraBidi.trailingWSStart <= start) {
lineBidi.trailingWSStart = 0;
} else if (paraBidi.trailingWSStart < limit) {
lineBidi.trailingWSStart = paraBidi.trailingWSStart - start;
} else {
lineBidi.trailingWSStart = length;
}
} else {
byte[] levels = lineBidi.levels;
int i, trailingWSStart;
byte level;
setTrailingWSStart(lineBidi);
trailingWSStart = lineBidi.trailingWSStart;
/* recalculate lineBidiBase.direction */
if (trailingWSStart == 0) {
/* all levels are at paraLevel */
lineBidi.direction = (byte)(lineBidi.paraLevel & 1);
} else {
/* get the level of the first character */
level = (byte)(levels[0] & 1);
/* if there is anything of a different level, then the line
is mixed */
if (trailingWSStart < length &&
(lineBidi.paraLevel & 1) != level) {
/* the trailing WS is at paraLevel, which differs from
levels[0] */
lineBidi.direction = BidiBase.MIXED;
} else {
/* see if levels[1..trailingWSStart-1] have the same
direction as levels[0] and paraLevel */
for (i = 1; ; i++) {
if (i == trailingWSStart) {
/* the direction values match those in level */
lineBidi.direction = level;
break;
} else if ((levels[i] & 1) != level) {
lineBidi.direction = BidiBase.MIXED;
break;
}
}
}
}
switch(lineBidi.direction) {
case Bidi.DIRECTION_LEFT_TO_RIGHT:
/* make sure paraLevel is even */
lineBidi.paraLevel = (byte)
((lineBidi.paraLevel + 1) & ~1);
/* all levels are implicitly at paraLevel (important for
getLevels()) */
lineBidi.trailingWSStart = 0;
break;
case Bidi.DIRECTION_RIGHT_TO_LEFT:
/* make sure paraLevel is odd */
lineBidi.paraLevel |= 1;
/* all levels are implicitly at paraLevel (important for
getLevels()) */
lineBidi.trailingWSStart = 0;
break;
default:
break;
}
}
lineBidi.paraBidi = paraBidi; /* mark successful setLine */
return newBidi;
}
static byte getLevelAt(BidiBase bidiBase, int charIndex)
{
/* return paraLevel if in the trailing WS run, otherwise the real level */
if (bidiBase.direction != BidiBase.MIXED || charIndex >= bidiBase.trailingWSStart) {
return bidiBase.GetParaLevelAt(charIndex);
} else {
return bidiBase.levels[charIndex];
}
}
static byte[] getLevels(BidiBase bidiBase)
{
int start = bidiBase.trailingWSStart;
int length = bidiBase.length;
if (start != length) {
/* the current levels array does not reflect the WS run */
/*
* After the previous if(), we know that the levels array
* has an implicit trailing WS run and therefore does not fully
* reflect itself all the levels.
* This must be a Bidi object for a line, and
* we need to create a new levels array.
*/
/* bidiBase.paraLevel is ok even if contextual multiple paragraphs,
since bidiBase is a line object */
Arrays.fill(bidiBase.levels, start, length, bidiBase.paraLevel);
/* this new levels array is set for the line and reflects the WS run */
bidiBase.trailingWSStart = length;
}
if (length < bidiBase.levels.length) {
byte[] levels = new byte[length];
System.arraycopy(bidiBase.levels, 0, levels, 0, length);
return levels;
}
return bidiBase.levels;
}
static BidiRun getVisualRun(BidiBase bidiBase, int runIndex) {
int start = bidiBase.runs[runIndex].start;
int limit;
byte level = bidiBase.runs[runIndex].level;
if (runIndex > 0) {
limit = start +
bidiBase.runs[runIndex].limit -
bidiBase.runs[runIndex - 1].limit;
} else {
limit = start + bidiBase.runs[0].limit;
}
return new BidiRun(start, limit, level);
}
/* in trivial cases there is only one trivial run; called by getRuns() */
private static void getSingleRun(BidiBase bidiBase, byte level) {
/* simple, single-run case */
bidiBase.runs = bidiBase.simpleRuns;
bidiBase.runCount = 1;
/* fill and reorder the single run */
bidiBase.runs[0] = new BidiRun(0, bidiBase.length, level);
}
/* reorder the runs array (L2) ---------------------------------------------- */
/*
* Reorder the same-level runs in the runs array.
* Here, runCount>1 and maxLevel>=minLevel>=paraLevel.
* All the visualStart fields=logical start before reordering.
* The "odd" bits are not set yet.
*
* Reordering with this data structure lends itself to some handy shortcuts:
*
* Since each run is moved but not modified, and since at the initial maxLevel
* each sequence of same-level runs consists of only one run each, we
* don't need to do anything there and can predecrement maxLevel.
* In many simple cases, the reordering is thus done entirely in the
* index mapping.
* Also, reordering occurs only down to the lowest odd level that occurs,
* which is minLevel|1. However, if the lowest level itself is odd, then
* in the last reordering the sequence of the runs at this level or higher
* will be all runs, and we don't need the elaborate loop to search for them.
* This is covered by ++minLevel instead of minLevel|=1 followed
* by an extra reorder-all after the reorder-some loop.
* About a trailing WS run:
* Such a run would need special treatment because its level is not
* reflected in levels[] if this is not a paragraph object.
* Instead, all characters from trailingWSStart on are implicitly at
* paraLevel.
* However, for all maxLevel>paraLevel, this run will never be reordered
* and does not need to be taken into account. maxLevel==paraLevel is only reordered
* if minLevel==paraLevel is odd, which is done in the extra segment.
* This means that for the main reordering loop we don't need to consider
* this run and can --runCount. If it is later part of the all-runs
* reordering, then runCount is adjusted accordingly.
*/
private static void reorderLine(BidiBase bidiBase, byte minLevel, byte maxLevel) {
/* nothing to do? */
if (maxLevel<=(minLevel|1)) {
return;
}
BidiRun[] runs;
BidiRun tempRun;
byte[] levels;
int firstRun, endRun, limitRun, runCount;
/*
* Reorder only down to the lowest odd level
* and reorder at an odd minLevel in a separate, simpler loop.
* See comments above for why minLevel is always incremented.
*/
++minLevel;
runs = bidiBase.runs;
levels = bidiBase.levels;
runCount = bidiBase.runCount;
/* do not include the WS run at paraLevel<=old minLevel except in the simple loop */
if (bidiBase.trailingWSStart < bidiBase.length) {
--runCount;
}
while (--maxLevel >= minLevel) {
firstRun = 0;
/* loop for all sequences of runs */
for ( ; ; ) {
/* look for a sequence of runs that are all at >=maxLevel */
/* look for the first run of such a sequence */
while (firstRun < runCount && levels[runs[firstRun].start] < maxLevel) {
++firstRun;
}
if (firstRun >= runCount) {
break; /* no more such runs */
}
/* look for the limit run of such a sequence (the run behind it) */
for (limitRun = firstRun; ++limitRun < runCount &&
levels[runs[limitRun].start]>=maxLevel; ) {}
/* Swap the entire sequence of runs from firstRun to limitRun-1. */
endRun = limitRun - 1;
while (firstRun < endRun) {
tempRun = runs[firstRun];
runs[firstRun] = runs[endRun];
runs[endRun] = tempRun;
++firstRun;
--endRun;
}
if (limitRun == runCount) {
break; /* no more such runs */
} else {
firstRun = limitRun + 1;
}
}
}
/* now do maxLevel==old minLevel (==odd!), see above */
if ((minLevel & 1) == 0) {
firstRun = 0;
/* include the trailing WS run in this complete reordering */
if (bidiBase.trailingWSStart == bidiBase.length) {
--runCount;
}
/* Swap the entire sequence of all runs. (endRun==runCount) */
while (firstRun < runCount) {
tempRun = runs[firstRun];
runs[firstRun] = runs[runCount];
runs[runCount] = tempRun;
++firstRun;
--runCount;
}
}
}
/* compute the runs array --------------------------------------------------- */
static int getRunFromLogicalIndex(BidiBase bidiBase, int logicalIndex) {
BidiRun[] runs = bidiBase.runs;
int runCount = bidiBase.runCount, visualStart = 0, i, length, logicalStart;
for (i = 0; i < runCount; i++) {
length = runs[i].limit - visualStart;
logicalStart = runs[i].start;
if ((logicalIndex >= logicalStart) && (logicalIndex < (logicalStart+length))) {
return i;
}
visualStart += length;
}
/* we should never get here */
throw new IllegalStateException("Internal ICU error in getRunFromLogicalIndex");
}
/*
* Compute the runs array from the levels array.
* After getRuns() returns true, runCount is guaranteed to be >0
* and the runs are reordered.
* Odd-level runs have visualStart on their visual right edge and
* they progress visually to the left.
* If option OPTION_INSERT_MARKS is set, insertRemove will contain the
* sum of appropriate LRM/RLM_BEFORE/AFTER flags.
* If option OPTION_REMOVE_CONTROLS is set, insertRemove will contain the
* negative number of BiDi control characters within this run.
*/
static void getRuns(BidiBase bidiBase) {
/*
* This method returns immediately if the runs are already set. This
* includes the case of length==0 (handled in setPara)..
*/
if (bidiBase.runCount >= 0) {
return;
}
if (bidiBase.direction != BidiBase.MIXED) {
/* simple, single-run case - this covers length==0 */
/* bidiBase.paraLevel is ok even for contextual multiple paragraphs */
getSingleRun(bidiBase, bidiBase.paraLevel);
} else /* BidiBase.MIXED, length>0 */ {
/* mixed directionality */
int length = bidiBase.length, limit;
byte[] levels = bidiBase.levels;
int i, runCount;
byte level = -1; /* initialize with no valid level */
/*
* If there are WS characters at the end of the line
* and the run preceding them has a level different from
* paraLevel, then they will form their own run at paraLevel (L1).
* Count them separately.
* We need some special treatment for this in order to not
* modify the levels array which a line Bidi object shares
* with its paragraph parent and its other line siblings.
* In other words, for the trailing WS, it may be
* levels[]!=paraLevel but we have to treat it like it were so.
*/
limit = bidiBase.trailingWSStart;
/* count the runs, there is at least one non-WS run, and limit>0 */
runCount = 0;
for (i = 0; i < limit; ++i) {
/* increment runCount at the start of each run */
if (levels[i] != level) {
++runCount;
level = levels[i];
}
}
/*
* We don't need to see if the last run can be merged with a trailing
* WS run because setTrailingWSStart() would have done that.
*/
if (runCount == 1 && limit == length) {
/* There is only one non-WS run and no trailing WS-run. */
getSingleRun(bidiBase, levels[0]);
} else /* runCount>1 || limit<length */ {
/* allocate and set the runs */
BidiRun[] runs;
int runIndex, start;
byte minLevel = BidiBase.MAX_EXPLICIT_LEVEL + 1;
byte maxLevel=0;
/* now, count a (non-mergeable) WS run */
if (limit < length) {
++runCount;
}
/* runCount > 1 */
bidiBase.getRunsMemory(runCount);
runs = bidiBase.runsMemory;
/* set the runs */
/* FOOD FOR THOUGHT: this could be optimized, e.g.:
* 464->444, 484->444, 575->555, 595->555
* However, that would take longer. Check also how it would
* interact with BiDi control removal and inserting Marks.
*/
runIndex = 0;
/* search for the run limits and initialize visualLimit values with the run lengths */
i = 0;
do {
/* prepare this run */
start = i;
level = levels[i];
if (level < minLevel) {
minLevel = level;
}
if (level > maxLevel) {
maxLevel = level;
}
/* look for the run limit */
while (++i < limit && levels[i] == level) {}
/* i is another run limit */
runs[runIndex] = new BidiRun(start, i - start, level);
++runIndex;
} while (i < limit);
if (limit < length) {
/* there is a separate WS run */
runs[runIndex] = new BidiRun(limit, length - limit, bidiBase.paraLevel);
/* For the trailing WS run, bidiBase.paraLevel is ok even
if contextual multiple paragraphs. */
if (bidiBase.paraLevel < minLevel) {
minLevel = bidiBase.paraLevel;
}
}
/* set the object fields */
bidiBase.runs = runs;
bidiBase.runCount = runCount;
reorderLine(bidiBase, minLevel, maxLevel);
/* now add the direction flags and adjust the visualLimit's to be just that */
/* this loop will also handle the trailing WS run */
limit = 0;
for (i = 0; i < runCount; ++i) {
runs[i].level = levels[runs[i].start];
limit = (runs[i].limit += limit);
}
/* Set the embedding level for the trailing WS run. */
/* For a RTL paragraph, it will be the *first* run in visual order. */
/* For the trailing WS run, bidiBase.paraLevel is ok even if
contextual multiple paragraphs. */
if (runIndex < runCount) {
int trailingRun = ((bidiBase.paraLevel & 1) != 0)? 0 : runIndex;
runs[trailingRun].level = bidiBase.paraLevel;
}
}
}
/* handle insert LRM/RLM BEFORE/AFTER run */
if (bidiBase.insertPoints.size > 0) {
BidiBase.Point point;
int runIndex, ip;
for (ip = 0; ip < bidiBase.insertPoints.size; ip++) {
point = bidiBase.insertPoints.points[ip];
runIndex = getRunFromLogicalIndex(bidiBase, point.pos);
bidiBase.runs[runIndex].insertRemove |= point.flag;
}
}
/* handle remove BiDi control characters */
if (bidiBase.controlCount > 0) {
int runIndex, ic;
char c;
for (ic = 0; ic < bidiBase.length; ic++) {
c = bidiBase.text[ic];
if (BidiBase.IsBidiControlChar(c)) {
runIndex = getRunFromLogicalIndex(bidiBase, ic);
bidiBase.runs[runIndex].insertRemove--;
}
}
}
}
static int[] prepareReorder(byte[] levels, byte[] pMinLevel, byte[] pMaxLevel)
{
int start;
byte level, minLevel, maxLevel;
if (levels == null || levels.length <= 0) {
return null;
}
/* determine minLevel and maxLevel */
minLevel = BidiBase.MAX_EXPLICIT_LEVEL + 1;
maxLevel = 0;
for (start = levels.length; start>0; ) {
level = levels[--start];
if (level < 0 || level > (BidiBase.MAX_EXPLICIT_LEVEL + 1)) {
return null;
}
if (level < minLevel) {
minLevel = level;
}
if (level > maxLevel) {
maxLevel = level;
}
}
pMinLevel[0] = minLevel;
pMaxLevel[0] = maxLevel;
/* initialize the index map */
int[] indexMap = new int[levels.length];
for (start = levels.length; start > 0; ) {
--start;
indexMap[start] = start;
}
return indexMap;
}
static int[] reorderVisual(byte[] levels)
{
byte[] aMinLevel = new byte[1];
byte[] aMaxLevel = new byte[1];
int start, end, limit, temp;
byte minLevel, maxLevel;
int[] indexMap = prepareReorder(levels, aMinLevel, aMaxLevel);
if (indexMap == null) {
return null;
}
minLevel = aMinLevel[0];
maxLevel = aMaxLevel[0];
/* nothing to do? */
if (minLevel == maxLevel && (minLevel & 1) == 0) {
return indexMap;
}
/* reorder only down to the lowest odd level */
minLevel |= 1;
/* loop maxLevel..minLevel */
do {
start = 0;
/* loop for all sequences of levels to reorder at the current maxLevel */
for ( ; ; ) {
/* look for a sequence of levels that are all at >=maxLevel */
/* look for the first index of such a sequence */
while (start < levels.length && levels[start] < maxLevel) {
++start;
}
if (start >= levels.length) {
break; /* no more such runs */
}
/* look for the limit of such a sequence (the index behind it) */
for (limit = start; ++limit < levels.length && levels[limit] >= maxLevel; ) {}
/*
* Swap the entire interval of indexes from start to limit-1.
* We don't need to swap the levels for the purpose of this
* algorithm: the sequence of levels that we look at does not
* move anyway.
*/
end = limit - 1;
while (start < end) {
temp = indexMap[start];
indexMap[start] = indexMap[end];
indexMap[end] = temp;
++start;
--end;
}
if (limit == levels.length) {
break; /* no more such sequences */
} else {
start = limit + 1;
}
}
} while (--maxLevel >= minLevel);
return indexMap;
}
static int[] getVisualMap(BidiBase bidiBase)
{
/* fill a visual-to-logical index map using the runs[] */
BidiRun[] runs = bidiBase.runs;
int logicalStart, visualStart, visualLimit;
int allocLength = bidiBase.length > bidiBase.resultLength ? bidiBase.length
: bidiBase.resultLength;
int[] indexMap = new int[allocLength];
visualStart = 0;
int idx = 0;
for (int j = 0; j < bidiBase.runCount; ++j) {
logicalStart = runs[j].start;
visualLimit = runs[j].limit;
if (runs[j].isEvenRun()) {
do { /* LTR */
indexMap[idx++] = logicalStart++;
} while (++visualStart < visualLimit);
} else {
logicalStart += visualLimit - visualStart; /* logicalLimit */
do { /* RTL */
indexMap[idx++] = --logicalStart;
} while (++visualStart < visualLimit);
}
/* visualStart==visualLimit; */
}
if (bidiBase.insertPoints.size > 0) {
int markFound = 0, runCount = bidiBase.runCount;
int insertRemove, i, j, k;
runs = bidiBase.runs;
/* count all inserted marks */
for (i = 0; i < runCount; i++) {
insertRemove = runs[i].insertRemove;
if ((insertRemove & (BidiBase.LRM_BEFORE|BidiBase.RLM_BEFORE)) > 0) {
markFound++;
}
if ((insertRemove & (BidiBase.LRM_AFTER|BidiBase.RLM_AFTER)) > 0) {
markFound++;
}
}
/* move back indexes by number of preceding marks */
k = bidiBase.resultLength;
for (i = runCount - 1; i >= 0 && markFound > 0; i--) {
insertRemove = runs[i].insertRemove;
if ((insertRemove & (BidiBase.LRM_AFTER|BidiBase.RLM_AFTER)) > 0) {
indexMap[--k] = BidiBase.MAP_NOWHERE;
markFound--;
}
visualStart = i > 0 ? runs[i-1].limit : 0;
for (j = runs[i].limit - 1; j >= visualStart && markFound > 0; j--) {
indexMap[--k] = indexMap[j];
}
if ((insertRemove & (BidiBase.LRM_BEFORE|BidiBase.RLM_BEFORE)) > 0) {
indexMap[--k] = BidiBase.MAP_NOWHERE;
markFound--;
}
}
}
else if (bidiBase.controlCount > 0) {
int runCount = bidiBase.runCount, logicalEnd;
int insertRemove, length, i, j, k, m;
char uchar;
boolean evenRun;
runs = bidiBase.runs;
visualStart = 0;
/* move forward indexes by number of preceding controls */
k = 0;
for (i = 0; i < runCount; i++, visualStart += length) {
length = runs[i].limit - visualStart;
insertRemove = runs[i].insertRemove;
/* if no control found yet, nothing to do in this run */
if ((insertRemove == 0) && (k == visualStart)) {
k += length;
continue;
}
/* if no control in this run */
if (insertRemove == 0) {
visualLimit = runs[i].limit;
for (j = visualStart; j < visualLimit; j++) {
indexMap[k++] = indexMap[j];
}
continue;
}
logicalStart = runs[i].start;
evenRun = runs[i].isEvenRun();
logicalEnd = logicalStart + length - 1;
for (j = 0; j < length; j++) {
m = evenRun ? logicalStart + j : logicalEnd - j;
uchar = bidiBase.text[m];
if (!BidiBase.IsBidiControlChar(uchar)) {
indexMap[k++] = m;
}
}
}
}
if (allocLength == bidiBase.resultLength) {
return indexMap;
}
int[] newMap = new int[bidiBase.resultLength];
System.arraycopy(indexMap, 0, newMap, 0, bidiBase.resultLength);
return newMap;
}
}

View file

@ -1,124 +0,0 @@
/*
* Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
/* Written by Simon Montagu, Matitiahu Allouche
* (ported from C code written by Markus W. Scherer)
*/
package sun.text.bidi;
/**
* A BidiRun represents a sequence of characters at the same embedding level.
* The Bidi algorithm decomposes a piece of text into sequences of characters
* at the same embedding level, each such sequence is called a "run".
*
* <p>A BidiRun represents such a run by storing its essential properties,
* but does not duplicate the characters which form the run.
*
* <p>The &quot;limit&quot; of the run is the position just after the
* last character, i.e., one more than that position.
*
* <p>This class has no public constructor, and its members cannot be
* modified by users.
*
* @see com.ibm.icu.text.Bidi
*/
class BidiRun {
int start; /* first logical position of the run */
int limit; /* last visual position of the run +1 */
int insertRemove; /* if >0, flags for inserting LRM/RLM before/after run,
if <0, count of bidi controls within run */
byte level;
/*
* Default constructor
*
* Note that members start and limit of a run instance have different
* meanings depending whether the run is part of the runs array of a Bidi
* object, or if it is a reference returned by getVisualRun() or
* getLogicalRun().
* For a member of the runs array of a Bidi object,
* - start is the first logical position of the run in the source text.
* - limit is one after the last visual position of the run.
* For a reference returned by getLogicalRun() or getVisualRun(),
* - start is the first logical position of the run in the source text.
* - limit is one after the last logical position of the run.
*/
BidiRun()
{
this(0, 0, (byte)0);
}
/*
* Constructor
*/
BidiRun(int start, int limit, byte embeddingLevel)
{
this.start = start;
this.limit = limit;
this.level = embeddingLevel;
}
/*
* Copy the content of a BidiRun instance
*/
void copyFrom(BidiRun run)
{
this.start = run.start;
this.limit = run.limit;
this.level = run.level;
this.insertRemove = run.insertRemove;
}
/**
* Get level of run
*/
byte getEmbeddingLevel()
{
return level;
}
/**
* Check if run level is even
* @return true if the embedding level of this run is even, i.e. it is a
* left-to-right run.
*/
boolean isEvenRun()
{
return (level & 1) == 0;
}
}

View file

@ -1,452 +0,0 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2001-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
/* Written by Simon Montagu, Matitiahu Allouche
* (ported from C code written by Markus W. Scherer)
*/
package sun.text.bidi;
import sun.text.normalizer.UCharacter;
import sun.text.normalizer.UTF16;
final class BidiWriter {
/** Bidi control code points */
static final char LRM_CHAR = 0x200e;
static final char RLM_CHAR = 0x200f;
static final int MASK_R_AL = (1 << UCharacter.RIGHT_TO_LEFT |
1 << UCharacter.RIGHT_TO_LEFT_ARABIC);
private static boolean IsCombining(int type) {
return ((1<<type &
(1<<UCharacter.NON_SPACING_MARK |
1<<UCharacter.COMBINING_SPACING_MARK |
1<<UCharacter.ENCLOSING_MARK)) != 0);
}
/*
* When we have OUTPUT_REVERSE set on writeReordered(), then we
* semantically write RTL runs in reverse and later reverse them again.
* Instead, we actually write them in forward order to begin with.
* However, if the RTL run was to be mirrored, we need to mirror here now
* since the implicit second reversal must not do it.
* It looks strange to do mirroring in LTR output, but it is only because
* we are writing RTL output in reverse.
*/
private static String doWriteForward(String src, int options) {
/* optimize for several combinations of options */
switch(options&(BidiBase.REMOVE_BIDI_CONTROLS|BidiBase.DO_MIRRORING)) {
case 0: {
/* simply return the LTR run */
return src;
}
case BidiBase.DO_MIRRORING: {
StringBuffer dest = new StringBuffer(src.length());
/* do mirroring */
int i=0;
int c;
do {
c = UTF16.charAt(src, i);
i += UTF16.getCharCount(c);
UTF16.append(dest, UCharacter.getMirror(c));
} while(i < src.length());
return dest.toString();
}
case BidiBase.REMOVE_BIDI_CONTROLS: {
StringBuilder dest = new StringBuilder(src.length());
/* copy the LTR run and remove any Bidi control characters */
int i = 0;
char c;
do {
c = src.charAt(i++);
if(!BidiBase.IsBidiControlChar(c)) {
dest.append(c);
}
} while(i < src.length());
return dest.toString();
}
default: {
StringBuffer dest = new StringBuffer(src.length());
/* remove Bidi control characters and do mirroring */
int i = 0;
int c;
do {
c = UTF16.charAt(src, i);
i += UTF16.getCharCount(c);
if(!BidiBase.IsBidiControlChar(c)) {
UTF16.append(dest, UCharacter.getMirror(c));
}
} while(i < src.length());
return dest.toString();
}
} /* end of switch */
}
private static String doWriteForward(char[] text, int start, int limit,
int options) {
return doWriteForward(new String(text, start, limit - start), options);
}
static String writeReverse(String src, int options) {
/*
* RTL run -
*
* RTL runs need to be copied to the destination in reverse order
* of code points, not code units, to keep Unicode characters intact.
*
* The general strategy for this is to read the source text
* in backward order, collect all code units for a code point
* (and optionally following combining characters, see below),
* and copy all these code units in ascending order
* to the destination for this run.
*
* Several options request whether combining characters
* should be kept after their base characters,
* whether Bidi control characters should be removed, and
* whether characters should be replaced by their mirror-image
* equivalent Unicode characters.
*/
StringBuffer dest = new StringBuffer(src.length());
/* optimize for several combinations of options */
switch (options &
(BidiBase.REMOVE_BIDI_CONTROLS |
BidiBase.DO_MIRRORING |
BidiBase.KEEP_BASE_COMBINING)) {
case 0:
/*
* With none of the "complicated" options set, the destination
* run will have the same length as the source run,
* and there is no mirroring and no keeping combining characters
* with their base characters.
*
* XXX: or dest = UTF16.reverse(new StringBuffer(src));
*/
int srcLength = src.length();
/* preserve character integrity */
do {
/* i is always after the last code unit known to need to be kept
* in this segment */
int i = srcLength;
/* collect code units for one base character */
srcLength -= UTF16.getCharCount(UTF16.charAt(src,
srcLength - 1));
/* copy this base character */
dest.append(src.substring(srcLength, i));
} while(srcLength > 0);
break;
case BidiBase.KEEP_BASE_COMBINING:
/*
* Here, too, the destination
* run will have the same length as the source run,
* and there is no mirroring.
* We do need to keep combining characters with their base
* characters.
*/
srcLength = src.length();
/* preserve character integrity */
do {
/* i is always after the last code unit known to need to be kept
* in this segment */
int c;
int i = srcLength;
/* collect code units and modifier letters for one base
* character */
do {
c = UTF16.charAt(src, srcLength - 1);
srcLength -= UTF16.getCharCount(c);
} while(srcLength > 0 && IsCombining(UCharacter.getType(c)));
/* copy this "user character" */
dest.append(src.substring(srcLength, i));
} while(srcLength > 0);
break;
default:
/*
* With several "complicated" options set, this is the most
* general and the slowest copying of an RTL run.
* We will do mirroring, remove Bidi controls, and
* keep combining characters with their base characters
* as requested.
*/
srcLength = src.length();
/* preserve character integrity */
do {
/* i is always after the last code unit known to need to be kept
* in this segment */
int i = srcLength;
/* collect code units for one base character */
int c = UTF16.charAt(src, srcLength - 1);
srcLength -= UTF16.getCharCount(c);
if ((options & BidiBase.KEEP_BASE_COMBINING) != 0) {
/* collect modifier letters for this base character */
while(srcLength > 0 && IsCombining(UCharacter.getType(c))) {
c = UTF16.charAt(src, srcLength - 1);
srcLength -= UTF16.getCharCount(c);
}
}
if ((options & BidiBase.REMOVE_BIDI_CONTROLS) != 0 &&
BidiBase.IsBidiControlChar(c)) {
/* do not copy this Bidi control character */
continue;
}
/* copy this "user character" */
int j = srcLength;
if((options & BidiBase.DO_MIRRORING) != 0) {
/* mirror only the base character */
c = UCharacter.getMirror(c);
UTF16.append(dest, c);
j += UTF16.getCharCount(c);
}
dest.append(src.substring(j, i));
} while(srcLength > 0);
break;
} /* end of switch */
return dest.toString();
}
static String doWriteReverse(char[] text, int start, int limit, int options) {
return writeReverse(new String(text, start, limit - start), options);
}
static String writeReordered(BidiBase bidi, int options) {
int run, runCount;
StringBuilder dest;
char[] text = bidi.text;
runCount = bidi.countRuns();
/*
* Option "insert marks" implies BidiBase.INSERT_LRM_FOR_NUMERIC if the
* reordering mode (checked below) is appropriate.
*/
if ((bidi.reorderingOptions & BidiBase.OPTION_INSERT_MARKS) != 0) {
options |= BidiBase.INSERT_LRM_FOR_NUMERIC;
options &= ~BidiBase.REMOVE_BIDI_CONTROLS;
}
/*
* Option "remove controls" implies BidiBase.REMOVE_BIDI_CONTROLS
* and cancels BidiBase.INSERT_LRM_FOR_NUMERIC.
*/
if ((bidi.reorderingOptions & BidiBase.OPTION_REMOVE_CONTROLS) != 0) {
options |= BidiBase.REMOVE_BIDI_CONTROLS;
options &= ~BidiBase.INSERT_LRM_FOR_NUMERIC;
}
/*
* If we do not perform the "inverse Bidi" algorithm, then we
* don't need to insert any LRMs, and don't need to test for it.
*/
if ((bidi.reorderingMode != BidiBase.REORDER_INVERSE_NUMBERS_AS_L) &&
(bidi.reorderingMode != BidiBase.REORDER_INVERSE_LIKE_DIRECT) &&
(bidi.reorderingMode != BidiBase.REORDER_INVERSE_FOR_NUMBERS_SPECIAL) &&
(bidi.reorderingMode != BidiBase.REORDER_RUNS_ONLY)) {
options &= ~BidiBase.INSERT_LRM_FOR_NUMERIC;
}
dest = new StringBuilder((options & BidiBase.INSERT_LRM_FOR_NUMERIC) != 0 ?
bidi.length * 2 : bidi.length);
/*
* Iterate through all visual runs and copy the run text segments to
* the destination, according to the options.
*
* The tests for where to insert LRMs ignore the fact that there may be
* BN codes or non-BMP code points at the beginning and end of a run;
* they may insert LRMs unnecessarily but the tests are faster this way
* (this would have to be improved for UTF-8).
*/
if ((options & BidiBase.OUTPUT_REVERSE) == 0) {
/* forward output */
if ((options & BidiBase.INSERT_LRM_FOR_NUMERIC) == 0) {
/* do not insert Bidi controls */
for (run = 0; run < runCount; ++run) {
BidiRun bidiRun = bidi.getVisualRun(run);
if (bidiRun.isEvenRun()) {
dest.append(doWriteForward(text, bidiRun.start,
bidiRun.limit,
options & ~BidiBase.DO_MIRRORING));
} else {
dest.append(doWriteReverse(text, bidiRun.start,
bidiRun.limit, options));
}
}
} else {
/* insert Bidi controls for "inverse Bidi" */
byte[] dirProps = bidi.dirProps;
char uc;
int markFlag;
for (run = 0; run < runCount; ++run) {
BidiRun bidiRun = bidi.getVisualRun(run);
markFlag=0;
/* check if something relevant in insertPoints */
markFlag = bidi.runs[run].insertRemove;
if (markFlag < 0) { /* bidi controls count */
markFlag = 0;
}
if (bidiRun.isEvenRun()) {
if (bidi.isInverse() &&
dirProps[bidiRun.start] != BidiBase.L) {
markFlag |= BidiBase.LRM_BEFORE;
}
if ((markFlag & BidiBase.LRM_BEFORE) != 0) {
uc = LRM_CHAR;
} else if ((markFlag & BidiBase.RLM_BEFORE) != 0) {
uc = RLM_CHAR;
} else {
uc = 0;
}
if (uc != 0) {
dest.append(uc);
}
dest.append(doWriteForward(text,
bidiRun.start, bidiRun.limit,
options & ~BidiBase.DO_MIRRORING));
if (bidi.isInverse() &&
dirProps[bidiRun.limit - 1] != BidiBase.L) {
markFlag |= BidiBase.LRM_AFTER;
}
if ((markFlag & BidiBase.LRM_AFTER) != 0) {
uc = LRM_CHAR;
} else if ((markFlag & BidiBase.RLM_AFTER) != 0) {
uc = RLM_CHAR;
} else {
uc = 0;
}
if (uc != 0) {
dest.append(uc);
}
} else { /* RTL run */
if (bidi.isInverse() &&
!bidi.testDirPropFlagAt(MASK_R_AL,
bidiRun.limit - 1)) {
markFlag |= BidiBase.RLM_BEFORE;
}
if ((markFlag & BidiBase.LRM_BEFORE) != 0) {
uc = LRM_CHAR;
} else if ((markFlag & BidiBase.RLM_BEFORE) != 0) {
uc = RLM_CHAR;
} else {
uc = 0;
}
if (uc != 0) {
dest.append(uc);
}
dest.append(doWriteReverse(text, bidiRun.start,
bidiRun.limit, options));
if(bidi.isInverse() &&
(MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.start])) == 0) {
markFlag |= BidiBase.RLM_AFTER;
}
if ((markFlag & BidiBase.LRM_AFTER) != 0) {
uc = LRM_CHAR;
} else if ((markFlag & BidiBase.RLM_AFTER) != 0) {
uc = RLM_CHAR;
} else {
uc = 0;
}
if (uc != 0) {
dest.append(uc);
}
}
}
}
} else {
/* reverse output */
if((options & BidiBase.INSERT_LRM_FOR_NUMERIC) == 0) {
/* do not insert Bidi controls */
for(run = runCount; --run >= 0; ) {
BidiRun bidiRun = bidi.getVisualRun(run);
if (bidiRun.isEvenRun()) {
dest.append(doWriteReverse(text,
bidiRun.start, bidiRun.limit,
options & ~BidiBase.DO_MIRRORING));
} else {
dest.append(doWriteForward(text, bidiRun.start,
bidiRun.limit, options));
}
}
} else {
/* insert Bidi controls for "inverse Bidi" */
byte[] dirProps = bidi.dirProps;
for (run = runCount; --run >= 0; ) {
/* reverse output */
BidiRun bidiRun = bidi.getVisualRun(run);
if (bidiRun.isEvenRun()) {
if (dirProps[bidiRun.limit - 1] != BidiBase.L) {
dest.append(LRM_CHAR);
}
dest.append(doWriteReverse(text, bidiRun.start,
bidiRun.limit, options & ~BidiBase.DO_MIRRORING));
if (dirProps[bidiRun.start] != BidiBase.L) {
dest.append(LRM_CHAR);
}
} else {
if ((MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.start])) == 0) {
dest.append(RLM_CHAR);
}
dest.append(doWriteForward(text, bidiRun.start,
bidiRun.limit, options));
if ((MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.limit - 1])) == 0) {
dest.append(RLM_CHAR);
}
}
}
}
}
return dest.toString();
}
}

View file

@ -1,526 +0,0 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
******************************************************************************
*
* Copyright (C) 2009-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*/
package sun.text.normalizer;
import sun.text.normalizer.UnicodeSet.SpanCondition;
/**
* Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points.
*
* Latin-1: Look up bytes.
* 2-byte characters: Bits organized vertically.
* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges.
* Supplementary characters: Call contains() on the parent set.
*/
final class BMPSet {
/**
* One boolean ('true' or 'false') per Latin-1 character.
*/
private boolean[] latin1Contains;
/**
* One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points
* correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6}
* trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead)
*
* Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at
* runtime.
*/
private int[] table7FF;
/**
* One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks
* correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12}
* t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit
* indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed
* and set.contains(c) must be called.
*
* Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster
* validity checking at runtime.
*/
private int[] bmpBlockBits;
/**
* Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000,
* U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
* always looked up in the bit tables. The last pair of indexes is for finding supplementary code points.
*/
private int[] list4kStarts;
/**
* The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for
* supplementary code points. The list is terminated with list[listLength-1]=0x110000.
*/
private final int[] list;
private final int listLength; // length used; list may be longer to minimize reallocs
public BMPSet(final int[] parentList, int parentListLength) {
list = parentList;
listLength = parentListLength;
latin1Contains = new boolean[0x100];
table7FF = new int[64];
bmpBlockBits = new int[64];
list4kStarts = new int[18];
/*
* Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the
* first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of
* indexes is for finding supplementary code points.
*/
list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1);
int i;
for (i = 1; i <= 0x10; ++i) {
list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1);
}
list4kStarts[0x11] = listLength - 1;
initBits();
}
public boolean contains(int c) {
if (c <= 0xff) {
return (latin1Contains[c]);
} else if (c <= 0x7ff) {
return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0);
} else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) {
int lead = c >> 12;
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
if (twoBits <= 1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
return (0 != twoBits);
} else {
// Look up the code point in its 4k block of code points.
return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]);
}
} else if (c <= 0x10ffff) {
// surrogate or supplementary code point
return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
} else {
// Out-of-range code points get false, consistent with long-standing
// behavior of UnicodeSet.contains(c).
return false;
}
}
/**
* Span the initial substring for which each character c has spanCondition==contains(c). It must be
* spanCondition==0 or 1.
*
* @param start The start index
* @param outCount If not null: Receives the number of code points in the span.
* @return the limit (exclusive end) of the span
*
* NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for
* sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points
* as usual in ICU.
*/
public final int span(CharSequence s, int start, SpanCondition spanCondition,
OutputInt outCount) {
char c, c2;
int i = start;
int limit = s.length();
int numSupplementary = 0;
if (SpanCondition.NOT_CONTAINED != spanCondition) {
// span
while (i < limit) {
c = s.charAt(i);
if (c <= 0xff) {
if (!latin1Contains[c]) {
break;
}
} else if (c <= 0x7ff) {
if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
break;
}
} else if (c < 0xd800 ||
c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
int lead = c >> 12;
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
if (twoBits <= 1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if (twoBits == 0) {
break;
}
} else {
// Look up the code point in its 4k block of code points.
if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
break;
}
}
} else {
// surrogate pair
int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
++numSupplementary;
++i;
}
++i;
}
} else {
// span not
while (i < limit) {
c = s.charAt(i);
if (c <= 0xff) {
if (latin1Contains[c]) {
break;
}
} else if (c <= 0x7ff) {
if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
break;
}
} else if (c < 0xd800 ||
c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
int lead = c >> 12;
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
if (twoBits <= 1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if (twoBits != 0) {
break;
}
} else {
// Look up the code point in its 4k block of code points.
if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
break;
}
}
} else {
// surrogate pair
int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
++numSupplementary;
++i;
}
++i;
}
}
if (outCount != null) {
int spanLength = i - start;
outCount.value = spanLength - numSupplementary; // number of code points
}
return i;
}
/**
* Symmetrical with span().
* Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >=
* limit and spanCondition==0 or 1.
*
* @return The string index which starts the span (i.e. inclusive).
*/
public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {
char c, c2;
if (SpanCondition.NOT_CONTAINED != spanCondition) {
// span
for (;;) {
c = s.charAt(--limit);
if (c <= 0xff) {
if (!latin1Contains[c]) {
break;
}
} else if (c <= 0x7ff) {
if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
break;
}
} else if (c < 0xd800 ||
c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
int lead = c >> 12;
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
if (twoBits <= 1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if (twoBits == 0) {
break;
}
} else {
// Look up the code point in its 4k block of code points.
if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
break;
}
}
} else {
// surrogate pair
int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
--limit;
}
if (0 == limit) {
return 0;
}
}
} else {
// span not
for (;;) {
c = s.charAt(--limit);
if (c <= 0xff) {
if (latin1Contains[c]) {
break;
}
} else if (c <= 0x7ff) {
if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
break;
}
} else if (c < 0xd800 ||
c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
int lead = c >> 12;
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
if (twoBits <= 1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if (twoBits != 0) {
break;
}
} else {
// Look up the code point in its 4k block of code points.
if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
break;
}
}
} else {
// surrogate pair
int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
--limit;
}
if (0 == limit) {
return 0;
}
}
}
return limit + 1;
}
/**
* Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800
*/
private static void set32x64Bits(int[] table, int start, int limit) {
assert (64 == table.length);
int lead = start >> 6; // Named for UTF-8 2-byte lead byte with upper 5 bits.
int trail = start & 0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits.
// Set one bit indicating an all-one block.
int bits = 1 << lead;
if ((start + 1) == limit) { // Single-character shortcut.
table[trail] |= bits;
return;
}
int limitLead = limit >> 6;
int limitTrail = limit & 0x3f;
if (lead == limitLead) {
// Partial vertical bit column.
while (trail < limitTrail) {
table[trail++] |= bits;
}
} else {
// Partial vertical bit column,
// followed by a bit rectangle,
// followed by another partial vertical bit column.
if (trail > 0) {
do {
table[trail++] |= bits;
} while (trail < 64);
++lead;
}
if (lead < limitLead) {
bits = ~((1 << lead) - 1);
if (limitLead < 0x20) {
bits &= (1 << limitLead) - 1;
}
for (trail = 0; trail < 64; ++trail) {
table[trail] |= bits;
}
}
// limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
// In that case, bits=1<<limitLead == 1<<0 == 1
// (because Java << uses only the lower 5 bits of the shift operand)
// but the bits value is not used because trail<limitTrail is already false.
bits = 1 << limitLead;
for (trail = 0; trail < limitTrail; ++trail) {
table[trail] |= bits;
}
}
}
private void initBits() {
int start, limit;
int listIndex = 0;
// Set latin1Contains[].
do {
start = list[listIndex++];
if (listIndex < listLength) {
limit = list[listIndex++];
} else {
limit = 0x110000;
}
if (start >= 0x100) {
break;
}
do {
latin1Contains[start++] = true;
} while (start < limit && start < 0x100);
} while (limit <= 0x100);
// Set table7FF[].
while (start < 0x800) {
set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800);
if (limit > 0x800) {
start = 0x800;
break;
}
start = list[listIndex++];
if (listIndex < listLength) {
limit = list[listIndex++];
} else {
limit = 0x110000;
}
}
// Set bmpBlockBits[].
int minStart = 0x800;
while (start < 0x10000) {
if (limit > 0x10000) {
limit = 0x10000;
}
if (start < minStart) {
start = minStart;
}
if (start < limit) { // Else: Another range entirely in a known mixed-value block.
if (0 != (start & 0x3f)) {
// Mixed-value block of 64 code points.
start >>= 6;
bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6);
start = (start + 1) << 6; // Round up to the next block boundary.
minStart = start; // Ignore further ranges in this block.
}
if (start < limit) {
if (start < (limit & ~0x3f)) {
// Multiple all-ones blocks of 64 code points each.
set32x64Bits(bmpBlockBits, start >> 6, limit >> 6);
}
if (0 != (limit & 0x3f)) {
// Mixed-value block of 64 code points.
limit >>= 6;
bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6);
limit = (limit + 1) << 6; // Round up to the next block boundary.
minStart = limit; // Ignore further ranges in this block.
}
}
}
if (limit == 0x10000) {
break;
}
start = list[listIndex++];
if (listIndex < listLength) {
limit = list[listIndex++];
} else {
limit = 0x110000;
}
}
}
/**
* Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code
* points in a certain range.
*
* For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and
* hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1.
*
* @param c
* a character in a subrange of MIN_VALUE..MAX_VALUE
* @param lo
* The lowest index to be returned.
* @param hi
* The highest index to be returned.
* @return the smallest integer i in the range lo..hi, inclusive, such that c < list[i]
*/
private int findCodePoint(int c, int lo, int hi) {
/* Examples:
findCodePoint(c)
set list[] c=0 1 3 4 7 8
=== ============== ===========
[] [110000] 0 0 0 0 0 0
[\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
[\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
[:Any:] [0, 110000] 1 1 1 1 1 1
*/
// Return the smallest i such that c < list[i]. Assume
// list[len - 1] == HIGH and that c is legal (0..HIGH-1).
if (c < list[lo])
return lo;
// High runner test. c is often after the last range, so an
// initial check for this condition pays off.
if (lo >= hi || c >= list[hi - 1])
return hi;
// invariant: c >= list[lo]
// invariant: c < list[hi]
for (;;) {
int i = (lo + hi) >>> 1;
if (i == lo) {
break; // Found!
} else if (c < list[i]) {
hi = i;
} else {
lo = i;
}
}
return hi;
}
private final boolean containsSlow(int c, int lo, int hi) {
return (0 != (findCodePoint(c, lo, hi) & 1));
}
}

View file

@ -1,175 +0,0 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and
* others. All Rights Reserved.
******************************************************************************
*/
package sun.text.normalizer;
import java.io.DataInputStream;
import java.io.InputStream;
import java.io.IOException;
/**
* Trie implementation which stores data in char, 16 bits.
* @author synwee
* @see com.ibm.icu.impl.Trie
* @since release 2.1, Jan 01 2002
*/
// note that i need to handle the block calculations later, since chartrie
// in icu4c uses the same index array.
public class CharTrie extends Trie
{
// public constructors ---------------------------------------------
/**
* <p>Creates a new Trie with the settings for the trie data.</p>
* <p>Unserialize the 32-bit-aligned input stream and use the data for the
* trie.</p>
* @param inputStream file input stream to a ICU data file, containing
* the trie
* @param dataManipulate object which provides methods to parse the char
* data
* @throws IOException thrown when data reading fails
* @draft 2.1
*/
public CharTrie(InputStream inputStream,
DataManipulate dataManipulate) throws IOException
{
super(inputStream, dataManipulate);
if (!isCharTrie()) {
throw new IllegalArgumentException(
"Data given does not belong to a char trie.");
}
}
// public methods --------------------------------------------------
/**
* Gets the value associated with the codepoint.
* If no value is associated with the codepoint, a default value will be
* returned.
* @param ch codepoint
* @return offset to data
*/
public final char getCodePointValue(int ch)
{
int offset;
// fastpath for U+0000..U+D7FF
if(0 <= ch && ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
// copy of getRawOffset()
offset = (m_index_[ch >> INDEX_STAGE_1_SHIFT_] << INDEX_STAGE_2_SHIFT_)
+ (ch & INDEX_STAGE_3_MASK_);
return m_data_[offset];
}
// handle U+D800..U+10FFFF
offset = getCodePointOffset(ch);
// return -1 if there is an error, in this case we return the default
// value: m_initialValue_
return (offset >= 0) ? m_data_[offset] : m_initialValue_;
}
/**
* Gets the value to the data which this lead surrogate character points
* to.
* Returned data may contain folding offset information for the next
* trailing surrogate character.
* This method does not guarantee correct results for trail surrogates.
* @param ch lead surrogate character
* @return data value
*/
public final char getLeadValue(char ch)
{
return m_data_[getLeadOffset(ch)];
}
// protected methods -----------------------------------------------
/**
* <p>Parses the input stream and stores its trie content into a index and
* data array</p>
* @param inputStream data input stream containing trie data
* @exception IOException thrown when data reading fails
*/
protected final void unserialize(InputStream inputStream)
throws IOException
{
DataInputStream input = new DataInputStream(inputStream);
int indexDataLength = m_dataOffset_ + m_dataLength_;
m_index_ = new char[indexDataLength];
for (int i = 0; i < indexDataLength; i ++) {
m_index_[i] = input.readChar();
}
m_data_ = m_index_;
m_initialValue_ = m_data_[m_dataOffset_];
}
/**
* Gets the offset to the data which the surrogate pair points to.
* @param lead lead surrogate
* @param trail trailing surrogate
* @return offset to data
* @draft 2.1
*/
protected final int getSurrogateOffset(char lead, char trail)
{
if (m_dataManipulate_ == null) {
throw new NullPointerException(
"The field DataManipulate in this Trie is null");
}
// get fold position for the next trail surrogate
int offset = m_dataManipulate_.getFoldingOffset(getLeadValue(lead));
// get the real data from the folded lead/trail units
if (offset > 0) {
return getRawOffset(offset, (char)(trail & SURROGATE_MASK_));
}
// return -1 if there is an error, in this case we return the default
// value: m_initialValue_
return -1;
}
// private data members --------------------------------------------
/**
* Default value
*/
private char m_initialValue_;
/**
* Array of char data
*/
private char m_data_[];
}

View file

@ -1,145 +0,0 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.text.CharacterIterator;
/**
* This class is a wrapper around CharacterIterator and implements the
* UCharacterIterator protocol
* @author ram
*/
class CharacterIteratorWrapper extends UCharacterIterator {
private CharacterIterator iterator;
public CharacterIteratorWrapper(CharacterIterator iter){
if(iter==null){
throw new IllegalArgumentException();
}
iterator = iter;
}
/**
* @see UCharacterIterator#current()
*/
public int current() {
int c = iterator.current();
if(c==CharacterIterator.DONE){
return DONE;
}
return c;
}
/**
* @see UCharacterIterator#getLength()
*/
public int getLength() {
return (iterator.getEndIndex() - iterator.getBeginIndex());
}
/**
* @see UCharacterIterator#getIndex()
*/
public int getIndex() {
return iterator.getIndex();
}
/**
* @see UCharacterIterator#next()
*/
public int next() {
int i = iterator.current();
iterator.next();
if(i==CharacterIterator.DONE){
return DONE;
}
return i;
}
/**
* @see UCharacterIterator#previous()
*/
public int previous() {
int i = iterator.previous();
if(i==CharacterIterator.DONE){
return DONE;
}
return i;
}
/**
* @see UCharacterIterator#setIndex(int)
*/
public void setIndex(int index) {
iterator.setIndex(index);
}
/**
* @see UCharacterIterator#getText(char[])
*/
public int getText(char[] fillIn, int offset){
int length =iterator.getEndIndex() - iterator.getBeginIndex();
int currentIndex = iterator.getIndex();
if(offset < 0 || offset + length > fillIn.length){
throw new IndexOutOfBoundsException(Integer.toString(length));
}
for (char ch = iterator.first(); ch != CharacterIterator.DONE; ch = iterator.next()) {
fillIn[offset++] = ch;
}
iterator.setIndex(currentIndex);
return length;
}
/**
* Creates a clone of this iterator. Clones the underlying character iterator.
* @see UCharacterIterator#clone()
*/
public Object clone(){
try {
CharacterIteratorWrapper result = (CharacterIteratorWrapper) super.clone();
result.iterator = (CharacterIterator)this.iterator.clone();
return result;
} catch (CloneNotSupportedException e) {
return null; // only invoked if bad underlying character iterator
}
}
}

View file

@ -1,501 +0,0 @@
/*
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
// (c) 2018 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html#License
// created: 2018may10 Markus W. Scherer
package sun.text.normalizer;
import java.util.Iterator;
import java.util.NoSuchElementException;
/**
* Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values.
* This does not implement java.util.Map.
*
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public abstract class CodePointMap implements Iterable<CodePointMap.Range> {
/**
* Selectors for how getRange() should report value ranges overlapping with surrogates.
* Most users should use NORMAL.
*
* @see #getRange
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public enum RangeOption {
/**
* getRange() enumerates all same-value ranges as stored in the map.
* Most users should use this option.
*
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
NORMAL,
/**
* getRange() enumerates all same-value ranges as stored in the map,
* except that lead surrogates (U+D800..U+DBFF) are treated as having the
* surrogateValue, which is passed to getRange() as a separate parameter.
* The surrogateValue is not transformed via filter().
* See {@link Character#isHighSurrogate}.
*
* <p>Most users should use NORMAL instead.
*
* <p>This option is useful for maps that map surrogate code *units* to
* special values optimized for UTF-16 string processing
* or for special error behavior for unpaired surrogates,
* but those values are not to be associated with the lead surrogate code *points*.
*
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
FIXED_LEAD_SURROGATES,
/**
* getRange() enumerates all same-value ranges as stored in the map,
* except that all surrogates (U+D800..U+DFFF) are treated as having the
* surrogateValue, which is passed to getRange() as a separate parameter.
* The surrogateValue is not transformed via filter().
* See {@link Character#isSurrogate}.
*
* <p>Most users should use NORMAL instead.
*
* <p>This option is useful for maps that map surrogate code *units* to
* special values optimized for UTF-16 string processing
* or for special error behavior for unpaired surrogates,
* but those values are not to be associated with the lead surrogate code *points*.
*
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
FIXED_ALL_SURROGATES
}
/**
* Callback function interface: Modifies a map value.
* Optionally called by getRange().
* The modified value will be returned by the getRange() function.
*
* <p>Can be used to ignore some of the value bits,
* make a filter for one of several values,
* return a value index computed from the map value, etc.
*
* @see #getRange
* @see #iterator
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public interface ValueFilter {
/**
* Modifies the map value.
*
* @param value map value
* @return modified value
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public int apply(int value);
}
/**
* Range iteration result data.
* Code points from start to end map to the same value.
* The value may have been modified by {@link ValueFilter#apply(int)},
* or it may be the surrogateValue if a RangeOption other than "normal" was used.
*
* @see #getRange
* @see #iterator
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public static final class Range {
private int start;
private int end;
private int value;
/**
* Constructor. Sets start and end to -1 and value to 0.
*
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public Range() {
start = end = -1;
value = 0;
}
/**
* @return the start code point
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public int getStart() { return start; }
/**
* @return the (inclusive) end code point
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public int getEnd() { return end; }
/**
* @return the range value
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public int getValue() { return value; }
/**
* Sets the range. When using {@link #iterator()},
* iteration will resume after the newly set end.
*
* @param start new start code point
* @param end new end code point
* @param value new value
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public void set(int start, int end, int value) {
this.start = start;
this.end = end;
this.value = value;
}
}
private final class RangeIterator implements Iterator<Range> {
private Range range = new Range();
@Override
public boolean hasNext() {
return -1 <= range.end && range.end < 0x10ffff;
}
@Override
public Range next() {
if (getRange(range.end + 1, null, range)) {
return range;
} else {
throw new NoSuchElementException();
}
}
@Override
public final void remove() {
throw new UnsupportedOperationException();
}
}
/**
* Iterates over code points of a string and fetches map values.
* This does not implement java.util.Iterator.
*
* <pre>
* void onString(CodePointMap map, CharSequence s, int start) {
* CodePointMap.StringIterator iter = map.stringIterator(s, start);
* while (iter.next()) {
* int end = iter.getIndex(); // code point from between start and end
* useValue(s, start, end, iter.getCodePoint(), iter.getValue());
* start = end;
* }
* }
* </pre>
*
* <p>This class is not intended for public subclassing.
*
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public class StringIterator {
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected CharSequence s;
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected int sIndex;
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected int c;
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected int value;
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected StringIterator(CharSequence s, int sIndex) {
this.s = s;
this.sIndex = sIndex;
c = -1;
value = 0;
}
/**
* Resets the iterator to a new string and/or a new string index.
*
* @param s string to iterate over
* @param sIndex string index where the iteration will start
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public void reset(CharSequence s, int sIndex) {
this.s = s;
this.sIndex = sIndex;
c = -1;
value = 0;
}
/**
* Reads the next code point, post-increments the string index,
* and gets a value from the map.
* Sets an implementation-defined error value if the code point is an unpaired surrogate.
*
* @return true if the string index was not yet at the end of the string;
* otherwise the iterator did not advance
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public boolean next() {
if (sIndex >= s.length()) {
return false;
}
c = Character.codePointAt(s, sIndex);
sIndex += Character.charCount(c);
value = get(c);
return true;
}
/**
* Reads the previous code point, pre-decrements the string index,
* and gets a value from the map.
* Sets an implementation-defined error value if the code point is an unpaired surrogate.
*
* @return true if the string index was not yet at the start of the string;
* otherwise the iterator did not advance
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public boolean previous() {
if (sIndex <= 0) {
return false;
}
c = Character.codePointBefore(s, sIndex);
sIndex -= Character.charCount(c);
value = get(c);
return true;
}
/**
* @return the string index
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public final int getIndex() { return sIndex; }
/**
* @return the code point
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public final int getCodePoint() { return c; }
/**
* @return the map value,
* or an implementation-defined error value if
* the code point is an unpaired surrogate
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public final int getValue() { return value; }
}
/**
* Protected no-args constructor.
*
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
protected CodePointMap() {
}
/**
* Returns the value for a code point as stored in the map, with range checking.
* Returns an implementation-defined error value if c is not in the range 0..U+10FFFF.
*
* @param c the code point
* @return the map value,
* or an implementation-defined error value if
* the code point is not in the range 0..U+10FFFF
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public abstract int get(int c);
/**
* Sets the range object to a range of code points beginning with the start parameter.
* The range start is the same as the start input parameter
* (even if there are preceding code points that have the same value).
* The range end is the last code point such that
* all those from start to there have the same value.
* Returns false if start is not 0..U+10FFFF.
* Can be used to efficiently iterate over all same-value ranges in a map.
* (This is normally faster than iterating over code points and get()ting each value,
* but may be much slower than a data structure that stores ranges directly.)
*
* <p>If the {@link ValueFilter} parameter is not null, then
* the value to be delivered is passed through that filter, and the return value is the end
* of the range where all values are modified to the same actual value.
* The value is unchanged if that parameter is null.
*
* <p>Example:
* <pre>
* int start = 0;
* CodePointMap.Range range = new CodePointMap.Range();
* while (map.getRange(start, null, range)) {
* int end = range.getEnd();
* int value = range.getValue();
* // Work with the range start..end and its value.
* start = end + 1;
* }
* </pre>
*
* @param start range start
* @param filter an object that may modify the map data value,
* or null if the values from the map are to be used unmodified
* @param range the range object that will be set to the code point range and value
* @return true if start is 0..U+10FFFF; otherwise no new range is fetched
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public abstract boolean getRange(int start, ValueFilter filter, Range range);
/**
* Sets the range object to a range of code points beginning with the start parameter.
* The range start is the same as the start input parameter
* (even if there are preceding code points that have the same value).
* The range end is the last code point such that
* all those from start to there have the same value.
* Returns false if start is not 0..U+10FFFF.
*
* <p>Same as the simpler {@link #getRange(int, ValueFilter, Range)} but optionally
* modifies the range if it overlaps with surrogate code points.
*
* @param start range start
* @param option defines whether surrogates are treated normally,
* or as having the surrogateValue; usually {@link RangeOption#NORMAL}
* @param surrogateValue value for surrogates; ignored if option=={@link RangeOption#NORMAL}
* @param filter an object that may modify the map data value,
* or null if the values from the map are to be used unmodified
* @param range the range object that will be set to the code point range and value
* @return true if start is 0..U+10FFFF; otherwise no new range is fetched
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public boolean getRange(int start, RangeOption option, int surrogateValue,
ValueFilter filter, Range range) {
assert option != null;
if (!getRange(start, filter, range)) {
return false;
}
if (option == RangeOption.NORMAL) {
return true;
}
int surrEnd = option == RangeOption.FIXED_ALL_SURROGATES ? 0xdfff : 0xdbff;
int end = range.end;
if (end < 0xd7ff || start > surrEnd) {
return true;
}
// The range overlaps with surrogates, or ends just before the first one.
if (range.value == surrogateValue) {
if (end >= surrEnd) {
// Surrogates followed by a non-surrValue range,
// or surrogates are part of a larger surrValue range.
return true;
}
} else {
if (start <= 0xd7ff) {
range.end = 0xd7ff; // Non-surrValue range ends before surrValue surrogates.
return true;
}
// Start is a surrogate with a non-surrValue code *unit* value.
// Return a surrValue code *point* range.
range.value = surrogateValue;
if (end > surrEnd) {
range.end = surrEnd; // Surrogate range ends before non-surrValue rest of range.
return true;
}
}
// See if the surrValue surrogate range can be merged with
// an immediately following range.
if (getRange(surrEnd + 1, filter, range) && range.value == surrogateValue) {
range.start = start;
return true;
}
range.start = start;
range.end = surrEnd;
range.value = surrogateValue;
return true;
}
/**
* Convenience iterator over same-map-value code point ranges.
* Same as looping over all ranges with {@link #getRange(int, ValueFilter, Range)}
* without filtering.
* Adjacent ranges have different map values.
*
* <p>The iterator always returns the same Range object.
*
* @return a Range iterator
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
@Override
public Iterator<Range> iterator() {
return new RangeIterator();
}
/**
* Returns an iterator (not a java.util.Iterator) over code points of a string
* for fetching map values.
*
* @param s string to iterate over
* @param sIndex string index where the iteration will start
* @return the iterator
* @draft ICU 63
* @provisional This API might change or be removed in a future release.
*/
public StringIterator stringIterator(CharSequence s, int sIndex) {
return new StringIterator(s, sIndex);
}
}

View file

@ -1,266 +0,0 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2009-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.IOException;
/**
* Normalization filtered by a UnicodeSet.
* Normalizes portions of the text contained in the filter set and leaves
* portions not contained in the filter set unchanged.
* Filtering is done via UnicodeSet.span(..., UnicodeSet.SpanCondition.SIMPLE).
* Not-in-the-filter text is treated as "is normalized" and "quick check yes".
* This class implements all of (and only) the Normalizer2 API.
* An instance of this class is unmodifiable/immutable.
* @stable ICU 4.4
* @author Markus W. Scherer
*/
class FilteredNormalizer2 extends Normalizer2 {
/**
* Constructs a filtered normalizer wrapping any Normalizer2 instance
* and a filter set.
* Both are aliased and must not be modified or deleted while this object
* is used.
* The filter set should be frozen; otherwise the performance will suffer greatly.
* @param n2 wrapped Normalizer2 instance
* @param filterSet UnicodeSet which determines the characters to be normalized
* @stable ICU 4.4
*/
public FilteredNormalizer2(Normalizer2 n2, UnicodeSet filterSet) {
norm2=n2;
set=filterSet;
}
/**
* {@inheritDoc}
* @stable ICU 4.4
*/
@Override
public StringBuilder normalize(CharSequence src, StringBuilder dest) {
if(dest==src) {
throw new IllegalArgumentException();
}
dest.setLength(0);
normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
return dest;
}
/**
* {@inheritDoc}
* @stable ICU 4.6
*/
@Override
public Appendable normalize(CharSequence src, Appendable dest) {
if(dest==src) {
throw new IllegalArgumentException();
}
return normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
}
/**
* {@inheritDoc}
* @stable ICU 4.4
*/
@Override
public StringBuilder normalizeSecondAndAppend(
StringBuilder first, CharSequence second) {
return normalizeSecondAndAppend(first, second, true);
}
/**
* {@inheritDoc}
* @stable ICU 4.4
*/
@Override
public StringBuilder append(StringBuilder first, CharSequence second) {
return normalizeSecondAndAppend(first, second, false);
}
/**
* {@inheritDoc}
* @stable ICU 4.6
*/
@Override
public String getDecomposition(int c) {
return set.contains(c) ? norm2.getDecomposition(c) : null;
}
/**
* {@inheritDoc}
* @stable ICU 49
*/
@Override
public int getCombiningClass(int c) {
return set.contains(c) ? norm2.getCombiningClass(c) : 0;
}
/**
* {@inheritDoc}
* @stable ICU 4.4
*/
@Override
public boolean isNormalized(CharSequence s) {
UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
int spanLimit=set.span(s, prevSpanLimit, spanCondition);
if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
spanCondition=UnicodeSet.SpanCondition.SIMPLE;
} else {
if(!norm2.isNormalized(s.subSequence(prevSpanLimit, spanLimit))) {
return false;
}
spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
}
prevSpanLimit=spanLimit;
}
return true;
}
/**
* {@inheritDoc}
* @stable ICU 4.4
*/
@Override
public int spanQuickCheckYes(CharSequence s) {
UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
int spanLimit=set.span(s, prevSpanLimit, spanCondition);
if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
spanCondition=UnicodeSet.SpanCondition.SIMPLE;
} else {
int yesLimit=
prevSpanLimit+
norm2.spanQuickCheckYes(s.subSequence(prevSpanLimit, spanLimit));
if(yesLimit<spanLimit) {
return yesLimit;
}
spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
}
prevSpanLimit=spanLimit;
}
return s.length();
}
/**
* {@inheritDoc}
* @stable ICU 4.4
*/
@Override
public boolean hasBoundaryBefore(int c) {
return !set.contains(c) || norm2.hasBoundaryBefore(c);
}
// Internal: No argument checking, and appends to dest.
// Pass as input spanCondition the one that is likely to yield a non-zero
// span length at the start of src.
// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
// UnicodeSet.SpanCondition.SIMPLE should be passed in for the start of src
// and UnicodeSet.SpanCondition.NOT_CONTAINED should be passed in if we continue after
// an in-filter prefix.
private Appendable normalize(CharSequence src, Appendable dest,
UnicodeSet.SpanCondition spanCondition) {
// Don't throw away destination buffer between iterations.
StringBuilder tempDest=new StringBuilder();
try {
for(int prevSpanLimit=0; prevSpanLimit<src.length();) {
int spanLimit=set.span(src, prevSpanLimit, spanCondition);
int spanLength=spanLimit-prevSpanLimit;
if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
if(spanLength!=0) {
dest.append(src, prevSpanLimit, spanLimit);
}
spanCondition=UnicodeSet.SpanCondition.SIMPLE;
} else {
if(spanLength!=0) {
// Not norm2.normalizeSecondAndAppend() because we do not want
// to modify the non-filter part of dest.
dest.append(norm2.normalize(src.subSequence(prevSpanLimit, spanLimit), tempDest));
}
spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
}
prevSpanLimit=spanLimit;
}
} catch(IOException e) {
throw new InternalError(e.toString(), e);
}
return dest;
}
private StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second,
boolean doNormalize) {
if(first==second) {
throw new IllegalArgumentException();
}
if(first.length()==0) {
if(doNormalize) {
return normalize(second, first);
} else {
return first.append(second);
}
}
// merge the in-filter suffix of the first string with the in-filter prefix of the second
int prefixLimit=set.span(second, 0, UnicodeSet.SpanCondition.SIMPLE);
if(prefixLimit!=0) {
CharSequence prefix=second.subSequence(0, prefixLimit);
int suffixStart=set.spanBack(first, 0x7fffffff, UnicodeSet.SpanCondition.SIMPLE);
if(suffixStart==0) {
if(doNormalize) {
norm2.normalizeSecondAndAppend(first, prefix);
} else {
norm2.append(first, prefix);
}
} else {
StringBuilder middle=new StringBuilder(
first.subSequence(suffixStart, first.length()));
if(doNormalize) {
norm2.normalizeSecondAndAppend(middle, prefix);
} else {
norm2.append(middle, prefix);
}
first.delete(suffixStart, 0x7fffffff).append(middle);
}
}
if(prefixLimit<second.length()) {
CharSequence rest=second.subSequence(prefixLimit, second.length());
if(doNormalize) {
normalize(rest, first, UnicodeSet.SpanCondition.NOT_CONTAINED);
} else {
first.append(rest);
}
}
return first;
}
private Normalizer2 norm2;
private UnicodeSet set;
};

View file

@ -1,323 +0,0 @@
/*
* Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Arrays;
import java.security.AccessController;
import java.security.PrivilegedAction;
public final class ICUBinary {
private static final class IsAcceptable implements Authenticate {
@Override
public boolean isDataVersionAcceptable(byte version[]) {
return version[0] == 1;
}
}
// public inner interface ------------------------------------------------
/**
* Special interface for data authentication
*/
public static interface Authenticate
{
/**
* Method used in ICUBinary.readHeader() to provide data format
* authentication.
* @param version version of the current data
* @return true if dataformat is an acceptable version, false otherwise
*/
public boolean isDataVersionAcceptable(byte version[]);
}
// public methods --------------------------------------------------------
/**
* Loads an ICU binary data file and returns it as a ByteBuffer.
* The buffer contents is normally read-only, but its position etc. can be modified.
*
* @param itemPath Relative ICU data item path, for example "root.res" or "coll/ucadata.icu".
* @return The data as a read-only ByteBuffer.
*/
public static ByteBuffer getRequiredData(String itemPath) {
final Class<ICUBinary> root = ICUBinary.class;
try (InputStream is = AccessController.doPrivileged(new PrivilegedAction<InputStream>() {
public InputStream run() {
return root.getResourceAsStream(itemPath);
}
})) {
// is.available() may return 0, or 1, or the total number of bytes in the stream,
// or some other number.
// Do not try to use is.available() == 0 to find the end of the stream!
byte[] bytes;
int avail = is.available();
if (avail > 32) {
// There are more bytes available than just the ICU data header length.
// With luck, it is the total number of bytes.
bytes = new byte[avail];
} else {
bytes = new byte[128]; // empty .res files are even smaller
}
// Call is.read(...) until one returns a negative value.
int length = 0;
for(;;) {
if (length < bytes.length) {
int numRead = is.read(bytes, length, bytes.length - length);
if (numRead < 0) {
break; // end of stream
}
length += numRead;
} else {
// See if we are at the end of the stream before we grow the array.
int nextByte = is.read();
if (nextByte < 0) {
break;
}
int capacity = 2 * bytes.length;
if (capacity < 128) {
capacity = 128;
} else if (capacity < 0x4000) {
capacity *= 2; // Grow faster until we reach 16kB.
}
bytes = Arrays.copyOf(bytes, capacity);
bytes[length++] = (byte) nextByte;
}
}
return ByteBuffer.wrap(bytes, 0, length);
}
catch (IOException e) {
throw new UncheckedIOException(e);
}
}
/**
* Same as readHeader(), but returns a VersionInfo rather than a compact int.
*/
public static VersionInfo readHeaderAndDataVersion(ByteBuffer bytes,
int dataFormat,
Authenticate authenticate)
throws IOException {
return getVersionInfoFromCompactInt(readHeader(bytes, dataFormat, authenticate));
}
private static final byte BIG_ENDIAN_ = 1;
public static final byte[] readHeader(InputStream inputStream,
byte dataFormatIDExpected[],
Authenticate authenticate)
throws IOException
{
DataInputStream input = new DataInputStream(inputStream);
char headersize = input.readChar();
int readcount = 2;
//reading the header format
byte magic1 = input.readByte();
readcount ++;
byte magic2 = input.readByte();
readcount ++;
if (magic1 != MAGIC1 || magic2 != MAGIC2) {
throw new IOException(MAGIC_NUMBER_AUTHENTICATION_FAILED_);
}
input.readChar(); // reading size
readcount += 2;
input.readChar(); // reading reserved word
readcount += 2;
byte bigendian = input.readByte();
readcount ++;
byte charset = input.readByte();
readcount ++;
byte charsize = input.readByte();
readcount ++;
input.readByte(); // reading reserved byte
readcount ++;
byte dataFormatID[] = new byte[4];
input.readFully(dataFormatID);
readcount += 4;
byte dataVersion[] = new byte[4];
input.readFully(dataVersion);
readcount += 4;
byte unicodeVersion[] = new byte[4];
input.readFully(unicodeVersion);
readcount += 4;
if (headersize < readcount) {
throw new IOException("Internal Error: Header size error");
}
input.skipBytes(headersize - readcount);
if (bigendian != BIG_ENDIAN_ || charset != CHAR_SET_
|| charsize != CHAR_SIZE_
|| !Arrays.equals(dataFormatIDExpected, dataFormatID)
|| (authenticate != null
&& !authenticate.isDataVersionAcceptable(dataVersion))) {
throw new IOException(HEADER_AUTHENTICATION_FAILED_);
}
return unicodeVersion;
}
/**
* Reads an ICU data header, checks the data format, and returns the data version.
*
* <p>Assumes that the ByteBuffer position is 0 on input.
* The buffer byte order is set according to the data.
* The buffer position is advanced past the header (including UDataInfo and comment).
*
* <p>See C++ ucmndata.h and unicode/udata.h.
*
* @return dataVersion
* @throws IOException if this is not a valid ICU data item of the expected dataFormat
*/
public static int readHeader(ByteBuffer bytes, int dataFormat, Authenticate authenticate)
throws IOException {
assert bytes.position() == 0;
byte magic1 = bytes.get(2);
byte magic2 = bytes.get(3);
if (magic1 != MAGIC1 || magic2 != MAGIC2) {
throw new IOException(MAGIC_NUMBER_AUTHENTICATION_FAILED_);
}
byte isBigEndian = bytes.get(8);
byte charsetFamily = bytes.get(9);
byte sizeofUChar = bytes.get(10);
if (isBigEndian < 0 || 1 < isBigEndian ||
charsetFamily != CHAR_SET_ || sizeofUChar != CHAR_SIZE_) {
throw new IOException(HEADER_AUTHENTICATION_FAILED_);
}
bytes.order(isBigEndian != 0 ? ByteOrder.BIG_ENDIAN : ByteOrder.LITTLE_ENDIAN);
int headerSize = bytes.getChar(0);
int sizeofUDataInfo = bytes.getChar(4);
if (sizeofUDataInfo < 20 || headerSize < (sizeofUDataInfo + 4)) {
throw new IOException("Internal Error: Header size error");
}
// TODO: Change Authenticate to take int major, int minor, int milli, int micro
// to avoid array allocation.
byte[] formatVersion = new byte[] {
bytes.get(16), bytes.get(17), bytes.get(18), bytes.get(19)
};
if (bytes.get(12) != (byte)(dataFormat >> 24) ||
bytes.get(13) != (byte)(dataFormat >> 16) ||
bytes.get(14) != (byte)(dataFormat >> 8) ||
bytes.get(15) != (byte)dataFormat ||
(authenticate != null && !authenticate.isDataVersionAcceptable(formatVersion))) {
throw new IOException(HEADER_AUTHENTICATION_FAILED_ +
String.format("; data format %02x%02x%02x%02x, format version %d.%d.%d.%d",
bytes.get(12), bytes.get(13), bytes.get(14), bytes.get(15),
formatVersion[0] & 0xff, formatVersion[1] & 0xff,
formatVersion[2] & 0xff, formatVersion[3] & 0xff));
}
bytes.position(headerSize);
return // dataVersion
((int)bytes.get(20) << 24) |
((bytes.get(21) & 0xff) << 16) |
((bytes.get(22) & 0xff) << 8) |
(bytes.get(23) & 0xff);
}
public static void skipBytes(ByteBuffer bytes, int skipLength) {
if (skipLength > 0) {
bytes.position(bytes.position() + skipLength);
}
}
public static byte[] getBytes(ByteBuffer bytes, int length, int additionalSkipLength) {
byte[] dest = new byte[length];
bytes.get(dest);
if (additionalSkipLength > 0) {
skipBytes(bytes, additionalSkipLength);
}
return dest;
}
public static String getString(ByteBuffer bytes, int length, int additionalSkipLength) {
CharSequence cs = bytes.asCharBuffer();
String s = cs.subSequence(0, length).toString();
skipBytes(bytes, length * 2 + additionalSkipLength);
return s;
}
public static char[] getChars(ByteBuffer bytes, int length, int additionalSkipLength) {
char[] dest = new char[length];
bytes.asCharBuffer().get(dest);
skipBytes(bytes, length * 2 + additionalSkipLength);
return dest;
}
public static int[] getInts(ByteBuffer bytes, int length, int additionalSkipLength) {
int[] dest = new int[length];
bytes.asIntBuffer().get(dest);
skipBytes(bytes, length * 4 + additionalSkipLength);
return dest;
}
/**
* Returns a VersionInfo for the bytes in the compact version integer.
*/
public static VersionInfo getVersionInfoFromCompactInt(int version) {
return VersionInfo.getInstance(
version >>> 24, (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff);
}
// private variables -------------------------------------------------
/**
* Magic numbers to authenticate the data file
*/
private static final byte MAGIC1 = (byte)0xda;
private static final byte MAGIC2 = (byte)0x27;
/**
* File format authentication values
*/
private static final byte CHAR_SET_ = 0;
private static final byte CHAR_SIZE_ = 2;
/**
* Error messages
*/
private static final String MAGIC_NUMBER_AUTHENTICATION_FAILED_ =
"ICUBinary data file error: Magic number authentication failed";
private static final String HEADER_AUTHENTICATION_FAILED_ =
"ICUBinary data file error: Header authentication failed";
}

View file

@ -1,287 +0,0 @@
/*
* Copyright (c) 2015, 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2009-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.IOException;
final class Norm2AllModes {
// Public API dispatch via Normalizer2 subclasses -------------------------- ***
// Normalizer2 implementation for the old UNORM_NONE.
public static final class NoopNormalizer2 extends Normalizer2 {
@Override
public StringBuilder normalize(CharSequence src, StringBuilder dest) {
if(dest!=src) {
dest.setLength(0);
return dest.append(src);
} else {
throw new IllegalArgumentException();
}
}
@Override
public Appendable normalize(CharSequence src, Appendable dest) {
if(dest!=src) {
try {
return dest.append(src);
} catch(IOException e) {
throw new InternalError(e.toString(), e);
}
} else {
throw new IllegalArgumentException();
}
}
@Override
public StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second) {
if(first!=second) {
return first.append(second);
} else {
throw new IllegalArgumentException();
}
}
@Override
public StringBuilder append(StringBuilder first, CharSequence second) {
if(first!=second) {
return first.append(second);
} else {
throw new IllegalArgumentException();
}
}
@Override
public String getDecomposition(int c) {
return null;
}
// No need to override the default getRawDecomposition().
@Override
public boolean isNormalized(CharSequence s) { return true; }
@Override
public int spanQuickCheckYes(CharSequence s) { return s.length(); }
@Override
public boolean hasBoundaryBefore(int c) { return true; }
}
// Intermediate class:
// Has NormalizerImpl and does boilerplate argument checking and setup.
public abstract static class Normalizer2WithImpl extends Normalizer2 {
public Normalizer2WithImpl(NormalizerImpl ni) {
impl=ni;
}
// normalize
@Override
public StringBuilder normalize(CharSequence src, StringBuilder dest) {
if(dest==src) {
throw new IllegalArgumentException();
}
dest.setLength(0);
normalize(src, new NormalizerImpl.ReorderingBuffer(impl, dest, src.length()));
return dest;
}
@Override
public Appendable normalize(CharSequence src, Appendable dest) {
if(dest==src) {
throw new IllegalArgumentException();
}
NormalizerImpl.ReorderingBuffer buffer=
new NormalizerImpl.ReorderingBuffer(impl, dest, src.length());
normalize(src, buffer);
buffer.flush();
return dest;
}
protected abstract void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer);
// normalize and append
@Override
public StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second) {
return normalizeSecondAndAppend(first, second, true);
}
@Override
public StringBuilder append(StringBuilder first, CharSequence second) {
return normalizeSecondAndAppend(first, second, false);
}
public StringBuilder normalizeSecondAndAppend(
StringBuilder first, CharSequence second, boolean doNormalize) {
if(first==second) {
throw new IllegalArgumentException();
}
normalizeAndAppend(
second, doNormalize,
new NormalizerImpl.ReorderingBuffer(impl, first, first.length()+second.length()));
return first;
}
protected abstract void normalizeAndAppend(
CharSequence src, boolean doNormalize, NormalizerImpl.ReorderingBuffer buffer);
@Override
public String getDecomposition(int c) {
return impl.getDecomposition(c);
}
@Override
public int getCombiningClass(int c) {
return impl.getCC(impl.getNorm16(c));
}
// quick checks
@Override
public boolean isNormalized(CharSequence s) {
return s.length()==spanQuickCheckYes(s);
}
public final NormalizerImpl impl;
}
public static final class DecomposeNormalizer2 extends Normalizer2WithImpl {
public DecomposeNormalizer2(NormalizerImpl ni) {
super(ni);
}
@Override
protected void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer) {
impl.decompose(src, 0, src.length(), buffer);
}
@Override
protected void normalizeAndAppend(
CharSequence src, boolean doNormalize, NormalizerImpl.ReorderingBuffer buffer) {
impl.decomposeAndAppend(src, doNormalize, buffer);
}
@Override
public int spanQuickCheckYes(CharSequence s) {
return impl.decompose(s, 0, s.length(), null);
}
@Override
public boolean hasBoundaryBefore(int c) { return impl.hasDecompBoundaryBefore(c); }
}
public static final class ComposeNormalizer2 extends Normalizer2WithImpl {
public ComposeNormalizer2(NormalizerImpl ni, boolean fcc) {
super(ni);
onlyContiguous=fcc;
}
@Override
protected void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer) {
impl.compose(src, 0, src.length(), onlyContiguous, true, buffer);
}
@Override
protected void normalizeAndAppend(
CharSequence src, boolean doNormalize, NormalizerImpl.ReorderingBuffer buffer) {
impl.composeAndAppend(src, doNormalize, onlyContiguous, buffer);
}
@Override
public boolean isNormalized(CharSequence s) {
// 5: small destCapacity for substring normalization
return impl.compose(s, 0, s.length(),
onlyContiguous, false,
new NormalizerImpl.ReorderingBuffer(impl, new StringBuilder(), 5));
}
@Override
public int spanQuickCheckYes(CharSequence s) {
return impl.composeQuickCheck(s, 0, s.length(), onlyContiguous, true)>>>1;
}
@Override
public boolean hasBoundaryBefore(int c) { return impl.hasCompBoundaryBefore(c); }
private final boolean onlyContiguous;
}
// instance cache ---------------------------------------------------------- ***
private Norm2AllModes(NormalizerImpl ni) {
impl=ni;
comp=new ComposeNormalizer2(ni, false);
decomp=new DecomposeNormalizer2(ni);
}
public final NormalizerImpl impl;
public final ComposeNormalizer2 comp;
public final DecomposeNormalizer2 decomp;
private static Norm2AllModes getInstanceFromSingleton(Norm2AllModesSingleton singleton) {
if(singleton.exception!=null) {
throw singleton.exception;
}
return singleton.allModes;
}
public static Norm2AllModes getNFCInstance() {
return getInstanceFromSingleton(NFCSingleton.INSTANCE);
}
public static Norm2AllModes getNFKCInstance() {
return getInstanceFromSingleton(NFKCSingleton.INSTANCE);
}
public static final NoopNormalizer2 NOOP_NORMALIZER2=new NoopNormalizer2();
private static final class Norm2AllModesSingleton {
private Norm2AllModesSingleton(String name) {
try {
String DATA_FILE_NAME = "/sun/text/resources/" + name + ".nrm";
NormalizerImpl impl=new NormalizerImpl().load(DATA_FILE_NAME);
allModes=new Norm2AllModes(impl);
} catch (RuntimeException e) {
exception=e;
}
}
private Norm2AllModes allModes;
private RuntimeException exception;
}
private static final class NFCSingleton {
private static final Norm2AllModesSingleton INSTANCE=new Norm2AllModesSingleton("nfc");
}
private static final class NFKCSingleton {
private static final Norm2AllModesSingleton INSTANCE=new Norm2AllModesSingleton("nfkc");
}
}

View file

@ -1,273 +0,0 @@
/*
* Copyright (c) 2015, 2018, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2009-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
/**
* Unicode normalization functionality for standard Unicode normalization or
* for using custom mapping tables.
* All instances of this class are unmodifiable/immutable.
* The Normalizer2 class is not intended for public subclassing.
* <p>
* The primary functions are to produce a normalized string and to detect whether
* a string is already normalized.
* The most commonly used normalization forms are those defined in
* http://www.unicode.org/unicode/reports/tr15/
* However, this API supports additional normalization forms for specialized purposes.
* For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
* and can be used in implementations of UTS #46.
* <p>
* Not only are the standard compose and decompose modes supplied,
* but additional modes are provided as documented in the Mode enum.
* <p>
* Some of the functions in this class identify normalization boundaries.
* At a normalization boundary, the portions of the string
* before it and starting from it do not interact and can be handled independently.
* <p>
* The spanQuickCheckYes() stops at a normalization boundary.
* When the goal is a normalized string, then the text before the boundary
* can be copied, and the remainder can be processed with normalizeSecondAndAppend().
* <p>
* The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
* a character is guaranteed to be at a normalization boundary,
* regardless of context.
* This is used for moving from one normalization boundary to the next
* or preceding boundary, and for performing iterative normalization.
* <p>
* Iterative normalization is useful when only a small portion of a
* longer string needs to be processed.
* For example, in ICU, iterative normalization is used by the NormalizationTransliterator
* (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
* (to process only the substring for which sort key bytes are computed).
* <p>
* The set of normalization boundaries returned by these functions may not be
* complete: There may be more boundaries that could be returned.
* Different functions may return different boundaries.
* @stable ICU 4.4
* @author Markus W. Scherer
*/
abstract class Normalizer2 {
/**
* Returns a Normalizer2 instance for Unicode NFC normalization.
* Same as getInstance(null, "nfc", Mode.COMPOSE).
* Returns an unmodifiable singleton instance.
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
public static Normalizer2 getNFCInstance() {
return Norm2AllModes.getNFCInstance().comp;
}
/**
* Returns a Normalizer2 instance for Unicode NFD normalization.
* Same as getInstance(null, "nfc", Mode.DECOMPOSE).
* Returns an unmodifiable singleton instance.
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
public static Normalizer2 getNFDInstance() {
return Norm2AllModes.getNFCInstance().decomp;
}
/**
* Returns a Normalizer2 instance for Unicode NFKC normalization.
* Same as getInstance(null, "nfkc", Mode.COMPOSE).
* Returns an unmodifiable singleton instance.
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
public static Normalizer2 getNFKCInstance() {
return Norm2AllModes.getNFKCInstance().comp;
}
/**
* Returns a Normalizer2 instance for Unicode NFKD normalization.
* Same as getInstance(null, "nfkc", Mode.DECOMPOSE).
* Returns an unmodifiable singleton instance.
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
public static Normalizer2 getNFKDInstance() {
return Norm2AllModes.getNFKCInstance().decomp;
}
/**
* Returns the normalized form of the source string.
* @param src source string
* @return normalized src
* @stable ICU 4.4
*/
public String normalize(CharSequence src) {
if(src instanceof String) {
// Fastpath: Do not construct a new String if the src is a String
// and is already normalized.
int spanLength=spanQuickCheckYes(src);
if(spanLength==src.length()) {
return (String)src;
}
if (spanLength != 0) {
StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength);
return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString();
}
}
return normalize(src, new StringBuilder(src.length())).toString();
}
/**
* Writes the normalized form of the source string to the destination string
* (replacing its contents) and returns the destination string.
* The source and destination strings must be different objects.
* @param src source string
* @param dest destination string; its contents is replaced with normalized src
* @return dest
* @stable ICU 4.4
*/
public abstract StringBuilder normalize(CharSequence src, StringBuilder dest);
/**
* Writes the normalized form of the source string to the destination Appendable
* and returns the destination Appendable.
* The source and destination strings must be different objects.
*
* <p>Any {@link java.io.IOException} is wrapped into a {@link com.ibm.icu.util.ICUUncheckedIOException}.
*
* @param src source string
* @param dest destination Appendable; gets normalized src appended
* @return dest
* @stable ICU 4.6
*/
public abstract Appendable normalize(CharSequence src, Appendable dest);
/**
* Appends the normalized form of the second string to the first string
* (merging them at the boundary) and returns the first string.
* The result is normalized if the first string was normalized.
* The first and second strings must be different objects.
* @param first string, should be normalized
* @param second string, will be normalized
* @return first
* @stable ICU 4.4
*/
public abstract StringBuilder normalizeSecondAndAppend(
StringBuilder first, CharSequence second);
/**
* Appends the second string to the first string
* (merging them at the boundary) and returns the first string.
* The result is normalized if both the strings were normalized.
* The first and second strings must be different objects.
* @param first string, should be normalized
* @param second string, should be normalized
* @return first
* @stable ICU 4.4
*/
public abstract StringBuilder append(StringBuilder first, CharSequence second);
/**
* Gets the decomposition mapping of c.
* Roughly equivalent to normalizing the String form of c
* on a DECOMPOSE Normalizer2 instance, but much faster, and except that this function
* returns null if c does not have a decomposition mapping in this instance's data.
* This function is independent of the mode of the Normalizer2.
* @param c code point
* @return c's decomposition mapping, if any; otherwise null
* @stable ICU 4.6
*/
public abstract String getDecomposition(int c);
/**
* Gets the combining class of c.
* The default implementation returns 0
* but all standard implementations return the Unicode Canonical_Combining_Class value.
* @param c code point
* @return c's combining class
* @stable ICU 49
*/
public int getCombiningClass(int c) { return 0; }
/**
* Tests if the string is normalized.
* Internally, in cases where the quickCheck() method would return "maybe"
* (which is only possible for the two COMPOSE modes) this method
* resolves to "yes" or "no" to provide a definitive result,
* at the cost of doing more work in those cases.
* @param s input string
* @return true if s is normalized
* @stable ICU 4.4
*/
public abstract boolean isNormalized(CharSequence s);
/**
* Returns the end of the normalized substring of the input string.
* In other words, with <code>end=spanQuickCheckYes(s);</code>
* the substring <code>s.subSequence(0, end)</code>
* will pass the quick check with a "yes" result.
* <p>
* The returned end index is usually one or more characters before the
* "no" or "maybe" character: The end index is at a normalization boundary.
* (See the class documentation for more about normalization boundaries.)
* <p>
* When the goal is a normalized string and most input strings are expected
* to be normalized already, then call this method,
* and if it returns a prefix shorter than the input string,
* copy that prefix and use normalizeSecondAndAppend() for the remainder.
* @param s input string
* @return "yes" span end index
* @stable ICU 4.4
*/
public abstract int spanQuickCheckYes(CharSequence s);
/**
* Tests if the character always has a normalization boundary before it,
* regardless of context.
* If true, then the character does not normalization-interact with
* preceding characters.
* In other words, a string containing this character can be normalized
* by processing portions before this character and starting from this
* character independently.
* This is used for iterative normalization. See the class documentation for details.
* @param c character to test
* @return true if c has a normalization boundary before it
* @stable ICU 4.4
*/
public abstract boolean hasBoundaryBefore(int c);
/**
* Sole constructor. (For invocation by subclass constructors,
* typically implicit.)
* @internal
* deprecated This API is ICU internal only.
*/
protected Normalizer2() {
}
}

View file

@ -1,782 +0,0 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2000-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
import java.text.CharacterIterator;
import java.text.Normalizer;
/**
* Unicode Normalization
*
* <h2>Unicode normalization API</h2>
*
* <code>normalize</code> transforms Unicode text into an equivalent composed or
* decomposed form, allowing for easier sorting and searching of text.
* <code>normalize</code> supports the standard normalization forms described in
* <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
* Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
*
* Characters with accents or other adornments can be encoded in
* several different ways in Unicode. For example, take the character A-acute.
* In Unicode, this can be encoded as a single character (the
* "composed" form):
*
* <pre>
* 00C1 LATIN CAPITAL LETTER A WITH ACUTE
* </pre>
*
* or as two separate characters (the "decomposed" form):
*
* <pre>
* 0041 LATIN CAPITAL LETTER A
* 0301 COMBINING ACUTE ACCENT
* </pre>
*
* To a user of your program, however, both of these sequences should be
* treated as the same "user-level" character "A with acute accent". When you
* are searching or comparing text, you must ensure that these two sequences are
* treated equivalently. In addition, you must handle characters with more than
* one accent. Sometimes the order of a character's combining accents is
* significant, while in other cases accent sequences in different orders are
* really equivalent.
*
* Similarly, the string "ffi" can be encoded as three separate letters:
*
* <pre>
* 0066 LATIN SMALL LETTER F
* 0066 LATIN SMALL LETTER F
* 0069 LATIN SMALL LETTER I
* </pre>
*
* or as the single character
*
* <pre>
* FB03 LATIN SMALL LIGATURE FFI
* </pre>
*
* The ffi ligature is not a distinct semantic character, and strictly speaking
* it shouldn't be in Unicode at all, but it was included for compatibility
* with existing character sets that already provided it. The Unicode standard
* identifies such characters by giving them "compatibility" decompositions
* into the corresponding semantic characters. When sorting and searching, you
* will often want to use these mappings.
*
* <code>normalize</code> helps solve these problems by transforming text into
* the canonical composed and decomposed forms as shown in the first example
* above. In addition, you can have it perform compatibility decompositions so
* that you can treat compatibility characters the same as their equivalents.
* Finally, <code>normalize</code> rearranges accents into the proper canonical
* order, so that you do not have to worry about accent rearrangement on your
* own.
*
* Form FCD, "Fast C or D", is also designed for collation.
* It allows to work on strings that are not necessarily normalized
* with an algorithm (like in collation) that works under "canonical closure",
* i.e., it treats precomposed characters and their decomposed equivalents the
* same.
*
* It is not a normalization form because it does not provide for uniqueness of
* representation. Multiple strings may be canonically equivalent (their NFDs
* are identical) and may all conform to FCD without being identical themselves.
*
* The form is defined such that the "raw decomposition", the recursive
* canonical decomposition of each character, results in a string that is
* canonically ordered. This means that precomposed characters are allowed for
* as long as their decompositions do not need canonical reordering.
*
* Its advantage for a process like collation is that all NFD and most NFC texts
* - and many unnormalized texts - already conform to FCD and do not need to be
* normalized (NFD) for such a process. The FCD quick check will return YES for
* most strings in practice.
*
* normalize(FCD) may be implemented with NFD.
*
* For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications):
* http://www.unicode.org/notes/tn5/#FCD
*
* ICU collation performs either NFD or FCD normalization automatically if
* normalization is turned on for the collator object. Beyond collation and
* string search, normalized strings may be useful for string equivalence
* comparisons, transliteration/transcription, unique representations, etc.
*
* The W3C generally recommends to exchange texts in NFC.
* Note also that most legacy character encodings use only precomposed forms and
* often do not encode any combining marks by themselves. For conversion to such
* character encodings the Unicode text needs to be normalized to NFC.
* For more usage examples, see the Unicode Standard Annex.
*
* Note: The Normalizer class also provides API for iterative normalization.
* While the setIndex() and getIndex() refer to indices in the
* underlying Unicode input text, the next() and previous() methods
* iterate through characters in the normalized output.
* This means that there is not necessarily a one-to-one correspondence
* between characters returned by next() and previous() and the indices
* passed to and returned from setIndex() and getIndex().
* It is for this reason that Normalizer does not implement the CharacterIterator interface.
*
* @stable ICU 2.8
*/
// Original filename in ICU4J: Normalizer.java
public final class NormalizerBase implements Cloneable {
// The input text and our position in it
private UCharacterIterator text;
private Normalizer2 norm2;
private Mode mode;
private int options;
// The normalization buffer is the result of normalization
// of the source in [currentIndex..nextIndex] .
private int currentIndex;
private int nextIndex;
// A buffer for holding intermediate results
private StringBuilder buffer;
private int bufferPos;
// Helper classes to defer loading of normalization data.
private static final class ModeImpl {
private ModeImpl(Normalizer2 n2) {
normalizer2 = n2;
}
private final Normalizer2 normalizer2;
}
private static final class NFDModeImpl {
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance());
}
private static final class NFKDModeImpl {
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance());
}
private static final class NFCModeImpl {
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance());
}
private static final class NFKCModeImpl {
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance());
}
private static final class Unicode32 {
private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
}
private static final class NFD32ModeImpl {
private static final ModeImpl INSTANCE =
new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(),
Unicode32.INSTANCE));
}
private static final class NFKD32ModeImpl {
private static final ModeImpl INSTANCE =
new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(),
Unicode32.INSTANCE));
}
private static final class NFC32ModeImpl {
private static final ModeImpl INSTANCE =
new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(),
Unicode32.INSTANCE));
}
private static final class NFKC32ModeImpl {
private static final ModeImpl INSTANCE =
new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(),
Unicode32.INSTANCE));
}
/**
* Options bit set value to select Unicode 3.2 normalization
* (except NormalizationCorrections).
* At most one Unicode version can be selected at a time.
* @stable ICU 2.6
*/
public static final int UNICODE_3_2=0x20;
public static final int UNICODE_3_2_0_ORIGINAL=UNICODE_3_2;
/*
* Default option for the latest Unicode normalization. This option is
* provided mainly for testing.
* The value zero means that normalization is done with the fixes for
* - Corrigendum 4 (Five CJK Canonical Mapping Errors)
* - Corrigendum 5 (Normalization Idempotency)
*/
public static final int UNICODE_LATEST = 0x00;
/**
* Constant indicating that the end of the iteration has been reached.
* This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
* @stable ICU 2.8
*/
public static final int DONE = UCharacterIterator.DONE;
/**
* Constants for normalization modes.
* <p>
* The Mode class is not intended for public subclassing.
* Only the Mode constants provided by the Normalizer class should be used,
* and any fields or methods should not be called or overridden by users.
* @stable ICU 2.8
*/
public abstract static class Mode {
/**
* Sole constructor
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected Mode() {
}
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected abstract Normalizer2 getNormalizer2(int options);
}
private static Mode toMode(Normalizer.Form form) {
switch (form) {
case NFC :
return NFC;
case NFD :
return NFD;
case NFKC :
return NFKC;
case NFKD :
return NFKD;
}
throw new IllegalArgumentException("Unexpected normalization form: " +
form);
}
private static final class NONEMode extends Mode {
protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; }
}
private static final class NFDMode extends Mode {
protected Normalizer2 getNormalizer2(int options) {
return (options&UNICODE_3_2) != 0 ?
NFD32ModeImpl.INSTANCE.normalizer2 :
NFDModeImpl.INSTANCE.normalizer2;
}
}
private static final class NFKDMode extends Mode {
protected Normalizer2 getNormalizer2(int options) {
return (options&UNICODE_3_2) != 0 ?
NFKD32ModeImpl.INSTANCE.normalizer2 :
NFKDModeImpl.INSTANCE.normalizer2;
}
}
private static final class NFCMode extends Mode {
protected Normalizer2 getNormalizer2(int options) {
return (options&UNICODE_3_2) != 0 ?
NFC32ModeImpl.INSTANCE.normalizer2 :
NFCModeImpl.INSTANCE.normalizer2;
}
}
private static final class NFKCMode extends Mode {
protected Normalizer2 getNormalizer2(int options) {
return (options&UNICODE_3_2) != 0 ?
NFKC32ModeImpl.INSTANCE.normalizer2 :
NFKCModeImpl.INSTANCE.normalizer2;
}
}
/**
* No decomposition/composition.
* @stable ICU 2.8
*/
public static final Mode NONE = new NONEMode();
/**
* Canonical decomposition.
* @stable ICU 2.8
*/
public static final Mode NFD = new NFDMode();
/**
* Compatibility decomposition.
* @stable ICU 2.8
*/
public static final Mode NFKD = new NFKDMode();
/**
* Canonical decomposition followed by canonical composition.
* @stable ICU 2.8
*/
public static final Mode NFC = new NFCMode();
public static final Mode NFKC =new NFKCMode();
//-------------------------------------------------------------------------
// Iterator constructors
//-------------------------------------------------------------------------
/**
* Creates a new {@code NormalizerBase} object for iterating over the
* normalized form of a given string.
* <p>
* The {@code options} parameter specifies which optional
* {@code NormalizerBase} features are to be enabled for this object.
* <p>
* @param str The string to be normalized. The normalization
* will start at the beginning of the string.
*
* @param mode The normalization mode.
*
* @param opt Any optional features to be enabled.
* Currently the only available option is {@link #UNICODE_3_2}.
* If you want the default behavior corresponding to one of the
* standard Unicode Normalization Forms, use 0 for this argument.
* @stable ICU 2.6
*/
public NormalizerBase(String str, Mode mode, int opt) {
this.text = UCharacterIterator.getInstance(str);
this.mode = mode;
this.options=opt;
norm2 = mode.getNormalizer2(opt);
buffer = new StringBuilder();
}
public NormalizerBase(String str, Mode mode) {
this(str, mode, 0);
}
/**
* Creates a new {@code NormalizerBase} object for iterating over the
* normalized form of the given text.
* <p>
* @param iter The input text to be normalized. The normalization
* will start at the beginning of the string.
*
* @param mode The normalization mode.
*
* @param opt Any optional features to be enabled.
* Currently the only available option is {@link #UNICODE_3_2}.
* If you want the default behavior corresponding to one of the
* standard Unicode Normalization Forms, use 0 for this argument.
* @stable ICU 2.6
*/
public NormalizerBase(CharacterIterator iter, Mode mode, int opt) {
this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone());
this.mode = mode;
this.options = opt;
norm2 = mode.getNormalizer2(opt);
buffer = new StringBuilder();
}
public NormalizerBase(CharacterIterator iter, Mode mode) {
this(iter, mode, 0);
}
/**
* Clones this {@code NormalizerBase} object. All properties of this
* object are duplicated in the new object, including the cloning of any
* {@link CharacterIterator} that was passed in to the constructor
* or to {@link #setText(CharacterIterator) setText}.
* However, the text storage underlying
* the {@code CharacterIterator} is not duplicated unless the
* iterator's {@code clone} method does so.
* @stable ICU 2.8
*/
public Object clone() {
try {
NormalizerBase copy = (NormalizerBase) super.clone();
copy.text = (UCharacterIterator) text.clone();
copy.mode = mode;
copy.options = options;
copy.norm2 = norm2;
copy.buffer = new StringBuilder(buffer);
copy.bufferPos = bufferPos;
copy.currentIndex = currentIndex;
copy.nextIndex = nextIndex;
return copy;
}
catch (CloneNotSupportedException e) {
throw new InternalError(e.toString(), e);
}
}
/**
* Normalizes a {@code String} using the given normalization operation.
* <p>
* The {@code options} parameter specifies which optional
* {@code NormalizerBase} features are to be enabled for this operation.
* Currently the only available option is {@link #UNICODE_3_2}.
* If you want the default behavior corresponding to one of the standard
* Unicode Normalization Forms, use 0 for this argument.
* <p>
* @param str the input string to be normalized.
* @param mode the normalization mode
* @param options the optional features to be enabled.
* @return String the normalized string
* @stable ICU 2.6
*/
public static String normalize(String str, Mode mode, int options) {
return mode.getNormalizer2(options).normalize(str);
}
public static String normalize(String str, Normalizer.Form form) {
return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST);
}
public static String normalize(String str, Normalizer.Form form, int options) {
return NormalizerBase.normalize(str, toMode(form), options);
}
/**
* Test if a string is in a given normalization form.
* This is semantically equivalent to source.equals(normalize(source, mode)).
*
* Unlike quickCheck(), this function returns a definitive result,
* never a "maybe".
* For NFD, NFKD, and FCD, both functions work exactly the same.
* For NFC and NFKC where quickCheck may return "maybe", this function will
* perform further tests to arrive at a true/false result.
* @param str the input string to be checked to see if it is
* normalized
* @param mode the normalization mode
* @param options Options for use with exclusion set and tailored Normalization
* The only option that is currently recognized is UNICODE_3_2
* @see #isNormalized
* @stable ICU 2.6
*/
public static boolean isNormalized(String str, Mode mode, int options) {
return mode.getNormalizer2(options).isNormalized(str);
}
public static boolean isNormalized(String str, Normalizer.Form form) {
return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST);
}
public static boolean isNormalized(String str, Normalizer.Form form, int options) {
return NormalizerBase.isNormalized(str, toMode(form), options);
}
//-------------------------------------------------------------------------
// Iteration API
//-------------------------------------------------------------------------
/**
* Return the current character in the normalized text.
* @return The codepoint as an int
* @stable ICU 2.8
*/
public int current() {
if(bufferPos<buffer.length() || nextNormalize()) {
return buffer.codePointAt(bufferPos);
} else {
return DONE;
}
}
/**
* Return the next character in the normalized text and advance
* the iteration position by one. If the end
* of the text has already been reached, {@link #DONE} is returned.
* @return The codepoint as an int
* @stable ICU 2.8
*/
public int next() {
if(bufferPos<buffer.length() || nextNormalize()) {
int c=buffer.codePointAt(bufferPos);
bufferPos+=Character.charCount(c);
return c;
} else {
return DONE;
}
}
/**
* Return the previous character in the normalized text and decrement
* the iteration position by one. If the beginning
* of the text has already been reached, {@link #DONE} is returned.
* @return The codepoint as an int
* @stable ICU 2.8
*/
public int previous() {
if(bufferPos>0 || previousNormalize()) {
int c=buffer.codePointBefore(bufferPos);
bufferPos-=Character.charCount(c);
return c;
} else {
return DONE;
}
}
/**
* Reset the index to the beginning of the text.
* This is equivalent to setIndexOnly(startIndex)).
* @stable ICU 2.8
*/
public void reset() {
text.setIndex(0);
currentIndex=nextIndex=0;
clearBuffer();
}
/**
* Set the iteration position in the input text that is being normalized,
* without any immediate normalization.
* After setIndexOnly(), getIndex() will return the same index that is
* specified here.
*
* @param index the desired index in the input text.
* @stable ICU 2.8
*/
public void setIndexOnly(int index) {
text.setIndex(index); // validates index
currentIndex=nextIndex=index;
clearBuffer();
}
/**
* Set the iteration position in the input text that is being normalized
* and return the first normalized character at that position.
* <p>
* <b>Note:</b> This method sets the position in the <em>input</em> text,
* while {@link #next} and {@link #previous} iterate through characters
* in the normalized <em>output</em>. This means that there is not
* necessarily a one-to-one correspondence between characters returned
* by {@code next} and {@code previous} and the indices passed to and
* returned from {@code setIndex} and {@link #getIndex}.
* <p>
* @param index the desired index in the input text.
*
* @return the first normalized character that is the result of iterating
* forward starting at the given index.
*
* @throws IllegalArgumentException if the given index is less than
* {@link #getBeginIndex} or greater than {@link #getEndIndex}.
* deprecated ICU 3.2
* @obsolete ICU 3.2
*/
public int setIndex(int index) {
setIndexOnly(index);
return current();
}
/**
* Retrieve the index of the start of the input text. This is the begin
* index of the {@code CharacterIterator} or the start (i.e. 0) of the
* {@code String} over which this {@code NormalizerBase} is iterating
* @deprecated ICU 2.2. Use startIndex() instead.
* @return The codepoint as an int
* @see #startIndex
*/
@Deprecated
public int getBeginIndex() {
return 0;
}
/**
* Retrieve the index of the end of the input text. This is the end index
* of the {@code CharacterIterator} or the length of the {@code String}
* over which this {@code NormalizerBase} is iterating
* @deprecated ICU 2.2. Use endIndex() instead.
* @return The codepoint as an int
* @see #endIndex
*/
@Deprecated
public int getEndIndex() {
return endIndex();
}
/**
* Retrieve the current iteration position in the input text that is
* being normalized. This method is useful in applications such as
* searching, where you need to be able to determine the position in
* the input text that corresponds to a given normalized output character.
* <p>
* <b>Note:</b> This method sets the position in the <em>input</em>, while
* {@link #next} and {@link #previous} iterate through characters in the
* <em>output</em>. This means that there is not necessarily a one-to-one
* correspondence between characters returned by {@code next} and
* {@code previous} and the indices passed to and returned from
* {@code setIndex} and {@link #getIndex}.
* @return The current iteration position
* @stable ICU 2.8
*/
public int getIndex() {
if(bufferPos<buffer.length()) {
return currentIndex;
} else {
return nextIndex;
}
}
/**
* Retrieve the index of the end of the input text. This is the end index
* of the {@code CharacterIterator} or the length of the {@code String}
* over which this {@code NormalizerBase} is iterating
* @return The current iteration position
* @stable ICU 2.8
*/
public int endIndex() {
return text.getLength();
}
//-------------------------------------------------------------------------
// Iterator attributes
//-------------------------------------------------------------------------
/**
* Set the normalization mode for this object.
* <p>
* <b>Note:</b>If the normalization mode is changed while iterating
* over a string, calls to {@link #next} and {@link #previous} may
* return previously buffers characters in the old normalization mode
* until the iteration is able to re-sync at the next base character.
* It is safest to call {@link #setText setText()}, {@link #first},
* {@link #last}, etc. after calling {@code setMode}.
* <p>
* @param newMode the new mode for this {@code NormalizerBase}.
* The supported modes are:
* <ul>
* <li>{@link #NFC} - Unicode canonical decompositiion
* followed by canonical composition.
* <li>{@link #NFKC} - Unicode compatibility decompositiion
* follwed by canonical composition.
* <li>{@link #NFD} - Unicode canonical decomposition
* <li>{@link #NFKD} - Unicode compatibility decomposition.
* <li>{@link #NONE} - Do nothing but return characters
* from the underlying input text.
* </ul>
*
* @see #getMode
* @stable ICU 2.8
*/
public void setMode(Mode newMode) {
mode = newMode;
norm2 = mode.getNormalizer2(options);
}
/**
* Return the basic operation performed by this {@code NormalizerBase}
*
* @see #setMode
* @stable ICU 2.8
*/
public Mode getMode() {
return mode;
}
/**
* Set the input text over which this {@code NormalizerBase} will iterate.
* The iteration position is set to the beginning of the input text.
* @param newText The new string to be normalized.
* @stable ICU 2.8
*/
public void setText(String newText) {
UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
if (newIter == null) {
throw new IllegalStateException("Could not create a new UCharacterIterator");
}
text = newIter;
reset();
}
/**
* Set the input text over which this {@code NormalizerBase} will iterate.
* The iteration position is set to the beginning of the input text.
* @param newText The new string to be normalized.
* @stable ICU 2.8
*/
public void setText(CharacterIterator newText) {
UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
if (newIter == null) {
throw new IllegalStateException("Could not create a new UCharacterIterator");
}
text = newIter;
currentIndex=nextIndex=0;
clearBuffer();
}
private void clearBuffer() {
buffer.setLength(0);
bufferPos=0;
}
private boolean nextNormalize() {
clearBuffer();
currentIndex=nextIndex;
text.setIndex(nextIndex);
// Skip at least one character so we make progress.
int c=text.nextCodePoint();
if(c<0) {
return false;
}
StringBuilder segment=new StringBuilder().appendCodePoint(c);
while((c=text.nextCodePoint())>=0) {
if(norm2.hasBoundaryBefore(c)) {
text.moveCodePointIndex(-1);
break;
}
segment.appendCodePoint(c);
}
nextIndex=text.getIndex();
norm2.normalize(segment, buffer);
return buffer.length()!=0;
}
private boolean previousNormalize() {
clearBuffer();
nextIndex=currentIndex;
text.setIndex(currentIndex);
StringBuilder segment=new StringBuilder();
int c;
while((c=text.previousCodePoint())>=0) {
if(c<=0xffff) {
segment.insert(0, (char)c);
} else {
segment.insert(0, Character.toChars(c));
}
if(norm2.hasBoundaryBefore(c)) {
break;
}
}
currentIndex=text.getIndex();
norm2.normalize(segment, buffer);
bufferPos=buffer.length();
return buffer.length()!=0;
}
}

View file

@ -1,50 +0,0 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
/**
* Simple struct-like class for int output parameters.
* Like <code>Output&lt;Integer&gt;</code> but without auto-boxing.
*
* @internal but could become public
* deprecated This API is ICU internal only.
*/
class OutputInt {
/**
* The value field.
*
* @internal
* deprecated This API is ICU internal only.
*/
public int value;
}

View file

@ -1,121 +0,0 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
/**
* <code>Replaceable</code> is an interface representing a
* string of characters that supports the replacement of a range of
* itself with a new string of characters. It is used by APIs that
* change a piece of text while retaining metadata. Metadata is data
* other than the Unicode characters returned by char32At(). One
* example of metadata is style attributes; another is an edit
* history, marking each character with an author and revision number.
*
* <p>An implicit aspect of the <code>Replaceable</code> API is that
* during a replace operation, new characters take on the metadata of
* the old characters. For example, if the string "the <b>bold</b>
* font" has range (4, 8) replaced with "strong", then it becomes "the
* <b>strong</b> font".
*
* <p><code>Replaceable</code> specifies ranges using a start
* offset and a limit offset. The range of characters thus specified
* includes the characters at offset start..limit-1. That is, the
* start offset is inclusive, and the limit offset is exclusive.
*
* <p><code>Replaceable</code> also includes API to access characters
* in the string: <code>length()</code>, <code>charAt()</code>,
* <code>char32At()</code>, and <code>extractBetween()</code>.
*
* <p>For a subclass to support metadata, typical behavior of
* <code>replace()</code> is the following:
* <ul>
* <li>Set the metadata of the new text to the metadata of the first
* character replaced</li>
* <li>If no characters are replaced, use the metadata of the
* previous character</li>
* <li>If there is no previous character (i.e. start == 0), use the
* following character</li>
* <li>If there is no following character (i.e. the replaceable was
* empty), use default metadata</li>
* <li>If the code point U+FFFF is seen, it should be interpreted as
* a special marker having no metadata</li>
* </ul>
* If this is not the behavior, the subclass should document any differences.
*
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
*
* @author Alan Liu
* @stable ICU 2.0
*/
interface Replaceable {
/**
* Returns the number of 16-bit code units in the text.
* @return number of 16-bit code units in text
* @stable ICU 2.0
*/
int length();
/**
* Returns the 16-bit code unit at the given offset into the text.
* @param offset an integer between 0 and <code>length()</code>-1
* inclusive
* @return 16-bit code unit of text at given offset
* @stable ICU 2.0
*/
char charAt(int offset);
/**
* Copies characters from this object into the destination
* character array. The first character to be copied is at index
* <code>srcStart</code>; the last character to be copied is at
* index <code>srcLimit-1</code> (thus the total number of
* characters to be copied is <code>srcLimit-srcStart</code>). The
* characters are copied into the subarray of <code>dst</code>
* starting at index <code>dstStart</code> and ending at index
* <code>dstStart + (srcLimit-srcStart) - 1</code>.
*
* @param srcStart the beginning index to copy, inclusive;
* {@code 0 <= start <= limit}.
* @param srcLimit the ending index to copy, exclusive;
* {@code start <= limit <= length()}.
* @param dst the destination array.
* @param dstStart the start offset in the destination array.
* @stable ICU 2.0
*/
void getChars(int srcStart, int srcLimit, char dst[], int dstStart);
}

View file

@ -1,118 +0,0 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 1996-2009, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package sun.text.normalizer;
/**
* <code>ReplaceableString</code> is an adapter class that implements the
* <code>Replaceable</code> API around an ordinary <code>StringBuffer</code>.
*
* <p><em>Note:</em> This class does not support attributes and is not
* intended for general use. Most clients will need to implement
* {@link Replaceable} in their text representation class.
*
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
*
* @see Replaceable
* @author Alan Liu
* @stable ICU 2.0
*/
class ReplaceableString implements Replaceable {
private StringBuffer buf;
/**
* Construct a new object with the given initial contents.
* @param str initial contents
* @stable ICU 2.0
*/
public ReplaceableString(String str) {
buf = new StringBuffer(str);
}
/**
* Construct a new object using <code>buf</code> for internal
* storage. The contents of <code>buf</code> at the time of
* construction are used as the initial contents. <em>Note!
* Modifications to <code>buf</code> will modify this object, and
* vice versa.</em>
* @param buf object to be used as internal storage
* @stable ICU 2.0
*/
public ReplaceableString(StringBuffer buf) {
this.buf = buf;
}
/**
* Return the number of characters contained in this object.
* <code>Replaceable</code> API.
* @stable ICU 2.0
*/
public int length() {
return buf.length();
}
/**
* Return the character at the given position in this object.
* <code>Replaceable</code> API.
* @param offset offset into the contents, from 0 to
* <code>length()</code> - 1
* @stable ICU 2.0
*/
public char charAt(int offset) {
return buf.charAt(offset);
}
/**
* Copies characters from this object into the destination
* character array. The first character to be copied is at index
* <code>srcStart</code>; the last character to be copied is at
* index <code>srcLimit-1</code> (thus the total number of
* characters to be copied is <code>srcLimit-srcStart</code>). The
* characters are copied into the subarray of <code>dst</code>
* starting at index <code>dstStart</code> and ending at index
* <code>dstStart + (srcLimit-srcStart) - 1</code>.
*
* @param srcStart the beginning index to copy, inclusive;
* {@code 0 <= start <= limit}.
* @param srcLimit the ending index to copy, exclusive;
* {@code start <= limit <= length()}.
* @param dst the destination array.
* @param dstStart the start offset in the destination array.
* @stable ICU 2.0
*/
public void getChars(int srcStart, int srcLimit, char dst[], int dstStart) {
if (srcStart != srcLimit) {
buf.getChars(srcStart, srcLimit, dst, dstStart);
}
}
}

View file

@ -1,186 +0,0 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
/**
* DLF docs must define behavior when Replaceable is mutated underneath
* the iterator.
*
* This and ICUCharacterIterator share some code, maybe they should share
* an implementation, or the common state and implementation should be
* moved up into UCharacterIterator.
*
* What are first, last, and getBeginIndex doing here?!?!?!
*/
class ReplaceableUCharacterIterator extends UCharacterIterator {
// public constructor ------------------------------------------------------
/**
* Public constructor
* @param str text which the iterator will be based on
*/
public ReplaceableUCharacterIterator(String str){
if(str==null){
throw new IllegalArgumentException();
}
this.replaceable = new ReplaceableString(str);
this.currentIndex = 0;
}
/**
* Public constructor
* @param buf buffer of text on which the iterator will be based
*/
public ReplaceableUCharacterIterator(StringBuffer buf){
if(buf==null){
throw new IllegalArgumentException();
}
this.replaceable = new ReplaceableString(buf);
this.currentIndex = 0;
}
// public methods ----------------------------------------------------------
/**
* Creates a copy of this iterator, does not clone the underlying
* <code>Replaceable</code>object
* @return copy of this iterator
*/
public Object clone(){
try {
return super.clone();
} catch (CloneNotSupportedException e) {
return null; // never invoked
}
}
/**
* Returns the current UTF16 character.
* @return current UTF16 character
*/
public int current(){
if (currentIndex < replaceable.length()) {
return replaceable.charAt(currentIndex);
}
return DONE;
}
/**
* Returns the length of the text
* @return length of the text
*/
public int getLength(){
return replaceable.length();
}
/**
* Gets the current currentIndex in text.
* @return current currentIndex in text.
*/
public int getIndex(){
return currentIndex;
}
/**
* Returns next UTF16 character and increments the iterator's currentIndex by 1.
* If the resulting currentIndex is greater or equal to the text length, the
* currentIndex is reset to the text length and a value of DONECODEPOINT is
* returned.
* @return next UTF16 character in text or DONE if the new currentIndex is off the
* end of the text range.
*/
public int next(){
if (currentIndex < replaceable.length()) {
return replaceable.charAt(currentIndex++);
}
return DONE;
}
/**
* Returns previous UTF16 character and decrements the iterator's currentIndex by
* 1.
* If the resulting currentIndex is less than 0, the currentIndex is reset to 0 and a
* value of DONECODEPOINT is returned.
* @return next UTF16 character in text or DONE if the new currentIndex is off the
* start of the text range.
*/
public int previous(){
if (currentIndex > 0) {
return replaceable.charAt(--currentIndex);
}
return DONE;
}
/**
* Sets the currentIndex to the specified currentIndex in the text and returns that
* single UTF16 character at currentIndex.
* This assumes the text is stored as 16-bit code units.
* @param currentIndex the currentIndex within the text.
* @exception IllegalArgumentException is thrown if an invalid currentIndex is
* supplied. i.e. currentIndex is out of bounds.
*/
public void setIndex(int currentIndex) {
if (currentIndex < 0 || currentIndex > replaceable.length()) {
throw new IllegalArgumentException();
}
this.currentIndex = currentIndex;
}
public int getText(char[] fillIn, int offset){
int length = replaceable.length();
if(offset < 0 || offset + length > fillIn.length){
throw new IndexOutOfBoundsException(Integer.toString(length));
}
replaceable.getChars(0,length,fillIn,offset);
return length;
}
// private data members ----------------------------------------------------
/**
* Replaceable object
*/
private Replaceable replaceable;
/**
* Current currentIndex
*/
private int currentIndex;
}

View file

@ -1,364 +0,0 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and
* others. All Rights Reserved.
******************************************************************************
*/
package sun.text.normalizer;
import java.io.DataInputStream;
import java.io.InputStream;
import java.io.IOException;
/**
* <p>A trie is a kind of compressed, serializable table of values
* associated with Unicode code points (0..0x10ffff).</p>
* <p>This class defines the basic structure of a trie and provides methods
* to <b>retrieve the offsets to the actual data</b>.</p>
* <p>Data will be the form of an array of basic types, char or int.</p>
* <p>The actual data format will have to be specified by the user in the
* inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>
* <p>This trie implementation is optimized for getting offset while walking
* forward through a UTF-16 string.
* Therefore, the simplest and fastest access macros are the
* fromLead() and fromOffsetTrail() methods.
* The fromBMP() method are a little more complicated; they get offsets even
* for lead surrogate codepoints, while the fromLead() method get special
* "folded" offsets for lead surrogate code units if there is relevant data
* associated with them.
* From such a folded offsets, an offset needs to be extracted to supply
* to the fromOffsetTrail() methods.
* To handle such supplementary codepoints, some offset information are kept
* in the data.</p>
* <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve
* that offset from the folded value for the lead surrogate unit.</p>
* <p>For examples of use, see com.ibm.icu.impl.CharTrie or
* com.ibm.icu.impl.IntTrie.</p>
* @author synwee
* @see com.ibm.icu.impl.CharTrie
* @see com.ibm.icu.impl.IntTrie
* @since release 2.1, Jan 01 2002
*/
public abstract class Trie
{
// public class declaration ----------------------------------------
/**
* Character data in com.ibm.impl.Trie have different user-specified format
* for different purposes.
* This interface specifies methods to be implemented in order for
* com.ibm.impl.Trie, to surrogate offset information encapsulated within
* the data.
*/
public static interface DataManipulate
{
/**
* Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's
* data
* the index array offset of the indexes for that lead surrogate.
* @param value data value for a surrogate from the trie, including the
* folding offset
* @return data offset or 0 if there is no data for the lead surrogate
*/
public int getFoldingOffset(int value);
}
// default implementation
private static class DefaultGetFoldingOffset implements DataManipulate {
public int getFoldingOffset(int value) {
return value;
}
}
// protected constructor -------------------------------------------
/**
* Trie constructor for CharTrie use.
* @param inputStream ICU data file input stream which contains the
* trie
* @param dataManipulate object containing the information to parse the
* trie data
* @throws IOException thrown when input stream does not have the
* right header.
*/
protected Trie(InputStream inputStream,
DataManipulate dataManipulate) throws IOException
{
DataInputStream input = new DataInputStream(inputStream);
// Magic number to authenticate the data.
int signature = input.readInt();
m_options_ = input.readInt();
if (!checkHeader(signature)) {
throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
}
if(dataManipulate != null) {
m_dataManipulate_ = dataManipulate;
} else {
m_dataManipulate_ = new DefaultGetFoldingOffset();
}
m_isLatin1Linear_ = (m_options_ &
HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
m_dataOffset_ = input.readInt();
m_dataLength_ = input.readInt();
unserialize(inputStream);
}
// protected data members ------------------------------------------
/**
* Lead surrogate code points' index displacement in the index array.
* <pre>{@code
* 0x10000-0xd800=0x2800
* 0x2800 >> INDEX_STAGE_1_SHIFT_
* }</pre>
*/
protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
/**
* Shift size for shifting right the input index. 1..9
*/
protected static final int INDEX_STAGE_1_SHIFT_ = 5;
/**
* Shift size for shifting left the index array values.
* Increases possible data size with 16-bit index values at the cost
* of compactability.
* This requires blocks of stage 2 data to be aligned by
* DATA_GRANULARITY.
* 0..INDEX_STAGE_1_SHIFT
*/
protected static final int INDEX_STAGE_2_SHIFT_ = 2;
/**
* Number of data values in a stage 2 (data array) block.
*/
protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
/**
* Mask for getting the lower bits from the input index.
* DATA_BLOCK_LENGTH - 1.
*/
protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
/**
* Surrogate mask to use when shifting offset to retrieve supplementary
* values
*/
protected static final int SURROGATE_MASK_ = 0x3FF;
/**
* Index or UTF16 characters
*/
protected char m_index_[];
/**
* Internal TrieValue which handles the parsing of the data value.
* This class is to be implemented by the user
*/
protected DataManipulate m_dataManipulate_;
/**
* Start index of the data portion of the trie. CharTrie combines
* index and data into a char array, so this is used to indicate the
* initial offset to the data portion.
* Note this index always points to the initial value.
*/
protected int m_dataOffset_;
/**
* Length of the data array
*/
protected int m_dataLength_;
// protected methods -----------------------------------------------
/**
* Gets the offset to the data which the surrogate pair points to.
* @param lead lead surrogate
* @param trail trailing surrogate
* @return offset to data
*/
protected abstract int getSurrogateOffset(char lead, char trail);
/**
* Gets the offset to the data which the index ch after variable offset
* points to.
* Note for locating a non-supplementary character data offset, calling
* <p>
* getRawOffset(0, ch);
* </p>
* will do. Otherwise if it is a supplementary character formed by
* surrogates lead and trail. Then we would have to call getRawOffset()
* with getFoldingIndexOffset(). See getSurrogateOffset().
* @param offset index offset which ch is to start from
* @param ch index to be used after offset
* @return offset to the data
*/
protected final int getRawOffset(int offset, char ch)
{
return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
<< INDEX_STAGE_2_SHIFT_)
+ (ch & INDEX_STAGE_3_MASK_);
}
/**
* Gets the offset to data which the BMP character points to
* Treats a lead surrogate as a normal code point.
* @param ch BMP character
* @return offset to data
*/
protected final int getBMPOffset(char ch)
{
return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE
&& ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)
? getRawOffset(LEAD_INDEX_OFFSET_, ch)
: getRawOffset(0, ch);
// using a getRawOffset(ch) makes no diff
}
/**
* Gets the offset to the data which this lead surrogate character points
* to.
* Data at the returned offset may contain folding offset information for
* the next trailing surrogate character.
* @param ch lead surrogate character
* @return offset to data
*/
protected final int getLeadOffset(char ch)
{
return getRawOffset(0, ch);
}
/**
* Internal trie getter from a code point.
* Could be faster(?) but longer with
* {@code if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }}
* Gets the offset to data which the codepoint points to
* @param ch codepoint
* @return offset to data
*/
protected final int getCodePointOffset(int ch)
{
// if ((ch >> 16) == 0) slower
if (ch < 0) {
return -1;
} else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
// fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
return getRawOffset(0, (char)ch);
} else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
// BMP codepoint
return getBMPOffset((char)ch);
} else if (ch <= UCharacter.MAX_VALUE) {
// look at the construction of supplementary characters
// trail forms the ends of it.
return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
(char)(ch & SURROGATE_MASK_));
} else {
// return -1 if there is an error, in this case we return
return -1;
}
}
/**
* <p>Parses the inputstream and creates the trie index with it.</p>
* <p>This is overwritten by the child classes.
* @param inputStream input stream containing the trie information
* @exception IOException thrown when data reading fails.
*/
protected void unserialize(InputStream inputStream) throws IOException
{
//indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_
m_index_ = new char[m_dataOffset_];
DataInputStream input = new DataInputStream(inputStream);
for (int i = 0; i < m_dataOffset_; i ++) {
m_index_[i] = input.readChar();
}
}
/**
* Determines if this is a 16 bit trie
* @return true if this is a 16 bit trie
*/
protected final boolean isCharTrie()
{
return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
}
// private data members --------------------------------------------
/**
* Latin 1 option mask
*/
protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
/**
* Constant number to authenticate the byte block
*/
protected static final int HEADER_SIGNATURE_ = 0x54726965;
/**
* Header option formatting
*/
private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
/**
* Flag indicator for Latin quick access data block
*/
private boolean m_isLatin1Linear_;
/**
* <p>Trie options field.</p>
* <p>options bit field:<br>
* 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
* 8 0 = 16-bit data, 1=32-bit data<br>
* 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br>
* 3..0 INDEX_STAGE_2_SHIFT // 1..9<br>
*/
private int m_options_;
// private methods ---------------------------------------------------
/**
* Authenticates raw data header.
* Checking the header information, signature and options.
* @param signature This contains the options and type of a Trie
* @return true if the header is authenticated valid
*/
private final boolean checkHeader(int signature)
{
// check the signature
// Trie in big-endian US-ASCII (0x54726965).
// Magic number to authenticate the data.
if (signature != HEADER_SIGNATURE_) {
return false;
}
if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) !=
INDEX_STAGE_1_SHIFT_ ||
((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) &
HEADER_OPTIONS_SHIFT_MASK_)
!= INDEX_STAGE_2_SHIFT_) {
return false;
}
return true;
}
}

View file

@ -1,655 +0,0 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2009-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Iterator;
import java.util.NoSuchElementException;
/**
* This is the interface and common implementation of a Unicode Trie2.
* It is a kind of compressed table that maps from Unicode code points (0..0x10ffff)
* to 16- or 32-bit integer values. It works best when there are ranges of
* characters with the same value, which is generally the case with Unicode
* character properties.
*
* This is the second common version of a Unicode trie (hence the name Trie2).
*
*/
abstract class Trie2 implements Iterable<Trie2.Range> {
/**
* Create a Trie2 from its serialized form. Inverse of utrie2_serialize().
*
* Reads from the current position and leaves the buffer after the end of the trie.
*
* The serialized format is identical between ICU4C and ICU4J, so this function
* will work with serialized Trie2s from either.
*
* The actual type of the returned Trie2 will be either Trie2_16 or Trie2_32, depending
* on the width of the data.
*
* To obtain the width of the Trie2, check the actual class type of the returned Trie2.
* Or use the createFromSerialized() function of Trie2_16 or Trie2_32, which will
* return only Tries of their specific type/size.
*
* The serialized Trie2 on the stream may be in either little or big endian byte order.
* This allows using serialized Tries from ICU4C without needing to consider the
* byte order of the system that created them.
*
* @param bytes a byte buffer to the serialized form of a UTrie2.
* @return An unserialized Trie2, ready for use.
* @throws IllegalArgumentException if the stream does not contain a serialized Trie2.
* @throws IOException if a read error occurs in the buffer.
*
*/
public static Trie2 createFromSerialized(ByteBuffer bytes) throws IOException {
// From ICU4C utrie2_impl.h
// * Trie2 data structure in serialized form:
// *
// * UTrie2Header header;
// * uint16_t index[header.index2Length];
// * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...]
// * @internal
// */
// typedef struct UTrie2Header {
// /** "Tri2" in big-endian US-ASCII (0x54726932) */
// uint32_t signature;
// /**
// * options bit field:
// * 15.. 4 reserved (0)
// * 3.. 0 UTrie2ValueBits valueBits
// */
// uint16_t options;
//
// /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH */
// uint16_t indexLength;
//
// /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT */
// uint16_t shiftedDataLength;
//
// /** Null index and data blocks, not shifted. */
// uint16_t index2NullOffset, dataNullOffset;
//
// /**
// * First code point of the single-value range ending with U+10ffff,
// * rounded up and then shifted right by UTRIE2_SHIFT_1.
// */
// uint16_t shiftedHighStart;
// } UTrie2Header;
ByteOrder outerByteOrder = bytes.order();
try {
UTrie2Header header = new UTrie2Header();
/* check the signature */
header.signature = bytes.getInt();
switch (header.signature) {
case 0x54726932:
// The buffer is already set to the trie data byte order.
break;
case 0x32697254:
// Temporarily reverse the byte order.
boolean isBigEndian = outerByteOrder == ByteOrder.BIG_ENDIAN;
bytes.order(isBigEndian ? ByteOrder.LITTLE_ENDIAN : ByteOrder.BIG_ENDIAN);
header.signature = 0x54726932;
break;
default:
throw new IllegalArgumentException("Buffer does not contain a serialized UTrie2");
}
header.options = bytes.getChar();
header.indexLength = bytes.getChar();
header.shiftedDataLength = bytes.getChar();
header.index2NullOffset = bytes.getChar();
header.dataNullOffset = bytes.getChar();
header.shiftedHighStart = bytes.getChar();
if ((header.options & UTRIE2_OPTIONS_VALUE_BITS_MASK) != 0) {
throw new IllegalArgumentException("UTrie2 serialized format error.");
}
Trie2 This;
This = new Trie2_16();
This.header = header;
/* get the length values and offsets */
This.indexLength = header.indexLength;
This.dataLength = header.shiftedDataLength << UTRIE2_INDEX_SHIFT;
This.index2NullOffset = header.index2NullOffset;
This.dataNullOffset = header.dataNullOffset;
This.highStart = header.shiftedHighStart << UTRIE2_SHIFT_1;
This.highValueIndex = This.dataLength - UTRIE2_DATA_GRANULARITY;
This.highValueIndex += This.indexLength;
// Allocate the Trie2 index array. If the data width is 16 bits, the array also
// includes the space for the data.
int indexArraySize = This.indexLength;
indexArraySize += This.dataLength;
This.index = new char[indexArraySize];
/* Read in the index */
int i;
for (i=0; i<This.indexLength; i++) {
This.index[i] = bytes.getChar();
}
/* Read in the data. 16 bit data goes in the same array as the index.
* 32 bit data goes in its own separate data array.
*/
This.data16 = This.indexLength;
for (i=0; i<This.dataLength; i++) {
This.index[This.data16 + i] = bytes.getChar();
}
This.data32 = null;
This.initialValue = This.index[This.dataNullOffset];
This.errorValue = This.index[This.data16+UTRIE2_BAD_UTF8_DATA_OFFSET];
return This;
} finally {
bytes.order(outerByteOrder);
}
}
/**
* Get the value for a code point as stored in the Trie2.
*
* @param codePoint the code point
* @return the value
*/
public abstract int get(int codePoint);
/**
* Get the trie value for a UTF-16 code unit.
*
* A Trie2 stores two distinct values for input in the lead surrogate
* range, one for lead surrogates, which is the value that will be
* returned by this function, and a second value that is returned
* by Trie2.get().
*
* For code units outside of the lead surrogate range, this function
* returns the same result as Trie2.get().
*
* This function, together with the alternate value for lead surrogates,
* makes possible very efficient processing of UTF-16 strings without
* first converting surrogate pairs to their corresponding 32 bit code point
* values.
*
* At build-time, enumerate the contents of the Trie2 to see if there
* is non-trivial (non-initialValue) data for any of the supplementary
* code points associated with a lead surrogate.
* If so, then set a special (application-specific) value for the
* lead surrogate code _unit_, with Trie2Writable.setForLeadSurrogateCodeUnit().
*
* At runtime, use Trie2.getFromU16SingleLead(). If there is non-trivial
* data and the code unit is a lead surrogate, then check if a trail surrogate
* follows. If so, assemble the supplementary code point and look up its value
* with Trie2.get(); otherwise reset the lead
* surrogate's value or do a code point lookup for it.
*
* If there is only trivial data for lead and trail surrogates, then processing
* can often skip them. For example, in normalization or case mapping
* all characters that do not have any mappings are simply copied as is.
*
* @param c the code point or lead surrogate value.
* @return the value
*/
public abstract int getFromU16SingleLead(char c);
/**
* When iterating over the contents of a Trie2, Elements of this type are produced.
* The iterator will return one item for each contiguous range of codepoints having the same value.
*
* When iterating, the same Trie2EnumRange object will be reused and returned for each range.
* If you need to retain complete iteration results, clone each returned Trie2EnumRange,
* or save the range in some other way, before advancing to the next iteration step.
*/
public static class Range {
public int startCodePoint;
public int endCodePoint; // Inclusive.
public int value;
public boolean leadSurrogate;
public boolean equals(Object other) {
if (other == null || !(other.getClass().equals(getClass()))) {
return false;
}
Range tother = (Range)other;
return this.startCodePoint == tother.startCodePoint &&
this.endCodePoint == tother.endCodePoint &&
this.value == tother.value &&
this.leadSurrogate == tother.leadSurrogate;
}
public int hashCode() {
int h = initHash();
h = hashUChar32(h, startCodePoint);
h = hashUChar32(h, endCodePoint);
h = hashInt(h, value);
h = hashByte(h, leadSurrogate? 1: 0);
return h;
}
}
/**
* Create an iterator over the value ranges in this Trie2.
* Values from the Trie2 are not remapped or filtered, but are returned as they
* are stored in the Trie2.
*
* @return an Iterator
*/
public Iterator<Range> iterator() {
return iterator(defaultValueMapper);
}
private static ValueMapper defaultValueMapper = new ValueMapper() {
public int map(int in) {
return in;
}
};
/**
* Create an iterator over the value ranges from this Trie2.
* Values from the Trie2 are passed through a caller-supplied remapping function,
* and it is the remapped values that determine the ranges that
* will be produced by the iterator.
*
*
* @param mapper provides a function to remap values obtained from the Trie2.
* @return an Iterator
*/
public Iterator<Range> iterator(ValueMapper mapper) {
return new Trie2Iterator(mapper);
}
/**
* When iterating over the contents of a Trie2, an instance of TrieValueMapper may
* be used to remap the values from the Trie2. The remapped values will be used
* both in determining the ranges of codepoints and as the value to be returned
* for each range.
*
* Example of use, with an anonymous subclass of TrieValueMapper:
*
*
* ValueMapper m = new ValueMapper() {
* int map(int in) {return in & 0x1f;};
* }
* for (Iterator<Trie2EnumRange> iter = trie.iterator(m); i.hasNext(); ) {
* Trie2EnumRange r = i.next();
* ... // Do something with the range r.
* }
*
*/
public interface ValueMapper {
public int map(int originalVal);
}
//--------------------------------------------------------------------------------
//
// Below this point are internal implementation items. No further public API.
//
//--------------------------------------------------------------------------------
/**
* Trie2 data structure in serialized form:
*
* UTrie2Header header;
* uint16_t index[header.index2Length];
* uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...]
*
* For Java, this is read from the stream into an instance of UTrie2Header.
* (The C version just places a struct over the raw serialized data.)
*
* @internal
*/
static class UTrie2Header {
/** "Tri2" in big-endian US-ASCII (0x54726932) */
int signature;
/**
* options bit field (uint16_t):
* 15.. 4 reserved (0)
* 3.. 0 UTrie2ValueBits valueBits
*/
int options;
/** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH (uint16_t) */
int indexLength;
/** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT (uint16_t) */
int shiftedDataLength;
/** Null index and data blocks, not shifted. (uint16_t) */
int index2NullOffset, dataNullOffset;
/**
* First code point of the single-value range ending with U+10ffff,
* rounded up and then shifted right by UTRIE2_SHIFT_1. (uint16_t)
*/
int shiftedHighStart;
}
//
// Data members of UTrie2.
//
UTrie2Header header;
char index[]; // Index array. Includes data for 16 bit Tries.
int data16; // Offset to data portion of the index array, if 16 bit data.
// zero if 32 bit data.
int data32[]; // NULL if 16b data is used via index
int indexLength;
int dataLength;
int index2NullOffset; // 0xffff if there is no dedicated index-2 null block
int initialValue;
/** Value returned for out-of-range code points and illegal UTF-8. */
int errorValue;
/* Start of the last range which ends at U+10ffff, and its value. */
int highStart;
int highValueIndex;
int dataNullOffset;
/**
* Trie2 constants, defining shift widths, index array lengths, etc.
*
* These are needed for the runtime macros but users can treat these as
* implementation details and skip to the actual public API further below.
*/
static final int UTRIE2_OPTIONS_VALUE_BITS_MASK=0x000f;
/** Shift size for getting the index-1 table offset. */
static final int UTRIE2_SHIFT_1=6+5;
/** Shift size for getting the index-2 table offset. */
static final int UTRIE2_SHIFT_2=5;
/**
* Difference between the two shift sizes,
* for getting an index-1 offset from an index-2 offset. 6=11-5
*/
static final int UTRIE2_SHIFT_1_2=UTRIE2_SHIFT_1-UTRIE2_SHIFT_2;
/**
* Number of index-1 entries for the BMP. 32=0x20
* This part of the index-1 table is omitted from the serialized form.
*/
static final int UTRIE2_OMITTED_BMP_INDEX_1_LENGTH=0x10000>>UTRIE2_SHIFT_1;
/** Number of entries in an index-2 block. 64=0x40 */
static final int UTRIE2_INDEX_2_BLOCK_LENGTH=1<<UTRIE2_SHIFT_1_2;
/** Mask for getting the lower bits for the in-index-2-block offset. */
static final int UTRIE2_INDEX_2_MASK=UTRIE2_INDEX_2_BLOCK_LENGTH-1;
/** Number of entries in a data block. 32=0x20 */
static final int UTRIE2_DATA_BLOCK_LENGTH=1<<UTRIE2_SHIFT_2;
/** Mask for getting the lower bits for the in-data-block offset. */
static final int UTRIE2_DATA_MASK=UTRIE2_DATA_BLOCK_LENGTH-1;
/**
* Shift size for shifting left the index array values.
* Increases possible data size with 16-bit index values at the cost
* of compactability.
* This requires data blocks to be aligned by UTRIE2_DATA_GRANULARITY.
*/
static final int UTRIE2_INDEX_SHIFT=2;
/** The alignment size of a data block. Also the granularity for compaction. */
static final int UTRIE2_DATA_GRANULARITY=1<<UTRIE2_INDEX_SHIFT;
/**
* The part of the index-2 table for U+D800..U+DBFF stores values for
* lead surrogate code _units_ not code _points_.
* Values for lead surrogate code _points_ are indexed with this portion of the table.
* Length=32=0x20=0x400>>UTRIE2_SHIFT_2. (There are 1024=0x400 lead surrogates.)
*/
static final int UTRIE2_LSCP_INDEX_2_OFFSET=0x10000>>UTRIE2_SHIFT_2;
static final int UTRIE2_LSCP_INDEX_2_LENGTH=0x400>>UTRIE2_SHIFT_2;
/** Count the lengths of both BMP pieces. 2080=0x820 */
static final int UTRIE2_INDEX_2_BMP_LENGTH=UTRIE2_LSCP_INDEX_2_OFFSET+UTRIE2_LSCP_INDEX_2_LENGTH;
/**
* The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820.
* Length 32=0x20 for lead bytes C0..DF, regardless of UTRIE2_SHIFT_2.
*/
static final int UTRIE2_UTF8_2B_INDEX_2_OFFSET=UTRIE2_INDEX_2_BMP_LENGTH;
static final int UTRIE2_UTF8_2B_INDEX_2_LENGTH=0x800>>6; /* U+0800 is the first code point after 2-byte UTF-8 */
/**
* The index-1 table, only used for supplementary code points, at offset 2112=0x840.
* Variable length, for code points up to highStart, where the last single-value range starts.
* Maximum length 512=0x200=0x100000>>UTRIE2_SHIFT_1.
* (For 0x100000 supplementary code points U+10000..U+10ffff.)
*
* The part of the index-2 table for supplementary code points starts
* after this index-1 table.
*
* Both the index-1 table and the following part of the index-2 table
* are omitted completely if there is only BMP data.
*/
static final int UTRIE2_INDEX_1_OFFSET=UTRIE2_UTF8_2B_INDEX_2_OFFSET+UTRIE2_UTF8_2B_INDEX_2_LENGTH;
/**
* The illegal-UTF-8 data block follows the ASCII block, at offset 128=0x80.
* Used with linear access for single bytes 0..0xbf for simple error handling.
* Length 64=0x40, not UTRIE2_DATA_BLOCK_LENGTH.
*/
static final int UTRIE2_BAD_UTF8_DATA_OFFSET=0x80;
/**
* Implementation class for an iterator over a Trie2.
*
* Iteration over a Trie2 first returns all of the ranges that are indexed by code points,
* then returns the special alternate values for the lead surrogates
*
* @internal
*/
class Trie2Iterator implements Iterator<Range> {
// The normal constructor that configures the iterator to cover the complete
// contents of the Trie2
Trie2Iterator(ValueMapper vm) {
mapper = vm;
nextStart = 0;
limitCP = 0x110000;
doLeadSurrogates = true;
}
/**
* The main next() function for Trie2 iterators
*
*/
public Range next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
if (nextStart >= limitCP) {
// Switch over from iterating normal code point values to
// doing the alternate lead-surrogate values.
doingCodePoints = false;
nextStart = 0xd800;
}
int endOfRange = 0;
int val = 0;
int mappedVal = 0;
if (doingCodePoints) {
// Iteration over code point values.
val = get(nextStart);
mappedVal = mapper.map(val);
endOfRange = rangeEnd(nextStart, limitCP, val);
// Loop once for each range in the Trie2 with the same raw (unmapped) value.
// Loop continues so long as the mapped values are the same.
for (;;) {
if (endOfRange >= limitCP-1) {
break;
}
val = get(endOfRange+1);
if (mapper.map(val) != mappedVal) {
break;
}
endOfRange = rangeEnd(endOfRange+1, limitCP, val);
}
} else {
// Iteration over the alternate lead surrogate values.
val = getFromU16SingleLead((char)nextStart);
mappedVal = mapper.map(val);
endOfRange = rangeEndLS((char)nextStart);
// Loop once for each range in the Trie2 with the same raw (unmapped) value.
// Loop continues so long as the mapped values are the same.
for (;;) {
if (endOfRange >= 0xdbff) {
break;
}
val = getFromU16SingleLead((char)(endOfRange+1));
if (mapper.map(val) != mappedVal) {
break;
}
endOfRange = rangeEndLS((char)(endOfRange+1));
}
}
returnValue.startCodePoint = nextStart;
returnValue.endCodePoint = endOfRange;
returnValue.value = mappedVal;
returnValue.leadSurrogate = !doingCodePoints;
nextStart = endOfRange+1;
return returnValue;
}
/**
*
*/
public boolean hasNext() {
return doingCodePoints && (doLeadSurrogates || nextStart < limitCP) || nextStart < 0xdc00;
}
private int rangeEndLS(char startingLS) {
if (startingLS >= 0xdbff) {
return 0xdbff;
}
int c;
int val = getFromU16SingleLead(startingLS);
for (c = startingLS+1; c <= 0x0dbff; c++) {
if (getFromU16SingleLead((char)c) != val) {
break;
}
}
return c-1;
}
//
// Iteration State Variables
//
private ValueMapper mapper;
private Range returnValue = new Range();
// The starting code point for the next range to be returned.
private int nextStart;
// The upper limit for the last normal range to be returned. Normally 0x110000, but
// may be lower when iterating over the code points for a single lead surrogate.
private int limitCP;
// True while iterating over the Trie2 values for code points.
// False while iterating over the alternate values for lead surrogates.
private boolean doingCodePoints = true;
// True if the iterator should iterate the special values for lead surrogates in
// addition to the normal values for code points.
private boolean doLeadSurrogates = true;
}
/**
* Find the last character in a contiguous range of characters with the
* same Trie2 value as the input character.
*
* @param c The character to begin with.
* @return The last contiguous character with the same value.
*/
int rangeEnd(int start, int limitp, int val) {
int c;
int limit = Math.min(highStart, limitp);
for (c = start+1; c < limit; c++) {
if (get(c) != val) {
break;
}
}
if (c >= highStart) {
c = limitp;
}
return c - 1;
}
//
// Hashing implementation functions. FNV hash. Respected public domain algorithm.
//
private static int initHash() {
return 0x811c9DC5; // unsigned 2166136261
}
private static int hashByte(int h, int b) {
h = h * 16777619;
h = h ^ b;
return h;
}
private static int hashUChar32(int h, int c) {
h = Trie2.hashByte(h, c & 255);
h = Trie2.hashByte(h, (c>>8) & 255);
h = Trie2.hashByte(h, c>>16);
return h;
}
private static int hashInt(int h, int i) {
h = Trie2.hashByte(h, i & 255);
h = Trie2.hashByte(h, (i>>8) & 255);
h = Trie2.hashByte(h, (i>>16) & 255);
h = Trie2.hashByte(h, (i>>24) & 255);
return h;
}
}

View file

@ -1,167 +0,0 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2009-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.IOException;
import java.nio.ByteBuffer;
/**
* @author aheninger
*
* A read-only Trie2, holding 16 bit data values.
*
* A Trie2 is a highly optimized data structure for mapping from Unicode
* code points (values ranging from 0 to 0x10ffff) to a 16 or 32 bit value.
*
* See class Trie2 for descriptions of the API for accessing the contents of a trie.
*
* The fundamental data access methods are declared final in this class, with
* the intent that applications might gain a little extra performance, when compared
* with calling the same methods via the abstract UTrie2 base class.
*/
public final class Trie2_16 extends Trie2 {
/**
* Internal constructor, not for general use.
*/
Trie2_16() {
}
/**
* Create a Trie2 from its serialized form. Inverse of utrie2_serialize().
* The serialized format is identical between ICU4C and ICU4J, so this function
* will work with serialized Trie2s from either.
*
* The serialized Trie2 in the bytes may be in either little or big endian byte order.
* This allows using serialized Tries from ICU4C without needing to consider the
* byte order of the system that created them.
*
* @param bytes a byte buffer to the serialized form of a UTrie2.
* @return An unserialized Trie2_16, ready for use.
* @throws IllegalArgumentException if the buffer does not contain a serialized Trie2.
* @throws IOException if a read error occurs in the buffer.
* @throws ClassCastException if the bytes contain a serialized Trie2_32
*/
public static Trie2_16 createFromSerialized(ByteBuffer bytes) throws IOException {
return (Trie2_16) Trie2.createFromSerialized(bytes);
}
/**
* Get the value for a code point as stored in the Trie2.
*
* @param codePoint the code point
* @return the value
*/
@Override
public final int get(int codePoint) {
int value;
int ix;
if (codePoint >= 0) {
if (codePoint < 0x0d800 || (codePoint > 0x0dbff && codePoint <= 0x0ffff)) {
// Ordinary BMP code point, excluding leading surrogates.
// BMP uses a single level lookup. BMP index starts at offset 0 in the Trie2 index.
// 16 bit data is stored in the index array itself.
ix = index[codePoint >> UTRIE2_SHIFT_2];
ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
value = index[ix];
return value;
}
if (codePoint <= 0xffff) {
// Lead Surrogate Code Point. A Separate index section is stored for
// lead surrogate code units and code points.
// The main index has the code unit data.
// For this function, we need the code point data.
// Note: this expression could be refactored for slightly improved efficiency, but
// surrogate code points will be so rare in practice that it's not worth it.
ix = index[UTRIE2_LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> UTRIE2_SHIFT_2)];
ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
value = index[ix];
return value;
}
if (codePoint < highStart) {
// Supplemental code point, use two-level lookup.
ix = (UTRIE2_INDEX_1_OFFSET - UTRIE2_OMITTED_BMP_INDEX_1_LENGTH) + (codePoint >> UTRIE2_SHIFT_1);
ix = index[ix];
ix += (codePoint >> UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK;
ix = index[ix];
ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
value = index[ix];
return value;
}
if (codePoint <= 0x10ffff) {
value = index[highValueIndex];
return value;
}
}
// Fall through. The code point is outside of the legal range of 0..0x10ffff.
return errorValue;
}
/**
* Get a Trie2 value for a UTF-16 code unit.
*
* This function returns the same value as get() if the input
* character is outside of the lead surrogate range
*
* There are two values stored in a Trie2 for inputs in the lead
* surrogate range. This function returns the alternate value,
* while Trie2.get() returns the main value.
*
* @param codeUnit a 16 bit code unit or lead surrogate value.
* @return the value
*/
@Override
public int getFromU16SingleLead(char codeUnit) {
int value;
int ix;
// Because the input is a 16 bit char, we can skip the tests for it being in
// the BMP range. It is.
ix = index[codeUnit >> UTRIE2_SHIFT_2];
ix = (ix << UTRIE2_INDEX_SHIFT) + (codeUnit & UTRIE2_DATA_MASK);
value = index[ix];
return value;
}
/**
* @return the number of bytes of the serialized trie
*/
public int getSerializedLength() {
return 16+(header.indexLength+dataLength)*2;
}
}

View file

@ -1,267 +0,0 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
*
* Copyright (C) 2004-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: UBiDiProps.java
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2005jan16
* created by: Markus W. Scherer
*
* Low-level Unicode bidi/shaping properties access.
* Java port of ubidi_props.h/.c.
*/
package sun.text.normalizer;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.MissingResourceException;
public final class UBiDiProps {
// constructors etc. --------------------------------------------------- ***
// port of ubidi_openProps()
private UBiDiProps() throws IOException{
ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME);
readData(bytes);
}
private void readData(ByteBuffer bytes) throws IOException {
// read the header
ICUBinary.readHeader(bytes, FMT, new IsAcceptable());
// read indexes[]
int i, count;
count=bytes.getInt();
if(count<IX_TOP) {
throw new IOException("indexes[0] too small in "+DATA_FILE_NAME);
}
indexes=new int[count];
indexes[0]=count;
for(i=1; i<count; ++i) {
indexes[i]=bytes.getInt();
}
// read the trie
trie=Trie2_16.createFromSerialized(bytes);
int expectedTrieLength=indexes[IX_TRIE_SIZE];
int trieLength=trie.getSerializedLength();
if(trieLength>expectedTrieLength) {
throw new IOException(DATA_FILE_NAME+": not enough bytes for the trie");
}
// skip padding after trie bytes
ICUBinary.skipBytes(bytes, expectedTrieLength-trieLength);
// read mirrors[]
count=indexes[IX_MIRROR_LENGTH];
if(count>0) {
mirrors=new int[count];
for(i=0; i<count; ++i) {
mirrors[i]=bytes.getInt();
}
}
// read jgArray[]
count=indexes[IX_JG_LIMIT]-indexes[IX_JG_START];
jgArray=new byte[count];
for(i=0; i<count; ++i) {
jgArray[i]=bytes.get();
}
// read jgArray2[]
count=indexes[IX_JG_LIMIT2]-indexes[IX_JG_START2];
jgArray2=new byte[count];
for(i=0; i<count; ++i) {
jgArray2[i]=bytes.get();
}
}
// implement ICUBinary.Authenticate
private static final class IsAcceptable implements ICUBinary.Authenticate {
public boolean isDataVersionAcceptable(byte version[]) {
return version[0]==2;
}
}
// property access functions ------------------------------------------- ***
public final int getClass(int c) {
return getClassFromProps(trie.get(c));
}
private final int getMirror(int c, int props) {
int delta=getMirrorDeltaFromProps(props);
if(delta!=ESC_MIRROR_DELTA) {
return c+delta;
} else {
/* look for mirror code point in the mirrors[] table */
int m;
int i, length;
int c2;
length=indexes[IX_MIRROR_LENGTH];
/* linear search */
for(i=0; i<length; ++i) {
m=mirrors[i];
c2=getMirrorCodePoint(m);
if(c==c2) {
/* found c, return its mirror code point using the index in m */
return getMirrorCodePoint(mirrors[getMirrorIndex(m)]);
} else if(c<c2) {
break;
}
}
/* c not found, return it itself */
return c;
}
}
public final int getMirror(int c) {
int props=trie.get(c);
return getMirror(c, props);
}
public final int getJoiningType(int c) {
return (trie.get(c)&JT_MASK)>>JT_SHIFT;
}
public final int getJoiningGroup(int c) {
int start, limit;
start=indexes[IX_JG_START];
limit=indexes[IX_JG_LIMIT];
if(start<=c && c<limit) {
return (int)jgArray[c-start]&0xff;
}
start=indexes[IX_JG_START2];
limit=indexes[IX_JG_LIMIT2];
if(start<=c && c<limit) {
return (int)jgArray2[c-start]&0xff;
}
return UCharacter.JoiningGroup.NO_JOINING_GROUP;
}
public final int getPairedBracketType(int c) {
return (trie.get(c)&BPT_MASK)>>BPT_SHIFT;
}
public final int getPairedBracket(int c) {
int props=trie.get(c);
if((props&BPT_MASK)==0) {
return c;
} else {
return getMirror(c, props);
}
}
// data members -------------------------------------------------------- ***
private int indexes[];
private int mirrors[];
private byte jgArray[];
private byte jgArray2[];
private Trie2_16 trie;
// data format constants ----------------------------------------------- ***
private static final String DATA_FILE_NAME = "/sun/text/resources/ubidi.icu";
/* format "BiDi" */
private static final int FMT=0x42694469;
/* indexes into indexes[] */
private static final int IX_TRIE_SIZE=2;
private static final int IX_MIRROR_LENGTH=3;
private static final int IX_JG_START=4;
private static final int IX_JG_LIMIT=5;
private static final int IX_JG_START2=6; /* new in format version 2.2, ICU 54 */
private static final int IX_JG_LIMIT2=7;
private static final int IX_TOP=16;
// definitions for 16-bit bidi/shaping properties word ----------------- ***
/* CLASS_SHIFT=0, */ /* bidi class: 5 bits (4..0) */
private static final int JT_SHIFT=5; /* joining type: 3 bits (7..5) */
private static final int BPT_SHIFT=8; /* Bidi_Paired_Bracket_Type(bpt): 2 bits (9..8) */
private static final int MIRROR_DELTA_SHIFT=13; /* bidi mirroring delta: 3 bits (15..13) */
private static final int CLASS_MASK= 0x0000001f;
private static final int JT_MASK= 0x000000e0;
private static final int BPT_MASK= 0x00000300;
private static final int getClassFromProps(int props) {
return props&CLASS_MASK;
}
private static final boolean getFlagFromProps(int props, int shift) {
return ((props>>shift)&1)!=0;
}
private static final int getMirrorDeltaFromProps(int props) {
return (short)props>>MIRROR_DELTA_SHIFT;
}
private static final int ESC_MIRROR_DELTA=-4;
// definitions for 32-bit mirror table entry --------------------------- ***
/* the source Unicode code point takes 21 bits (20..0) */
private static final int MIRROR_INDEX_SHIFT=21;
private static final int getMirrorCodePoint(int m) {
return m&0x1fffff;
}
private static final int getMirrorIndex(int m) {
return m>>>MIRROR_INDEX_SHIFT;
}
/*
* public singleton instance
*/
public static final UBiDiProps INSTANCE;
// This static initializer block must be placed after
// other static member initialization
static {
try {
INSTANCE = new UBiDiProps();
} catch (IOException e) {
throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME,"");
}
}
}

View file

@ -1,539 +0,0 @@
/*
* Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
/**
* <p>The UCharacter class provides extensions to the
* <a href="http://java.sun.com/j2se/1.5/docs/api/java/lang/Character.html">
* java.lang.Character</a> class. These extensions provide support for
* more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a>
* class, provide support for supplementary characters (those with code
* points above U+FFFF).
* Each ICU release supports the latest version of Unicode available at that time.
*
* <p>Code points are represented in these API using ints. While it would be
* more convenient in Java to have a separate primitive datatype for them,
* ints suffice in the meantime.
*
* <p>To use this class please add the jar file name icu4j.jar to the
* class path, since it contains data files which supply the information used
* by this file.<br>
* E.g. In Windows <br>
* <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
* Otherwise, another method would be to copy the files uprops.dat and
* unames.icu from the icu4j source subdirectory
* <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
* <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
*
* <p>Aside from the additions for UTF-16 support, and the updated Unicode
* properties, the main differences between UCharacter and Character are:
* <ul>
* <li> UCharacter is not designed to be a char wrapper and does not have
* APIs to which involves management of that single char.<br>
* These include:
* <ul>
* <li> char charValue(),
* <li> int compareTo(java.lang.Character, java.lang.Character), etc.
* </ul>
* <li> UCharacter does not include Character APIs that are deprecated, nor
* does it include the Java-specific character information, such as
* boolean isJavaIdentifierPart(char ch).
* <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
* values '10' - '35'. UCharacter also does this in digit and
* getNumericValue, to adhere to the java semantics of these
* methods. New methods unicodeDigit, and
* getUnicodeNumericValue do not treat the above code points
* as having numeric values. This is a semantic change from ICU4J 1.3.1.
* </ul>
* <p>
* Further detail on differences can be determined using the program
* <a href=
* "http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
* com.ibm.icu.dev.test.lang.UCharacterCompare</a>
* </p>
* <p>
* In addition to Java compatibility functions, which calculate derived properties,
* this API provides low-level access to the Unicode Character Database.
* </p>
* <p>
* Unicode assigns each code point (not just assigned character) values for
* many properties.
* Most of them are simple boolean flags, or constants from a small enumerated list.
* For some properties, values are strings or other relatively more complex types.
* </p>
* <p>
* For more information see
* <a href="http://www.unicode/org/ucd/">"About the Unicode Character Database"</a>
* (http://www.unicode.org/ucd/)
* and the <a href="http://www.icu-project.org/userguide/properties.html">ICU
* User Guide chapter on Properties</a>
* (http://www.icu-project.org/userguide/properties.html).
* </p>
* <p>
* There are also functions that provide easy migration from C/POSIX functions
* like isblank(). Their use is generally discouraged because the C/POSIX
* standards do not define their semantics beyond the ASCII range, which means
* that different implementations exhibit very different behavior.
* Instead, Unicode properties should be used directly.
* </p>
* <p>
* There are also only a few, broad C/POSIX character classes, and they tend
* to be used for conflicting purposes. For example, the "isalpha()" class
* is sometimes used to determine word boundaries, while a more sophisticated
* approach would at least distinguish initial letters from continuation
* characters (the latter including combining marks).
* (In ICU, BreakIterator is the most sophisticated API for word boundaries.)
* Another example: There is no "istitle()" class for titlecase characters.
* </p>
* <p>
* ICU 3.4 and later provides API access for all twelve C/POSIX character classes.
* ICU implements them according to the Standard Recommendations in
* Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
* (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
* </p>
* <p>
* API access for C/POSIX character classes is as follows:
* <pre>{@code
* - alpha: isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
* - lower: isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
* - upper: isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
* - punct: ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|
* (1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|
* (1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0
* - digit: isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER
* - xdigit: hasBinaryProperty(c, UProperty.POSIX_XDIGIT)
* - alnum: hasBinaryProperty(c, UProperty.POSIX_ALNUM)
* - space: isUWhiteSpace(c) or hasBinaryProperty(c, UProperty.WHITE_SPACE)
* - blank: hasBinaryProperty(c, UProperty.POSIX_BLANK)
* - cntrl: getType(c)==CONTROL
* - graph: hasBinaryProperty(c, UProperty.POSIX_GRAPH)
* - print: hasBinaryProperty(c, UProperty.POSIX_PRINT)
* }</pre>
* </p>
* <p>
* The C/POSIX character classes are also available in UnicodeSet patterns,
* using patterns like [:graph:] or \p{graph}.
* </p>
*
* There are several ICU (and Java) whitespace functions.
* Comparison:<ul>
* <li> isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
* most of general categories "Z" (separators) + most whitespace ISO controls
* (including no-break spaces, but excluding IS1..IS4 and ZWSP)
* <li> isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
* <li> isSpaceChar: just Z (including no-break spaces)</ul>
* </p>
* <p>
* This class is not subclassable.
* </p>
* @author Syn Wee Quek
* @stable ICU 2.1
* @see com.ibm.icu.lang.UCharacterEnums
*/
public final class UCharacter
{
/**
* Joining Group constants.
* @see UProperty#JOINING_GROUP
* @stable ICU 2.4
*/
public static interface JoiningGroup
{
/**
* @stable ICU 2.4
*/
public static final int NO_JOINING_GROUP = 0;
}
/**
* Numeric Type constants.
* @see UProperty#NUMERIC_TYPE
* @stable ICU 2.4
*/
public static interface NumericType
{
/**
* @stable ICU 2.4
*/
public static final int NONE = 0;
/**
* @stable ICU 2.4
*/
public static final int DECIMAL = 1;
/**
* @stable ICU 2.4
*/
public static final int DIGIT = 2;
/**
* @stable ICU 2.4
*/
public static final int NUMERIC = 3;
/**
* @stable ICU 2.4
*/
public static final int COUNT = 4;
}
/**
* Hangul Syllable Type constants.
*
* @see UProperty#HANGUL_SYLLABLE_TYPE
* @stable ICU 2.6
*/
public static interface HangulSyllableType
{
/**
* @stable ICU 2.6
*/
public static final int NOT_APPLICABLE = 0; /*[NA]*/ /*See note !!*/
/**
* @stable ICU 2.6
*/
public static final int LEADING_JAMO = 1; /*[L]*/
/**
* @stable ICU 2.6
*/
public static final int VOWEL_JAMO = 2; /*[V]*/
/**
* @stable ICU 2.6
*/
public static final int TRAILING_JAMO = 3; /*[T]*/
/**
* @stable ICU 2.6
*/
public static final int LV_SYLLABLE = 4; /*[LV]*/
/**
* @stable ICU 2.6
*/
public static final int LVT_SYLLABLE = 5; /*[LVT]*/
/**
* @stable ICU 2.6
*/
public static final int COUNT = 6;
}
// public data members -----------------------------------------------
/**
* The lowest Unicode code point value.
* @stable ICU 2.1
*/
public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
/**
* The highest Unicode code point value (scalar value) according to the
* Unicode Standard.
* This is a 21-bit value (21 bits, rounded up).<br>
* Up-to-date Unicode implementation of java.lang.Character.MAX_VALUE
* @stable ICU 2.1
*/
public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
// public methods ----------------------------------------------------
/**
* Returns the numeric value of a decimal digit code point.
* <br>This method observes the semantics of
* <code>java.lang.Character.digit()</code>. Note that this
* will return positive values for code points for which isDigit
* returns false, just like java.lang.Character.
* <br><em>Semantic Change:</em> In release 1.3.1 and
* prior, this did not treat the European letters as having a
* digit value, and also treated numeric letters and other numbers as
* digits.
* This has been changed to conform to the java semantics.
* <br>A code point is a valid digit if and only if:
* <ul>
* <li>ch is a decimal digit or one of the european letters, and
* <li>the value of ch is less than the specified radix.
* </ul>
* @param ch the code point to query
* @param radix the radix
* @return the numeric value represented by the code point in the
* specified radix, or -1 if the code point is not a decimal digit
* or if its value is too large for the radix
* @stable ICU 2.1
*/
public static int digit(int ch, int radix)
{
if (2 <= radix && radix <= 36) {
int value = digit(ch);
if (value < 0) {
// ch is not a decimal digit, try latin letters
value = UCharacterProperty.getEuropeanDigit(ch);
}
return (value < radix) ? value : -1;
} else {
return -1; // invalid radix
}
}
/**
* Returns the numeric value of a decimal digit code point.
* <br>This is a convenience overload of <code>digit(int, int)</code>
* that provides a decimal radix.
* <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
* treated numeric letters and other numbers as digits. This has
* been changed to conform to the java semantics.
* @param ch the code point to query
* @return the numeric value represented by the code point,
* or -1 if the code point is not a decimal digit or if its
* value is too large for a decimal radix
* @stable ICU 2.1
*/
public static int digit(int ch)
{
return UCharacterProperty.INSTANCE.digit(ch);
}
/**
* Returns a value indicating a code point's Unicode category.
* Up-to-date Unicode implementation of java.lang.Character.getType()
* except for the above mentioned code points that had their category
* changed.<br>
* Return results are constants from the interface
* <a href=UCharacterCategory.html>UCharacterCategory</a><br>
* <em>NOTE:</em> the UCharacterCategory values are <em>not</em> compatible with
* those returned by java.lang.Character.getType. UCharacterCategory values
* match the ones used in ICU4C, while java.lang.Character type
* values, though similar, skip the value 17.</p>
* @param ch code point whose type is to be determined
* @return category which is a value of UCharacterCategory
* @stable ICU 2.1
*/
public static int getType(int ch)
{
return UCharacterProperty.INSTANCE.getType(ch);
}
/**
* Returns the Bidirection property of a code point.
* For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
* property.<br>
* Result returned belongs to the interface
* <a href=UCharacterDirection.html>UCharacterDirection</a>
* @param ch the code point to be determined its direction
* @return direction constant from UCharacterDirection.
* @stable ICU 2.1
*/
public static int getDirection(int ch)
{
return UBiDiProps.INSTANCE.getClass(ch);
}
/**
* Maps the specified code point to a "mirror-image" code point.
* For code points with the "mirrored" property, implementations sometimes
* need a "poor man's" mapping to another code point such that the default
* glyph may serve as the mirror-image of the default glyph of the
* specified code point.<br>
* This is useful for text conversion to and from codepages with visual
* order, and for displays without glyph selection capabilities.
* @param ch code point whose mirror is to be retrieved
* @return another code point that may serve as a mirror-image substitute,
* or ch itself if there is no such mapping or ch does not have the
* "mirrored" property
* @stable ICU 2.1
*/
public static int getMirror(int ch)
{
return UBiDiProps.INSTANCE.getMirror(ch);
}
/**
* Maps the specified character to its paired bracket character.
* For Bidi_Paired_Bracket_Type!=None, this is the same as getMirror(int).
* Otherwise c itself is returned.
* See http://www.unicode.org/reports/tr9/
*
* @param c the code point to be mapped
* @return the paired bracket code point,
* or c itself if there is no such mapping
* (Bidi_Paired_Bracket_Type=None)
*
* @see UProperty#BIDI_PAIRED_BRACKET
* @see UProperty#BIDI_PAIRED_BRACKET_TYPE
* @see #getMirror(int)
* @stable ICU 52
*/
public static int getBidiPairedBracket(int c) {
return UBiDiProps.INSTANCE.getPairedBracket(c);
}
/**
* Returns the combining class of the argument codepoint
* @param ch code point whose combining is to be retrieved
* @return the combining class of the codepoint
* @stable ICU 2.1
*/
public static int getCombiningClass(int ch)
{
return Normalizer2.getNFDInstance().getCombiningClass(ch);
}
/**
* Returns the version of Unicode data used.
* @return the unicode version number used
* @stable ICU 2.1
*/
public static VersionInfo getUnicodeVersion()
{
return UCharacterProperty.INSTANCE.m_unicodeVersion_;
}
/**
* Returns a code point corresponding to the two UTF16 characters.
* @param lead the lead char
* @param trail the trail char
* @return code point if surrogate characters are valid.
* @exception IllegalArgumentException thrown when argument characters do
* not form a valid codepoint
* @stable ICU 2.1
*/
public static int getCodePoint(char lead, char trail)
{
if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) {
return UCharacterProperty.getRawSupplementary(lead, trail);
}
throw new IllegalArgumentException("Illegal surrogate characters");
}
/**
* Returns the "age" of the code point.</p>
* <p>The "age" is the Unicode version when the code point was first
* designated (as a non-character or for Private Use) or assigned a
* character.
* <p>This can be useful to avoid emitting code points to receiving
* processes that do not accept newer characters.</p>
* <p>The data is from the UCD file DerivedAge.txt.</p>
* @param ch The code point.
* @return the Unicode version number
* @stable ICU 2.6
*/
public static VersionInfo getAge(int ch)
{
if (ch < MIN_VALUE || ch > MAX_VALUE) {
throw new IllegalArgumentException("Codepoint out of bounds");
}
return UCharacterProperty.INSTANCE.getAge(ch);
}
/**
* Returns the property value for an Unicode property type of a code point.
* Also returns binary and mask property values.</p>
* <p>Unicode, especially in version 3.2, defines many more properties than
* the original set in UnicodeData.txt.</p>
* <p>The properties APIs are intended to reflect Unicode properties as
* defined in the Unicode Character Database (UCD) and Unicode Technical
* Reports (UTR). For details about the properties see
* http://www.unicode.org/.</p>
* <p>For names of Unicode properties see the UCD file PropertyAliases.txt.
* </p>
* <pre>
* Sample usage:
* int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH);
* int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC);
* boolean b = (ideo == 1) ? true : false;
* </pre>
* @param ch code point to test.
* @param type UProperty selector constant, identifies which binary
* property to check. Must be
* UProperty.BINARY_START &lt;= type &lt; UProperty.BINARY_LIMIT or
* UProperty.INT_START &lt;= type &lt; UProperty.INT_LIMIT or
* UProperty.MASK_START &lt;= type &lt; UProperty.MASK_LIMIT.
* @return numeric value that is directly the property value or,
* for enumerated properties, corresponds to the numeric value of
* the enumerated constant of the respective property value
* enumeration type (cast to enum type if necessary).
* Returns 0 or 1 (for false / true) for binary Unicode properties.
* Returns a bit-mask for mask properties.
* Returns 0 if 'type' is out of bounds or if the Unicode version
* does not have data for the property at all, or not for this code
* point.
* @see UProperty
* @see #hasBinaryProperty
* @see #getIntPropertyMinValue
* @see #getIntPropertyMaxValue
* @see #getUnicodeVersion
* @stable ICU 2.4
*/
// for BiDiBase.java
public static int getIntPropertyValue(int ch, int type) {
return UCharacterProperty.INSTANCE.getIntPropertyValue(ch, type);
}
// private constructor -----------------------------------------------
/**
* Private constructor to prevent instantiation
*/
private UCharacter() { }
/*
* Copied from UCharacterEnums.java
*/
/**
* Character type Mn
* @stable ICU 2.1
*/
public static final byte NON_SPACING_MARK = 6;
/**
* Character type Me
* @stable ICU 2.1
*/
public static final byte ENCLOSING_MARK = 7;
/**
* Character type Mc
* @stable ICU 2.1
*/
public static final byte COMBINING_SPACING_MARK = 8;
/**
* Character type count
* @stable ICU 2.1
*/
public static final byte CHAR_CATEGORY_COUNT = 30;
/**
* Directional type R
* @stable ICU 2.1
*/
public static final int RIGHT_TO_LEFT = 1;
/**
* Directional type AL
* @stable ICU 2.1
*/
public static final int RIGHT_TO_LEFT_ARABIC = 13;
}

View file

@ -1,313 +0,0 @@
/*
* Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.text.CharacterIterator;
/**
* Abstract class that defines an API for iteration on text objects.This is an
* interface for forward and backward iteration and random access into a text
* object. Forward iteration is done with post-increment and backward iteration
* is done with pre-decrement semantics, while the
* <code>java.text.CharacterIterator</code> interface methods provided forward
* iteration with "pre-increment" and backward iteration with pre-decrement
* semantics. This API is more efficient for forward iteration over code points.
* The other major difference is that this API can do both code unit and code point
* iteration, <code>java.text.CharacterIterator</code> can only iterate over
* code units and is limited to BMP (0 - 0xFFFF)
* @author Ram
* @stable ICU 2.4
*/
public abstract class UCharacterIterator
implements Cloneable {
/**
* Protected default constructor for the subclasses
* @stable ICU 2.4
*/
protected UCharacterIterator(){
}
/**
* Indicator that we have reached the ends of the UTF16 text.
* Moved from UForwardCharacterIterator.java
* @stable ICU 2.4
*/
public static final int DONE = -1;
// static final methods ----------------------------------------------------
/**
* Returns a <code>UCharacterIterator</code> object given a
* source string.
* @param source a string
* @return UCharacterIterator object
* @exception IllegalArgumentException if the argument is null
* @stable ICU 2.4
*/
public static final UCharacterIterator getInstance(String source){
return new ReplaceableUCharacterIterator(source);
}
/**
* Returns a <code>UCharacterIterator</code> object given a
* source StringBuffer.
* @param source an string buffer of UTF-16 code units
* @return UCharacterIterator object
* @exception IllegalArgumentException if the argument is null
* @stable ICU 2.4
*/
public static final UCharacterIterator getInstance(StringBuffer source){
return new ReplaceableUCharacterIterator(source);
}
/**
* Returns a <code>UCharacterIterator</code> object given a
* CharacterIterator.
* @param source a valid CharacterIterator object.
* @return UCharacterIterator object
* @exception IllegalArgumentException if the argument is null
* @stable ICU 2.4
*/
public static final UCharacterIterator getInstance(CharacterIterator source){
return new CharacterIteratorWrapper(source);
}
// public methods ----------------------------------------------------------
/**
* Returns the length of the text
* @return length of the text
* @stable ICU 2.4
*/
public abstract int getLength();
/**
* Gets the current index in text.
* @return current index in text.
* @stable ICU 2.4
*/
public abstract int getIndex();
/**
* Returns the UTF16 code unit at index, and increments to the next
* code unit (post-increment semantics). If index is out of
* range, DONE is returned, and the iterator is reset to the limit
* of the text.
* @return the next UTF16 code unit, or DONE if the index is at the limit
* of the text.
* @stable ICU 2.4
*/
public abstract int next();
/**
* Returns the code point at index, and increments to the next code
* point (post-increment semantics). If index does not point to a
* valid surrogate pair, the behavior is the same as
* <code>next()</code>. Otherwise the iterator is incremented past
* the surrogate pair, and the code point represented by the pair
* is returned.
* @return the next codepoint in text, or DONE if the index is at
* the limit of the text.
* @stable ICU 2.4
*/
public int nextCodePoint(){
int ch1 = next();
if(UTF16.isLeadSurrogate((char)ch1)){
int ch2 = next();
if(UTF16.isTrailSurrogate((char)ch2)){
return UCharacterProperty.getRawSupplementary((char)ch1,
(char)ch2);
}else if (ch2 != DONE) {
// unmatched surrogate so back out
previous();
}
}
return ch1;
}
/**
* Decrement to the position of the previous code unit in the
* text, and return it (pre-decrement semantics). If the
* resulting index is less than 0, the index is reset to 0 and
* DONE is returned.
* @return the previous code unit in the text, or DONE if the new
* index is before the start of the text.
* @stable ICU 2.4
*/
public abstract int previous();
/**
* Retreat to the start of the previous code point in the text,
* and return it (pre-decrement semantics). If the index is not
* preceeded by a valid surrogate pair, the behavior is the same
* as <code>previous()</code>. Otherwise the iterator is
* decremented to the start of the surrogate pair, and the code
* point represented by the pair is returned.
* @return the previous code point in the text, or DONE if the new
* index is before the start of the text.
* @stable ICU 2.4
*/
public int previousCodePoint(){
int ch1 = previous();
if(UTF16.isTrailSurrogate((char)ch1)){
int ch2 = previous();
if(UTF16.isLeadSurrogate((char)ch2)){
return UCharacterProperty.getRawSupplementary((char)ch2,
(char)ch1);
}else if (ch2 != DONE) {
//unmatched trail surrogate so back out
next();
}
}
return ch1;
}
/**
* Sets the index to the specified index in the text.
* @param index the index within the text.
* @exception IndexOutOfBoundsException is thrown if an invalid index is
* supplied
* @stable ICU 2.4
*/
public abstract void setIndex(int index);
/**
* Sets the current index to the start.
* @stable ICU 2.4
*/
public void setToStart() {
setIndex(0);
}
/**
* Fills the buffer with the underlying text storage of the iterator
* If the buffer capacity is not enough a exception is thrown. The capacity
* of the fill in buffer should at least be equal to length of text in the
* iterator obtained by calling <code>getLength()</code>.
* <b>Usage:</b>
*
* <pre>{@code
* UChacterIterator iter = new UCharacterIterator.getInstance(text);
* char[] buf = new char[iter.getLength()];
* iter.getText(buf);
*
* OR
* char[] buf= new char[1];
* int len = 0;
* for(;;){
* try{
* len = iter.getText(buf);
* break;
* }catch(IndexOutOfBoundsException e){
* buf = new char[iter.getLength()];
* }
* }
* }</pre>
*
* @param fillIn an array of chars to fill with the underlying UTF-16 code
* units.
* @param offset the position within the array to start putting the data.
* @return the number of code units added to fillIn, as a convenience
* @exception IndexOutOfBoundsException exception if there is not enough
* room after offset in the array, or if offset < 0.
* @stable ICU 2.4
*/
public abstract int getText(char[] fillIn, int offset);
/**
* Convenience override for <code>getText(char[], int)</code> that provides
* an offset of 0.
* @param fillIn an array of chars to fill with the underlying UTF-16 code
* units.
* @return the number of code units added to fillIn, as a convenience
* @exception IndexOutOfBoundsException exception if there is not enough
* room in the array.
* @stable ICU 2.4
*/
public final int getText(char[] fillIn) {
return getText(fillIn, 0);
}
/**
* Convenience method for returning the underlying text storage as a string
* @return the underlying text storage in the iterator as a string
* @stable ICU 2.4
*/
public String getText() {
char[] text = new char[getLength()];
getText(text);
return new String(text);
}
/**
* Moves the current position by the number of code points
* specified, either forward or backward depending on the sign of
* delta (positive or negative respectively). If the current index
* is at a trail surrogate then the first adjustment is by code
* unit, and the remaining adjustments are by code points. If the
* resulting index would be less than zero, the index is set to
* zero, and if the resulting index would be greater than limit,
* the index is set to limit.
* @param delta the number of code units to move the current index.
* @return the new index
* @exception IndexOutOfBoundsException is thrown if an invalid delta is
* supplied
* @stable ICU 2.4
*
*/
public int moveCodePointIndex(int delta){
if(delta>0){
while(delta>0 && nextCodePoint() != DONE){delta--;}
}else{
while(delta<0 && previousCodePoint() != DONE){delta++;}
}
if(delta!=0){
throw new IndexOutOfBoundsException();
}
return getIndex();
}
/**
* Creates a copy of this iterator, independent from other iterators.
* If it is not possible to clone the iterator, returns null.
* @return copy of this iterator
* @stable ICU 2.4
*/
public Object clone() throws CloneNotSupportedException{
return super.clone();
}
}

View file

@ -1,607 +0,0 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Iterator;
import java.util.MissingResourceException;
import sun.text.normalizer.UCharacter.HangulSyllableType;
import sun.text.normalizer.UCharacter.NumericType;
/**
* <p>Internal class used for Unicode character property database.</p>
* <p>This classes store binary data read from uprops.icu.
* It does not have the capability to parse the data into more high-level
* information. It only returns bytes of information when required.</p>
* <p>Due to the form most commonly used for retrieval, array of char is used
* to store the binary data.</p>
* <p>UCharacterPropertyDB also contains information on accessing indexes to
* significant points in the binary data.</p>
* <p>Responsibility for molding the binary data into more meaning form lies on
* <a href=UCharacter.html>UCharacter</a>.</p>
* @author Syn Wee Quek
* @since release 2.1, february 1st 2002
*/
final class UCharacterProperty
{
// public data members -----------------------------------------------
/*
* public singleton instance
*/
public static final UCharacterProperty INSTANCE;
/**
* Trie data
*/
public Trie2_16 m_trie_;
/**
* Unicode version
*/
public VersionInfo m_unicodeVersion_;
/**
* Character type mask
*/
public static final int TYPE_MASK = 0x1F;
// uprops.h enum UPropertySource --------------------------------------- ***
/** From uchar.c/uprops.icu main trie */
public static final int SRC_CHAR=1;
/** From uchar.c/uprops.icu properties vectors trie */
public static final int SRC_PROPSVEC=2;
/** From ubidi_props.c/ubidi.icu */
public static final int SRC_BIDI=5;
/** From normalizer2impl.cpp/nfc.nrm */
public static final int SRC_NFC=8;
/** From normalizer2impl.cpp/nfkc.nrm */
public static final int SRC_NFKC=9;
// public methods ----------------------------------------------------
/**
* Gets the main property value for code point ch.
* @param ch code point whose property value is to be retrieved
* @return property value of code point
*/
public final int getProperty(int ch)
{
return m_trie_.get(ch);
}
/**
* Gets the unicode additional properties.
* Java version of C u_getUnicodeProperties().
* @param codepoint codepoint whose additional properties is to be
* retrieved
* @param column The column index.
* @return unicode properties
*/
public int getAdditional(int codepoint, int column) {
assert column >= 0;
if (column >= m_additionalColumnsCount_) {
return 0;
}
return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column];
}
/**
* <p>Get the "age" of the code point.</p>
* <p>The "age" is the Unicode version when the code point was first
* designated (as a non-character or for Private Use) or assigned a
* character.</p>
* <p>This can be useful to avoid emitting code points to receiving
* processes that do not accept newer characters.</p>
* <p>The data is from the UCD file DerivedAge.txt.</p>
* <p>This API does not check the validity of the codepoint.</p>
* @param codepoint The code point.
* @return the Unicode version number
*/
public VersionInfo getAge(int codepoint)
{
int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
return VersionInfo.getInstance(
(version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
version & LAST_NIBBLE_MASK_, 0, 0);
}
// int-value and enumerated properties --------------------------------- ***
public int getType(int c) {
return getProperty(c)&TYPE_MASK;
}
/*
* Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
* Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
*/
private static final int /* UHangulSyllableType */ gcbToHst[]={
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */
HangulSyllableType.LEADING_JAMO, /* U_GCB_L */
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */
HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */
HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */
HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */
HangulSyllableType.VOWEL_JAMO /* U_GCB_V */
/*
* Omit GCB values beyond what we need for hst.
* The code below checks for the array length.
*/
};
private class IntProperty {
int column; // SRC_PROPSVEC column, or "source" if mask==0
int mask;
int shift;
IntProperty(int column, int mask, int shift) {
this.column=column;
this.mask=mask;
this.shift=shift;
}
IntProperty(int source) {
this.column=source;
this.mask=0;
}
int getValue(int c) {
// systematic, directly stored properties
return (getAdditional(c, column)&mask)>>>shift;
}
}
private class BiDiIntProperty extends IntProperty {
BiDiIntProperty() {
super(SRC_BIDI);
}
}
private class CombiningClassIntProperty extends IntProperty {
CombiningClassIntProperty(int source) {
super(source);
}
}
private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties
int which;
int max;
NormQuickCheckIntProperty(int source, int which, int max) {
super(source);
this.which=which;
this.max=max;
}
}
private IntProperty intProp = new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE
int getValue(int c) {
return UBiDiProps.INSTANCE.getPairedBracketType(c);
}
};
public int getIntPropertyValue(int c, int which) {
if (which == BIDI_PAIRED_BRACKET_TYPE) {
return intProp.getValue(c);
}
return 0; // undefined
}
/**
* Forms a supplementary code point from the argument character<br>
* Note this is for internal use hence no checks for the validity of the
* surrogate characters are done
* @param lead lead surrogate character
* @param trail trailing surrogate character
* @return code point of the supplementary character
*/
public static int getRawSupplementary(char lead, char trail)
{
return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
}
/**
* Gets the type mask
* @param type character type
* @return mask
*/
public static final int getMask(int type)
{
return 1 << type;
}
/**
* Returns the digit values of characters like 'A' - 'Z', normal,
* half-width and full-width. This method assumes that the other digit
* characters are checked by the calling method.
* @param ch character to test
* @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
* its corresponding digit will be returned.
*/
public static int getEuropeanDigit(int ch) {
if ((ch > 0x7a && ch < 0xff21)
|| ch < 0x41 || (ch > 0x5a && ch < 0x61)
|| ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
return -1;
}
if (ch <= 0x7a) {
// ch >= 0x41 or ch < 0x61
return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
}
// ch >= 0xff21
if (ch <= 0xff3a) {
return ch + 10 - 0xff21;
}
// ch >= 0xff41 && ch <= 0xff5a
return ch + 10 - 0xff41;
}
public int digit(int c) {
int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_;
if(value<=9) {
return value;
} else {
return -1;
}
}
// protected variables -----------------------------------------------
/**
* Extra property trie
*/
Trie2_16 m_additionalTrie_;
/**
* Extra property vectors, 1st column for age and second for binary
* properties.
*/
int m_additionalVectors_[];
/**
* Number of additional columns
*/
int m_additionalColumnsCount_;
/**
* Maximum values for block, bits used as in vector word
* 0
*/
int m_maxBlockScriptValue_;
/**
* Maximum values for script, bits used as in vector word
* 0
*/
int m_maxJTGValue_;
/**
* Script_Extensions data
*/
public char[] m_scriptExtensions_;
// private variables -------------------------------------------------
/**
* Default name of the datafile
*/
private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";
/**
* Shift value for lead surrogate to form a supplementary character.
*/
private static final int LEAD_SURROGATE_SHIFT_ = 10;
/**
* Offset to add to combined surrogate pair to avoid masking.
*/
private static final int SURROGATE_OFFSET_ =
UTF16.SUPPLEMENTARY_MIN_VALUE -
(UTF16.SURROGATE_MIN_VALUE <<
LEAD_SURROGATE_SHIFT_) -
UTF16.TRAIL_SURROGATE_MIN_VALUE;
// property data constants -------------------------------------------------
/**
* Numeric types and values in the main properties words.
*/
private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6;
private static final int getNumericTypeValue(int props) {
return props >> NUMERIC_TYPE_VALUE_SHIFT_;
}
/* constants for the storage form of numeric types and values */
/** No numeric value. */
private static final int NTV_NONE_ = 0;
/** Decimal digits: nv=0..9 */
private static final int NTV_DECIMAL_START_ = 1;
/** Other digits: nv=0..9 */
private static final int NTV_DIGIT_START_ = 11;
/** Small integers: nv=0..154 */
private static final int NTV_NUMERIC_START_ = 21;
private static final int ntvGetType(int ntv) {
return
(ntv==NTV_NONE_) ? NumericType.NONE :
(ntv<NTV_DIGIT_START_) ? NumericType.DECIMAL :
(ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT :
NumericType.NUMERIC;
}
/*
* Properties in vector word 0
* Bits
* 31..24 DerivedAge version major/minor one nibble each
* 23..22 3..1: Bits 7..0 = Script_Extensions index
* 3: Script value from Script_Extensions
* 2: Script=Inherited
* 1: Script=Common
* 0: Script=bits 7..0
* 21..20 reserved
* 19..17 East Asian Width
* 16.. 8 UBlockCode
* 7.. 0 UScriptCode
*/
/**
* Script_Extensions: mask includes Script
*/
public static final int SCRIPT_X_MASK = 0x00c000ff;
//private static final int SCRIPT_X_SHIFT = 22;
/**
* Integer properties mask and shift values for East Asian cell width.
* Equivalent to icu4c UPROPS_EA_MASK
*/
private static final int EAST_ASIAN_MASK_ = 0x000e0000;
/**
* Integer properties mask and shift values for East Asian cell width.
* Equivalent to icu4c UPROPS_EA_SHIFT
*/
private static final int EAST_ASIAN_SHIFT_ = 17;
/**
* Integer properties mask and shift values for blocks.
* Equivalent to icu4c UPROPS_BLOCK_MASK
*/
private static final int BLOCK_MASK_ = 0x0001ff00;
/**
* Integer properties mask and shift values for blocks.
* Equivalent to icu4c UPROPS_BLOCK_SHIFT
*/
private static final int BLOCK_SHIFT_ = 8;
/**
* Integer properties mask and shift values for scripts.
* Equivalent to icu4c UPROPS_SHIFT_MASK
*/
public static final int SCRIPT_MASK_ = 0x000000ff;
/**
* Additional properties used in internal trie data
*/
/*
* Properties in vector word 1
* Each bit encodes one binary property.
* The following constants represent the bit number, use 1<<UPROPS_XYZ.
* UPROPS_BINARY_1_TOP<=32!
*
* Keep this list of property enums in sync with
* propListNames[] in icu/source/tools/genprops/props2.c!
*
* ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".
*/
private static final int WHITE_SPACE_PROPERTY_ = 0;
private static final int DASH_PROPERTY_ = 1;
private static final int HYPHEN_PROPERTY_ = 2;
private static final int QUOTATION_MARK_PROPERTY_ = 3;
private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4;
private static final int MATH_PROPERTY_ = 5;
private static final int HEX_DIGIT_PROPERTY_ = 6;
private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7;
private static final int ALPHABETIC_PROPERTY_ = 8;
private static final int IDEOGRAPHIC_PROPERTY_ = 9;
private static final int DIACRITIC_PROPERTY_ = 10;
private static final int EXTENDER_PROPERTY_ = 11;
private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12;
private static final int GRAPHEME_EXTEND_PROPERTY_ = 13;
private static final int GRAPHEME_LINK_PROPERTY_ = 14;
private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15;
private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16;
private static final int RADICAL_PROPERTY_ = 17;
private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18;
private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19;
private static final int DEPRECATED_PROPERTY_ = 20;
private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21;
private static final int XID_START_PROPERTY_ = 22;
private static final int XID_CONTINUE_PROPERTY_ = 23;
private static final int ID_START_PROPERTY_ = 24;
private static final int ID_CONTINUE_PROPERTY_ = 25;
private static final int GRAPHEME_BASE_PROPERTY_ = 26;
private static final int S_TERM_PROPERTY_ = 27;
private static final int VARIATION_SELECTOR_PROPERTY_ = 28;
private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */
private static final int PATTERN_WHITE_SPACE = 30;
/*
* Properties in vector word 2
* Bits
* 31..26 reserved
* 25..20 Line Break
* 19..15 Sentence Break
* 14..10 Word Break
* 9.. 5 Grapheme Cluster Break
* 4.. 0 Decomposition Type
*/
private static final int LB_MASK = 0x03f00000;
private static final int LB_SHIFT = 20;
private static final int SB_MASK = 0x000f8000;
private static final int SB_SHIFT = 15;
private static final int WB_MASK = 0x00007c00;
private static final int WB_SHIFT = 10;
private static final int GCB_MASK = 0x000003e0;
private static final int GCB_SHIFT = 5;
/**
* Integer properties mask for decomposition type.
* Equivalent to icu4c UPROPS_DT_MASK.
*/
private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;
/**
* First nibble shift
*/
private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
/**
* Second nibble mask
*/
private static final int LAST_NIBBLE_MASK_ = 0xF;
/**
* Age value shift
*/
private static final int AGE_SHIFT_ = 24;
// private constructors --------------------------------------------------
/**
* Constructor
* @exception IOException thrown when data reading fails or data corrupted
*/
private UCharacterProperty() throws IOException
{
// jar access
ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_);
m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable());
// Read or skip the 16 indexes.
int propertyOffset = bytes.getInt();
/* exceptionOffset = */ bytes.getInt();
/* caseOffset = */ bytes.getInt();
int additionalOffset = bytes.getInt();
int additionalVectorsOffset = bytes.getInt();
m_additionalColumnsCount_ = bytes.getInt();
int scriptExtensionsOffset = bytes.getInt();
int reservedOffset7 = bytes.getInt();
/* reservedOffset8 = */ bytes.getInt();
/* dataTopOffset = */ bytes.getInt();
m_maxBlockScriptValue_ = bytes.getInt();
m_maxJTGValue_ = bytes.getInt();
ICUBinary.skipBytes(bytes, (16 - 12) << 2);
// read the main properties trie
m_trie_ = Trie2_16.createFromSerialized(bytes);
int expectedTrieLength = (propertyOffset - 16) * 4;
int trieLength = m_trie_.getSerializedLength();
if(trieLength > expectedTrieLength) {
throw new IOException("uprops.icu: not enough bytes for main trie");
}
// skip padding after trie bytes
ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
// skip unused intervening data structures
ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4);
if(m_additionalColumnsCount_ > 0) {
// reads the additional property block
m_additionalTrie_ = Trie2_16.createFromSerialized(bytes);
expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4;
trieLength = m_additionalTrie_.getSerializedLength();
if(trieLength > expectedTrieLength) {
throw new IOException("uprops.icu: not enough bytes for additional-properties trie");
}
// skip padding after trie bytes
ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
// additional properties
int size = scriptExtensionsOffset - additionalVectorsOffset;
m_additionalVectors_ = new int[size];
for (int i = 0; i < size; i ++) {
m_additionalVectors_[i] = bytes.getInt();
}
}
// Script_Extensions
int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2;
if(numChars > 0) {
m_scriptExtensions_ = new char[numChars];
for(int i = 0; i < numChars; ++i) {
m_scriptExtensions_[i] = bytes.getChar();
}
}
}
private static final class IsAcceptable implements ICUBinary.Authenticate {
// @Override when we switch to Java 6
public boolean isDataVersionAcceptable(byte version[]) {
return version[0] == 7;
}
}
private static final int DATA_FORMAT = 0x5550726F; // "UPro"
public void upropsvec_addPropertyStarts(UnicodeSet set) {
/* add the start code point of each same-value range of the properties vectors trie */
if(m_additionalColumnsCount_>0) {
/* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator();
Trie2.Range range;
while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
set.add(range.startCodePoint);
}
}
}
// This static initializer block must be placed after
// other static member initialization
static {
try {
INSTANCE = new UCharacterProperty();
}
catch (IOException e) {
throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME_,"");
}
}
// Moved from UProperty.java
/**
* Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3).
* Used in UAX #9: Unicode Bidirectional Algorithm
* (http://www.unicode.org/reports/tr9/)
* Returns UCharacter.BidiPairedBracketType values.
* @stable ICU 52
*/
public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015;
}

View file

@ -1,616 +0,0 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
/**
* <p>Standalone utility class providing UTF16 character conversions and
* indexing conversions.
* <p>Code that uses strings alone rarely need modification.
* By design, UTF-16 does not allow overlap, so searching for strings is a safe
* operation. Similarly, concatenation is always safe. Substringing is safe if
* the start and end are both on UTF-32 boundaries. In normal code, the values
* for start and end are on those boundaries, since they arose from operations
* like searching. If not, the nearest UTF-32 boundaries can be determined
* using <code>bounds()</code>.
* <strong>Examples:</strong>
* <p>The following examples illustrate use of some of these methods.
* <pre>{@code
* // iteration forwards: Original
* for (int i = 0; i < s.length(); ++i) {
* char ch = s.charAt(i);
* doSomethingWith(ch);
* }
*
* // iteration forwards: Changes for UTF-32
* int ch;
* for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
* ch = UTF16.charAt(s, i);
* doSomethingWith(ch);
* }
*
* // iteration backwards: Original
* for (int i = s.length() - 1; i >= 0; --i) {
* char ch = s.charAt(i);
* doSomethingWith(ch);
* }
*
* // iteration backwards: Changes for UTF-32
* int ch;
* for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
* ch = UTF16.charAt(s, i);
* doSomethingWith(ch);
* }
* }</pre>
* <strong>Notes:</strong>
* <ul>
* <li>
* <strong>Naming:</strong> For clarity, High and Low surrogates are called
* <code>Lead</code> and <code>Trail</code> in the API, which gives a better
* sense of their ordering in a string. <code>offset16</code> and
* <code>offset32</code> are used to distinguish offsets to UTF-16
* boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
* used to contain UTF-32 characters, as opposed to <code>char16</code>,
* which is a UTF-16 code unit.
* </li>
* <li>
* <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
* UTF-32 offset to a UTF-16 offset and back. Because of the difference in
* structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
* back if and only if <code>bounds(string, offset16) != TRAIL</code>.
* </li>
* <li>
* <strong>Exceptions:</strong> The error checking will throw an exception
* if indices are out of bounds. Other than that, all methods will
* behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32
* values are present. <code>UCharacter.isLegal()</code> can be used to check
* for validity if desired.
* </li>
* <li>
* <strong>Unmatched Surrogates:</strong> If the string contains unmatched
* surrogates, then these are counted as one UTF-32 value. This matches
* their iteration behavior, which is vital. It also matches common display
* practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5).
* </li>
* <li>
* <strong>Optimization:</strong> The method implementations may need
* optimization if the compiler doesn't fold static final methods. Since
* surrogate pairs will form an exceeding small percentage of all the text
* in the world, the singleton case should always be optimized for.
* </li>
* </ul>
* @author Mark Davis, with help from Markus Scherer
* @stable ICU 2.1
*/
public final class UTF16
{
// public variables ---------------------------------------------------
/**
* The lowest Unicode code point value.
* @stable ICU 2.1
*/
public static final int CODEPOINT_MIN_VALUE = 0;
/**
* The highest Unicode code point value (scalar value) according to the
* Unicode Standard.
* @stable ICU 2.1
*/
public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
/**
* The minimum value for Supplementary code points
* @stable ICU 2.1
*/
public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
/**
* Lead surrogate minimum value
* @stable ICU 2.1
*/
public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
/**
* Trail surrogate minimum value
* @stable ICU 2.1
*/
public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
/**
* Lead surrogate maximum value
* @stable ICU 2.1
*/
public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
/**
* Trail surrogate maximum value
* @stable ICU 2.1
*/
public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
/**
* Surrogate minimum value
* @stable ICU 2.1
*/
public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
/**
* Lead surrogate bitmask
*/
private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
/**
* Trail surrogate bitmask
*/
private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
/**
* Surrogate bitmask
*/
private static final int SURROGATE_BITMASK = 0xFFFFF800;
/**
* Lead surrogate bits
*/
private static final int LEAD_SURROGATE_BITS = 0xD800;
/**
* Trail surrogate bits
*/
private static final int TRAIL_SURROGATE_BITS = 0xDC00;
/**
* Surrogate bits
*/
private static final int SURROGATE_BITS = 0xD800;
// constructor --------------------------------------------------------
// /CLOVER:OFF
/**
* Prevent instance from being created.
*/
private UTF16() {
}
// /CLOVER:ON
// public method ------------------------------------------------------
/**
* Extract a single UTF-32 value from a string.
* Used when iterating forwards or backwards (with
* <code>UTF16.getCharCount()</code>, as well as random access. If a
* validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">
* UCharacter.isLegal()</a></code> on the return value.
* If the char retrieved is part of a surrogate pair, its supplementary
* character will be returned. If a complete supplementary character is
* not found the incomplete character will be returned
* @param source array of UTF-16 chars
* @param offset16 UTF-16 offset to the start of the character.
* @return UTF-32 value for the UTF-32 value that contains the char at
* offset16. The boundaries of that codepoint are the same as in
* <code>bounds32()</code>.
* @exception IndexOutOfBoundsException thrown if offset16 is out of
* bounds.
* @stable ICU 2.1
*/
public static int charAt(String source, int offset16) {
char single = source.charAt(offset16);
if (single < LEAD_SURROGATE_MIN_VALUE) {
return single;
}
return _charAt(source, offset16, single);
}
private static int _charAt(String source, int offset16, char single) {
if (single > TRAIL_SURROGATE_MAX_VALUE) {
return single;
}
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is
// low, look both directions.
if (single <= LEAD_SURROGATE_MAX_VALUE) {
++offset16;
if (source.length() != offset16) {
char trail = source.charAt(offset16);
if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(single, trail);
}
}
} else {
--offset16;
if (offset16 >= 0) {
// single is a trail surrogate so
char lead = source.charAt(offset16);
if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(lead, single);
}
}
}
return single; // return unmatched surrogate
}
/**
* Extract a single UTF-32 value from a string.
* Used when iterating forwards or backwards (with
* <code>UTF16.getCharCount()</code>, as well as random access. If a
* validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
* </a></code> on the return value.
* If the char retrieved is part of a surrogate pair, its supplementary
* character will be returned. If a complete supplementary character is
* not found the incomplete character will be returned
* @param source array of UTF-16 chars
* @param offset16 UTF-16 offset to the start of the character.
* @return UTF-32 value for the UTF-32 value that contains the char at
* offset16. The boundaries of that codepoint are the same as in
* <code>bounds32()</code>.
* @exception IndexOutOfBoundsException thrown if offset16 is out of bounds.
* @stable ICU 2.1
*/
public static int charAt(CharSequence source, int offset16) {
char single = source.charAt(offset16);
if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
return single;
}
return _charAt(source, offset16, single);
}
private static int _charAt(CharSequence source, int offset16, char single) {
if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
return single;
}
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is
// low, look both directions.
if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
++offset16;
if (source.length() != offset16) {
char trail = source.charAt(offset16);
if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
&& trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(single, trail);
}
}
} else {
--offset16;
if (offset16 >= 0) {
// single is a trail surrogate so
char lead = source.charAt(offset16);
if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
&& lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(lead, single);
}
}
}
return single; // return unmatched surrogate
}
/**
* Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
* (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
* required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
* </a></code>
* on the return value. If the char retrieved is part of a surrogate pair, its supplementary
* character will be returned. If a complete supplementary character is not found the incomplete
* character will be returned
*
* @param source Array of UTF-16 chars
* @param start Offset to substring in the source array for analyzing
* @param limit Offset to substring in the source array for analyzing
* @param offset16 UTF-16 offset relative to start
* @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
* of that codepoint are the same as in <code>bounds32()</code>.
* @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit.
* @stable ICU 2.1
*/
public static int charAt(char source[], int start, int limit, int offset16) {
offset16 += start;
if (offset16 < start || offset16 >= limit) {
throw new ArrayIndexOutOfBoundsException(offset16);
}
char single = source[offset16];
if (!isSurrogate(single)) {
return single;
}
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is
// low, look both directions.
if (single <= LEAD_SURROGATE_MAX_VALUE) {
offset16++;
if (offset16 >= limit) {
return single;
}
char trail = source[offset16];
if (isTrailSurrogate(trail)) {
return UCharacterProperty.getRawSupplementary(single, trail);
}
}
else { // isTrailSurrogate(single), so
if (offset16 == start) {
return single;
}
offset16--;
char lead = source[offset16];
if (isLeadSurrogate(lead))
return UCharacterProperty.getRawSupplementary(lead, single);
}
return single; // return unmatched surrogate
}
/**
* Determines how many chars this char32 requires.
* If a validity check is required, use <code>
* <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
* char32 before calling.
* @param char32 the input codepoint.
* @return 2 if is in supplementary space, otherwise 1.
* @stable ICU 2.1
*/
public static int getCharCount(int char32)
{
if (char32 < SUPPLEMENTARY_MIN_VALUE) {
return 1;
}
return 2;
}
/**
* Determines whether the code value is a surrogate.
* @param char16 the input character.
* @return true if the input character is a surrogate.
* @stable ICU 2.1
*/
public static boolean isSurrogate(char char16)
{
return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
}
/**
* Determines whether the character is a trail surrogate.
* @param char16 the input character.
* @return true if the input character is a trail surrogate.
* @stable ICU 2.1
*/
public static boolean isTrailSurrogate(char char16)
{
return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
}
/**
* Determines whether the character is a lead surrogate.
* @param char16 the input character.
* @return true if the input character is a lead surrogate
* @stable ICU 2.1
*/
public static boolean isLeadSurrogate(char char16)
{
return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
}
/**
* Returns the lead surrogate.
* If a validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
* on char32 before calling.
* @param char32 the input character.
* @return lead surrogate if the getCharCount(ch) is 2; <br>
* and 0 otherwise (note: 0 is not a valid lead surrogate).
* @stable ICU 2.1
*/
public static char getLeadSurrogate(int char32)
{
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
return (char)(LEAD_SURROGATE_OFFSET_ +
(char32 >> LEAD_SURROGATE_SHIFT_));
}
return 0;
}
/**
* Returns the trail surrogate.
* If a validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
* on char32 before calling.
* @param char32 the input character.
* @return the trail surrogate if the getCharCount(ch) is 2; <br> otherwise
* the character itself
* @stable ICU 2.1
*/
public static char getTrailSurrogate(int char32)
{
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
return (char)(TRAIL_SURROGATE_MIN_VALUE +
(char32 & TRAIL_SURROGATE_MASK_));
}
return (char) char32;
}
/**
* Convenience method corresponding to String.valueOf(char). Returns a one
* or two char string containing the UTF-32 value in UTF16 format. If a
* validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
* on char32 before calling.
* @param char32 the input character.
* @return string value of char32 in UTF16 format
* @exception IllegalArgumentException thrown if char32 is a invalid
* codepoint.
* @stable ICU 2.1
*/
public static String valueOf(int char32)
{
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
throw new IllegalArgumentException("Illegal codepoint");
}
return toString(char32);
}
/**
* Append a single UTF-32 value to the end of a StringBuffer.
* If a validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
* on char32 before calling.
* @param target the buffer to append to
* @param char32 value to append.
* @return the updated StringBuffer
* @exception IllegalArgumentException thrown when char32 does not lie
* within the range of the Unicode codepoints
* @stable ICU 2.1
*/
public static StringBuffer append(StringBuffer target, int char32)
{
// Check for irregular values
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
}
// Write the UTF-16 values
if (char32 >= SUPPLEMENTARY_MIN_VALUE)
{
target.append(getLeadSurrogate(char32));
target.append(getTrailSurrogate(char32));
}
else {
target.append((char) char32);
}
return target;
}
/**
* Shifts offset16 by the argument number of codepoints within a subarray.
* @param source char array
* @param start position of the subarray to be performed on
* @param limit position of the subarray to be performed on
* @param offset16 UTF16 position to shift relative to start
* @param shift32 number of codepoints to shift
* @return new shifted offset16 relative to start
* @exception IndexOutOfBoundsException if the new offset16 is out of
* bounds with respect to the subarray or the subarray bounds
* are out of range.
* @stable ICU 2.1
*/
public static int moveCodePointOffset(char source[], int start, int limit,
int offset16, int shift32)
{
int size = source.length;
int count;
char ch;
int result = offset16 + start;
if (start < 0 || limit < start) {
throw new StringIndexOutOfBoundsException(start);
}
if (limit > size) {
throw new StringIndexOutOfBoundsException(limit);
}
if (offset16 < 0 || result > limit) {
throw new StringIndexOutOfBoundsException(offset16);
}
if (shift32 > 0) {
if (shift32 + result > size) {
throw new StringIndexOutOfBoundsException(result);
}
count = shift32;
while (result < limit && count > 0)
{
ch = source[result];
if (isLeadSurrogate(ch) && (result + 1 < limit) &&
isTrailSurrogate(source[result + 1])) {
result++;
}
count--;
result++;
}
} else {
if (result + shift32 < start) {
throw new StringIndexOutOfBoundsException(result);
}
for (count = -shift32; count > 0; count--) {
result--;
if (result < start) {
break;
}
ch = source[result];
if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
result--;
}
}
}
if (count != 0) {
throw new StringIndexOutOfBoundsException(shift32);
}
result -= start;
return result;
}
// private data members -------------------------------------------------
/**
* Shift value for lead surrogate to form a supplementary character.
*/
private static final int LEAD_SURROGATE_SHIFT_ = 10;
/**
* Mask to retrieve the significant value from a trail surrogate.
*/
private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
/**
* Value that all lead surrogate starts with
*/
private static final int LEAD_SURROGATE_OFFSET_ =
LEAD_SURROGATE_MIN_VALUE -
(SUPPLEMENTARY_MIN_VALUE
>> LEAD_SURROGATE_SHIFT_);
// private methods ------------------------------------------------------
/**
* <p>Converts argument code point and returns a String object representing
* the code point's value in UTF16 format.
* <p>This method does not check for the validity of the codepoint, the
* results are not guaranteed if a invalid codepoint is passed as
* argument.
* <p>The result is a string whose length is 1 for non-supplementary code
* points, 2 otherwise.
* @param ch code point
* @return string representation of the code point
*/
private static String toString(int ch)
{
if (ch < SUPPLEMENTARY_MIN_VALUE) {
return String.valueOf((char) ch);
}
StringBuilder result = new StringBuilder();
result.append(getLeadSurrogate(ch));
result.append(getTrailSurrogate(ch));
return result.toString();
}
}

File diff suppressed because it is too large Load diff

View file

@ -1,273 +0,0 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 1996-2011, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.IOException;
import java.util.Locale;
final class Utility {
/**
* Convert characters outside the range U+0020 to U+007F to
* Unicode escapes, and convert backslash to a double backslash.
*/
public static final String escape(String s) {
StringBuilder buf = new StringBuilder();
for (int i=0; i<s.length(); ) {
int c = Character.codePointAt(s, i);
i += UTF16.getCharCount(c);
if (c >= ' ' && c <= 0x007F) {
if (c == '\\') {
buf.append("\\\\"); // That is, "\\"
} else {
buf.append((char)c);
}
} else {
boolean four = c <= 0xFFFF;
buf.append(four ? "\\u" : "\\U");
buf.append(hex(c, four ? 4 : 8));
}
}
return buf.toString();
}
/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
private static final char[] UNESCAPE_MAP = {
/*" 0x22, 0x22 */
/*' 0x27, 0x27 */
/*? 0x3F, 0x3F */
/*\ 0x5C, 0x5C */
/*a*/ 0x61, 0x07,
/*b*/ 0x62, 0x08,
/*e*/ 0x65, 0x1b,
/*f*/ 0x66, 0x0c,
/*n*/ 0x6E, 0x0a,
/*r*/ 0x72, 0x0d,
/*t*/ 0x74, 0x09,
/*v*/ 0x76, 0x0b
};
/**
* Convert an escape to a 32-bit code point value. We attempt
* to parallel the icu4c unescapeAt() function.
* @param offset16 an array containing offset to the character
* <em>after</em> the backslash. Upon return offset16[0] will
* be updated to point after the escape sequence.
* @return character value from 0 to 10FFFF, or -1 on error.
*/
public static int unescapeAt(String s, int[] offset16) {
int c;
int result = 0;
int n = 0;
int minDig = 0;
int maxDig = 0;
int bitsPerDigit = 4;
int dig;
int i;
boolean braces = false;
/* Check that offset is in range */
int offset = offset16[0];
int length = s.length();
if (offset < 0 || offset >= length) {
return -1;
}
/* Fetch first UChar after '\\' */
c = Character.codePointAt(s, offset);
offset += UTF16.getCharCount(c);
/* Convert hexadecimal and octal escapes */
switch (c) {
case 'u':
minDig = maxDig = 4;
break;
case 'U':
minDig = maxDig = 8;
break;
case 'x':
minDig = 1;
if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) {
++offset;
braces = true;
maxDig = 8;
} else {
maxDig = 2;
}
break;
default:
dig = UCharacter.digit(c, 8);
if (dig >= 0) {
minDig = 1;
maxDig = 3;
n = 1; /* Already have first octal digit */
bitsPerDigit = 3;
result = dig;
}
break;
}
if (minDig != 0) {
while (offset < length && n < maxDig) {
c = UTF16.charAt(s, offset);
dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
if (dig < 0) {
break;
}
result = (result << bitsPerDigit) | dig;
offset += UTF16.getCharCount(c);
++n;
}
if (n < minDig) {
return -1;
}
if (braces) {
if (c != 0x7D /*}*/) {
return -1;
}
++offset;
}
if (result < 0 || result >= 0x110000) {
return -1;
}
// If an escape sequence specifies a lead surrogate, see
// if there is a trail surrogate after it, either as an
// escape or as a literal. If so, join them up into a
// supplementary.
if (offset < length &&
UTF16.isLeadSurrogate((char) result)) {
int ahead = offset+1;
c = s.charAt(offset); // [sic] get 16-bit code unit
if (c == '\\' && ahead < length) {
int o[] = new int[] { ahead };
c = unescapeAt(s, o);
ahead = o[0];
}
if (UTF16.isTrailSurrogate((char) c)) {
offset = ahead;
result = UCharacterProperty.getRawSupplementary(
(char) result, (char) c);
}
}
offset16[0] = offset;
return result;
}
/* Convert C-style escapes in table */
for (i=0; i<UNESCAPE_MAP.length; i+=2) {
if (c == UNESCAPE_MAP[i]) {
offset16[0] = offset;
return UNESCAPE_MAP[i+1];
} else if (c < UNESCAPE_MAP[i]) {
break;
}
}
/* Map \cX to control-X: X & 0x1F */
if (c == 'c' && offset < length) {
c = UTF16.charAt(s, offset);
offset16[0] = offset + UTF16.getCharCount(c);
return 0x1F & c;
}
/* If no special forms are recognized, then consider
* the backslash to generically escape the next character. */
offset16[0] = offset;
return c;
}
/**
* Supplies a zero-padded hex representation of an integer (without 0x)
*/
public static String hex(long i, int places) {
if (i == Long.MIN_VALUE) return "-8000000000000000";
boolean negative = i < 0;
if (negative) {
i = -i;
}
String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH);
if (result.length() < places) {
result = "0000000000000000".substring(result.length(),places) + result;
}
if (negative) {
return '-' + result;
}
return result;
}
static final char DIGITS[] = {
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
'U', 'V', 'W', 'X', 'Y', 'Z'
};
/**
* Return true if the character is NOT printable ASCII. The tab,
* newline and linefeed characters are considered unprintable.
*/
public static boolean isUnprintable(int c) {
//0x20 = 32 and 0x7E = 126
return !(c >= 0x20 && c <= 0x7E);
}
/**
* Escape unprintable characters using <backslash>uxxxx notation
* for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
* above. If the character is printable ASCII, then do nothing
* and return FALSE. Otherwise, append the escaped notation and
* return TRUE.
*/
public static <T extends Appendable> boolean escapeUnprintable(T result, int c) {
try {
if (isUnprintable(c)) {
result.append('\\');
if ((c & ~0xFFFF) != 0) {
result.append('U');
result.append(DIGITS[0xF&(c>>28)]);
result.append(DIGITS[0xF&(c>>24)]);
result.append(DIGITS[0xF&(c>>20)]);
result.append(DIGITS[0xF&(c>>16)]);
} else {
result.append('u');
}
result.append(DIGITS[0xF&(c>>12)]);
result.append(DIGITS[0xF&(c>>8)]);
result.append(DIGITS[0xF&(c>>4)]);
result.append(DIGITS[0xF&c]);
return true;
}
return false;
} catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
}

View file

@ -1,185 +0,0 @@
/*
* Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.util.HashMap;
/**
* Class to store version numbers of the form major.minor.milli.micro.
* @author synwee
* @stable ICU 2.6
*/
public final class VersionInfo
{
// public methods ------------------------------------------------------
/**
* Returns an instance of VersionInfo with the argument version.
* @param version version String in the format of "major.minor.milli.micro"
* or "major.minor.milli" or "major.minor" or "major",
* where major, minor, milli, micro are non-negative numbers
* {@literal <=} 255. If the trailing version numbers are
* not specified they are taken as 0s. E.g. Version "3.1" is
* equivalent to "3.1.0.0".
* @return an instance of VersionInfo with the argument version.
* @exception throws an IllegalArgumentException when the argument version
* is not in the right format
* @stable ICU 2.6
*/
public static VersionInfo getInstance(String version)
{
int length = version.length();
int array[] = {0, 0, 0, 0};
int count = 0;
int index = 0;
while (count < 4 && index < length) {
char c = version.charAt(index);
if (c == '.') {
count ++;
}
else {
c -= '0';
if (c < 0 || c > 9) {
throw new IllegalArgumentException(INVALID_VERSION_NUMBER_);
}
array[count] *= 10;
array[count] += c;
}
index ++;
}
if (index != length) {
throw new IllegalArgumentException(
"Invalid version number: String '" + version + "' exceeds version format");
}
for (int i = 0; i < 4; i ++) {
if (array[i] < 0 || array[i] > 255) {
throw new IllegalArgumentException(INVALID_VERSION_NUMBER_);
}
}
return getInstance(array[0], array[1], array[2], array[3]);
}
/**
* Returns an instance of VersionInfo with the argument version.
* @param major major version, non-negative number {@literal <=} 255.
* @param minor minor version, non-negative number {@literal <=} 255.
* @param milli milli version, non-negative number {@literal <=} 255.
* @param micro micro version, non-negative number {@literal <=} 255.
* @exception throws an IllegalArgumentException when either arguments are
* negative or {@literal >} 255
* @stable ICU 2.6
*/
public static VersionInfo getInstance(int major, int minor, int milli,
int micro)
{
// checks if it is in the hashmap
// else
if (major < 0 || major > 255 || minor < 0 || minor > 255 ||
milli < 0 || milli > 255 || micro < 0 || micro > 255) {
throw new IllegalArgumentException(INVALID_VERSION_NUMBER_);
}
int version = getInt(major, minor, milli, micro);
Integer key = Integer.valueOf(version);
Object result = MAP_.get(key);
if (result == null) {
result = new VersionInfo(version);
MAP_.put(key, result);
}
return (VersionInfo)result;
}
/**
* Compares other with this VersionInfo.
* @param other VersionInfo to be compared
* @return 0 if the argument is a VersionInfo object that has version
* information equals to this object.
* Less than 0 if the argument is a VersionInfo object that has
* version information greater than this object.
* Greater than 0 if the argument is a VersionInfo object that
* has version information less than this object.
* @stable ICU 2.6
*/
public int compareTo(VersionInfo other)
{
return m_version_ - other.m_version_;
}
// private data members ----------------------------------------------
/**
* Version number stored as a byte for each of the major, minor, milli and
* micro numbers in the 32 bit int.
* Most significant for the major and the least significant contains the
* micro numbers.
*/
private int m_version_;
/**
* Map of singletons
*/
private static final HashMap<Integer, Object> MAP_ = new HashMap<>();
/**
* Error statement string
*/
private static final String INVALID_VERSION_NUMBER_ =
"Invalid version number: Version number may be negative or greater than 255";
// private constructor -----------------------------------------------
/**
* Constructor with int
* @param compactversion a 32 bit int with each byte representing a number
*/
private VersionInfo(int compactversion)
{
m_version_ = compactversion;
}
/**
* Gets the int from the version numbers
* @param major non-negative version number
* @param minor non-negativeversion number
* @param milli non-negativeversion number
* @param micro non-negativeversion number
*/
private static int getInt(int major, int minor, int milli, int micro)
{
return (major << 24) | (minor << 16) | (milli << 8) | micro;
}
}