8187443: Forest Consolidation: Move files to unified layout

Reviewed-by: darcy, ihse
This commit is contained in:
Erik Joelsson 2017-09-12 19:03:39 +02:00
parent 270fe13182
commit 3789983e89
56923 changed files with 3 additions and 15727 deletions

View file

@ -0,0 +1,306 @@
/*
* Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*
* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
* (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
*
* The original version of this source code and documentation
* is copyrighted and owned by Taligent, Inc., a wholly-owned
* subsidiary of IBM. These materials are provided under terms
* of a License Agreement between Taligent and Sun. This technology
* is protected by multiple US and International patents.
*
* This notice and attribution to Taligent may not be removed.
* Taligent is a registered trademark of Taligent, Inc.
*/
package sun.text;
import java.nio.BufferUnderflowException;
import java.nio.ByteBuffer;
import java.util.MissingResourceException;
import sun.text.CompactByteArray;
import sun.text.SupplementaryCharacterData;
/**
* This is the class that represents the list of known words used by
* DictionaryBasedBreakIterator. The conceptual data structure used
* here is a trie: there is a node hanging off the root node for every
* letter that can start a word. Each of these nodes has a node hanging
* off of it for every letter that can be the second letter of a word
* if this node is the first letter, and so on. The trie is represented
* as a two-dimensional array that can be treated as a table of state
* transitions. Indexes are used to compress this array, taking
* advantage of the fact that this array will always be very sparse.
*/
class BreakDictionary {
//=========================================================================
// data members
//=========================================================================
/**
* The version of the dictionary that was read in.
*/
private static int supportedVersion = 1;
/**
* Maps from characters to column numbers. The main use of this is to
* avoid making room in the array for empty columns.
*/
private CompactByteArray columnMap = null;
private SupplementaryCharacterData supplementaryCharColumnMap = null;
/**
* The number of actual columns in the table
*/
private int numCols;
/**
* Columns are organized into groups of 32. This says how many
* column groups. (We could calculate this, but we store the
* value to avoid having to repeatedly calculate it.)
*/
private int numColGroups;
/**
* The actual compressed state table. Each conceptual row represents
* a state, and the cells in it contain the row numbers of the states
* to transition to for each possible letter. 0 is used to indicate
* an illegal combination of letters (i.e., the error state). The
* table is compressed by eliminating all the unpopulated (i.e., zero)
* cells. Multiple conceptual rows can then be doubled up in a single
* physical row by sliding them up and possibly shifting them to one
* side or the other so the populated cells don't collide. Indexes
* are used to identify unpopulated cells and to locate populated cells.
*/
private short[] table = null;
/**
* This index maps logical row numbers to physical row numbers
*/
private short[] rowIndex = null;
/**
* A bitmap is used to tell which cells in the comceptual table are
* populated. This array contains all the unique bit combinations
* in that bitmap. If the table is more than 32 columns wide,
* successive entries in this array are used for a single row.
*/
private int[] rowIndexFlags = null;
/**
* This index maps from a logical row number into the bitmap table above.
* (This keeps us from storing duplicate bitmap combinations.) Since there
* are a lot of rows with only one populated cell, instead of wasting space
* in the bitmap table, we just store a negative number in this index for
* rows with one populated cell. The absolute value of that number is
* the column number of the populated cell.
*/
private short[] rowIndexFlagsIndex = null;
/**
* For each logical row, this index contains a constant that is added to
* the logical column number to get the physical column number
*/
private byte[] rowIndexShifts = null;
//=========================================================================
// deserialization
//=========================================================================
BreakDictionary(String dictionaryName, byte[] dictionaryData) {
try {
setupDictionary(dictionaryName, dictionaryData);
} catch (BufferUnderflowException bue) {
MissingResourceException e;
e = new MissingResourceException("Corrupted dictionary data",
dictionaryName, "");
e.initCause(bue);
throw e;
}
}
private void setupDictionary(String dictionaryName, byte[] dictionaryData) {
ByteBuffer bb = ByteBuffer.wrap(dictionaryData);
// check version
int version = bb.getInt();
if (version != supportedVersion) {
throw new MissingResourceException("Dictionary version(" + version + ") is unsupported",
dictionaryName, "");
}
// Check data size
int len = bb.getInt();
if (bb.position() + len != bb.limit()) {
throw new MissingResourceException("Dictionary size is wrong: " + bb.limit(),
dictionaryName, "");
}
// read in the column map for BMP characteres (this is serialized in
// its internal form: an index array followed by a data array)
len = bb.getInt();
short[] temp = new short[len];
for (int i = 0; i < len; i++) {
temp[i] = bb.getShort();
}
len = bb.getInt();
byte[] temp2 = new byte[len];
bb.get(temp2);
columnMap = new CompactByteArray(temp, temp2);
// read in numCols and numColGroups
numCols = bb.getInt();
numColGroups = bb.getInt();
// read in the row-number index
len = bb.getInt();
rowIndex = new short[len];
for (int i = 0; i < len; i++) {
rowIndex[i] = bb.getShort();
}
// load in the populated-cells bitmap: index first, then bitmap list
len = bb.getInt();
rowIndexFlagsIndex = new short[len];
for (int i = 0; i < len; i++) {
rowIndexFlagsIndex[i] = bb.getShort();
}
len = bb.getInt();
rowIndexFlags = new int[len];
for (int i = 0; i < len; i++) {
rowIndexFlags[i] = bb.getInt();
}
// load in the row-shift index
len = bb.getInt();
rowIndexShifts = new byte[len];
bb.get(rowIndexShifts);
// load in the actual state table
len = bb.getInt();
table = new short[len];
for (int i = 0; i < len; i++) {
table[i] = bb.getShort();
}
// finally, prepare the column map for supplementary characters
len = bb.getInt();
int[] temp3 = new int[len];
for (int i = 0; i < len; i++) {
temp3[i] = bb.getInt();
}
assert bb.position() == bb.limit();
supplementaryCharColumnMap = new SupplementaryCharacterData(temp3);
}
//=========================================================================
// access to the words
//=========================================================================
/**
* Uses the column map to map the character to a column number, then
* passes the row and column number to getNextState()
* @param row The current state
* @param ch The character whose column we're interested in
* @return The new state to transition to
*/
public final short getNextStateFromCharacter(int row, int ch) {
int col;
if (ch < Character.MIN_SUPPLEMENTARY_CODE_POINT) {
col = columnMap.elementAt((char)ch);
} else {
col = supplementaryCharColumnMap.getValue(ch);
}
return getNextState(row, col);
}
/**
* Returns the value in the cell with the specified (logical) row and
* column numbers. In DictionaryBasedBreakIterator, the row number is
* a state number, the column number is an input, and the return value
* is the row number of the new state to transition to. (0 is the
* "error" state, and -1 is the "end of word" state in a dictionary)
* @param row The row number of the current state
* @param col The column number of the input character (0 means "not a
* dictionary character")
* @return The row number of the new state to transition to
*/
public final short getNextState(int row, int col) {
if (cellIsPopulated(row, col)) {
// we map from logical to physical row number by looking up the
// mapping in rowIndex; we map from logical column number to
// physical column number by looking up a shift value for this
// logical row and offsetting the logical column number by
// the shift amount. Then we can use internalAt() to actually
// get the value out of the table.
return internalAt(rowIndex[row], col + rowIndexShifts[row]);
}
else {
return 0;
}
}
/**
* Given (logical) row and column numbers, returns true if the
* cell in that position is populated
*/
private boolean cellIsPopulated(int row, int col) {
// look up the entry in the bitmap index for the specified row.
// If it's a negative number, it's the column number of the only
// populated cell in the row
if (rowIndexFlagsIndex[row] < 0) {
return col == -rowIndexFlagsIndex[row];
}
// if it's a positive number, it's the offset of an entry in the bitmap
// list. If the table is more than 32 columns wide, the bitmap is stored
// successive entries in the bitmap list, so we have to divide the column
// number by 32 and offset the number we got out of the index by the result.
// Once we have the appropriate piece of the bitmap, test the appropriate
// bit and return the result.
else {
int flags = rowIndexFlags[rowIndexFlagsIndex[row] + (col >> 5)];
return (flags & (1 << (col & 0x1f))) != 0;
}
}
/**
* Implementation of getNextState() when we know the specified cell is
* populated.
* @param row The PHYSICAL row number of the cell
* @param col The PHYSICAL column number of the cell
* @return The value stored in the cell
*/
private short internalAt(int row, int col) {
// the table is a one-dimensional array, so this just does the math necessary
// to treat it as a two-dimensional array (we don't just use a two-dimensional
// array because two-dimensional arrays are inefficient in Java)
return table[row * numCols + col];
}
}

View file

@ -0,0 +1,65 @@
/*
* Copyright (c) 2000, 2005, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package sun.text;
import sun.text.normalizer.NormalizerBase;
public class CollatorUtilities {
public static int toLegacyMode(NormalizerBase.Mode mode) {
// find the index of the legacy mode in the table;
// if it's not there, default to Collator.NO_DECOMPOSITION (0)
int legacyMode = legacyModeMap.length;
while (legacyMode > 0) {
--legacyMode;
if (legacyModeMap[legacyMode] == mode) {
break;
}
}
return legacyMode;
}
public static NormalizerBase.Mode toNormalizerMode(int mode) {
NormalizerBase.Mode normalizerMode;
try {
normalizerMode = legacyModeMap[mode];
}
catch(ArrayIndexOutOfBoundsException e) {
normalizerMode = NormalizerBase.NONE;
}
return normalizerMode;
}
static NormalizerBase.Mode[] legacyModeMap = {
NormalizerBase.NONE, // Collator.NO_DECOMPOSITION
NormalizerBase.NFD, // Collator.CANONICAL_DECOMPOSITION
NormalizerBase.NFKD, // Collator.FULL_DECOMPOSITION
};
}

View file

@ -0,0 +1,352 @@
/*
* Copyright (c) 1996, 2011, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
* (C) Copyright Taligent, Inc. 1996 - All Rights Reserved
* (C) Copyright IBM Corp. 1996 - All Rights Reserved
*
* The original version of this source code and documentation is copyrighted
* and owned by Taligent, Inc., a wholly-owned subsidiary of IBM. These
* materials are provided under terms of a License Agreement between Taligent
* and Sun. This technology is protected by multiple US and International
* patents. This notice and attribution to Taligent may not be removed.
* Taligent is a registered trademark of Taligent, Inc.
*
*/
package sun.text;
/**
* class CompactATypeArray : use only on primitive data types
* Provides a compact way to store information that is indexed by Unicode
* values, such as character properties, types, keyboard values, etc.This
* is very useful when you have a block of Unicode data that contains
* significant values while the rest of the Unicode data is unused in the
* application or when you have a lot of redundance, such as where all 21,000
* Han ideographs have the same value. However, lookup is much faster than a
* hash table.
* A compact array of any primitive data type serves two purposes:
* <UL type = circle>
* <LI>Fast access of the indexed values.
* <LI>Smaller memory footprint.
* </UL>
* A compact array is composed of a index array and value array. The index
* array contains the indicies of Unicode characters to the value array.
*
* @see CompactIntArray
* @see CompactShortArray
* @author Helena Shih
*/
public final class CompactByteArray implements Cloneable {
/**
* The total number of Unicode characters.
*/
public static final int UNICODECOUNT =65536;
/**
* Constructor for CompactByteArray.
* @param defaultValue the default value of the compact array.
*/
public CompactByteArray(byte defaultValue)
{
int i;
values = new byte[UNICODECOUNT];
indices = new short[INDEXCOUNT];
hashes = new int[INDEXCOUNT];
for (i = 0; i < UNICODECOUNT; ++i) {
values[i] = defaultValue;
}
for (i = 0; i < INDEXCOUNT; ++i) {
indices[i] = (short)(i<<BLOCKSHIFT);
hashes[i] = 0;
}
isCompact = false;
}
/**
* Constructor for CompactByteArray.
* @param indexArray the indicies of the compact array.
* @param newValues the values of the compact array.
* @exception IllegalArgumentException If index is out of range.
*/
public CompactByteArray(short indexArray[],
byte newValues[])
{
int i;
if (indexArray.length != INDEXCOUNT)
throw new IllegalArgumentException("Index out of bounds!");
for (i = 0; i < INDEXCOUNT; ++i) {
short index = indexArray[i];
if ((index < 0) || (index >= newValues.length+BLOCKCOUNT))
throw new IllegalArgumentException("Index out of bounds!");
}
indices = indexArray;
values = newValues;
isCompact = true;
}
/**
* Get the mapped value of a Unicode character.
* @param index the character to get the mapped value with
* @return the mapped value of the given character
*/
public byte elementAt(char index)
{
return (values[(indices[index >> BLOCKSHIFT] & 0xFFFF)
+ (index & BLOCKMASK)]);
}
/**
* Set a new value for a Unicode character.
* Set automatically expands the array if it is compacted.
* @param index the character to set the mapped value with
* @param value the new mapped value
*/
public void setElementAt(char index, byte value)
{
if (isCompact)
expand();
values[(int)index] = value;
touchBlock(index >> BLOCKSHIFT, value);
}
/**
* Set new values for a range of Unicode character.
* @param start the starting offset o of the range
* @param end the ending offset of the range
* @param value the new mapped value
*/
public void setElementAt(char start, char end, byte value)
{
int i;
if (isCompact) {
expand();
}
for (i = start; i <= end; ++i) {
values[i] = value;
touchBlock(i >> BLOCKSHIFT, value);
}
}
/**
*Compact the array.
*/
public void compact()
{
if (!isCompact) {
int limitCompacted = 0;
int iBlockStart = 0;
short iUntouched = -1;
for (int i = 0; i < indices.length; ++i, iBlockStart += BLOCKCOUNT) {
indices[i] = -1;
boolean touched = blockTouched(i);
if (!touched && iUntouched != -1) {
// If no values in this block were set, we can just set its
// index to be the same as some other block with no values
// set, assuming we've seen one yet.
indices[i] = iUntouched;
} else {
int jBlockStart = 0;
int j = 0;
for (j = 0; j < limitCompacted;
++j, jBlockStart += BLOCKCOUNT) {
if (hashes[i] == hashes[j] &&
arrayRegionMatches(values, iBlockStart,
values, jBlockStart, BLOCKCOUNT)) {
indices[i] = (short)jBlockStart;
break;
}
}
if (indices[i] == -1) {
// we didn't match, so copy & update
System.arraycopy(values, iBlockStart,
values, jBlockStart, BLOCKCOUNT);
indices[i] = (short)jBlockStart;
hashes[j] = hashes[i];
++limitCompacted;
if (!touched) {
// If this is the first untouched block we've seen,
// remember its index.
iUntouched = (short)jBlockStart;
}
}
}
}
// we are done compacting, so now make the array shorter
int newSize = limitCompacted*BLOCKCOUNT;
byte[] result = new byte[newSize];
System.arraycopy(values, 0, result, 0, newSize);
values = result;
isCompact = true;
hashes = null;
}
}
/**
* Convenience utility to compare two arrays of doubles.
* @param len the length to compare.
* The start indices and start+len must be valid.
*/
static final boolean arrayRegionMatches(byte[] source, int sourceStart,
byte[] target, int targetStart,
int len)
{
int sourceEnd = sourceStart + len;
int delta = targetStart - sourceStart;
for (int i = sourceStart; i < sourceEnd; i++) {
if (source[i] != target[i + delta])
return false;
}
return true;
}
/**
* Remember that a specified block was "touched", i.e. had a value set.
* Untouched blocks can be skipped when compacting the array
*/
private final void touchBlock(int i, int value) {
hashes[i] = (hashes[i] + (value<<1)) | 1;
}
/**
* Query whether a specified block was "touched", i.e. had a value set.
* Untouched blocks can be skipped when compacting the array
*/
private final boolean blockTouched(int i) {
return hashes[i] != 0;
}
/** For internal use only. Do not modify the result, the behavior of
* modified results are undefined.
*/
public short getIndexArray()[]
{
return indices;
}
/** For internal use only. Do not modify the result, the behavior of
* modified results are undefined.
*/
public byte getStringArray()[]
{
return values;
}
/**
* Overrides Cloneable
*/
public Object clone()
{
try {
CompactByteArray other = (CompactByteArray) super.clone();
other.values = values.clone();
other.indices = indices.clone();
if (hashes != null) other.hashes = hashes.clone();
return other;
} catch (CloneNotSupportedException e) {
throw new InternalError(e);
}
}
/**
* Compares the equality of two compact array objects.
* @param obj the compact array object to be compared with this.
* @return true if the current compact array object is the same
* as the compact array object obj; false otherwise.
*/
public boolean equals(Object obj) {
if (obj == null) return false;
if (this == obj) // quick check
return true;
if (getClass() != obj.getClass()) // same class?
return false;
CompactByteArray other = (CompactByteArray) obj;
for (int i = 0; i < UNICODECOUNT; i++) {
// could be sped up later
if (elementAt((char)i) != other.elementAt((char)i))
return false;
}
return true; // we made it through the guantlet.
}
/**
* Generates the hash code for the compact array object
*/
public int hashCode() {
int result = 0;
int increment = Math.min(3, values.length/16);
for (int i = 0; i < values.length; i+= increment) {
result = result * 37 + values[i];
}
return result;
}
// --------------------------------------------------------------
// package private
// --------------------------------------------------------------
/**
* Expanding takes the array back to a 65536 element array.
*/
private void expand()
{
int i;
if (isCompact) {
byte[] tempArray;
hashes = new int[INDEXCOUNT];
tempArray = new byte[UNICODECOUNT];
for (i = 0; i < UNICODECOUNT; ++i) {
byte value = elementAt((char)i);
tempArray[i] = value;
touchBlock(i >> BLOCKSHIFT, value);
}
for (i = 0; i < INDEXCOUNT; ++i) {
indices[i] = (short)(i<<BLOCKSHIFT);
}
values = null;
values = tempArray;
isCompact = false;
}
}
private byte[] getArray()
{
return values;
}
private static final int BLOCKSHIFT =7;
private static final int BLOCKCOUNT =(1<<BLOCKSHIFT);
private static final int INDEXSHIFT =(16-BLOCKSHIFT);
private static final int INDEXCOUNT =(1<<INDEXSHIFT);
private static final int BLOCKMASK = BLOCKCOUNT - 1;
private byte[] values; // char -> short (char parameterized short)
private short indices[];
private boolean isCompact;
private int[] hashes;
};

View file

@ -0,0 +1,83 @@
/*
* Copyright (c) 2001, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package sun.text;
import sun.text.normalizer.NormalizerBase;
import sun.text.normalizer.NormalizerImpl;
public final class ComposedCharIter {
/**
* Constant that indicates the iteration has completed.
* {@link #next} returns this value when there are no more composed characters
* over which to iterate.
*/
public static final int DONE = NormalizerBase.DONE;
//cache the decomps mapping, so the seconde composedcharIter does
//not need to get the data again.
private static int chars[];
private static String decomps[];
private static int decompNum;
static {
int maxNum = 2100;
chars = new int[maxNum];
decomps = new String[maxNum];
decompNum = NormalizerImpl.getDecompose(chars, decomps);
}
/**
* Construct a new {@code ComposedCharIter}. The iterator will return
* all Unicode characters with canonical decompositions, excluding Korean
* Hangul characters.
*/
public ComposedCharIter() { }
/**
* Returns the next precomposed Unicode character.
* Repeated calls to {@code next} return all of the precomposed characters defined
* by Unicode, in ascending order. After all precomposed characters have
* been returned, {@link #hasNext} will return {@code false} and further calls
* to {@code next} will return {@link #DONE}.
*/
public int next() {
if (curChar == decompNum - 1) {
return DONE;
}
return chars[++curChar];
}
/**
* Returns the Unicode decomposition of the current character.
* This method returns the decomposition of the precomposed character most
* recently returned by {@link #next}. The resulting decomposition is
* affected by the settings of the options passed to the constructor.
*/
public String decomposition() {
return decomps[curChar];
}
private int curChar = -1;
}

View file

@ -0,0 +1,526 @@
/*
* Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*
* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
* (C) Copyright IBM Corp. 1996 - 2002 - All Rights Reserved
*
* The original version of this source code and documentation
* is copyrighted and owned by Taligent, Inc., a wholly-owned
* subsidiary of IBM. These materials are provided under terms
* of a License Agreement between Taligent and Sun. This technology
* is protected by multiple US and International patents.
*
* This notice and attribution to Taligent may not be removed.
* Taligent is a registered trademark of Taligent, Inc.
*/
package sun.text;
import java.text.CharacterIterator;
import java.util.ArrayList;
import java.util.List;
import java.util.Stack;
/**
* A subclass of RuleBasedBreakIterator that adds the ability to use a dictionary
* to further subdivide ranges of text beyond what is possible using just the
* state-table-based algorithm. This is necessary, for example, to handle
* word and line breaking in Thai, which doesn't use spaces between words. The
* state-table-based algorithm used by RuleBasedBreakIterator is used to divide
* up text as far as possible, and then contiguous ranges of letters are
* repeatedly compared against a list of known words (i.e., the dictionary)
* to divide them up into words.
*
* DictionaryBasedBreakIterator uses the same rule language as RuleBasedBreakIterator,
* but adds one more special substitution name: &lt;dictionary&gt;. This substitution
* name is used to identify characters in words in the dictionary. The idea is that
* if the iterator passes over a chunk of text that includes two or more characters
* in a row that are included in &lt;dictionary&gt;, it goes back through that range and
* derives additional break positions (if possible) using the dictionary.
*
* DictionaryBasedBreakIterator is also constructed with the filename of a dictionary
* file. It follows a prescribed search path to locate the dictionary (right now,
* it looks for it in /com/ibm/text/resources in each directory in the classpath,
* and won't find it in JAR files, but this location is likely to change). The
* dictionary file is in a serialized binary format. We have a very primitive (and
* slow) BuildDictionaryFile utility for creating dictionary files, but aren't
* currently making it public. Contact us for help.
*/
public class DictionaryBasedBreakIterator extends RuleBasedBreakIterator {
/**
* a list of known words that is used to divide up contiguous ranges of letters,
* stored in a compressed, indexed, format that offers fast access
*/
private BreakDictionary dictionary;
/**
* a list of flags indicating which character categories are contained in
* the dictionary file (this is used to determine which ranges of characters
* to apply the dictionary to)
*/
private boolean[] categoryFlags;
/**
* a temporary hiding place for the number of dictionary characters in the
* last range passed over by next()
*/
private int dictionaryCharCount;
/**
* when a range of characters is divided up using the dictionary, the break
* positions that are discovered are stored here, preventing us from having
* to use either the dictionary or the state table again until the iterator
* leaves this range of text
*/
private int[] cachedBreakPositions;
/**
* if cachedBreakPositions is not null, this indicates which item in the
* cache the current iteration position refers to
*/
private int positionInCache;
/**
* Constructs a DictionaryBasedBreakIterator.
*
* @param ruleFile the name of the rule data file
* @param ruleData the rule data loaded from the rule data file
* @param dictionaryFile the name of the dictionary file
* @param dictionartData the dictionary data loaded from the dictionary file
* @throws MissingResourceException if rule data or dictionary initialization failed
*/
public DictionaryBasedBreakIterator(String ruleFile, byte[] ruleData,
String dictionaryFile, byte[] dictionaryData) {
super(ruleFile, ruleData);
byte[] tmp = super.getAdditionalData();
if (tmp != null) {
prepareCategoryFlags(tmp);
super.setAdditionalData(null);
}
dictionary = new BreakDictionary(dictionaryFile, dictionaryData);
}
private void prepareCategoryFlags(byte[] data) {
categoryFlags = new boolean[data.length];
for (int i = 0; i < data.length; i++) {
categoryFlags[i] = (data[i] == (byte)1) ? true : false;
}
}
@Override
public void setText(CharacterIterator newText) {
super.setText(newText);
cachedBreakPositions = null;
dictionaryCharCount = 0;
positionInCache = 0;
}
/**
* Sets the current iteration position to the beginning of the text.
* (i.e., the CharacterIterator's starting offset).
* @return The offset of the beginning of the text.
*/
@Override
public int first() {
cachedBreakPositions = null;
dictionaryCharCount = 0;
positionInCache = 0;
return super.first();
}
/**
* Sets the current iteration position to the end of the text.
* (i.e., the CharacterIterator's ending offset).
* @return The text's past-the-end offset.
*/
@Override
public int last() {
cachedBreakPositions = null;
dictionaryCharCount = 0;
positionInCache = 0;
return super.last();
}
/**
* Advances the iterator one step backwards.
* @return The position of the last boundary position before the
* current iteration position
*/
@Override
public int previous() {
CharacterIterator text = getText();
// if we have cached break positions and we're still in the range
// covered by them, just move one step backward in the cache
if (cachedBreakPositions != null && positionInCache > 0) {
--positionInCache;
text.setIndex(cachedBreakPositions[positionInCache]);
return cachedBreakPositions[positionInCache];
}
// otherwise, dump the cache and use the inherited previous() method to move
// backward. This may fill up the cache with new break positions, in which
// case we have to mark our position in the cache
else {
cachedBreakPositions = null;
int result = super.previous();
if (cachedBreakPositions != null) {
positionInCache = cachedBreakPositions.length - 2;
}
return result;
}
}
/**
* Sets the current iteration position to the last boundary position
* before the specified position.
* @param offset The position to begin searching from
* @return The position of the last boundary before "offset"
*/
@Override
public int preceding(int offset) {
CharacterIterator text = getText();
checkOffset(offset, text);
// if we have no cached break positions, or "offset" is outside the
// range covered by the cache, we can just call the inherited routine
// (which will eventually call other routines in this class that may
// refresh the cache)
if (cachedBreakPositions == null || offset <= cachedBreakPositions[0] ||
offset > cachedBreakPositions[cachedBreakPositions.length - 1]) {
cachedBreakPositions = null;
return super.preceding(offset);
}
// on the other hand, if "offset" is within the range covered by the cache,
// then all we have to do is search the cache for the last break position
// before "offset"
else {
positionInCache = 0;
while (positionInCache < cachedBreakPositions.length
&& offset > cachedBreakPositions[positionInCache]) {
++positionInCache;
}
--positionInCache;
text.setIndex(cachedBreakPositions[positionInCache]);
return text.getIndex();
}
}
/**
* Sets the current iteration position to the first boundary position after
* the specified position.
* @param offset The position to begin searching forward from
* @return The position of the first boundary after "offset"
*/
@Override
public int following(int offset) {
CharacterIterator text = getText();
checkOffset(offset, text);
// if we have no cached break positions, or if "offset" is outside the
// range covered by the cache, then dump the cache and call our
// inherited following() method. This will call other methods in this
// class that may refresh the cache.
if (cachedBreakPositions == null || offset < cachedBreakPositions[0] ||
offset >= cachedBreakPositions[cachedBreakPositions.length - 1]) {
cachedBreakPositions = null;
return super.following(offset);
}
// on the other hand, if "offset" is within the range covered by the
// cache, then just search the cache for the first break position
// after "offset"
else {
positionInCache = 0;
while (positionInCache < cachedBreakPositions.length
&& offset >= cachedBreakPositions[positionInCache]) {
++positionInCache;
}
text.setIndex(cachedBreakPositions[positionInCache]);
return text.getIndex();
}
}
/**
* This is the implementation function for next().
*/
@Override
protected int handleNext() {
CharacterIterator text = getText();
// if there are no cached break positions, or if we've just moved
// off the end of the range covered by the cache, we have to dump
// and possibly regenerate the cache
if (cachedBreakPositions == null ||
positionInCache == cachedBreakPositions.length - 1) {
// start by using the inherited handleNext() to find a tentative return
// value. dictionaryCharCount tells us how many dictionary characters
// we passed over on our way to the tentative return value
int startPos = text.getIndex();
dictionaryCharCount = 0;
int result = super.handleNext();
// if we passed over more than one dictionary character, then we use
// divideUpDictionaryRange() to regenerate the cached break positions
// for the new range
if (dictionaryCharCount > 1 && result - startPos > 1) {
divideUpDictionaryRange(startPos, result);
}
// otherwise, the value we got back from the inherited fuction
// is our return value, and we can dump the cache
else {
cachedBreakPositions = null;
return result;
}
}
// if the cache of break positions has been regenerated (or existed all
// along), then just advance to the next break position in the cache
// and return it
if (cachedBreakPositions != null) {
++positionInCache;
text.setIndex(cachedBreakPositions[positionInCache]);
return cachedBreakPositions[positionInCache];
}
return -9999; // SHOULD NEVER GET HERE!
}
/**
* Looks up a character category for a character.
*/
@Override
protected int lookupCategory(int c) {
// this override of lookupCategory() exists only to keep track of whether we've
// passed over any dictionary characters. It calls the inherited lookupCategory()
// to do the real work, and then checks whether its return value is one of the
// categories represented in the dictionary. If it is, bump the dictionary-
// character count.
int result = super.lookupCategory(c);
if (result != RuleBasedBreakIterator.IGNORE && categoryFlags[result]) {
++dictionaryCharCount;
}
return result;
}
/**
* This is the function that actually implements the dictionary-based
* algorithm. Given the endpoints of a range of text, it uses the
* dictionary to determine the positions of any boundaries in this
* range. It stores all the boundary positions it discovers in
* cachedBreakPositions so that we only have to do this work once
* for each time we enter the range.
*/
@SuppressWarnings("unchecked")
private void divideUpDictionaryRange(int startPos, int endPos) {
CharacterIterator text = getText();
// the range we're dividing may begin or end with non-dictionary characters
// (i.e., for line breaking, we may have leading or trailing punctuation
// that needs to be kept with the word). Seek from the beginning of the
// range to the first dictionary character
text.setIndex(startPos);
int c = getCurrent();
int category = lookupCategory(c);
while (category == IGNORE || !categoryFlags[category]) {
c = getNext();
category = lookupCategory(c);
}
// initialize. We maintain two stacks: currentBreakPositions contains
// the list of break positions that will be returned if we successfully
// finish traversing the whole range now. possibleBreakPositions lists
// all other possible word ends we've passed along the way. (Whenever
// we reach an error [a sequence of characters that can't begin any word
// in the dictionary], we back up, possibly delete some breaks from
// currentBreakPositions, move a break from possibleBreakPositions
// to currentBreakPositions, and start over from there. This process
// continues in this way until we either successfully make it all the way
// across the range, or exhaust all of our combinations of break
// positions.)
Stack<Integer> currentBreakPositions = new Stack<>();
Stack<Integer> possibleBreakPositions = new Stack<>();
List<Integer> wrongBreakPositions = new ArrayList<>();
// the dictionary is implemented as a trie, which is treated as a state
// machine. -1 represents the end of a legal word. Every word in the
// dictionary is represented by a path from the root node to -1. A path
// that ends in state 0 is an illegal combination of characters.
int state = 0;
// these two variables are used for error handling. We keep track of the
// farthest we've gotten through the range being divided, and the combination
// of breaks that got us that far. If we use up all possible break
// combinations, the text contains an error or a word that's not in the
// dictionary. In this case, we "bless" the break positions that got us the
// farthest as real break positions, and then start over from scratch with
// the character where the error occurred.
int farthestEndPoint = text.getIndex();
Stack<Integer> bestBreakPositions = null;
// initialize (we always exit the loop with a break statement)
c = getCurrent();
while (true) {
// if we can transition to state "-1" from our current state, we're
// on the last character of a legal word. Push that position onto
// the possible-break-positions stack
if (dictionary.getNextState(state, 0) == -1) {
possibleBreakPositions.push(text.getIndex());
}
// look up the new state to transition to in the dictionary
state = dictionary.getNextStateFromCharacter(state, c);
// if the character we're sitting on causes us to transition to
// the "end of word" state, then it was a non-dictionary character
// and we've successfully traversed the whole range. Drop out
// of the loop.
if (state == -1) {
currentBreakPositions.push(text.getIndex());
break;
}
// if the character we're sitting on causes us to transition to
// the error state, or if we've gone off the end of the range
// without transitioning to the "end of word" state, we've hit
// an error...
else if (state == 0 || text.getIndex() >= endPos) {
// if this is the farthest we've gotten, take note of it in
// case there's an error in the text
if (text.getIndex() > farthestEndPoint) {
farthestEndPoint = text.getIndex();
@SuppressWarnings("unchecked")
Stack<Integer> currentBreakPositionsCopy = (Stack<Integer>) currentBreakPositions.clone();
bestBreakPositions = currentBreakPositionsCopy;
}
// wrongBreakPositions is a list of all break positions
// we've tried starting that didn't allow us to traverse
// all the way through the text. Every time we pop a
// break position off of currentBreakPositions, we put it
// into wrongBreakPositions to avoid trying it again later.
// If we make it to this spot, we're either going to back
// up to a break in possibleBreakPositions and try starting
// over from there, or we've exhausted all possible break
// positions and are going to do the fallback procedure.
// This loop prevents us from messing with anything in
// possibleBreakPositions that didn't work as a starting
// point the last time we tried it (this is to prevent a bunch of
// repetitive checks from slowing down some extreme cases)
while (!possibleBreakPositions.isEmpty()
&& wrongBreakPositions.contains(possibleBreakPositions.peek())) {
possibleBreakPositions.pop();
}
// if we've used up all possible break-position combinations, there's
// an error or an unknown word in the text. In this case, we start
// over, treating the farthest character we've reached as the beginning
// of the range, and "blessing" the break positions that got us that
// far as real break positions
if (possibleBreakPositions.isEmpty()) {
if (bestBreakPositions != null) {
currentBreakPositions = bestBreakPositions;
if (farthestEndPoint < endPos) {
text.setIndex(farthestEndPoint + 1);
}
else {
break;
}
}
else {
if ((currentBreakPositions.size() == 0 ||
currentBreakPositions.peek().intValue() != text.getIndex())
&& text.getIndex() != startPos) {
currentBreakPositions.push(text.getIndex());
}
getNext();
currentBreakPositions.push(text.getIndex());
}
}
// if we still have more break positions we can try, then promote the
// last break in possibleBreakPositions into currentBreakPositions,
// and get rid of all entries in currentBreakPositions that come after
// it. Then back up to that position and start over from there (i.e.,
// treat that position as the beginning of a new word)
else {
Integer temp = possibleBreakPositions.pop();
Integer temp2 = null;
while (!currentBreakPositions.isEmpty() && temp.intValue() <
currentBreakPositions.peek().intValue()) {
temp2 = currentBreakPositions.pop();
wrongBreakPositions.add(temp2);
}
currentBreakPositions.push(temp);
text.setIndex(currentBreakPositions.peek().intValue());
}
// re-sync "c" for the next go-round, and drop out of the loop if
// we've made it off the end of the range
c = getCurrent();
if (text.getIndex() >= endPos) {
break;
}
}
// if we didn't hit any exceptional conditions on this last iteration,
// just advance to the next character and loop
else {
c = getNext();
}
}
// dump the last break position in the list, and replace it with the actual
// end of the range (which may be the same character, or may be further on
// because the range actually ended with non-dictionary characters we want to
// keep with the word)
if (!currentBreakPositions.isEmpty()) {
currentBreakPositions.pop();
}
currentBreakPositions.push(endPos);
// create a regular array to hold the break positions and copy
// the break positions from the stack to the array (in addition,
// our starting position goes into this array as a break position).
// This array becomes the cache of break positions used by next()
// and previous(), so this is where we actually refresh the cache.
cachedBreakPositions = new int[currentBreakPositions.size() + 1];
cachedBreakPositions[0] = startPos;
for (int i = 0; i < currentBreakPositions.size(); i++) {
cachedBreakPositions[i + 1] = currentBreakPositions.elementAt(i).intValue();
}
positionInCache = 0;
}
}

View file

@ -0,0 +1,271 @@
/*
* Copyright (c) 1998, 2011, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
* (C) Copyright Taligent, Inc. 1996,1997 - All Rights Reserved
* (C) Copyright IBM Corp. 1996, 1997 - All Rights Reserved
*/
package sun.text;
/** Simple internal class for doing hash mapping. Much, much faster than the
* standard Hashtable for integer to integer mappings,
* and doesn't require object creation.<br>
* If a key is not found, the defaultValue is returned.
* Note: the keys are limited to values above Integer.MIN_VALUE+1.<br>
*/
public final class IntHashtable {
public IntHashtable () {
initialize(3);
}
public IntHashtable (int initialSize) {
initialize(leastGreaterPrimeIndex((int)(initialSize/HIGH_WATER_FACTOR)));
}
public int size() {
return count;
}
public boolean isEmpty() {
return count == 0;
}
public void put(int key, int value) {
if (count > highWaterMark) {
rehash();
}
int index = find(key);
if (keyList[index] <= MAX_UNUSED) { // deleted or empty
keyList[index] = key;
++count;
}
values[index] = value; // reset value
}
public int get(int key) {
return values[find(key)];
}
public void remove(int key) {
int index = find(key);
if (keyList[index] > MAX_UNUSED) { // neither deleted nor empty
keyList[index] = DELETED; // set to deleted
values[index] = defaultValue; // set to default
--count;
if (count < lowWaterMark) {
rehash();
}
}
}
public int getDefaultValue() {
return defaultValue;
}
public void setDefaultValue(int newValue) {
defaultValue = newValue;
rehash();
}
public boolean equals (Object that) {
if (that.getClass() != this.getClass()) return false;
IntHashtable other = (IntHashtable) that;
if (other.size() != count || other.defaultValue != defaultValue) {
return false;
}
for (int i = 0; i < keyList.length; ++i) {
int key = keyList[i];
if (key > MAX_UNUSED && other.get(key) != values[i])
return false;
}
return true;
}
public int hashCode() {
// NOTE: This function isn't actually used anywhere in this package, but it's here
// in case this class is ever used to make sure we uphold the invariants about
// hashCode() and equals()
// WARNING: This function hasn't undergone rigorous testing to make sure it actually
// gives good distribution. We've eyeballed the results, and they appear okay, but
// you copy this algorithm (or these seed and multiplier values) at your own risk.
// --rtg 8/17/99
int result = 465; // an arbitrary seed value
int scrambler = 1362796821; // an arbitrary multiplier.
for (int i = 0; i < keyList.length; ++i) {
// this line just scrambles the bits as each value is added into the
// has value. This helps to make sure we affect all the bits and that
// the same values in a different order will produce a different hash value
result = result * scrambler + 1;
result += keyList[i];
}
for (int i = 0; i < values.length; ++i) {
result = result * scrambler + 1;
result += values[i];
}
return result;
}
public Object clone ()
throws CloneNotSupportedException {
IntHashtable result = (IntHashtable) super.clone();
values = values.clone();
keyList = keyList.clone();
return result;
}
// =======================PRIVATES============================
private int defaultValue = 0;
// the tables have to have prime-number lengths. Rather than compute
// primes, we just keep a table, with the current index we are using.
private int primeIndex;
// highWaterFactor determines the maximum number of elements before
// a rehash. Can be tuned for different performance/storage characteristics.
private static final float HIGH_WATER_FACTOR = 0.4F;
private int highWaterMark;
// lowWaterFactor determines the minimum number of elements before
// a rehash. Can be tuned for different performance/storage characteristics.
private static final float LOW_WATER_FACTOR = 0.0F;
private int lowWaterMark;
private int count;
// we use two arrays to minimize allocations
private int[] values;
private int[] keyList;
private static final int EMPTY = Integer.MIN_VALUE;
private static final int DELETED = EMPTY + 1;
private static final int MAX_UNUSED = DELETED;
private void initialize (int primeIndex) {
if (primeIndex < 0) {
primeIndex = 0;
} else if (primeIndex >= PRIMES.length) {
System.out.println("TOO BIG");
primeIndex = PRIMES.length - 1;
// throw new java.util.IllegalArgumentError();
}
this.primeIndex = primeIndex;
int initialSize = PRIMES[primeIndex];
values = new int[initialSize];
keyList = new int[initialSize];
for (int i = 0; i < initialSize; ++i) {
keyList[i] = EMPTY;
values[i] = defaultValue;
}
count = 0;
lowWaterMark = (int)(initialSize * LOW_WATER_FACTOR);
highWaterMark = (int)(initialSize * HIGH_WATER_FACTOR);
}
private void rehash() {
int[] oldValues = values;
int[] oldkeyList = keyList;
int newPrimeIndex = primeIndex;
if (count > highWaterMark) {
++newPrimeIndex;
} else if (count < lowWaterMark) {
newPrimeIndex -= 2;
}
initialize(newPrimeIndex);
for (int i = oldValues.length - 1; i >= 0; --i) {
int key = oldkeyList[i];
if (key > MAX_UNUSED) {
putInternal(key, oldValues[i]);
}
}
}
public void putInternal (int key, int value) {
int index = find(key);
if (keyList[index] < MAX_UNUSED) { // deleted or empty
keyList[index] = key;
++count;
}
values[index] = value; // reset value
}
private int find (int key) {
if (key <= MAX_UNUSED)
throw new IllegalArgumentException("key can't be less than 0xFFFFFFFE");
int firstDeleted = -1; // assume invalid index
int index = (key ^ 0x4000000) % keyList.length;
if (index < 0) index = -index; // positive only
int jump = 0; // lazy evaluate
while (true) {
int tableHash = keyList[index];
if (tableHash == key) { // quick check
return index;
} else if (tableHash > MAX_UNUSED) { // neither correct nor unused
// ignore
} else if (tableHash == EMPTY) { // empty, end o' the line
if (firstDeleted >= 0) {
index = firstDeleted; // reset if had deleted slot
}
return index;
} else if (firstDeleted < 0) { // remember first deleted
firstDeleted = index;
}
if (jump == 0) { // lazy compute jump
jump = (key % (keyList.length - 1));
if (jump < 0) jump = -jump;
++jump;
}
index = (index + jump) % keyList.length;
if (index == firstDeleted) {
// We've searched all entries for the given key.
return index;
}
}
}
private static int leastGreaterPrimeIndex(int source) {
int i;
for (i = 0; i < PRIMES.length; ++i) {
if (source < PRIMES[i]) {
break;
}
}
return (i == 0) ? 0 : (i - 1);
}
// This list is the result of buildList below. Can be tuned for different
// performance/storage characteristics.
private static final int[] PRIMES = {
17, 37, 67, 131, 257,
521, 1031, 2053, 4099, 8209, 16411, 32771, 65537,
131101, 262147, 524309, 1048583, 2097169, 4194319, 8388617, 16777259,
33554467, 67108879, 134217757, 268435459, 536870923, 1073741827, 2147483647
};
}

View file

@ -0,0 +1,98 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package sun.text;
import sun.text.normalizer.NormalizerBase;
import sun.text.normalizer.UCharacter;
/**
* This Normalizer is for Unicode 3.2 support for IDNA only.
* Developers should not use this class.
*
* @since 1.6
*/
public final class Normalizer {
private Normalizer() {};
/**
* Option to select Unicode 3.2 (without corrigendum 4 corrections) for
* normalization.
*/
public static final int UNICODE_3_2 = NormalizerBase.UNICODE_3_2_0_ORIGINAL;
/**
* Normalize a sequence of char values.
* The sequence will be normalized according to the specified normalization
* from.
* @param src The sequence of char values to normalize.
* @param form The normalization form; one of
* {@link java.text.Normalizer.Form#NFC},
* {@link java.text.Normalizer.Form#NFD},
* {@link java.text.Normalizer.Form#NFKC},
* {@link java.text.Normalizer.Form#NFKD}
* @param option The normalization option;
* {@link sun.text.Normalizer#UNICODE_3_2}
* @return The normalized String
* @throws NullPointerException If <code>src</code> or <code>form</code>
* is null.
*/
public static String normalize(CharSequence src,
java.text.Normalizer.Form form,
int option) {
return NormalizerBase.normalize(src.toString(), form, option);
};
/**
* Determines if the given sequence of char values is normalized.
* @param src The sequence of char values to be checked.
* @param form The normalization form; one of
* {@link java.text.Normalizer.Form#NFC},
* {@link java.text.Normalizer.Form#NFD},
* {@link java.text.Normalizer.Form#NFKC},
* {@link java.text.Normalizer.Form#NFKD}
* @param option The normalization option;
* {@link sun.text.Normalizer#UNICODE_3_2}
* @return true if the sequence of char values is normalized;
* false otherwise.
* @throws NullPointerException If <code>src</code> or <code>form</code>
* is null.
*/
public static boolean isNormalized(CharSequence src,
java.text.Normalizer.Form form,
int option) {
return NormalizerBase.isNormalized(src.toString(), form, option);
}
/**
* Returns the combining class of the given character
* @param ch character to retrieve combining class of
* @return combining class of the given character
*/
public static final int getCombiningClass(int ch) {
return UCharacter.getCombiningClass(ch);
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,99 @@
/*
* Copyright (c) 2003, 2012, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package sun.text;
/**
* SupplementaryCharacterData is an SMI-private class which was written for
* RuleBasedBreakIterator and BreakDictionary.
*/
public final class SupplementaryCharacterData implements Cloneable {
/**
* A token used as a character-category value to identify ignore characters
*/
private static final byte IGNORE = -1;
/**
* An array for supplementary characters and values.
* Lower one byte is used to keep a byte-value.
* Upper three bytes are used to keep the first supplementary character
* which has the value. The value is also valid for the following
* supplementary characters until the next supplementary character in
* the array <code>dataTable</code>.
* For example, if the value of <code>dataTable[2]</code> is
* <code>0x01000123</code> and the value of <code>dataTable[3]</code> is
* <code>0x01000567</code>, supplementary characters from
* <code>0x10001</code> to <code>0x10004</code> has the value
* <code>0x23</code>. And, <code>getValue(0x10003)</code> returns the value.
*/
private int[] dataTable;
/**
* Creates a new SupplementaryCharacterData object with the given table.
*/
public SupplementaryCharacterData(int[] table) {
dataTable = table;
}
/**
* Returns a corresponding value for the given supplementary code-point.
*/
public int getValue(int index) {
// Index should be a valid supplementary character.
assert index >= Character.MIN_SUPPLEMENTARY_CODE_POINT &&
index <= Character.MAX_CODE_POINT :
"Invalid code point:" + Integer.toHexString(index);
int i = 0;
int j = dataTable.length - 1;
int k;
for (;;) {
k = (i + j) / 2;
int start = dataTable[k] >> 8;
int end = dataTable[k+1] >> 8;
if (index < start) {
j = k;
} else if (index > (end-1)) {
i = k;
} else {
int v = dataTable[k] & 0xFF;
return (v == 0xFF) ? IGNORE : v;
}
}
}
/**
* Returns the data array.
*/
public int[] getArray() {
return dataTable;
}
}

View file

@ -0,0 +1,205 @@
/*
* Copyright (c) 2003, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package sun.text;
public final class UCompactIntArray implements Cloneable {
/**
* Default constructor for UCompactIntArray, the default value of the
* compact array is 0.
*/
public UCompactIntArray() {
values = new int[16][];
indices = new short[16][];
blockTouched = new boolean[16][];
planeTouched = new boolean[16];
}
public UCompactIntArray(int defaultValue) {
this();
this.defaultValue = defaultValue;
}
/**
* Get the mapped value of a Unicode character.
* @param index the character to get the mapped value with
* @return the mapped value of the given character
*/
public int elementAt(int index) {
int plane = (index & PLANEMASK) >> PLANESHIFT;
if (!planeTouched[plane]) {
return defaultValue;
}
index &= CODEPOINTMASK;
return values[plane][(indices[plane][index >> BLOCKSHIFT] & 0xFFFF)
+ (index & BLOCKMASK)];
}
/**
* Set a new value for a Unicode character.
* Set automatically expands the array if it is compacted.
* @param index the character to set the mapped value with
* @param value the new mapped value
*/
public void setElementAt(int index, int value) {
if (isCompact) {
expand();
}
int plane = (index & PLANEMASK) >> PLANESHIFT;
if (!planeTouched[plane]) {
initPlane(plane);
}
index &= CODEPOINTMASK;
values[plane][index] = value;
blockTouched[plane][index >> BLOCKSHIFT] = true;
}
/**
* Compact the array.
*/
public void compact() {
if (isCompact) {
return;
}
for (int plane = 0; plane < PLANECOUNT; plane++) {
if (!planeTouched[plane]) {
continue;
}
int limitCompacted = 0;
int iBlockStart = 0;
short iUntouched = -1;
for (int i = 0; i < indices[plane].length; ++i, iBlockStart += BLOCKCOUNT) {
indices[plane][i] = -1;
if (!blockTouched[plane][i] && iUntouched != -1) {
// If no values in this block were set, we can just set its
// index to be the same as some other block with no values
// set, assuming we've seen one yet.
indices[plane][i] = iUntouched;
} else {
int jBlockStart = limitCompacted * BLOCKCOUNT;
if (i > limitCompacted) {
System.arraycopy(values[plane], iBlockStart,
values[plane], jBlockStart, BLOCKCOUNT);
}
if (!blockTouched[plane][i]) {
// If this is the first untouched block we've seen, remember it.
iUntouched = (short)jBlockStart;
}
indices[plane][i] = (short)jBlockStart;
limitCompacted++;
}
}
// we are done compacting, so now make the array shorter
int newSize = limitCompacted * BLOCKCOUNT;
int[] result = new int[newSize];
System.arraycopy(values[plane], 0, result, 0, newSize);
values[plane] = result;
blockTouched[plane] = null;
}
isCompact = true;
}
// --------------------------------------------------------------
// private
// --------------------------------------------------------------
/**
* Expanded takes the array back to a 0x10ffff element array
*/
private void expand() {
int i;
if (isCompact) {
int[] tempArray;
for (int plane = 0; plane < PLANECOUNT; plane++) {
if (!planeTouched[plane]) {
continue;
}
blockTouched[plane] = new boolean[INDEXCOUNT];
tempArray = new int[UNICODECOUNT];
for (i = 0; i < UNICODECOUNT; ++i) {
tempArray[i] = values[plane][indices[plane][i >> BLOCKSHIFT]
& 0xffff + (i & BLOCKMASK)];
blockTouched[plane][i >> BLOCKSHIFT] = true;
}
for (i = 0; i < INDEXCOUNT; ++i) {
indices[plane][i] = (short)(i<<BLOCKSHIFT);
}
values[plane] = tempArray;
}
isCompact = false;
}
}
private void initPlane(int plane) {
values[plane] = new int[UNICODECOUNT];
indices[plane] = new short[INDEXCOUNT];
blockTouched[plane] = new boolean[INDEXCOUNT];
planeTouched[plane] = true;
if (planeTouched[0] && plane != 0) {
System.arraycopy(indices[0], 0, indices[plane], 0, INDEXCOUNT);
} else {
for (int i = 0; i < INDEXCOUNT; ++i) {
indices[plane][i] = (short)(i<<BLOCKSHIFT);
}
}
for (int i = 0; i < UNICODECOUNT; ++i) {
values[plane][i] = defaultValue;
}
}
public int getKSize() {
int size = 0;
for (int plane = 0; plane < PLANECOUNT; plane++) {
if (planeTouched[plane]) {
size += (values[plane].length * 4 + indices[plane].length * 2);
}
}
return size / 1024;
}
private static final int PLANEMASK = 0x30000;
private static final int PLANESHIFT = 16;
private static final int PLANECOUNT = 0x10;
private static final int CODEPOINTMASK = 0xffff;
private static final int UNICODECOUNT = 0x10000;
private static final int BLOCKSHIFT = 7;
private static final int BLOCKCOUNT = (1<<BLOCKSHIFT);
private static final int INDEXSHIFT = (16-BLOCKSHIFT);
private static final int INDEXCOUNT = (1<<INDEXSHIFT);
private static final int BLOCKMASK = BLOCKCOUNT - 1;
private int defaultValue;
private int values[][];
private short indices[][];
private boolean isCompact;
private boolean[][] blockTouched;
private boolean[] planeTouched;
};

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,835 @@
/*
* Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2001-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
/* Written by Simon Montagu, Matitiahu Allouche
* (ported from C code written by Markus W. Scherer)
*/
package sun.text.bidi;
import java.text.Bidi;
import java.util.Arrays;
final class BidiLine {
/*
* General remarks about the functions in this file:
*
* These functions deal with the aspects of potentially mixed-directional
* text in a single paragraph or in a line of a single paragraph
* which has already been processed according to
* the Unicode 3.0 Bidi algorithm as defined in
* http://www.unicode.org/unicode/reports/tr9/ , version 13,
* also described in The Unicode Standard, Version 4.0.1 .
*
* This means that there is a Bidi object with a levels
* and a dirProps array.
* paraLevel and direction are also set.
* Only if the length of the text is zero, then levels==dirProps==NULL.
*
* The overall directionality of the paragraph
* or line is used to bypass the reordering steps if possible.
* Even purely RTL text does not need reordering there because
* the getLogical/VisualIndex() methods can compute the
* index on the fly in such a case.
*
* The implementation of the access to same-level-runs and of the reordering
* do attempt to provide better performance and less memory usage compared to
* a direct implementation of especially rule (L2) with an array of
* one (32-bit) integer per text character.
*
* Here, the levels array is scanned as soon as necessary, and a vector of
* same-level-runs is created. Reordering then is done on this vector.
* For each run of text positions that were resolved to the same level,
* only 8 bytes are stored: the first text position of the run and the visual
* position behind the run after reordering.
* One sign bit is used to hold the directionality of the run.
* This is inefficient if there are many very short runs. If the average run
* length is <2, then this uses more memory.
*
* In a further attempt to save memory, the levels array is never changed
* after all the resolution rules (Xn, Wn, Nn, In).
* Many methods have to consider the field trailingWSStart:
* if it is less than length, then there is an implicit trailing run
* at the paraLevel,
* which is not reflected in the levels array.
* This allows a line Bidi object to use the same levels array as
* its paragraph parent object.
*
* When a Bidi object is created for a line of a paragraph, then the
* paragraph's levels and dirProps arrays are reused by way of setting
* a pointer into them, not by copying. This again saves memory and forbids to
* change the now shared levels for (L1).
*/
/* handle trailing WS (L1) -------------------------------------------------- */
/*
* setTrailingWSStart() sets the start index for a trailing
* run of WS in the line. This is necessary because we do not modify
* the paragraph's levels array that we just point into.
* Using trailingWSStart is another form of performing (L1).
*
* To make subsequent operations easier, we also include the run
* before the WS if it is at the paraLevel - we merge the two here.
*
* This method is called only from setLine(), so paraLevel is
* set correctly for the line even when contextual multiple paragraphs.
*/
static void setTrailingWSStart(BidiBase bidiBase)
{
byte[] dirProps = bidiBase.dirProps;
byte[] levels = bidiBase.levels;
int start = bidiBase.length;
byte paraLevel = bidiBase.paraLevel;
/* If the line is terminated by a block separator, all preceding WS etc...
are already set to paragraph level.
Setting trailingWSStart to pBidi->length will avoid changing the
level of B chars from 0 to paraLevel in getLevels when
orderParagraphsLTR==TRUE
*/
if (dirProps[start - 1] == BidiBase.B) {
bidiBase.trailingWSStart = start; /* currently == bidiBase.length */
return;
}
/* go backwards across all WS, BN, explicit codes */
while (start > 0 &&
(BidiBase.DirPropFlag(dirProps[start - 1]) & BidiBase.MASK_WS) != 0) {
--start;
}
/* if the WS run can be merged with the previous run then do so here */
while (start > 0 && levels[start - 1] == paraLevel) {
--start;
}
bidiBase.trailingWSStart=start;
}
static Bidi setLine(BidiBase paraBidi,
Bidi newBidi, BidiBase lineBidi,
int start, int limit) {
int length;
/* set the values in lineBidi from its paraBidi parent */
/* class members are already initialized to 0 */
// lineBidi.paraBidi = null; /* mark unfinished setLine */
// lineBidi.flags = 0;
// lineBidi.controlCount = 0;
length = lineBidi.length = lineBidi.originalLength =
lineBidi.resultLength = limit - start;
lineBidi.text = new char[length];
System.arraycopy(paraBidi.text, start, lineBidi.text, 0, length);
lineBidi.paraLevel = paraBidi.GetParaLevelAt(start);
lineBidi.paraCount = paraBidi.paraCount;
lineBidi.runs = new BidiRun[0];
lineBidi.reorderingMode = paraBidi.reorderingMode;
lineBidi.reorderingOptions = paraBidi.reorderingOptions;
if (paraBidi.controlCount > 0) {
int j;
for (j = start; j < limit; j++) {
if (BidiBase.IsBidiControlChar(paraBidi.text[j])) {
lineBidi.controlCount++;
}
}
lineBidi.resultLength -= lineBidi.controlCount;
}
/* copy proper subset of DirProps */
lineBidi.getDirPropsMemory(length);
lineBidi.dirProps = lineBidi.dirPropsMemory;
System.arraycopy(paraBidi.dirProps, start, lineBidi.dirProps, 0,
length);
/* copy proper subset of Levels */
lineBidi.getLevelsMemory(length);
lineBidi.levels = lineBidi.levelsMemory;
System.arraycopy(paraBidi.levels, start, lineBidi.levels, 0,
length);
lineBidi.runCount = -1;
if (paraBidi.direction != BidiBase.MIXED) {
/* the parent is already trivial */
lineBidi.direction = paraBidi.direction;
/*
* The parent's levels are all either
* implicitly or explicitly ==paraLevel;
* do the same here.
*/
if (paraBidi.trailingWSStart <= start) {
lineBidi.trailingWSStart = 0;
} else if (paraBidi.trailingWSStart < limit) {
lineBidi.trailingWSStart = paraBidi.trailingWSStart - start;
} else {
lineBidi.trailingWSStart = length;
}
} else {
byte[] levels = lineBidi.levels;
int i, trailingWSStart;
byte level;
setTrailingWSStart(lineBidi);
trailingWSStart = lineBidi.trailingWSStart;
/* recalculate lineBidiBase.direction */
if (trailingWSStart == 0) {
/* all levels are at paraLevel */
lineBidi.direction = (byte)(lineBidi.paraLevel & 1);
} else {
/* get the level of the first character */
level = (byte)(levels[0] & 1);
/* if there is anything of a different level, then the line
is mixed */
if (trailingWSStart < length &&
(lineBidi.paraLevel & 1) != level) {
/* the trailing WS is at paraLevel, which differs from
levels[0] */
lineBidi.direction = BidiBase.MIXED;
} else {
/* see if levels[1..trailingWSStart-1] have the same
direction as levels[0] and paraLevel */
for (i = 1; ; i++) {
if (i == trailingWSStart) {
/* the direction values match those in level */
lineBidi.direction = level;
break;
} else if ((levels[i] & 1) != level) {
lineBidi.direction = BidiBase.MIXED;
break;
}
}
}
}
switch(lineBidi.direction) {
case Bidi.DIRECTION_LEFT_TO_RIGHT:
/* make sure paraLevel is even */
lineBidi.paraLevel = (byte)
((lineBidi.paraLevel + 1) & ~1);
/* all levels are implicitly at paraLevel (important for
getLevels()) */
lineBidi.trailingWSStart = 0;
break;
case Bidi.DIRECTION_RIGHT_TO_LEFT:
/* make sure paraLevel is odd */
lineBidi.paraLevel |= 1;
/* all levels are implicitly at paraLevel (important for
getLevels()) */
lineBidi.trailingWSStart = 0;
break;
default:
break;
}
}
lineBidi.paraBidi = paraBidi; /* mark successful setLine */
return newBidi;
}
static byte getLevelAt(BidiBase bidiBase, int charIndex)
{
/* return paraLevel if in the trailing WS run, otherwise the real level */
if (bidiBase.direction != BidiBase.MIXED || charIndex >= bidiBase.trailingWSStart) {
return bidiBase.GetParaLevelAt(charIndex);
} else {
return bidiBase.levels[charIndex];
}
}
static byte[] getLevels(BidiBase bidiBase)
{
int start = bidiBase.trailingWSStart;
int length = bidiBase.length;
if (start != length) {
/* the current levels array does not reflect the WS run */
/*
* After the previous if(), we know that the levels array
* has an implicit trailing WS run and therefore does not fully
* reflect itself all the levels.
* This must be a Bidi object for a line, and
* we need to create a new levels array.
*/
/* bidiBase.paraLevel is ok even if contextual multiple paragraphs,
since bidiBase is a line object */
Arrays.fill(bidiBase.levels, start, length, bidiBase.paraLevel);
/* this new levels array is set for the line and reflects the WS run */
bidiBase.trailingWSStart = length;
}
if (length < bidiBase.levels.length) {
byte[] levels = new byte[length];
System.arraycopy(bidiBase.levels, 0, levels, 0, length);
return levels;
}
return bidiBase.levels;
}
static BidiRun getVisualRun(BidiBase bidiBase, int runIndex) {
int start = bidiBase.runs[runIndex].start;
int limit;
byte level = bidiBase.runs[runIndex].level;
if (runIndex > 0) {
limit = start +
bidiBase.runs[runIndex].limit -
bidiBase.runs[runIndex - 1].limit;
} else {
limit = start + bidiBase.runs[0].limit;
}
return new BidiRun(start, limit, level);
}
/* in trivial cases there is only one trivial run; called by getRuns() */
private static void getSingleRun(BidiBase bidiBase, byte level) {
/* simple, single-run case */
bidiBase.runs = bidiBase.simpleRuns;
bidiBase.runCount = 1;
/* fill and reorder the single run */
bidiBase.runs[0] = new BidiRun(0, bidiBase.length, level);
}
/* reorder the runs array (L2) ---------------------------------------------- */
/*
* Reorder the same-level runs in the runs array.
* Here, runCount>1 and maxLevel>=minLevel>=paraLevel.
* All the visualStart fields=logical start before reordering.
* The "odd" bits are not set yet.
*
* Reordering with this data structure lends itself to some handy shortcuts:
*
* Since each run is moved but not modified, and since at the initial maxLevel
* each sequence of same-level runs consists of only one run each, we
* don't need to do anything there and can predecrement maxLevel.
* In many simple cases, the reordering is thus done entirely in the
* index mapping.
* Also, reordering occurs only down to the lowest odd level that occurs,
* which is minLevel|1. However, if the lowest level itself is odd, then
* in the last reordering the sequence of the runs at this level or higher
* will be all runs, and we don't need the elaborate loop to search for them.
* This is covered by ++minLevel instead of minLevel|=1 followed
* by an extra reorder-all after the reorder-some loop.
* About a trailing WS run:
* Such a run would need special treatment because its level is not
* reflected in levels[] if this is not a paragraph object.
* Instead, all characters from trailingWSStart on are implicitly at
* paraLevel.
* However, for all maxLevel>paraLevel, this run will never be reordered
* and does not need to be taken into account. maxLevel==paraLevel is only reordered
* if minLevel==paraLevel is odd, which is done in the extra segment.
* This means that for the main reordering loop we don't need to consider
* this run and can --runCount. If it is later part of the all-runs
* reordering, then runCount is adjusted accordingly.
*/
private static void reorderLine(BidiBase bidiBase, byte minLevel, byte maxLevel) {
/* nothing to do? */
if (maxLevel<=(minLevel|1)) {
return;
}
BidiRun[] runs;
BidiRun tempRun;
byte[] levels;
int firstRun, endRun, limitRun, runCount;
/*
* Reorder only down to the lowest odd level
* and reorder at an odd minLevel in a separate, simpler loop.
* See comments above for why minLevel is always incremented.
*/
++minLevel;
runs = bidiBase.runs;
levels = bidiBase.levels;
runCount = bidiBase.runCount;
/* do not include the WS run at paraLevel<=old minLevel except in the simple loop */
if (bidiBase.trailingWSStart < bidiBase.length) {
--runCount;
}
while (--maxLevel >= minLevel) {
firstRun = 0;
/* loop for all sequences of runs */
for ( ; ; ) {
/* look for a sequence of runs that are all at >=maxLevel */
/* look for the first run of such a sequence */
while (firstRun < runCount && levels[runs[firstRun].start] < maxLevel) {
++firstRun;
}
if (firstRun >= runCount) {
break; /* no more such runs */
}
/* look for the limit run of such a sequence (the run behind it) */
for (limitRun = firstRun; ++limitRun < runCount &&
levels[runs[limitRun].start]>=maxLevel; ) {}
/* Swap the entire sequence of runs from firstRun to limitRun-1. */
endRun = limitRun - 1;
while (firstRun < endRun) {
tempRun = runs[firstRun];
runs[firstRun] = runs[endRun];
runs[endRun] = tempRun;
++firstRun;
--endRun;
}
if (limitRun == runCount) {
break; /* no more such runs */
} else {
firstRun = limitRun + 1;
}
}
}
/* now do maxLevel==old minLevel (==odd!), see above */
if ((minLevel & 1) == 0) {
firstRun = 0;
/* include the trailing WS run in this complete reordering */
if (bidiBase.trailingWSStart == bidiBase.length) {
--runCount;
}
/* Swap the entire sequence of all runs. (endRun==runCount) */
while (firstRun < runCount) {
tempRun = runs[firstRun];
runs[firstRun] = runs[runCount];
runs[runCount] = tempRun;
++firstRun;
--runCount;
}
}
}
/* compute the runs array --------------------------------------------------- */
static int getRunFromLogicalIndex(BidiBase bidiBase, int logicalIndex) {
BidiRun[] runs = bidiBase.runs;
int runCount = bidiBase.runCount, visualStart = 0, i, length, logicalStart;
for (i = 0; i < runCount; i++) {
length = runs[i].limit - visualStart;
logicalStart = runs[i].start;
if ((logicalIndex >= logicalStart) && (logicalIndex < (logicalStart+length))) {
return i;
}
visualStart += length;
}
/* we should never get here */
throw new IllegalStateException("Internal ICU error in getRunFromLogicalIndex");
}
/*
* Compute the runs array from the levels array.
* After getRuns() returns true, runCount is guaranteed to be >0
* and the runs are reordered.
* Odd-level runs have visualStart on their visual right edge and
* they progress visually to the left.
* If option OPTION_INSERT_MARKS is set, insertRemove will contain the
* sum of appropriate LRM/RLM_BEFORE/AFTER flags.
* If option OPTION_REMOVE_CONTROLS is set, insertRemove will contain the
* negative number of BiDi control characters within this run.
*/
static void getRuns(BidiBase bidiBase) {
/*
* This method returns immediately if the runs are already set. This
* includes the case of length==0 (handled in setPara)..
*/
if (bidiBase.runCount >= 0) {
return;
}
if (bidiBase.direction != BidiBase.MIXED) {
/* simple, single-run case - this covers length==0 */
/* bidiBase.paraLevel is ok even for contextual multiple paragraphs */
getSingleRun(bidiBase, bidiBase.paraLevel);
} else /* BidiBase.MIXED, length>0 */ {
/* mixed directionality */
int length = bidiBase.length, limit;
byte[] levels = bidiBase.levels;
int i, runCount;
byte level = -1; /* initialize with no valid level */
/*
* If there are WS characters at the end of the line
* and the run preceding them has a level different from
* paraLevel, then they will form their own run at paraLevel (L1).
* Count them separately.
* We need some special treatment for this in order to not
* modify the levels array which a line Bidi object shares
* with its paragraph parent and its other line siblings.
* In other words, for the trailing WS, it may be
* levels[]!=paraLevel but we have to treat it like it were so.
*/
limit = bidiBase.trailingWSStart;
/* count the runs, there is at least one non-WS run, and limit>0 */
runCount = 0;
for (i = 0; i < limit; ++i) {
/* increment runCount at the start of each run */
if (levels[i] != level) {
++runCount;
level = levels[i];
}
}
/*
* We don't need to see if the last run can be merged with a trailing
* WS run because setTrailingWSStart() would have done that.
*/
if (runCount == 1 && limit == length) {
/* There is only one non-WS run and no trailing WS-run. */
getSingleRun(bidiBase, levels[0]);
} else /* runCount>1 || limit<length */ {
/* allocate and set the runs */
BidiRun[] runs;
int runIndex, start;
byte minLevel = BidiBase.MAX_EXPLICIT_LEVEL + 1;
byte maxLevel=0;
/* now, count a (non-mergeable) WS run */
if (limit < length) {
++runCount;
}
/* runCount > 1 */
bidiBase.getRunsMemory(runCount);
runs = bidiBase.runsMemory;
/* set the runs */
/* FOOD FOR THOUGHT: this could be optimized, e.g.:
* 464->444, 484->444, 575->555, 595->555
* However, that would take longer. Check also how it would
* interact with BiDi control removal and inserting Marks.
*/
runIndex = 0;
/* search for the run limits and initialize visualLimit values with the run lengths */
i = 0;
do {
/* prepare this run */
start = i;
level = levels[i];
if (level < minLevel) {
minLevel = level;
}
if (level > maxLevel) {
maxLevel = level;
}
/* look for the run limit */
while (++i < limit && levels[i] == level) {}
/* i is another run limit */
runs[runIndex] = new BidiRun(start, i - start, level);
++runIndex;
} while (i < limit);
if (limit < length) {
/* there is a separate WS run */
runs[runIndex] = new BidiRun(limit, length - limit, bidiBase.paraLevel);
/* For the trailing WS run, bidiBase.paraLevel is ok even
if contextual multiple paragraphs. */
if (bidiBase.paraLevel < minLevel) {
minLevel = bidiBase.paraLevel;
}
}
/* set the object fields */
bidiBase.runs = runs;
bidiBase.runCount = runCount;
reorderLine(bidiBase, minLevel, maxLevel);
/* now add the direction flags and adjust the visualLimit's to be just that */
/* this loop will also handle the trailing WS run */
limit = 0;
for (i = 0; i < runCount; ++i) {
runs[i].level = levels[runs[i].start];
limit = (runs[i].limit += limit);
}
/* Set the embedding level for the trailing WS run. */
/* For a RTL paragraph, it will be the *first* run in visual order. */
/* For the trailing WS run, bidiBase.paraLevel is ok even if
contextual multiple paragraphs. */
if (runIndex < runCount) {
int trailingRun = ((bidiBase.paraLevel & 1) != 0)? 0 : runIndex;
runs[trailingRun].level = bidiBase.paraLevel;
}
}
}
/* handle insert LRM/RLM BEFORE/AFTER run */
if (bidiBase.insertPoints.size > 0) {
BidiBase.Point point;
int runIndex, ip;
for (ip = 0; ip < bidiBase.insertPoints.size; ip++) {
point = bidiBase.insertPoints.points[ip];
runIndex = getRunFromLogicalIndex(bidiBase, point.pos);
bidiBase.runs[runIndex].insertRemove |= point.flag;
}
}
/* handle remove BiDi control characters */
if (bidiBase.controlCount > 0) {
int runIndex, ic;
char c;
for (ic = 0; ic < bidiBase.length; ic++) {
c = bidiBase.text[ic];
if (BidiBase.IsBidiControlChar(c)) {
runIndex = getRunFromLogicalIndex(bidiBase, ic);
bidiBase.runs[runIndex].insertRemove--;
}
}
}
}
static int[] prepareReorder(byte[] levels, byte[] pMinLevel, byte[] pMaxLevel)
{
int start;
byte level, minLevel, maxLevel;
if (levels == null || levels.length <= 0) {
return null;
}
/* determine minLevel and maxLevel */
minLevel = BidiBase.MAX_EXPLICIT_LEVEL + 1;
maxLevel = 0;
for (start = levels.length; start>0; ) {
level = levels[--start];
if (level < 0 || level > (BidiBase.MAX_EXPLICIT_LEVEL + 1)) {
return null;
}
if (level < minLevel) {
minLevel = level;
}
if (level > maxLevel) {
maxLevel = level;
}
}
pMinLevel[0] = minLevel;
pMaxLevel[0] = maxLevel;
/* initialize the index map */
int[] indexMap = new int[levels.length];
for (start = levels.length; start > 0; ) {
--start;
indexMap[start] = start;
}
return indexMap;
}
static int[] reorderVisual(byte[] levels)
{
byte[] aMinLevel = new byte[1];
byte[] aMaxLevel = new byte[1];
int start, end, limit, temp;
byte minLevel, maxLevel;
int[] indexMap = prepareReorder(levels, aMinLevel, aMaxLevel);
if (indexMap == null) {
return null;
}
minLevel = aMinLevel[0];
maxLevel = aMaxLevel[0];
/* nothing to do? */
if (minLevel == maxLevel && (minLevel & 1) == 0) {
return indexMap;
}
/* reorder only down to the lowest odd level */
minLevel |= 1;
/* loop maxLevel..minLevel */
do {
start = 0;
/* loop for all sequences of levels to reorder at the current maxLevel */
for ( ; ; ) {
/* look for a sequence of levels that are all at >=maxLevel */
/* look for the first index of such a sequence */
while (start < levels.length && levels[start] < maxLevel) {
++start;
}
if (start >= levels.length) {
break; /* no more such runs */
}
/* look for the limit of such a sequence (the index behind it) */
for (limit = start; ++limit < levels.length && levels[limit] >= maxLevel; ) {}
/*
* Swap the entire interval of indexes from start to limit-1.
* We don't need to swap the levels for the purpose of this
* algorithm: the sequence of levels that we look at does not
* move anyway.
*/
end = limit - 1;
while (start < end) {
temp = indexMap[start];
indexMap[start] = indexMap[end];
indexMap[end] = temp;
++start;
--end;
}
if (limit == levels.length) {
break; /* no more such sequences */
} else {
start = limit + 1;
}
}
} while (--maxLevel >= minLevel);
return indexMap;
}
static int[] getVisualMap(BidiBase bidiBase)
{
/* fill a visual-to-logical index map using the runs[] */
BidiRun[] runs = bidiBase.runs;
int logicalStart, visualStart, visualLimit;
int allocLength = bidiBase.length > bidiBase.resultLength ? bidiBase.length
: bidiBase.resultLength;
int[] indexMap = new int[allocLength];
visualStart = 0;
int idx = 0;
for (int j = 0; j < bidiBase.runCount; ++j) {
logicalStart = runs[j].start;
visualLimit = runs[j].limit;
if (runs[j].isEvenRun()) {
do { /* LTR */
indexMap[idx++] = logicalStart++;
} while (++visualStart < visualLimit);
} else {
logicalStart += visualLimit - visualStart; /* logicalLimit */
do { /* RTL */
indexMap[idx++] = --logicalStart;
} while (++visualStart < visualLimit);
}
/* visualStart==visualLimit; */
}
if (bidiBase.insertPoints.size > 0) {
int markFound = 0, runCount = bidiBase.runCount;
int insertRemove, i, j, k;
runs = bidiBase.runs;
/* count all inserted marks */
for (i = 0; i < runCount; i++) {
insertRemove = runs[i].insertRemove;
if ((insertRemove & (BidiBase.LRM_BEFORE|BidiBase.RLM_BEFORE)) > 0) {
markFound++;
}
if ((insertRemove & (BidiBase.LRM_AFTER|BidiBase.RLM_AFTER)) > 0) {
markFound++;
}
}
/* move back indexes by number of preceding marks */
k = bidiBase.resultLength;
for (i = runCount - 1; i >= 0 && markFound > 0; i--) {
insertRemove = runs[i].insertRemove;
if ((insertRemove & (BidiBase.LRM_AFTER|BidiBase.RLM_AFTER)) > 0) {
indexMap[--k] = BidiBase.MAP_NOWHERE;
markFound--;
}
visualStart = i > 0 ? runs[i-1].limit : 0;
for (j = runs[i].limit - 1; j >= visualStart && markFound > 0; j--) {
indexMap[--k] = indexMap[j];
}
if ((insertRemove & (BidiBase.LRM_BEFORE|BidiBase.RLM_BEFORE)) > 0) {
indexMap[--k] = BidiBase.MAP_NOWHERE;
markFound--;
}
}
}
else if (bidiBase.controlCount > 0) {
int runCount = bidiBase.runCount, logicalEnd;
int insertRemove, length, i, j, k, m;
char uchar;
boolean evenRun;
runs = bidiBase.runs;
visualStart = 0;
/* move forward indexes by number of preceding controls */
k = 0;
for (i = 0; i < runCount; i++, visualStart += length) {
length = runs[i].limit - visualStart;
insertRemove = runs[i].insertRemove;
/* if no control found yet, nothing to do in this run */
if ((insertRemove == 0) && (k == visualStart)) {
k += length;
continue;
}
/* if no control in this run */
if (insertRemove == 0) {
visualLimit = runs[i].limit;
for (j = visualStart; j < visualLimit; j++) {
indexMap[k++] = indexMap[j];
}
continue;
}
logicalStart = runs[i].start;
evenRun = runs[i].isEvenRun();
logicalEnd = logicalStart + length - 1;
for (j = 0; j < length; j++) {
m = evenRun ? logicalStart + j : logicalEnd - j;
uchar = bidiBase.text[m];
if (!BidiBase.IsBidiControlChar(uchar)) {
indexMap[k++] = m;
}
}
}
}
if (allocLength == bidiBase.resultLength) {
return indexMap;
}
int[] newMap = new int[bidiBase.resultLength];
System.arraycopy(indexMap, 0, newMap, 0, bidiBase.resultLength);
return newMap;
}
}

View file

@ -0,0 +1,124 @@
/*
* Copyright (c) 2009, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
/* Written by Simon Montagu, Matitiahu Allouche
* (ported from C code written by Markus W. Scherer)
*/
package sun.text.bidi;
/**
* A BidiRun represents a sequence of characters at the same embedding level.
* The Bidi algorithm decomposes a piece of text into sequences of characters
* at the same embedding level, each such sequence is called a "run".
*
* <p>A BidiRun represents such a run by storing its essential properties,
* but does not duplicate the characters which form the run.
*
* <p>The &quot;limit&quot; of the run is the position just after the
* last character, i.e., one more than that position.
*
* <p>This class has no public constructor, and its members cannot be
* modified by users.
*
* @see com.ibm.icu.text.Bidi
*/
class BidiRun {
int start; /* first logical position of the run */
int limit; /* last visual position of the run +1 */
int insertRemove; /* if >0, flags for inserting LRM/RLM before/after run,
if <0, count of bidi controls within run */
byte level;
/*
* Default constructor
*
* Note that members start and limit of a run instance have different
* meanings depending whether the run is part of the runs array of a Bidi
* object, or if it is a reference returned by getVisualRun() or
* getLogicalRun().
* For a member of the runs array of a Bidi object,
* - start is the first logical position of the run in the source text.
* - limit is one after the last visual position of the run.
* For a reference returned by getLogicalRun() or getVisualRun(),
* - start is the first logical position of the run in the source text.
* - limit is one after the last logical position of the run.
*/
BidiRun()
{
this(0, 0, (byte)0);
}
/*
* Constructor
*/
BidiRun(int start, int limit, byte embeddingLevel)
{
this.start = start;
this.limit = limit;
this.level = embeddingLevel;
}
/*
* Copy the content of a BidiRun instance
*/
void copyFrom(BidiRun run)
{
this.start = run.start;
this.limit = run.limit;
this.level = run.level;
this.insertRemove = run.insertRemove;
}
/**
* Get level of run
*/
byte getEmbeddingLevel()
{
return level;
}
/**
* Check if run level is even
* @return true if the embedding level of this run is even, i.e. it is a
* left-to-right run.
*/
boolean isEvenRun()
{
return (level & 1) == 0;
}
}

View file

@ -0,0 +1,452 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2001-2010, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
/* Written by Simon Montagu, Matitiahu Allouche
* (ported from C code written by Markus W. Scherer)
*/
package sun.text.bidi;
import sun.text.normalizer.UCharacter;
import sun.text.normalizer.UTF16;
final class BidiWriter {
/** Bidi control code points */
static final char LRM_CHAR = 0x200e;
static final char RLM_CHAR = 0x200f;
static final int MASK_R_AL = (1 << UCharacter.RIGHT_TO_LEFT |
1 << UCharacter.RIGHT_TO_LEFT_ARABIC);
private static boolean IsCombining(int type) {
return ((1<<type &
(1<<UCharacter.NON_SPACING_MARK |
1<<UCharacter.COMBINING_SPACING_MARK |
1<<UCharacter.ENCLOSING_MARK)) != 0);
}
/*
* When we have OUTPUT_REVERSE set on writeReordered(), then we
* semantically write RTL runs in reverse and later reverse them again.
* Instead, we actually write them in forward order to begin with.
* However, if the RTL run was to be mirrored, we need to mirror here now
* since the implicit second reversal must not do it.
* It looks strange to do mirroring in LTR output, but it is only because
* we are writing RTL output in reverse.
*/
private static String doWriteForward(String src, int options) {
/* optimize for several combinations of options */
switch(options&(BidiBase.REMOVE_BIDI_CONTROLS|BidiBase.DO_MIRRORING)) {
case 0: {
/* simply return the LTR run */
return src;
}
case BidiBase.DO_MIRRORING: {
StringBuffer dest = new StringBuffer(src.length());
/* do mirroring */
int i=0;
int c;
do {
c = UTF16.charAt(src, i);
i += UTF16.getCharCount(c);
UTF16.append(dest, UCharacter.getMirror(c));
} while(i < src.length());
return dest.toString();
}
case BidiBase.REMOVE_BIDI_CONTROLS: {
StringBuilder dest = new StringBuilder(src.length());
/* copy the LTR run and remove any Bidi control characters */
int i = 0;
char c;
do {
c = src.charAt(i++);
if(!BidiBase.IsBidiControlChar(c)) {
dest.append(c);
}
} while(i < src.length());
return dest.toString();
}
default: {
StringBuffer dest = new StringBuffer(src.length());
/* remove Bidi control characters and do mirroring */
int i = 0;
int c;
do {
c = UTF16.charAt(src, i);
i += UTF16.getCharCount(c);
if(!BidiBase.IsBidiControlChar(c)) {
UTF16.append(dest, UCharacter.getMirror(c));
}
} while(i < src.length());
return dest.toString();
}
} /* end of switch */
}
private static String doWriteForward(char[] text, int start, int limit,
int options) {
return doWriteForward(new String(text, start, limit - start), options);
}
static String writeReverse(String src, int options) {
/*
* RTL run -
*
* RTL runs need to be copied to the destination in reverse order
* of code points, not code units, to keep Unicode characters intact.
*
* The general strategy for this is to read the source text
* in backward order, collect all code units for a code point
* (and optionally following combining characters, see below),
* and copy all these code units in ascending order
* to the destination for this run.
*
* Several options request whether combining characters
* should be kept after their base characters,
* whether Bidi control characters should be removed, and
* whether characters should be replaced by their mirror-image
* equivalent Unicode characters.
*/
StringBuffer dest = new StringBuffer(src.length());
/* optimize for several combinations of options */
switch (options &
(BidiBase.REMOVE_BIDI_CONTROLS |
BidiBase.DO_MIRRORING |
BidiBase.KEEP_BASE_COMBINING)) {
case 0:
/*
* With none of the "complicated" options set, the destination
* run will have the same length as the source run,
* and there is no mirroring and no keeping combining characters
* with their base characters.
*
* XXX: or dest = UTF16.reverse(new StringBuffer(src));
*/
int srcLength = src.length();
/* preserve character integrity */
do {
/* i is always after the last code unit known to need to be kept
* in this segment */
int i = srcLength;
/* collect code units for one base character */
srcLength -= UTF16.getCharCount(UTF16.charAt(src,
srcLength - 1));
/* copy this base character */
dest.append(src.substring(srcLength, i));
} while(srcLength > 0);
break;
case BidiBase.KEEP_BASE_COMBINING:
/*
* Here, too, the destination
* run will have the same length as the source run,
* and there is no mirroring.
* We do need to keep combining characters with their base
* characters.
*/
srcLength = src.length();
/* preserve character integrity */
do {
/* i is always after the last code unit known to need to be kept
* in this segment */
int c;
int i = srcLength;
/* collect code units and modifier letters for one base
* character */
do {
c = UTF16.charAt(src, srcLength - 1);
srcLength -= UTF16.getCharCount(c);
} while(srcLength > 0 && IsCombining(UCharacter.getType(c)));
/* copy this "user character" */
dest.append(src.substring(srcLength, i));
} while(srcLength > 0);
break;
default:
/*
* With several "complicated" options set, this is the most
* general and the slowest copying of an RTL run.
* We will do mirroring, remove Bidi controls, and
* keep combining characters with their base characters
* as requested.
*/
srcLength = src.length();
/* preserve character integrity */
do {
/* i is always after the last code unit known to need to be kept
* in this segment */
int i = srcLength;
/* collect code units for one base character */
int c = UTF16.charAt(src, srcLength - 1);
srcLength -= UTF16.getCharCount(c);
if ((options & BidiBase.KEEP_BASE_COMBINING) != 0) {
/* collect modifier letters for this base character */
while(srcLength > 0 && IsCombining(UCharacter.getType(c))) {
c = UTF16.charAt(src, srcLength - 1);
srcLength -= UTF16.getCharCount(c);
}
}
if ((options & BidiBase.REMOVE_BIDI_CONTROLS) != 0 &&
BidiBase.IsBidiControlChar(c)) {
/* do not copy this Bidi control character */
continue;
}
/* copy this "user character" */
int j = srcLength;
if((options & BidiBase.DO_MIRRORING) != 0) {
/* mirror only the base character */
c = UCharacter.getMirror(c);
UTF16.append(dest, c);
j += UTF16.getCharCount(c);
}
dest.append(src.substring(j, i));
} while(srcLength > 0);
break;
} /* end of switch */
return dest.toString();
}
static String doWriteReverse(char[] text, int start, int limit, int options) {
return writeReverse(new String(text, start, limit - start), options);
}
static String writeReordered(BidiBase bidi, int options) {
int run, runCount;
StringBuilder dest;
char[] text = bidi.text;
runCount = bidi.countRuns();
/*
* Option "insert marks" implies BidiBase.INSERT_LRM_FOR_NUMERIC if the
* reordering mode (checked below) is appropriate.
*/
if ((bidi.reorderingOptions & BidiBase.OPTION_INSERT_MARKS) != 0) {
options |= BidiBase.INSERT_LRM_FOR_NUMERIC;
options &= ~BidiBase.REMOVE_BIDI_CONTROLS;
}
/*
* Option "remove controls" implies BidiBase.REMOVE_BIDI_CONTROLS
* and cancels BidiBase.INSERT_LRM_FOR_NUMERIC.
*/
if ((bidi.reorderingOptions & BidiBase.OPTION_REMOVE_CONTROLS) != 0) {
options |= BidiBase.REMOVE_BIDI_CONTROLS;
options &= ~BidiBase.INSERT_LRM_FOR_NUMERIC;
}
/*
* If we do not perform the "inverse Bidi" algorithm, then we
* don't need to insert any LRMs, and don't need to test for it.
*/
if ((bidi.reorderingMode != BidiBase.REORDER_INVERSE_NUMBERS_AS_L) &&
(bidi.reorderingMode != BidiBase.REORDER_INVERSE_LIKE_DIRECT) &&
(bidi.reorderingMode != BidiBase.REORDER_INVERSE_FOR_NUMBERS_SPECIAL) &&
(bidi.reorderingMode != BidiBase.REORDER_RUNS_ONLY)) {
options &= ~BidiBase.INSERT_LRM_FOR_NUMERIC;
}
dest = new StringBuilder((options & BidiBase.INSERT_LRM_FOR_NUMERIC) != 0 ?
bidi.length * 2 : bidi.length);
/*
* Iterate through all visual runs and copy the run text segments to
* the destination, according to the options.
*
* The tests for where to insert LRMs ignore the fact that there may be
* BN codes or non-BMP code points at the beginning and end of a run;
* they may insert LRMs unnecessarily but the tests are faster this way
* (this would have to be improved for UTF-8).
*/
if ((options & BidiBase.OUTPUT_REVERSE) == 0) {
/* forward output */
if ((options & BidiBase.INSERT_LRM_FOR_NUMERIC) == 0) {
/* do not insert Bidi controls */
for (run = 0; run < runCount; ++run) {
BidiRun bidiRun = bidi.getVisualRun(run);
if (bidiRun.isEvenRun()) {
dest.append(doWriteForward(text, bidiRun.start,
bidiRun.limit,
options & ~BidiBase.DO_MIRRORING));
} else {
dest.append(doWriteReverse(text, bidiRun.start,
bidiRun.limit, options));
}
}
} else {
/* insert Bidi controls for "inverse Bidi" */
byte[] dirProps = bidi.dirProps;
char uc;
int markFlag;
for (run = 0; run < runCount; ++run) {
BidiRun bidiRun = bidi.getVisualRun(run);
markFlag=0;
/* check if something relevant in insertPoints */
markFlag = bidi.runs[run].insertRemove;
if (markFlag < 0) { /* bidi controls count */
markFlag = 0;
}
if (bidiRun.isEvenRun()) {
if (bidi.isInverse() &&
dirProps[bidiRun.start] != BidiBase.L) {
markFlag |= BidiBase.LRM_BEFORE;
}
if ((markFlag & BidiBase.LRM_BEFORE) != 0) {
uc = LRM_CHAR;
} else if ((markFlag & BidiBase.RLM_BEFORE) != 0) {
uc = RLM_CHAR;
} else {
uc = 0;
}
if (uc != 0) {
dest.append(uc);
}
dest.append(doWriteForward(text,
bidiRun.start, bidiRun.limit,
options & ~BidiBase.DO_MIRRORING));
if (bidi.isInverse() &&
dirProps[bidiRun.limit - 1] != BidiBase.L) {
markFlag |= BidiBase.LRM_AFTER;
}
if ((markFlag & BidiBase.LRM_AFTER) != 0) {
uc = LRM_CHAR;
} else if ((markFlag & BidiBase.RLM_AFTER) != 0) {
uc = RLM_CHAR;
} else {
uc = 0;
}
if (uc != 0) {
dest.append(uc);
}
} else { /* RTL run */
if (bidi.isInverse() &&
!bidi.testDirPropFlagAt(MASK_R_AL,
bidiRun.limit - 1)) {
markFlag |= BidiBase.RLM_BEFORE;
}
if ((markFlag & BidiBase.LRM_BEFORE) != 0) {
uc = LRM_CHAR;
} else if ((markFlag & BidiBase.RLM_BEFORE) != 0) {
uc = RLM_CHAR;
} else {
uc = 0;
}
if (uc != 0) {
dest.append(uc);
}
dest.append(doWriteReverse(text, bidiRun.start,
bidiRun.limit, options));
if(bidi.isInverse() &&
(MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.start])) == 0) {
markFlag |= BidiBase.RLM_AFTER;
}
if ((markFlag & BidiBase.LRM_AFTER) != 0) {
uc = LRM_CHAR;
} else if ((markFlag & BidiBase.RLM_AFTER) != 0) {
uc = RLM_CHAR;
} else {
uc = 0;
}
if (uc != 0) {
dest.append(uc);
}
}
}
}
} else {
/* reverse output */
if((options & BidiBase.INSERT_LRM_FOR_NUMERIC) == 0) {
/* do not insert Bidi controls */
for(run = runCount; --run >= 0; ) {
BidiRun bidiRun = bidi.getVisualRun(run);
if (bidiRun.isEvenRun()) {
dest.append(doWriteReverse(text,
bidiRun.start, bidiRun.limit,
options & ~BidiBase.DO_MIRRORING));
} else {
dest.append(doWriteForward(text, bidiRun.start,
bidiRun.limit, options));
}
}
} else {
/* insert Bidi controls for "inverse Bidi" */
byte[] dirProps = bidi.dirProps;
for (run = runCount; --run >= 0; ) {
/* reverse output */
BidiRun bidiRun = bidi.getVisualRun(run);
if (bidiRun.isEvenRun()) {
if (dirProps[bidiRun.limit - 1] != BidiBase.L) {
dest.append(LRM_CHAR);
}
dest.append(doWriteReverse(text, bidiRun.start,
bidiRun.limit, options & ~BidiBase.DO_MIRRORING));
if (dirProps[bidiRun.start] != BidiBase.L) {
dest.append(LRM_CHAR);
}
} else {
if ((MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.start])) == 0) {
dest.append(RLM_CHAR);
}
dest.append(doWriteForward(text, bidiRun.start,
bidiRun.limit, options));
if ((MASK_R_AL & BidiBase.DirPropFlag(dirProps[bidiRun.limit - 1])) == 0) {
dest.append(RLM_CHAR);
}
}
}
}
}
return dest.toString();
}
}

View file

@ -0,0 +1,526 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
******************************************************************************
*
* Copyright (C) 2009-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*
******************************************************************************
*/
package sun.text.normalizer;
import sun.text.normalizer.UnicodeSet.SpanCondition;
/**
* Helper class for frozen UnicodeSets, implements contains() and span() optimized for BMP code points.
*
* Latin-1: Look up bytes.
* 2-byte characters: Bits organized vertically.
* 3-byte characters: Use zero/one/mixed data per 64-block in U+0000..U+FFFF, with mixed for illegal ranges.
* Supplementary characters: Call contains() on the parent set.
*/
final class BMPSet {
/**
* One boolean ('true' or 'false') per Latin-1 character.
*/
private boolean[] latin1Contains;
/**
* One bit per code point from U+0000..U+07FF. The bits are organized vertically; consecutive code points
* correspond to the same bit positions in consecutive table words. With code point parts lead=c{10..6}
* trail=c{5..0} it is set.contains(c)==(table7FF[trail] bit lead)
*
* Bits for 0..7F (non-shortest forms) are set to the result of contains(FFFD) for faster validity checking at
* runtime.
*/
private int[] table7FF;
/**
* One bit per 64 BMP code points. The bits are organized vertically; consecutive 64-code point blocks
* correspond to the same bit position in consecutive table words. With code point parts lead=c{15..12}
* t1=c{11..6} test bits (lead+16) and lead in bmpBlockBits[t1]. If the upper bit is 0, then the lower bit
* indicates if contains(c) for all code points in the 64-block. If the upper bit is 1, then the block is mixed
* and set.contains(c) must be called.
*
* Bits for 0..7FF (non-shortest forms) and D800..DFFF are set to the result of contains(FFFD) for faster
* validity checking at runtime.
*/
private int[] bmpBlockBits;
/**
* Inversion list indexes for restricted binary searches in findCodePoint(), from findCodePoint(U+0800, U+1000,
* U+2000, .., U+F000, U+10000). U+0800 is the first 3-byte-UTF-8 code point. Code points below U+0800 are
* always looked up in the bit tables. The last pair of indexes is for finding supplementary code points.
*/
private int[] list4kStarts;
/**
* The inversion list of the parent set, for the slower contains() implementation for mixed BMP blocks and for
* supplementary code points. The list is terminated with list[listLength-1]=0x110000.
*/
private final int[] list;
private final int listLength; // length used; list may be longer to minimize reallocs
public BMPSet(final int[] parentList, int parentListLength) {
list = parentList;
listLength = parentListLength;
latin1Contains = new boolean[0x100];
table7FF = new int[64];
bmpBlockBits = new int[64];
list4kStarts = new int[18];
/*
* Set the list indexes for binary searches for U+0800, U+1000, U+2000, .., U+F000, U+10000. U+0800 is the
* first 3-byte-UTF-8 code point. Lower code points are looked up in the bit tables. The last pair of
* indexes is for finding supplementary code points.
*/
list4kStarts[0] = findCodePoint(0x800, 0, listLength - 1);
int i;
for (i = 1; i <= 0x10; ++i) {
list4kStarts[i] = findCodePoint(i << 12, list4kStarts[i - 1], listLength - 1);
}
list4kStarts[0x11] = listLength - 1;
initBits();
}
public boolean contains(int c) {
if (c <= 0xff) {
return (latin1Contains[c]);
} else if (c <= 0x7ff) {
return ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0);
} else if (c < 0xd800 || (c >= 0xe000 && c <= 0xffff)) {
int lead = c >> 12;
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
if (twoBits <= 1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
return (0 != twoBits);
} else {
// Look up the code point in its 4k block of code points.
return containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1]);
}
} else if (c <= 0x10ffff) {
// surrogate or supplementary code point
return containsSlow(c, list4kStarts[0xd], list4kStarts[0x11]);
} else {
// Out-of-range code points get false, consistent with long-standing
// behavior of UnicodeSet.contains(c).
return false;
}
}
/**
* Span the initial substring for which each character c has spanCondition==contains(c). It must be
* spanCondition==0 or 1.
*
* @param start The start index
* @param outCount If not null: Receives the number of code points in the span.
* @return the limit (exclusive end) of the span
*
* NOTE: to reduce the overhead of function call to contains(c), it is manually inlined here. Check for
* sufficient length for trail unit for each surrogate pair. Handle single surrogates as surrogate code points
* as usual in ICU.
*/
public final int span(CharSequence s, int start, SpanCondition spanCondition,
OutputInt outCount) {
char c, c2;
int i = start;
int limit = s.length();
int numSupplementary = 0;
if (SpanCondition.NOT_CONTAINED != spanCondition) {
// span
while (i < limit) {
c = s.charAt(i);
if (c <= 0xff) {
if (!latin1Contains[c]) {
break;
}
} else if (c <= 0x7ff) {
if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
break;
}
} else if (c < 0xd800 ||
c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
int lead = c >> 12;
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
if (twoBits <= 1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if (twoBits == 0) {
break;
}
} else {
// Look up the code point in its 4k block of code points.
if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
break;
}
}
} else {
// surrogate pair
int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
++numSupplementary;
++i;
}
++i;
}
} else {
// span not
while (i < limit) {
c = s.charAt(i);
if (c <= 0xff) {
if (latin1Contains[c]) {
break;
}
} else if (c <= 0x7ff) {
if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
break;
}
} else if (c < 0xd800 ||
c >= 0xdc00 || (i + 1) == limit || (c2 = s.charAt(i + 1)) < 0xdc00 || c2 >= 0xe000) {
int lead = c >> 12;
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
if (twoBits <= 1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if (twoBits != 0) {
break;
}
} else {
// Look up the code point in its 4k block of code points.
if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
break;
}
}
} else {
// surrogate pair
int supplementary = UCharacterProperty.getRawSupplementary(c, c2);
if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
++numSupplementary;
++i;
}
++i;
}
}
if (outCount != null) {
int spanLength = i - start;
outCount.value = spanLength - numSupplementary; // number of code points
}
return i;
}
/**
* Symmetrical with span().
* Span the trailing substring for which each character c has spanCondition==contains(c). It must be s.length >=
* limit and spanCondition==0 or 1.
*
* @return The string index which starts the span (i.e. inclusive).
*/
public final int spanBack(CharSequence s, int limit, SpanCondition spanCondition) {
char c, c2;
if (SpanCondition.NOT_CONTAINED != spanCondition) {
// span
for (;;) {
c = s.charAt(--limit);
if (c <= 0xff) {
if (!latin1Contains[c]) {
break;
}
} else if (c <= 0x7ff) {
if ((table7FF[c & 0x3f] & (1 << (c >> 6))) == 0) {
break;
}
} else if (c < 0xd800 ||
c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
int lead = c >> 12;
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
if (twoBits <= 1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if (twoBits == 0) {
break;
}
} else {
// Look up the code point in its 4k block of code points.
if (!containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
break;
}
}
} else {
// surrogate pair
int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
if (!containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
--limit;
}
if (0 == limit) {
return 0;
}
}
} else {
// span not
for (;;) {
c = s.charAt(--limit);
if (c <= 0xff) {
if (latin1Contains[c]) {
break;
}
} else if (c <= 0x7ff) {
if ((table7FF[c & 0x3f] & (1 << (c >> 6))) != 0) {
break;
}
} else if (c < 0xd800 ||
c < 0xdc00 || 0 == limit || (c2 = s.charAt(limit - 1)) < 0xd800 || c2 >= 0xdc00) {
int lead = c >> 12;
int twoBits = (bmpBlockBits[(c >> 6) & 0x3f] >> lead) & 0x10001;
if (twoBits <= 1) {
// All 64 code points with the same bits 15..6
// are either in the set or not.
if (twoBits != 0) {
break;
}
} else {
// Look up the code point in its 4k block of code points.
if (containsSlow(c, list4kStarts[lead], list4kStarts[lead + 1])) {
break;
}
}
} else {
// surrogate pair
int supplementary = UCharacterProperty.getRawSupplementary(c2, c);
if (containsSlow(supplementary, list4kStarts[0x10], list4kStarts[0x11])) {
break;
}
--limit;
}
if (0 == limit) {
return 0;
}
}
}
return limit + 1;
}
/**
* Set bits in a bit rectangle in "vertical" bit organization. start<limit<=0x800
*/
private static void set32x64Bits(int[] table, int start, int limit) {
assert (64 == table.length);
int lead = start >> 6; // Named for UTF-8 2-byte lead byte with upper 5 bits.
int trail = start & 0x3f; // Named for UTF-8 2-byte trail byte with lower 6 bits.
// Set one bit indicating an all-one block.
int bits = 1 << lead;
if ((start + 1) == limit) { // Single-character shortcut.
table[trail] |= bits;
return;
}
int limitLead = limit >> 6;
int limitTrail = limit & 0x3f;
if (lead == limitLead) {
// Partial vertical bit column.
while (trail < limitTrail) {
table[trail++] |= bits;
}
} else {
// Partial vertical bit column,
// followed by a bit rectangle,
// followed by another partial vertical bit column.
if (trail > 0) {
do {
table[trail++] |= bits;
} while (trail < 64);
++lead;
}
if (lead < limitLead) {
bits = ~((1 << lead) - 1);
if (limitLead < 0x20) {
bits &= (1 << limitLead) - 1;
}
for (trail = 0; trail < 64; ++trail) {
table[trail] |= bits;
}
}
// limit<=0x800. If limit==0x800 then limitLead=32 and limitTrail=0.
// In that case, bits=1<<limitLead == 1<<0 == 1
// (because Java << uses only the lower 5 bits of the shift operand)
// but the bits value is not used because trail<limitTrail is already false.
bits = 1 << limitLead;
for (trail = 0; trail < limitTrail; ++trail) {
table[trail] |= bits;
}
}
}
private void initBits() {
int start, limit;
int listIndex = 0;
// Set latin1Contains[].
do {
start = list[listIndex++];
if (listIndex < listLength) {
limit = list[listIndex++];
} else {
limit = 0x110000;
}
if (start >= 0x100) {
break;
}
do {
latin1Contains[start++] = true;
} while (start < limit && start < 0x100);
} while (limit <= 0x100);
// Set table7FF[].
while (start < 0x800) {
set32x64Bits(table7FF, start, limit <= 0x800 ? limit : 0x800);
if (limit > 0x800) {
start = 0x800;
break;
}
start = list[listIndex++];
if (listIndex < listLength) {
limit = list[listIndex++];
} else {
limit = 0x110000;
}
}
// Set bmpBlockBits[].
int minStart = 0x800;
while (start < 0x10000) {
if (limit > 0x10000) {
limit = 0x10000;
}
if (start < minStart) {
start = minStart;
}
if (start < limit) { // Else: Another range entirely in a known mixed-value block.
if (0 != (start & 0x3f)) {
// Mixed-value block of 64 code points.
start >>= 6;
bmpBlockBits[start & 0x3f] |= 0x10001 << (start >> 6);
start = (start + 1) << 6; // Round up to the next block boundary.
minStart = start; // Ignore further ranges in this block.
}
if (start < limit) {
if (start < (limit & ~0x3f)) {
// Multiple all-ones blocks of 64 code points each.
set32x64Bits(bmpBlockBits, start >> 6, limit >> 6);
}
if (0 != (limit & 0x3f)) {
// Mixed-value block of 64 code points.
limit >>= 6;
bmpBlockBits[limit & 0x3f] |= 0x10001 << (limit >> 6);
limit = (limit + 1) << 6; // Round up to the next block boundary.
minStart = limit; // Ignore further ranges in this block.
}
}
}
if (limit == 0x10000) {
break;
}
start = list[listIndex++];
if (listIndex < listLength) {
limit = list[listIndex++];
} else {
limit = 0x110000;
}
}
}
/**
* Same as UnicodeSet.findCodePoint(int c) except that the binary search is restricted for finding code
* points in a certain range.
*
* For restricting the search for finding in the range start..end, pass in lo=findCodePoint(start) and
* hi=findCodePoint(end) with 0<=lo<=hi<len. findCodePoint(c) defaults to lo=0 and hi=len-1.
*
* @param c
* a character in a subrange of MIN_VALUE..MAX_VALUE
* @param lo
* The lowest index to be returned.
* @param hi
* The highest index to be returned.
* @return the smallest integer i in the range lo..hi, inclusive, such that c < list[i]
*/
private int findCodePoint(int c, int lo, int hi) {
/* Examples:
findCodePoint(c)
set list[] c=0 1 3 4 7 8
=== ============== ===========
[] [110000] 0 0 0 0 0 0
[\u0000-\u0003] [0, 4, 110000] 1 1 1 2 2 2
[\u0004-\u0007] [4, 8, 110000] 0 0 0 1 1 2
[:Any:] [0, 110000] 1 1 1 1 1 1
*/
// Return the smallest i such that c < list[i]. Assume
// list[len - 1] == HIGH and that c is legal (0..HIGH-1).
if (c < list[lo])
return lo;
// High runner test. c is often after the last range, so an
// initial check for this condition pays off.
if (lo >= hi || c >= list[hi - 1])
return hi;
// invariant: c >= list[lo]
// invariant: c < list[hi]
for (;;) {
int i = (lo + hi) >>> 1;
if (i == lo) {
break; // Found!
} else if (c < list[i]) {
hi = i;
} else {
lo = i;
}
}
return hi;
}
private final boolean containsSlow(int c, int lo, int hi) {
return (0 != (findCodePoint(c, lo, hi) & 1));
}
}

View file

@ -0,0 +1,175 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and
* others. All Rights Reserved.
******************************************************************************
*/
package sun.text.normalizer;
import java.io.DataInputStream;
import java.io.InputStream;
import java.io.IOException;
/**
* Trie implementation which stores data in char, 16 bits.
* @author synwee
* @see com.ibm.icu.impl.Trie
* @since release 2.1, Jan 01 2002
*/
// note that i need to handle the block calculations later, since chartrie
// in icu4c uses the same index array.
public class CharTrie extends Trie
{
// public constructors ---------------------------------------------
/**
* <p>Creates a new Trie with the settings for the trie data.</p>
* <p>Unserialize the 32-bit-aligned input stream and use the data for the
* trie.</p>
* @param inputStream file input stream to a ICU data file, containing
* the trie
* @param dataManipulate object which provides methods to parse the char
* data
* @throws IOException thrown when data reading fails
* @draft 2.1
*/
public CharTrie(InputStream inputStream,
DataManipulate dataManipulate) throws IOException
{
super(inputStream, dataManipulate);
if (!isCharTrie()) {
throw new IllegalArgumentException(
"Data given does not belong to a char trie.");
}
}
// public methods --------------------------------------------------
/**
* Gets the value associated with the codepoint.
* If no value is associated with the codepoint, a default value will be
* returned.
* @param ch codepoint
* @return offset to data
*/
public final char getCodePointValue(int ch)
{
int offset;
// fastpath for U+0000..U+D7FF
if(0 <= ch && ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
// copy of getRawOffset()
offset = (m_index_[ch >> INDEX_STAGE_1_SHIFT_] << INDEX_STAGE_2_SHIFT_)
+ (ch & INDEX_STAGE_3_MASK_);
return m_data_[offset];
}
// handle U+D800..U+10FFFF
offset = getCodePointOffset(ch);
// return -1 if there is an error, in this case we return the default
// value: m_initialValue_
return (offset >= 0) ? m_data_[offset] : m_initialValue_;
}
/**
* Gets the value to the data which this lead surrogate character points
* to.
* Returned data may contain folding offset information for the next
* trailing surrogate character.
* This method does not guarantee correct results for trail surrogates.
* @param ch lead surrogate character
* @return data value
*/
public final char getLeadValue(char ch)
{
return m_data_[getLeadOffset(ch)];
}
// protected methods -----------------------------------------------
/**
* <p>Parses the input stream and stores its trie content into a index and
* data array</p>
* @param inputStream data input stream containing trie data
* @exception IOException thrown when data reading fails
*/
protected final void unserialize(InputStream inputStream)
throws IOException
{
DataInputStream input = new DataInputStream(inputStream);
int indexDataLength = m_dataOffset_ + m_dataLength_;
m_index_ = new char[indexDataLength];
for (int i = 0; i < indexDataLength; i ++) {
m_index_[i] = input.readChar();
}
m_data_ = m_index_;
m_initialValue_ = m_data_[m_dataOffset_];
}
/**
* Gets the offset to the data which the surrogate pair points to.
* @param lead lead surrogate
* @param trail trailing surrogate
* @return offset to data
* @draft 2.1
*/
protected final int getSurrogateOffset(char lead, char trail)
{
if (m_dataManipulate_ == null) {
throw new NullPointerException(
"The field DataManipulate in this Trie is null");
}
// get fold position for the next trail surrogate
int offset = m_dataManipulate_.getFoldingOffset(getLeadValue(lead));
// get the real data from the folded lead/trail units
if (offset > 0) {
return getRawOffset(offset, (char)(trail & SURROGATE_MASK_));
}
// return -1 if there is an error, in this case we return the default
// value: m_initialValue_
return -1;
}
// private data members --------------------------------------------
/**
* Default value
*/
private char m_initialValue_;
/**
* Array of char data
*/
private char m_data_[];
}

View file

@ -0,0 +1,145 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.text.CharacterIterator;
/**
* This class is a wrapper around CharacterIterator and implements the
* UCharacterIterator protocol
* @author ram
*/
class CharacterIteratorWrapper extends UCharacterIterator {
private CharacterIterator iterator;
public CharacterIteratorWrapper(CharacterIterator iter){
if(iter==null){
throw new IllegalArgumentException();
}
iterator = iter;
}
/**
* @see UCharacterIterator#current()
*/
public int current() {
int c = iterator.current();
if(c==CharacterIterator.DONE){
return DONE;
}
return c;
}
/**
* @see UCharacterIterator#getLength()
*/
public int getLength() {
return (iterator.getEndIndex() - iterator.getBeginIndex());
}
/**
* @see UCharacterIterator#getIndex()
*/
public int getIndex() {
return iterator.getIndex();
}
/**
* @see UCharacterIterator#next()
*/
public int next() {
int i = iterator.current();
iterator.next();
if(i==CharacterIterator.DONE){
return DONE;
}
return i;
}
/**
* @see UCharacterIterator#previous()
*/
public int previous() {
int i = iterator.previous();
if(i==CharacterIterator.DONE){
return DONE;
}
return i;
}
/**
* @see UCharacterIterator#setIndex(int)
*/
public void setIndex(int index) {
iterator.setIndex(index);
}
/**
* @see UCharacterIterator#getText(char[])
*/
public int getText(char[] fillIn, int offset){
int length =iterator.getEndIndex() - iterator.getBeginIndex();
int currentIndex = iterator.getIndex();
if(offset < 0 || offset + length > fillIn.length){
throw new IndexOutOfBoundsException(Integer.toString(length));
}
for (char ch = iterator.first(); ch != CharacterIterator.DONE; ch = iterator.next()) {
fillIn[offset++] = ch;
}
iterator.setIndex(currentIndex);
return length;
}
/**
* Creates a clone of this iterator. Clones the underlying character iterator.
* @see UCharacterIterator#clone()
*/
public Object clone(){
try {
CharacterIteratorWrapper result = (CharacterIteratorWrapper) super.clone();
result.iterator = (CharacterIterator)this.iterator.clone();
return result;
} catch (CloneNotSupportedException e) {
return null; // only invoked if bad underlying character iterator
}
}
}

View file

@ -0,0 +1,266 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2009-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.IOException;
/**
* Normalization filtered by a UnicodeSet.
* Normalizes portions of the text contained in the filter set and leaves
* portions not contained in the filter set unchanged.
* Filtering is done via UnicodeSet.span(..., UnicodeSet.SpanCondition.SIMPLE).
* Not-in-the-filter text is treated as "is normalized" and "quick check yes".
* This class implements all of (and only) the Normalizer2 API.
* An instance of this class is unmodifiable/immutable.
* @stable ICU 4.4
* @author Markus W. Scherer
*/
class FilteredNormalizer2 extends Normalizer2 {
/**
* Constructs a filtered normalizer wrapping any Normalizer2 instance
* and a filter set.
* Both are aliased and must not be modified or deleted while this object
* is used.
* The filter set should be frozen; otherwise the performance will suffer greatly.
* @param n2 wrapped Normalizer2 instance
* @param filterSet UnicodeSet which determines the characters to be normalized
* @stable ICU 4.4
*/
public FilteredNormalizer2(Normalizer2 n2, UnicodeSet filterSet) {
norm2=n2;
set=filterSet;
}
/**
* {@inheritDoc}
* @stable ICU 4.4
*/
@Override
public StringBuilder normalize(CharSequence src, StringBuilder dest) {
if(dest==src) {
throw new IllegalArgumentException();
}
dest.setLength(0);
normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
return dest;
}
/**
* {@inheritDoc}
* @stable ICU 4.6
*/
@Override
public Appendable normalize(CharSequence src, Appendable dest) {
if(dest==src) {
throw new IllegalArgumentException();
}
return normalize(src, dest, UnicodeSet.SpanCondition.SIMPLE);
}
/**
* {@inheritDoc}
* @stable ICU 4.4
*/
@Override
public StringBuilder normalizeSecondAndAppend(
StringBuilder first, CharSequence second) {
return normalizeSecondAndAppend(first, second, true);
}
/**
* {@inheritDoc}
* @stable ICU 4.4
*/
@Override
public StringBuilder append(StringBuilder first, CharSequence second) {
return normalizeSecondAndAppend(first, second, false);
}
/**
* {@inheritDoc}
* @stable ICU 4.6
*/
@Override
public String getDecomposition(int c) {
return set.contains(c) ? norm2.getDecomposition(c) : null;
}
/**
* {@inheritDoc}
* @stable ICU 49
*/
@Override
public int getCombiningClass(int c) {
return set.contains(c) ? norm2.getCombiningClass(c) : 0;
}
/**
* {@inheritDoc}
* @stable ICU 4.4
*/
@Override
public boolean isNormalized(CharSequence s) {
UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
int spanLimit=set.span(s, prevSpanLimit, spanCondition);
if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
spanCondition=UnicodeSet.SpanCondition.SIMPLE;
} else {
if(!norm2.isNormalized(s.subSequence(prevSpanLimit, spanLimit))) {
return false;
}
spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
}
prevSpanLimit=spanLimit;
}
return true;
}
/**
* {@inheritDoc}
* @stable ICU 4.4
*/
@Override
public int spanQuickCheckYes(CharSequence s) {
UnicodeSet.SpanCondition spanCondition=UnicodeSet.SpanCondition.SIMPLE;
for(int prevSpanLimit=0; prevSpanLimit<s.length();) {
int spanLimit=set.span(s, prevSpanLimit, spanCondition);
if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
spanCondition=UnicodeSet.SpanCondition.SIMPLE;
} else {
int yesLimit=
prevSpanLimit+
norm2.spanQuickCheckYes(s.subSequence(prevSpanLimit, spanLimit));
if(yesLimit<spanLimit) {
return yesLimit;
}
spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
}
prevSpanLimit=spanLimit;
}
return s.length();
}
/**
* {@inheritDoc}
* @stable ICU 4.4
*/
@Override
public boolean hasBoundaryBefore(int c) {
return !set.contains(c) || norm2.hasBoundaryBefore(c);
}
// Internal: No argument checking, and appends to dest.
// Pass as input spanCondition the one that is likely to yield a non-zero
// span length at the start of src.
// For set=[:age=3.2:], since almost all common characters were in Unicode 3.2,
// UnicodeSet.SpanCondition.SIMPLE should be passed in for the start of src
// and UnicodeSet.SpanCondition.NOT_CONTAINED should be passed in if we continue after
// an in-filter prefix.
private Appendable normalize(CharSequence src, Appendable dest,
UnicodeSet.SpanCondition spanCondition) {
// Don't throw away destination buffer between iterations.
StringBuilder tempDest=new StringBuilder();
try {
for(int prevSpanLimit=0; prevSpanLimit<src.length();) {
int spanLimit=set.span(src, prevSpanLimit, spanCondition);
int spanLength=spanLimit-prevSpanLimit;
if(spanCondition==UnicodeSet.SpanCondition.NOT_CONTAINED) {
if(spanLength!=0) {
dest.append(src, prevSpanLimit, spanLimit);
}
spanCondition=UnicodeSet.SpanCondition.SIMPLE;
} else {
if(spanLength!=0) {
// Not norm2.normalizeSecondAndAppend() because we do not want
// to modify the non-filter part of dest.
dest.append(norm2.normalize(src.subSequence(prevSpanLimit, spanLimit), tempDest));
}
spanCondition=UnicodeSet.SpanCondition.NOT_CONTAINED;
}
prevSpanLimit=spanLimit;
}
} catch(IOException e) {
throw new InternalError(e.toString(), e);
}
return dest;
}
private StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second,
boolean doNormalize) {
if(first==second) {
throw new IllegalArgumentException();
}
if(first.length()==0) {
if(doNormalize) {
return normalize(second, first);
} else {
return first.append(second);
}
}
// merge the in-filter suffix of the first string with the in-filter prefix of the second
int prefixLimit=set.span(second, 0, UnicodeSet.SpanCondition.SIMPLE);
if(prefixLimit!=0) {
CharSequence prefix=second.subSequence(0, prefixLimit);
int suffixStart=set.spanBack(first, 0x7fffffff, UnicodeSet.SpanCondition.SIMPLE);
if(suffixStart==0) {
if(doNormalize) {
norm2.normalizeSecondAndAppend(first, prefix);
} else {
norm2.append(first, prefix);
}
} else {
StringBuilder middle=new StringBuilder(
first.subSequence(suffixStart, first.length()));
if(doNormalize) {
norm2.normalizeSecondAndAppend(middle, prefix);
} else {
norm2.append(middle, prefix);
}
first.delete(suffixStart, 0x7fffffff).append(middle);
}
}
if(prefixLimit<second.length()) {
CharSequence rest=second.subSequence(prefixLimit, second.length());
if(doNormalize) {
normalize(rest, first, UnicodeSet.SpanCondition.NOT_CONTAINED);
} else {
first.append(rest);
}
}
return first;
}
private Normalizer2 norm2;
private UnicodeSet set;
};

View file

@ -0,0 +1,266 @@
/*
* Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.channels.FileChannel;
import java.nio.file.FileSystems;
import java.util.Arrays;
import java.security.AccessController;
import java.security.PrivilegedAction;
public final class ICUBinary {
private static final class IsAcceptable implements Authenticate {
// @Override when we switch to Java 6
public boolean isDataVersionAcceptable(byte version[]) {
return version[0] == 1;
}
}
// public inner interface ------------------------------------------------
/**
* Special interface for data authentication
*/
public static interface Authenticate
{
/**
* Method used in ICUBinary.readHeader() to provide data format
* authentication.
* @param version version of the current data
* @return true if dataformat is an acceptable version, false otherwise
*/
public boolean isDataVersionAcceptable(byte version[]);
}
// public methods --------------------------------------------------------
/**
* Loads an ICU binary data file and returns it as a ByteBuffer.
* The buffer contents is normally read-only, but its position etc. can be modified.
*
* @param itemPath Relative ICU data item path, for example "root.res" or "coll/ucadata.icu".
* @return The data as a read-only ByteBuffer.
*/
public static ByteBuffer getRequiredData(String itemPath) {
final Class<ICUBinary> root = ICUBinary.class;
try (InputStream is = AccessController.doPrivileged(new PrivilegedAction<InputStream>() {
public InputStream run() {
return root.getResourceAsStream(itemPath);
}
})) {
BufferedInputStream b=new BufferedInputStream(is, 4096 /* data buffer size */);
DataInputStream inputStream = new DataInputStream(b);
byte[] bb = new byte[120000];
int n = inputStream.read(bb);
ByteBuffer bytes = ByteBuffer.wrap(bb, 0, n);
return bytes;
}
catch (IOException e) {
throw new UncheckedIOException(e);
}
}
/**
* Same as readHeader(), but returns a VersionInfo rather than a compact int.
*/
public static VersionInfo readHeaderAndDataVersion(ByteBuffer bytes,
int dataFormat,
Authenticate authenticate)
throws IOException {
return getVersionInfoFromCompactInt(readHeader(bytes, dataFormat, authenticate));
}
private static final byte BIG_ENDIAN_ = 1;
public static final byte[] readHeader(InputStream inputStream,
byte dataFormatIDExpected[],
Authenticate authenticate)
throws IOException
{
DataInputStream input = new DataInputStream(inputStream);
char headersize = input.readChar();
int readcount = 2;
//reading the header format
byte magic1 = input.readByte();
readcount ++;
byte magic2 = input.readByte();
readcount ++;
if (magic1 != MAGIC1 || magic2 != MAGIC2) {
throw new IOException(MAGIC_NUMBER_AUTHENTICATION_FAILED_);
}
input.readChar(); // reading size
readcount += 2;
input.readChar(); // reading reserved word
readcount += 2;
byte bigendian = input.readByte();
readcount ++;
byte charset = input.readByte();
readcount ++;
byte charsize = input.readByte();
readcount ++;
input.readByte(); // reading reserved byte
readcount ++;
byte dataFormatID[] = new byte[4];
input.readFully(dataFormatID);
readcount += 4;
byte dataVersion[] = new byte[4];
input.readFully(dataVersion);
readcount += 4;
byte unicodeVersion[] = new byte[4];
input.readFully(unicodeVersion);
readcount += 4;
if (headersize < readcount) {
throw new IOException("Internal Error: Header size error");
}
input.skipBytes(headersize - readcount);
if (bigendian != BIG_ENDIAN_ || charset != CHAR_SET_
|| charsize != CHAR_SIZE_
|| !Arrays.equals(dataFormatIDExpected, dataFormatID)
|| (authenticate != null
&& !authenticate.isDataVersionAcceptable(dataVersion))) {
throw new IOException(HEADER_AUTHENTICATION_FAILED_);
}
return unicodeVersion;
}
/**
* Reads an ICU data header, checks the data format, and returns the data version.
*
* <p>Assumes that the ByteBuffer position is 0 on input.
* The buffer byte order is set according to the data.
* The buffer position is advanced past the header (including UDataInfo and comment).
*
* <p>See C++ ucmndata.h and unicode/udata.h.
*
* @return dataVersion
* @throws IOException if this is not a valid ICU data item of the expected dataFormat
*/
public static int readHeader(ByteBuffer bytes, int dataFormat, Authenticate authenticate)
throws IOException {
assert bytes.position() == 0;
byte magic1 = bytes.get(2);
byte magic2 = bytes.get(3);
if (magic1 != MAGIC1 || magic2 != MAGIC2) {
throw new IOException(MAGIC_NUMBER_AUTHENTICATION_FAILED_);
}
byte isBigEndian = bytes.get(8);
byte charsetFamily = bytes.get(9);
byte sizeofUChar = bytes.get(10);
if (isBigEndian < 0 || 1 < isBigEndian ||
charsetFamily != CHAR_SET_ || sizeofUChar != CHAR_SIZE_) {
throw new IOException(HEADER_AUTHENTICATION_FAILED_);
}
bytes.order(isBigEndian != 0 ? ByteOrder.BIG_ENDIAN : ByteOrder.LITTLE_ENDIAN);
int headerSize = bytes.getChar(0);
int sizeofUDataInfo = bytes.getChar(4);
if (sizeofUDataInfo < 20 || headerSize < (sizeofUDataInfo + 4)) {
throw new IOException("Internal Error: Header size error");
}
// TODO: Change Authenticate to take int major, int minor, int milli, int micro
// to avoid array allocation.
byte[] formatVersion = new byte[] {
bytes.get(16), bytes.get(17), bytes.get(18), bytes.get(19)
};
if (bytes.get(12) != (byte)(dataFormat >> 24) ||
bytes.get(13) != (byte)(dataFormat >> 16) ||
bytes.get(14) != (byte)(dataFormat >> 8) ||
bytes.get(15) != (byte)dataFormat ||
(authenticate != null && !authenticate.isDataVersionAcceptable(formatVersion))) {
throw new IOException(HEADER_AUTHENTICATION_FAILED_ +
String.format("; data format %02x%02x%02x%02x, format version %d.%d.%d.%d",
bytes.get(12), bytes.get(13), bytes.get(14), bytes.get(15),
formatVersion[0] & 0xff, formatVersion[1] & 0xff,
formatVersion[2] & 0xff, formatVersion[3] & 0xff));
}
bytes.position(headerSize);
return // dataVersion
((int)bytes.get(20) << 24) |
((bytes.get(21) & 0xff) << 16) |
((bytes.get(22) & 0xff) << 8) |
(bytes.get(23) & 0xff);
}
public static void skipBytes(ByteBuffer bytes, int skipLength) {
if (skipLength > 0) {
bytes.position(bytes.position() + skipLength);
}
}
/**
* Returns a VersionInfo for the bytes in the compact version integer.
*/
public static VersionInfo getVersionInfoFromCompactInt(int version) {
return VersionInfo.getInstance(
version >>> 24, (version >> 16) & 0xff, (version >> 8) & 0xff, version & 0xff);
}
// private variables -------------------------------------------------
/**
* Magic numbers to authenticate the data file
*/
private static final byte MAGIC1 = (byte)0xda;
private static final byte MAGIC2 = (byte)0x27;
/**
* File format authentication values
*/
private static final byte CHAR_SET_ = 0;
private static final byte CHAR_SIZE_ = 2;
/**
* Error messages
*/
private static final String MAGIC_NUMBER_AUTHENTICATION_FAILED_ =
"ICUBinary data file error: Magin number authentication failed";
private static final String HEADER_AUTHENTICATION_FAILED_ =
"ICUBinary data file error: Header authentication failed";
}

View file

@ -0,0 +1,287 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2009-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.IOException;
final class Norm2AllModes {
// Public API dispatch via Normalizer2 subclasses -------------------------- ***
// Normalizer2 implementation for the old UNORM_NONE.
public static final class NoopNormalizer2 extends Normalizer2 {
@Override
public StringBuilder normalize(CharSequence src, StringBuilder dest) {
if(dest!=src) {
dest.setLength(0);
return dest.append(src);
} else {
throw new IllegalArgumentException();
}
}
@Override
public Appendable normalize(CharSequence src, Appendable dest) {
if(dest!=src) {
try {
return dest.append(src);
} catch(IOException e) {
throw new InternalError(e.toString(), e);
}
} else {
throw new IllegalArgumentException();
}
}
@Override
public StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second) {
if(first!=second) {
return first.append(second);
} else {
throw new IllegalArgumentException();
}
}
@Override
public StringBuilder append(StringBuilder first, CharSequence second) {
if(first!=second) {
return first.append(second);
} else {
throw new IllegalArgumentException();
}
}
@Override
public String getDecomposition(int c) {
return null;
}
// No need to override the default getRawDecomposition().
@Override
public boolean isNormalized(CharSequence s) { return true; }
@Override
public int spanQuickCheckYes(CharSequence s) { return s.length(); }
@Override
public boolean hasBoundaryBefore(int c) { return true; }
}
// Intermediate class:
// Has NormalizerImpl and does boilerplate argument checking and setup.
public abstract static class Normalizer2WithImpl extends Normalizer2 {
public Normalizer2WithImpl(NormalizerImpl ni) {
impl=ni;
}
// normalize
@Override
public StringBuilder normalize(CharSequence src, StringBuilder dest) {
if(dest==src) {
throw new IllegalArgumentException();
}
dest.setLength(0);
normalize(src, new NormalizerImpl.ReorderingBuffer(impl, dest, src.length()));
return dest;
}
@Override
public Appendable normalize(CharSequence src, Appendable dest) {
if(dest==src) {
throw new IllegalArgumentException();
}
NormalizerImpl.ReorderingBuffer buffer=
new NormalizerImpl.ReorderingBuffer(impl, dest, src.length());
normalize(src, buffer);
buffer.flush();
return dest;
}
protected abstract void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer);
// normalize and append
@Override
public StringBuilder normalizeSecondAndAppend(StringBuilder first, CharSequence second) {
return normalizeSecondAndAppend(first, second, true);
}
@Override
public StringBuilder append(StringBuilder first, CharSequence second) {
return normalizeSecondAndAppend(first, second, false);
}
public StringBuilder normalizeSecondAndAppend(
StringBuilder first, CharSequence second, boolean doNormalize) {
if(first==second) {
throw new IllegalArgumentException();
}
normalizeAndAppend(
second, doNormalize,
new NormalizerImpl.ReorderingBuffer(impl, first, first.length()+second.length()));
return first;
}
protected abstract void normalizeAndAppend(
CharSequence src, boolean doNormalize, NormalizerImpl.ReorderingBuffer buffer);
@Override
public String getDecomposition(int c) {
return impl.getDecomposition(c);
}
@Override
public int getCombiningClass(int c) {
return impl.getCC(impl.getNorm16(c));
}
// quick checks
@Override
public boolean isNormalized(CharSequence s) {
return s.length()==spanQuickCheckYes(s);
}
public final NormalizerImpl impl;
}
public static final class DecomposeNormalizer2 extends Normalizer2WithImpl {
public DecomposeNormalizer2(NormalizerImpl ni) {
super(ni);
}
@Override
protected void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer) {
impl.decompose(src, 0, src.length(), buffer);
}
@Override
protected void normalizeAndAppend(
CharSequence src, boolean doNormalize, NormalizerImpl.ReorderingBuffer buffer) {
impl.decomposeAndAppend(src, doNormalize, buffer);
}
@Override
public int spanQuickCheckYes(CharSequence s) {
return impl.decompose(s, 0, s.length(), null);
}
@Override
public boolean hasBoundaryBefore(int c) { return impl.hasDecompBoundary(c, true); }
}
public static final class ComposeNormalizer2 extends Normalizer2WithImpl {
public ComposeNormalizer2(NormalizerImpl ni, boolean fcc) {
super(ni);
onlyContiguous=fcc;
}
@Override
protected void normalize(CharSequence src, NormalizerImpl.ReorderingBuffer buffer) {
impl.compose(src, 0, src.length(), onlyContiguous, true, buffer);
}
@Override
protected void normalizeAndAppend(
CharSequence src, boolean doNormalize, NormalizerImpl.ReorderingBuffer buffer) {
impl.composeAndAppend(src, doNormalize, onlyContiguous, buffer);
}
@Override
public boolean isNormalized(CharSequence s) {
// 5: small destCapacity for substring normalization
return impl.compose(s, 0, s.length(),
onlyContiguous, false,
new NormalizerImpl.ReorderingBuffer(impl, new StringBuilder(), 5));
}
@Override
public int spanQuickCheckYes(CharSequence s) {
return impl.composeQuickCheck(s, 0, s.length(), onlyContiguous, true)>>>1;
}
@Override
public boolean hasBoundaryBefore(int c) { return impl.hasCompBoundaryBefore(c); }
private final boolean onlyContiguous;
}
// instance cache ---------------------------------------------------------- ***
private Norm2AllModes(NormalizerImpl ni) {
impl=ni;
comp=new ComposeNormalizer2(ni, false);
decomp=new DecomposeNormalizer2(ni);
}
public final NormalizerImpl impl;
public final ComposeNormalizer2 comp;
public final DecomposeNormalizer2 decomp;
private static Norm2AllModes getInstanceFromSingleton(Norm2AllModesSingleton singleton) {
if(singleton.exception!=null) {
throw singleton.exception;
}
return singleton.allModes;
}
public static Norm2AllModes getNFCInstance() {
return getInstanceFromSingleton(NFCSingleton.INSTANCE);
}
public static Norm2AllModes getNFKCInstance() {
return getInstanceFromSingleton(NFKCSingleton.INSTANCE);
}
public static final NoopNormalizer2 NOOP_NORMALIZER2=new NoopNormalizer2();
private static final class Norm2AllModesSingleton {
private Norm2AllModesSingleton(String name) {
try {
String DATA_FILE_NAME = "/sun/text/resources/" + name + ".icu";
NormalizerImpl impl=new NormalizerImpl().load(DATA_FILE_NAME);
allModes=new Norm2AllModes(impl);
} catch (RuntimeException e) {
exception=e;
}
}
private Norm2AllModes allModes;
private RuntimeException exception;
}
private static final class NFCSingleton {
private static final Norm2AllModesSingleton INSTANCE=new Norm2AllModesSingleton("nfc");
}
private static final class NFKCSingleton {
private static final Norm2AllModesSingleton INSTANCE=new Norm2AllModesSingleton("nfkc");
}
}

View file

@ -0,0 +1,271 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2009-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
/**
* Unicode normalization functionality for standard Unicode normalization or
* for using custom mapping tables.
* All instances of this class are unmodifiable/immutable.
* The Normalizer2 class is not intended for public subclassing.
* <p>
* The primary functions are to produce a normalized string and to detect whether
* a string is already normalized.
* The most commonly used normalization forms are those defined in
* http://www.unicode.org/unicode/reports/tr15/
* However, this API supports additional normalization forms for specialized purposes.
* For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
* and can be used in implementations of UTS #46.
* <p>
* Not only are the standard compose and decompose modes supplied,
* but additional modes are provided as documented in the Mode enum.
* <p>
* Some of the functions in this class identify normalization boundaries.
* At a normalization boundary, the portions of the string
* before it and starting from it do not interact and can be handled independently.
* <p>
* The spanQuickCheckYes() stops at a normalization boundary.
* When the goal is a normalized string, then the text before the boundary
* can be copied, and the remainder can be processed with normalizeSecondAndAppend().
* <p>
* The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
* a character is guaranteed to be at a normalization boundary,
* regardless of context.
* This is used for moving from one normalization boundary to the next
* or preceding boundary, and for performing iterative normalization.
* <p>
* Iterative normalization is useful when only a small portion of a
* longer string needs to be processed.
* For example, in ICU, iterative normalization is used by the NormalizationTransliterator
* (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
* (to process only the substring for which sort key bytes are computed).
* <p>
* The set of normalization boundaries returned by these functions may not be
* complete: There may be more boundaries that could be returned.
* Different functions may return different boundaries.
* @stable ICU 4.4
* @author Markus W. Scherer
*/
abstract class Normalizer2 {
/**
* Returns a Normalizer2 instance for Unicode NFC normalization.
* Same as getInstance(null, "nfc", Mode.COMPOSE).
* Returns an unmodifiable singleton instance.
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
public static Normalizer2 getNFCInstance() {
return Norm2AllModes.getNFCInstance().comp;
}
/**
* Returns a Normalizer2 instance for Unicode NFD normalization.
* Same as getInstance(null, "nfc", Mode.DECOMPOSE).
* Returns an unmodifiable singleton instance.
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
public static Normalizer2 getNFDInstance() {
return Norm2AllModes.getNFCInstance().decomp;
}
/**
* Returns a Normalizer2 instance for Unicode NFKC normalization.
* Same as getInstance(null, "nfkc", Mode.COMPOSE).
* Returns an unmodifiable singleton instance.
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
public static Normalizer2 getNFKCInstance() {
return Norm2AllModes.getNFKCInstance().comp;
}
/**
* Returns a Normalizer2 instance for Unicode NFKD normalization.
* Same as getInstance(null, "nfkc", Mode.DECOMPOSE).
* Returns an unmodifiable singleton instance.
* @return the requested Normalizer2, if successful
* @stable ICU 49
*/
public static Normalizer2 getNFKDInstance() {
return Norm2AllModes.getNFKCInstance().decomp;
}
/**
* Returns the normalized form of the source string.
* @param src source string
* @return normalized src
* @stable ICU 4.4
*/
public String normalize(CharSequence src) {
if(src instanceof String) {
// Fastpath: Do not construct a new String if the src is a String
// and is already normalized.
int spanLength=spanQuickCheckYes(src);
if(spanLength==src.length()) {
return (String)src;
}
StringBuilder sb=new StringBuilder(src.length()).append(src, 0, spanLength);
return normalizeSecondAndAppend(sb, src.subSequence(spanLength, src.length())).toString();
}
return normalize(src, new StringBuilder(src.length())).toString();
}
/**
* Writes the normalized form of the source string to the destination string
* (replacing its contents) and returns the destination string.
* The source and destination strings must be different objects.
* @param src source string
* @param dest destination string; its contents is replaced with normalized src
* @return dest
* @stable ICU 4.4
*/
public abstract StringBuilder normalize(CharSequence src, StringBuilder dest);
/**
* Writes the normalized form of the source string to the destination Appendable
* and returns the destination Appendable.
* The source and destination strings must be different objects.
*
* <p>Any {@link java.io.IOException} is wrapped into a {@link com.ibm.icu.util.ICUUncheckedIOException}.
*
* @param src source string
* @param dest destination Appendable; gets normalized src appended
* @return dest
* @stable ICU 4.6
*/
public abstract Appendable normalize(CharSequence src, Appendable dest);
/**
* Appends the normalized form of the second string to the first string
* (merging them at the boundary) and returns the first string.
* The result is normalized if the first string was normalized.
* The first and second strings must be different objects.
* @param first string, should be normalized
* @param second string, will be normalized
* @return first
* @stable ICU 4.4
*/
public abstract StringBuilder normalizeSecondAndAppend(
StringBuilder first, CharSequence second);
/**
* Appends the second string to the first string
* (merging them at the boundary) and returns the first string.
* The result is normalized if both the strings were normalized.
* The first and second strings must be different objects.
* @param first string, should be normalized
* @param second string, should be normalized
* @return first
* @stable ICU 4.4
*/
public abstract StringBuilder append(StringBuilder first, CharSequence second);
/**
* Gets the decomposition mapping of c.
* Roughly equivalent to normalizing the String form of c
* on a DECOMPOSE Normalizer2 instance, but much faster, and except that this function
* returns null if c does not have a decomposition mapping in this instance's data.
* This function is independent of the mode of the Normalizer2.
* @param c code point
* @return c's decomposition mapping, if any; otherwise null
* @stable ICU 4.6
*/
public abstract String getDecomposition(int c);
/**
* Gets the combining class of c.
* The default implementation returns 0
* but all standard implementations return the Unicode Canonical_Combining_Class value.
* @param c code point
* @return c's combining class
* @stable ICU 49
*/
public int getCombiningClass(int c) { return 0; }
/**
* Tests if the string is normalized.
* Internally, in cases where the quickCheck() method would return "maybe"
* (which is only possible for the two COMPOSE modes) this method
* resolves to "yes" or "no" to provide a definitive result,
* at the cost of doing more work in those cases.
* @param s input string
* @return true if s is normalized
* @stable ICU 4.4
*/
public abstract boolean isNormalized(CharSequence s);
/**
* Returns the end of the normalized substring of the input string.
* In other words, with <code>end=spanQuickCheckYes(s);</code>
* the substring <code>s.subSequence(0, end)</code>
* will pass the quick check with a "yes" result.
* <p>
* The returned end index is usually one or more characters before the
* "no" or "maybe" character: The end index is at a normalization boundary.
* (See the class documentation for more about normalization boundaries.)
* <p>
* When the goal is a normalized string and most input strings are expected
* to be normalized already, then call this method,
* and if it returns a prefix shorter than the input string,
* copy that prefix and use normalizeSecondAndAppend() for the remainder.
* @param s input string
* @return "yes" span end index
* @stable ICU 4.4
*/
public abstract int spanQuickCheckYes(CharSequence s);
/**
* Tests if the character always has a normalization boundary before it,
* regardless of context.
* If true, then the character does not normalization-interact with
* preceding characters.
* In other words, a string containing this character can be normalized
* by processing portions before this character and starting from this
* character independently.
* This is used for iterative normalization. See the class documentation for details.
* @param c character to test
* @return true if c has a normalization boundary before it
* @stable ICU 4.4
*/
public abstract boolean hasBoundaryBefore(int c);
/**
* Sole constructor. (For invocation by subclass constructors,
* typically implicit.)
* @internal
* deprecated This API is ICU internal only.
*/
protected Normalizer2() {
}
}

View file

@ -0,0 +1,782 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2000-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
import java.text.CharacterIterator;
import java.text.Normalizer;
/**
* Unicode Normalization
*
* <h2>Unicode normalization API</h2>
*
* <code>normalize</code> transforms Unicode text into an equivalent composed or
* decomposed form, allowing for easier sorting and searching of text.
* <code>normalize</code> supports the standard normalization forms described in
* <a href="http://www.unicode.org/unicode/reports/tr15/" target="unicode">
* Unicode Standard Annex #15 &mdash; Unicode Normalization Forms</a>.
*
* Characters with accents or other adornments can be encoded in
* several different ways in Unicode. For example, take the character A-acute.
* In Unicode, this can be encoded as a single character (the
* "composed" form):
*
* <pre>
* 00C1 LATIN CAPITAL LETTER A WITH ACUTE
* </pre>
*
* or as two separate characters (the "decomposed" form):
*
* <pre>
* 0041 LATIN CAPITAL LETTER A
* 0301 COMBINING ACUTE ACCENT
* </pre>
*
* To a user of your program, however, both of these sequences should be
* treated as the same "user-level" character "A with acute accent". When you
* are searching or comparing text, you must ensure that these two sequences are
* treated equivalently. In addition, you must handle characters with more than
* one accent. Sometimes the order of a character's combining accents is
* significant, while in other cases accent sequences in different orders are
* really equivalent.
*
* Similarly, the string "ffi" can be encoded as three separate letters:
*
* <pre>
* 0066 LATIN SMALL LETTER F
* 0066 LATIN SMALL LETTER F
* 0069 LATIN SMALL LETTER I
* </pre>
*
* or as the single character
*
* <pre>
* FB03 LATIN SMALL LIGATURE FFI
* </pre>
*
* The ffi ligature is not a distinct semantic character, and strictly speaking
* it shouldn't be in Unicode at all, but it was included for compatibility
* with existing character sets that already provided it. The Unicode standard
* identifies such characters by giving them "compatibility" decompositions
* into the corresponding semantic characters. When sorting and searching, you
* will often want to use these mappings.
*
* <code>normalize</code> helps solve these problems by transforming text into
* the canonical composed and decomposed forms as shown in the first example
* above. In addition, you can have it perform compatibility decompositions so
* that you can treat compatibility characters the same as their equivalents.
* Finally, <code>normalize</code> rearranges accents into the proper canonical
* order, so that you do not have to worry about accent rearrangement on your
* own.
*
* Form FCD, "Fast C or D", is also designed for collation.
* It allows to work on strings that are not necessarily normalized
* with an algorithm (like in collation) that works under "canonical closure",
* i.e., it treats precomposed characters and their decomposed equivalents the
* same.
*
* It is not a normalization form because it does not provide for uniqueness of
* representation. Multiple strings may be canonically equivalent (their NFDs
* are identical) and may all conform to FCD without being identical themselves.
*
* The form is defined such that the "raw decomposition", the recursive
* canonical decomposition of each character, results in a string that is
* canonically ordered. This means that precomposed characters are allowed for
* as long as their decompositions do not need canonical reordering.
*
* Its advantage for a process like collation is that all NFD and most NFC texts
* - and many unnormalized texts - already conform to FCD and do not need to be
* normalized (NFD) for such a process. The FCD quick check will return YES for
* most strings in practice.
*
* normalize(FCD) may be implemented with NFD.
*
* For more details on FCD see Unicode Technical Note #5 (Canonical Equivalence in Applications):
* http://www.unicode.org/notes/tn5/#FCD
*
* ICU collation performs either NFD or FCD normalization automatically if
* normalization is turned on for the collator object. Beyond collation and
* string search, normalized strings may be useful for string equivalence
* comparisons, transliteration/transcription, unique representations, etc.
*
* The W3C generally recommends to exchange texts in NFC.
* Note also that most legacy character encodings use only precomposed forms and
* often do not encode any combining marks by themselves. For conversion to such
* character encodings the Unicode text needs to be normalized to NFC.
* For more usage examples, see the Unicode Standard Annex.
*
* Note: The Normalizer class also provides API for iterative normalization.
* While the setIndex() and getIndex() refer to indices in the
* underlying Unicode input text, the next() and previous() methods
* iterate through characters in the normalized output.
* This means that there is not necessarily a one-to-one correspondence
* between characters returned by next() and previous() and the indices
* passed to and returned from setIndex() and getIndex().
* It is for this reason that Normalizer does not implement the CharacterIterator interface.
*
* @stable ICU 2.8
*/
// Original filename in ICU4J: Normalizer.java
public final class NormalizerBase implements Cloneable {
// The input text and our position in it
private UCharacterIterator text;
private Normalizer2 norm2;
private Mode mode;
private int options;
// The normalization buffer is the result of normalization
// of the source in [currentIndex..nextIndex] .
private int currentIndex;
private int nextIndex;
// A buffer for holding intermediate results
private StringBuilder buffer;
private int bufferPos;
// Helper classes to defer loading of normalization data.
private static final class ModeImpl {
private ModeImpl(Normalizer2 n2) {
normalizer2 = n2;
}
private final Normalizer2 normalizer2;
}
private static final class NFDModeImpl {
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFDInstance());
}
private static final class NFKDModeImpl {
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKDInstance());
}
private static final class NFCModeImpl {
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFCInstance());
}
private static final class NFKCModeImpl {
private static final ModeImpl INSTANCE = new ModeImpl(Normalizer2.getNFKCInstance());
}
private static final class Unicode32 {
private static final UnicodeSet INSTANCE = new UnicodeSet("[:age=3.2:]").freeze();
}
private static final class NFD32ModeImpl {
private static final ModeImpl INSTANCE =
new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFDInstance(),
Unicode32.INSTANCE));
}
private static final class NFKD32ModeImpl {
private static final ModeImpl INSTANCE =
new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKDInstance(),
Unicode32.INSTANCE));
}
private static final class NFC32ModeImpl {
private static final ModeImpl INSTANCE =
new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFCInstance(),
Unicode32.INSTANCE));
}
private static final class NFKC32ModeImpl {
private static final ModeImpl INSTANCE =
new ModeImpl(new FilteredNormalizer2(Normalizer2.getNFKCInstance(),
Unicode32.INSTANCE));
}
/**
* Options bit set value to select Unicode 3.2 normalization
* (except NormalizationCorrections).
* At most one Unicode version can be selected at a time.
* @stable ICU 2.6
*/
public static final int UNICODE_3_2=0x20;
public static final int UNICODE_3_2_0_ORIGINAL=UNICODE_3_2;
/*
* Default option for the latest Unicode normalization. This option is
* provided mainly for testing.
* The value zero means that normalization is done with the fixes for
* - Corrigendum 4 (Five CJK Canonical Mapping Errors)
* - Corrigendum 5 (Normalization Idempotency)
*/
public static final int UNICODE_LATEST = 0x00;
/**
* Constant indicating that the end of the iteration has been reached.
* This is guaranteed to have the same value as {@link UCharacterIterator#DONE}.
* @stable ICU 2.8
*/
public static final int DONE = UCharacterIterator.DONE;
/**
* Constants for normalization modes.
* <p>
* The Mode class is not intended for public subclassing.
* Only the Mode constants provided by the Normalizer class should be used,
* and any fields or methods should not be called or overridden by users.
* @stable ICU 2.8
*/
public abstract static class Mode {
/**
* Sole constructor
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected Mode() {
}
/**
* @internal
* @deprecated This API is ICU internal only.
*/
@Deprecated
protected abstract Normalizer2 getNormalizer2(int options);
}
private static Mode toMode(Normalizer.Form form) {
switch (form) {
case NFC :
return NFC;
case NFD :
return NFD;
case NFKC :
return NFKC;
case NFKD :
return NFKD;
}
throw new IllegalArgumentException("Unexpected normalization form: " +
form);
}
private static final class NONEMode extends Mode {
protected Normalizer2 getNormalizer2(int options) { return Norm2AllModes.NOOP_NORMALIZER2; }
}
private static final class NFDMode extends Mode {
protected Normalizer2 getNormalizer2(int options) {
return (options&UNICODE_3_2) != 0 ?
NFD32ModeImpl.INSTANCE.normalizer2 :
NFDModeImpl.INSTANCE.normalizer2;
}
}
private static final class NFKDMode extends Mode {
protected Normalizer2 getNormalizer2(int options) {
return (options&UNICODE_3_2) != 0 ?
NFKD32ModeImpl.INSTANCE.normalizer2 :
NFKDModeImpl.INSTANCE.normalizer2;
}
}
private static final class NFCMode extends Mode {
protected Normalizer2 getNormalizer2(int options) {
return (options&UNICODE_3_2) != 0 ?
NFC32ModeImpl.INSTANCE.normalizer2 :
NFCModeImpl.INSTANCE.normalizer2;
}
}
private static final class NFKCMode extends Mode {
protected Normalizer2 getNormalizer2(int options) {
return (options&UNICODE_3_2) != 0 ?
NFKC32ModeImpl.INSTANCE.normalizer2 :
NFKCModeImpl.INSTANCE.normalizer2;
}
}
/**
* No decomposition/composition.
* @stable ICU 2.8
*/
public static final Mode NONE = new NONEMode();
/**
* Canonical decomposition.
* @stable ICU 2.8
*/
public static final Mode NFD = new NFDMode();
/**
* Compatibility decomposition.
* @stable ICU 2.8
*/
public static final Mode NFKD = new NFKDMode();
/**
* Canonical decomposition followed by canonical composition.
* @stable ICU 2.8
*/
public static final Mode NFC = new NFCMode();
public static final Mode NFKC =new NFKCMode();
//-------------------------------------------------------------------------
// Iterator constructors
//-------------------------------------------------------------------------
/**
* Creates a new {@code NormalizerBase} object for iterating over the
* normalized form of a given string.
* <p>
* The {@code options} parameter specifies which optional
* {@code NormalizerBase} features are to be enabled for this object.
* <p>
* @param str The string to be normalized. The normalization
* will start at the beginning of the string.
*
* @param mode The normalization mode.
*
* @param opt Any optional features to be enabled.
* Currently the only available option is {@link #UNICODE_3_2}.
* If you want the default behavior corresponding to one of the
* standard Unicode Normalization Forms, use 0 for this argument.
* @stable ICU 2.6
*/
public NormalizerBase(String str, Mode mode, int opt) {
this.text = UCharacterIterator.getInstance(str);
this.mode = mode;
this.options=opt;
norm2 = mode.getNormalizer2(opt);
buffer = new StringBuilder();
}
public NormalizerBase(String str, Mode mode) {
this(str, mode, 0);
}
/**
* Creates a new {@code NormalizerBase} object for iterating over the
* normalized form of the given text.
* <p>
* @param iter The input text to be normalized. The normalization
* will start at the beginning of the string.
*
* @param mode The normalization mode.
*
* @param opt Any optional features to be enabled.
* Currently the only available option is {@link #UNICODE_3_2}.
* If you want the default behavior corresponding to one of the
* standard Unicode Normalization Forms, use 0 for this argument.
* @stable ICU 2.6
*/
public NormalizerBase(CharacterIterator iter, Mode mode, int opt) {
this.text = UCharacterIterator.getInstance((CharacterIterator)iter.clone());
this.mode = mode;
this.options = opt;
norm2 = mode.getNormalizer2(opt);
buffer = new StringBuilder();
}
public NormalizerBase(CharacterIterator iter, Mode mode) {
this(iter, mode, 0);
}
/**
* Clones this {@code NormalizerBase} object. All properties of this
* object are duplicated in the new object, including the cloning of any
* {@link CharacterIterator} that was passed in to the constructor
* or to {@link #setText(CharacterIterator) setText}.
* However, the text storage underlying
* the {@code CharacterIterator} is not duplicated unless the
* iterator's {@code clone} method does so.
* @stable ICU 2.8
*/
public Object clone() {
try {
NormalizerBase copy = (NormalizerBase) super.clone();
copy.text = (UCharacterIterator) text.clone();
copy.mode = mode;
copy.options = options;
copy.norm2 = norm2;
copy.buffer = new StringBuilder(buffer);
copy.bufferPos = bufferPos;
copy.currentIndex = currentIndex;
copy.nextIndex = nextIndex;
return copy;
}
catch (CloneNotSupportedException e) {
throw new InternalError(e.toString(), e);
}
}
/**
* Normalizes a {@code String} using the given normalization operation.
* <p>
* The {@code options} parameter specifies which optional
* {@code NormalizerBase} features are to be enabled for this operation.
* Currently the only available option is {@link #UNICODE_3_2}.
* If you want the default behavior corresponding to one of the standard
* Unicode Normalization Forms, use 0 for this argument.
* <p>
* @param str the input string to be normalized.
* @param mode the normalization mode
* @param options the optional features to be enabled.
* @return String the normalized string
* @stable ICU 2.6
*/
public static String normalize(String str, Mode mode, int options) {
return mode.getNormalizer2(options).normalize(str);
}
public static String normalize(String str, Normalizer.Form form) {
return NormalizerBase.normalize(str, toMode(form), UNICODE_LATEST);
}
public static String normalize(String str, Normalizer.Form form, int options) {
return NormalizerBase.normalize(str, toMode(form), options);
}
/**
* Test if a string is in a given normalization form.
* This is semantically equivalent to source.equals(normalize(source, mode)).
*
* Unlike quickCheck(), this function returns a definitive result,
* never a "maybe".
* For NFD, NFKD, and FCD, both functions work exactly the same.
* For NFC and NFKC where quickCheck may return "maybe", this function will
* perform further tests to arrive at a true/false result.
* @param str the input string to be checked to see if it is
* normalized
* @param mode the normalization mode
* @param options Options for use with exclusion set and tailored Normalization
* The only option that is currently recognized is UNICODE_3_2
* @see #isNormalized
* @stable ICU 2.6
*/
public static boolean isNormalized(String str, Mode mode, int options) {
return mode.getNormalizer2(options).isNormalized(str);
}
public static boolean isNormalized(String str, Normalizer.Form form) {
return NormalizerBase.isNormalized(str, toMode(form), UNICODE_LATEST);
}
public static boolean isNormalized(String str, Normalizer.Form form, int options) {
return NormalizerBase.isNormalized(str, toMode(form), options);
}
//-------------------------------------------------------------------------
// Iteration API
//-------------------------------------------------------------------------
/**
* Return the current character in the normalized text.
* @return The codepoint as an int
* @stable ICU 2.8
*/
public int current() {
if(bufferPos<buffer.length() || nextNormalize()) {
return buffer.codePointAt(bufferPos);
} else {
return DONE;
}
}
/**
* Return the next character in the normalized text and advance
* the iteration position by one. If the end
* of the text has already been reached, {@link #DONE} is returned.
* @return The codepoint as an int
* @stable ICU 2.8
*/
public int next() {
if(bufferPos<buffer.length() || nextNormalize()) {
int c=buffer.codePointAt(bufferPos);
bufferPos+=Character.charCount(c);
return c;
} else {
return DONE;
}
}
/**
* Return the previous character in the normalized text and decrement
* the iteration position by one. If the beginning
* of the text has already been reached, {@link #DONE} is returned.
* @return The codepoint as an int
* @stable ICU 2.8
*/
public int previous() {
if(bufferPos>0 || previousNormalize()) {
int c=buffer.codePointBefore(bufferPos);
bufferPos-=Character.charCount(c);
return c;
} else {
return DONE;
}
}
/**
* Reset the index to the beginning of the text.
* This is equivalent to setIndexOnly(startIndex)).
* @stable ICU 2.8
*/
public void reset() {
text.setIndex(0);
currentIndex=nextIndex=0;
clearBuffer();
}
/**
* Set the iteration position in the input text that is being normalized,
* without any immediate normalization.
* After setIndexOnly(), getIndex() will return the same index that is
* specified here.
*
* @param index the desired index in the input text.
* @stable ICU 2.8
*/
public void setIndexOnly(int index) {
text.setIndex(index); // validates index
currentIndex=nextIndex=index;
clearBuffer();
}
/**
* Set the iteration position in the input text that is being normalized
* and return the first normalized character at that position.
* <p>
* <b>Note:</b> This method sets the position in the <em>input</em> text,
* while {@link #next} and {@link #previous} iterate through characters
* in the normalized <em>output</em>. This means that there is not
* necessarily a one-to-one correspondence between characters returned
* by {@code next} and {@code previous} and the indices passed to and
* returned from {@code setIndex} and {@link #getIndex}.
* <p>
* @param index the desired index in the input text.
*
* @return the first normalized character that is the result of iterating
* forward starting at the given index.
*
* @throws IllegalArgumentException if the given index is less than
* {@link #getBeginIndex} or greater than {@link #getEndIndex}.
* deprecated ICU 3.2
* @obsolete ICU 3.2
*/
public int setIndex(int index) {
setIndexOnly(index);
return current();
}
/**
* Retrieve the index of the start of the input text. This is the begin
* index of the {@code CharacterIterator} or the start (i.e. 0) of the
* {@code String} over which this {@code NormalizerBase} is iterating
* @deprecated ICU 2.2. Use startIndex() instead.
* @return The codepoint as an int
* @see #startIndex
*/
@Deprecated
public int getBeginIndex() {
return 0;
}
/**
* Retrieve the index of the end of the input text. This is the end index
* of the {@code CharacterIterator} or the length of the {@code String}
* over which this {@code NormalizerBase} is iterating
* @deprecated ICU 2.2. Use endIndex() instead.
* @return The codepoint as an int
* @see #endIndex
*/
@Deprecated
public int getEndIndex() {
return endIndex();
}
/**
* Retrieve the current iteration position in the input text that is
* being normalized. This method is useful in applications such as
* searching, where you need to be able to determine the position in
* the input text that corresponds to a given normalized output character.
* <p>
* <b>Note:</b> This method sets the position in the <em>input</em>, while
* {@link #next} and {@link #previous} iterate through characters in the
* <em>output</em>. This means that there is not necessarily a one-to-one
* correspondence between characters returned by {@code next} and
* {@code previous} and the indices passed to and returned from
* {@code setIndex} and {@link #getIndex}.
* @return The current iteration position
* @stable ICU 2.8
*/
public int getIndex() {
if(bufferPos<buffer.length()) {
return currentIndex;
} else {
return nextIndex;
}
}
/**
* Retrieve the index of the end of the input text. This is the end index
* of the {@code CharacterIterator} or the length of the {@code String}
* over which this {@code NormalizerBase} is iterating
* @return The current iteration position
* @stable ICU 2.8
*/
public int endIndex() {
return text.getLength();
}
//-------------------------------------------------------------------------
// Iterator attributes
//-------------------------------------------------------------------------
/**
* Set the normalization mode for this object.
* <p>
* <b>Note:</b>If the normalization mode is changed while iterating
* over a string, calls to {@link #next} and {@link #previous} may
* return previously buffers characters in the old normalization mode
* until the iteration is able to re-sync at the next base character.
* It is safest to call {@link #setText setText()}, {@link #first},
* {@link #last}, etc. after calling {@code setMode}.
* <p>
* @param newMode the new mode for this {@code NormalizerBase}.
* The supported modes are:
* <ul>
* <li>{@link #NFC} - Unicode canonical decompositiion
* followed by canonical composition.
* <li>{@link #NFKC} - Unicode compatibility decompositiion
* follwed by canonical composition.
* <li>{@link #NFD} - Unicode canonical decomposition
* <li>{@link #NFKD} - Unicode compatibility decomposition.
* <li>{@link #NONE} - Do nothing but return characters
* from the underlying input text.
* </ul>
*
* @see #getMode
* @stable ICU 2.8
*/
public void setMode(Mode newMode) {
mode = newMode;
norm2 = mode.getNormalizer2(options);
}
/**
* Return the basic operation performed by this {@code NormalizerBase}
*
* @see #setMode
* @stable ICU 2.8
*/
public Mode getMode() {
return mode;
}
/**
* Set the input text over which this {@code NormalizerBase} will iterate.
* The iteration position is set to the beginning of the input text.
* @param newText The new string to be normalized.
* @stable ICU 2.8
*/
public void setText(String newText) {
UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
if (newIter == null) {
throw new IllegalStateException("Could not create a new UCharacterIterator");
}
text = newIter;
reset();
}
/**
* Set the input text over which this {@code NormalizerBase} will iterate.
* The iteration position is set to the beginning of the input text.
* @param newText The new string to be normalized.
* @stable ICU 2.8
*/
public void setText(CharacterIterator newText) {
UCharacterIterator newIter = UCharacterIterator.getInstance(newText);
if (newIter == null) {
throw new IllegalStateException("Could not create a new UCharacterIterator");
}
text = newIter;
currentIndex=nextIndex=0;
clearBuffer();
}
private void clearBuffer() {
buffer.setLength(0);
bufferPos=0;
}
private boolean nextNormalize() {
clearBuffer();
currentIndex=nextIndex;
text.setIndex(nextIndex);
// Skip at least one character so we make progress.
int c=text.nextCodePoint();
if(c<0) {
return false;
}
StringBuilder segment=new StringBuilder().appendCodePoint(c);
while((c=text.nextCodePoint())>=0) {
if(norm2.hasBoundaryBefore(c)) {
text.moveCodePointIndex(-1);
break;
}
segment.appendCodePoint(c);
}
nextIndex=text.getIndex();
norm2.normalize(segment, buffer);
return buffer.length()!=0;
}
private boolean previousNormalize() {
clearBuffer();
nextIndex=currentIndex;
text.setIndex(currentIndex);
StringBuilder segment=new StringBuilder();
int c;
while((c=text.previousCodePoint())>=0) {
if(c<=0xffff) {
segment.insert(0, (char)c);
} else {
segment.insert(0, Character.toChars(c));
}
if(norm2.hasBoundaryBefore(c)) {
break;
}
}
currentIndex=text.getIndex();
norm2.normalize(segment, buffer);
bufferPos=buffer.length();
return buffer.length()!=0;
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,50 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
/**
* Simple struct-like class for int output parameters.
* Like <code>Output&lt;Integer&gt;</code> but without auto-boxing.
*
* @internal but could become public
* deprecated This API is ICU internal only.
*/
class OutputInt {
/**
* The value field.
*
* @internal
* deprecated This API is ICU internal only.
*/
public int value;
}

View file

@ -0,0 +1,121 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
/**
* <code>Replaceable</code> is an interface representing a
* string of characters that supports the replacement of a range of
* itself with a new string of characters. It is used by APIs that
* change a piece of text while retaining metadata. Metadata is data
* other than the Unicode characters returned by char32At(). One
* example of metadata is style attributes; another is an edit
* history, marking each character with an author and revision number.
*
* <p>An implicit aspect of the <code>Replaceable</code> API is that
* during a replace operation, new characters take on the metadata of
* the old characters. For example, if the string "the <b>bold</b>
* font" has range (4, 8) replaced with "strong", then it becomes "the
* <b>strong</b> font".
*
* <p><code>Replaceable</code> specifies ranges using a start
* offset and a limit offset. The range of characters thus specified
* includes the characters at offset start..limit-1. That is, the
* start offset is inclusive, and the limit offset is exclusive.
*
* <p><code>Replaceable</code> also includes API to access characters
* in the string: <code>length()</code>, <code>charAt()</code>,
* <code>char32At()</code>, and <code>extractBetween()</code>.
*
* <p>For a subclass to support metadata, typical behavior of
* <code>replace()</code> is the following:
* <ul>
* <li>Set the metadata of the new text to the metadata of the first
* character replaced</li>
* <li>If no characters are replaced, use the metadata of the
* previous character</li>
* <li>If there is no previous character (i.e. start == 0), use the
* following character</li>
* <li>If there is no following character (i.e. the replaceable was
* empty), use default metadata</li>
* <li>If the code point U+FFFF is seen, it should be interpreted as
* a special marker having no metadata</li>
* </ul>
* If this is not the behavior, the subclass should document any differences.
*
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
*
* @author Alan Liu
* @stable ICU 2.0
*/
interface Replaceable {
/**
* Returns the number of 16-bit code units in the text.
* @return number of 16-bit code units in text
* @stable ICU 2.0
*/
int length();
/**
* Returns the 16-bit code unit at the given offset into the text.
* @param offset an integer between 0 and <code>length()</code>-1
* inclusive
* @return 16-bit code unit of text at given offset
* @stable ICU 2.0
*/
char charAt(int offset);
/**
* Copies characters from this object into the destination
* character array. The first character to be copied is at index
* <code>srcStart</code>; the last character to be copied is at
* index <code>srcLimit-1</code> (thus the total number of
* characters to be copied is <code>srcLimit-srcStart</code>). The
* characters are copied into the subarray of <code>dst</code>
* starting at index <code>dstStart</code> and ending at index
* <code>dstStart + (srcLimit-srcStart) - 1</code>.
*
* @param srcStart the beginning index to copy, inclusive;
* {@code 0 <= start <= limit}.
* @param srcLimit the ending index to copy, exclusive;
* {@code start <= limit <= length()}.
* @param dst the destination array.
* @param dstStart the start offset in the destination array.
* @stable ICU 2.0
*/
void getChars(int srcStart, int srcLimit, char dst[], int dstStart);
}

View file

@ -0,0 +1,118 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 1996-2009, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package sun.text.normalizer;
/**
* <code>ReplaceableString</code> is an adapter class that implements the
* <code>Replaceable</code> API around an ordinary <code>StringBuffer</code>.
*
* <p><em>Note:</em> This class does not support attributes and is not
* intended for general use. Most clients will need to implement
* {@link Replaceable} in their text representation class.
*
* <p>Copyright &copy; IBM Corporation 1999. All rights reserved.
*
* @see Replaceable
* @author Alan Liu
* @stable ICU 2.0
*/
class ReplaceableString implements Replaceable {
private StringBuffer buf;
/**
* Construct a new object with the given initial contents.
* @param str initial contents
* @stable ICU 2.0
*/
public ReplaceableString(String str) {
buf = new StringBuffer(str);
}
/**
* Construct a new object using <code>buf</code> for internal
* storage. The contents of <code>buf</code> at the time of
* construction are used as the initial contents. <em>Note!
* Modifications to <code>buf</code> will modify this object, and
* vice versa.</em>
* @param buf object to be used as internal storage
* @stable ICU 2.0
*/
public ReplaceableString(StringBuffer buf) {
this.buf = buf;
}
/**
* Return the number of characters contained in this object.
* <code>Replaceable</code> API.
* @stable ICU 2.0
*/
public int length() {
return buf.length();
}
/**
* Return the character at the given position in this object.
* <code>Replaceable</code> API.
* @param offset offset into the contents, from 0 to
* <code>length()</code> - 1
* @stable ICU 2.0
*/
public char charAt(int offset) {
return buf.charAt(offset);
}
/**
* Copies characters from this object into the destination
* character array. The first character to be copied is at index
* <code>srcStart</code>; the last character to be copied is at
* index <code>srcLimit-1</code> (thus the total number of
* characters to be copied is <code>srcLimit-srcStart</code>). The
* characters are copied into the subarray of <code>dst</code>
* starting at index <code>dstStart</code> and ending at index
* <code>dstStart + (srcLimit-srcStart) - 1</code>.
*
* @param srcStart the beginning index to copy, inclusive;
* {@code 0 <= start <= limit}.
* @param srcLimit the ending index to copy, exclusive;
* {@code start <= limit <= length()}.
* @param dst the destination array.
* @param dstStart the start offset in the destination array.
* @stable ICU 2.0
*/
public void getChars(int srcStart, int srcLimit, char dst[], int dstStart) {
if (srcStart != srcLimit) {
buf.getChars(srcStart, srcLimit, dst, dstStart);
}
}
}

View file

@ -0,0 +1,186 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. 1996-2005 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
/**
* DLF docs must define behavior when Replaceable is mutated underneath
* the iterator.
*
* This and ICUCharacterIterator share some code, maybe they should share
* an implementation, or the common state and implementation should be
* moved up into UCharacterIterator.
*
* What are first, last, and getBeginIndex doing here?!?!?!
*/
class ReplaceableUCharacterIterator extends UCharacterIterator {
// public constructor ------------------------------------------------------
/**
* Public constructor
* @param str text which the iterator will be based on
*/
public ReplaceableUCharacterIterator(String str){
if(str==null){
throw new IllegalArgumentException();
}
this.replaceable = new ReplaceableString(str);
this.currentIndex = 0;
}
/**
* Public constructor
* @param buf buffer of text on which the iterator will be based
*/
public ReplaceableUCharacterIterator(StringBuffer buf){
if(buf==null){
throw new IllegalArgumentException();
}
this.replaceable = new ReplaceableString(buf);
this.currentIndex = 0;
}
// public methods ----------------------------------------------------------
/**
* Creates a copy of this iterator, does not clone the underlying
* <code>Replaceable</code>object
* @return copy of this iterator
*/
public Object clone(){
try {
return super.clone();
} catch (CloneNotSupportedException e) {
return null; // never invoked
}
}
/**
* Returns the current UTF16 character.
* @return current UTF16 character
*/
public int current(){
if (currentIndex < replaceable.length()) {
return replaceable.charAt(currentIndex);
}
return DONE;
}
/**
* Returns the length of the text
* @return length of the text
*/
public int getLength(){
return replaceable.length();
}
/**
* Gets the current currentIndex in text.
* @return current currentIndex in text.
*/
public int getIndex(){
return currentIndex;
}
/**
* Returns next UTF16 character and increments the iterator's currentIndex by 1.
* If the resulting currentIndex is greater or equal to the text length, the
* currentIndex is reset to the text length and a value of DONECODEPOINT is
* returned.
* @return next UTF16 character in text or DONE if the new currentIndex is off the
* end of the text range.
*/
public int next(){
if (currentIndex < replaceable.length()) {
return replaceable.charAt(currentIndex++);
}
return DONE;
}
/**
* Returns previous UTF16 character and decrements the iterator's currentIndex by
* 1.
* If the resulting currentIndex is less than 0, the currentIndex is reset to 0 and a
* value of DONECODEPOINT is returned.
* @return next UTF16 character in text or DONE if the new currentIndex is off the
* start of the text range.
*/
public int previous(){
if (currentIndex > 0) {
return replaceable.charAt(--currentIndex);
}
return DONE;
}
/**
* Sets the currentIndex to the specified currentIndex in the text and returns that
* single UTF16 character at currentIndex.
* This assumes the text is stored as 16-bit code units.
* @param currentIndex the currentIndex within the text.
* @exception IllegalArgumentException is thrown if an invalid currentIndex is
* supplied. i.e. currentIndex is out of bounds.
*/
public void setIndex(int currentIndex) {
if (currentIndex < 0 || currentIndex > replaceable.length()) {
throw new IllegalArgumentException();
}
this.currentIndex = currentIndex;
}
public int getText(char[] fillIn, int offset){
int length = replaceable.length();
if(offset < 0 || offset + length > fillIn.length){
throw new IndexOutOfBoundsException(Integer.toString(length));
}
replaceable.getChars(0,length,fillIn,offset);
return length;
}
// private data members ----------------------------------------------------
/**
* Replaceable object
*/
private Replaceable replaceable;
/**
* Current currentIndex
*/
private int currentIndex;
}

View file

@ -0,0 +1,364 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and
* others. All Rights Reserved.
******************************************************************************
*/
package sun.text.normalizer;
import java.io.DataInputStream;
import java.io.InputStream;
import java.io.IOException;
/**
* <p>A trie is a kind of compressed, serializable table of values
* associated with Unicode code points (0..0x10ffff).</p>
* <p>This class defines the basic structure of a trie and provides methods
* to <b>retrieve the offsets to the actual data</b>.</p>
* <p>Data will be the form of an array of basic types, char or int.</p>
* <p>The actual data format will have to be specified by the user in the
* inner static interface com.ibm.icu.impl.Trie.DataManipulate.</p>
* <p>This trie implementation is optimized for getting offset while walking
* forward through a UTF-16 string.
* Therefore, the simplest and fastest access macros are the
* fromLead() and fromOffsetTrail() methods.
* The fromBMP() method are a little more complicated; they get offsets even
* for lead surrogate codepoints, while the fromLead() method get special
* "folded" offsets for lead surrogate code units if there is relevant data
* associated with them.
* From such a folded offsets, an offset needs to be extracted to supply
* to the fromOffsetTrail() methods.
* To handle such supplementary codepoints, some offset information are kept
* in the data.</p>
* <p>Methods in com.ibm.icu.impl.Trie.DataManipulate are called to retrieve
* that offset from the folded value for the lead surrogate unit.</p>
* <p>For examples of use, see com.ibm.icu.impl.CharTrie or
* com.ibm.icu.impl.IntTrie.</p>
* @author synwee
* @see com.ibm.icu.impl.CharTrie
* @see com.ibm.icu.impl.IntTrie
* @since release 2.1, Jan 01 2002
*/
public abstract class Trie
{
// public class declaration ----------------------------------------
/**
* Character data in com.ibm.impl.Trie have different user-specified format
* for different purposes.
* This interface specifies methods to be implemented in order for
* com.ibm.impl.Trie, to surrogate offset information encapsulated within
* the data.
*/
public static interface DataManipulate
{
/**
* Called by com.ibm.icu.impl.Trie to extract from a lead surrogate's
* data
* the index array offset of the indexes for that lead surrogate.
* @param value data value for a surrogate from the trie, including the
* folding offset
* @return data offset or 0 if there is no data for the lead surrogate
*/
public int getFoldingOffset(int value);
}
// default implementation
private static class DefaultGetFoldingOffset implements DataManipulate {
public int getFoldingOffset(int value) {
return value;
}
}
// protected constructor -------------------------------------------
/**
* Trie constructor for CharTrie use.
* @param inputStream ICU data file input stream which contains the
* trie
* @param dataManipulate object containing the information to parse the
* trie data
* @throws IOException thrown when input stream does not have the
* right header.
*/
protected Trie(InputStream inputStream,
DataManipulate dataManipulate) throws IOException
{
DataInputStream input = new DataInputStream(inputStream);
// Magic number to authenticate the data.
int signature = input.readInt();
m_options_ = input.readInt();
if (!checkHeader(signature)) {
throw new IllegalArgumentException("ICU data file error: Trie header authentication failed, please check if you have the most updated ICU data file");
}
if(dataManipulate != null) {
m_dataManipulate_ = dataManipulate;
} else {
m_dataManipulate_ = new DefaultGetFoldingOffset();
}
m_isLatin1Linear_ = (m_options_ &
HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_) != 0;
m_dataOffset_ = input.readInt();
m_dataLength_ = input.readInt();
unserialize(inputStream);
}
// protected data members ------------------------------------------
/**
* Lead surrogate code points' index displacement in the index array.
* <pre>{@code
* 0x10000-0xd800=0x2800
* 0x2800 >> INDEX_STAGE_1_SHIFT_
* }</pre>
*/
protected static final int LEAD_INDEX_OFFSET_ = 0x2800 >> 5;
/**
* Shift size for shifting right the input index. 1..9
*/
protected static final int INDEX_STAGE_1_SHIFT_ = 5;
/**
* Shift size for shifting left the index array values.
* Increases possible data size with 16-bit index values at the cost
* of compactability.
* This requires blocks of stage 2 data to be aligned by
* DATA_GRANULARITY.
* 0..INDEX_STAGE_1_SHIFT
*/
protected static final int INDEX_STAGE_2_SHIFT_ = 2;
/**
* Number of data values in a stage 2 (data array) block.
*/
protected static final int DATA_BLOCK_LENGTH=1<<INDEX_STAGE_1_SHIFT_;
/**
* Mask for getting the lower bits from the input index.
* DATA_BLOCK_LENGTH - 1.
*/
protected static final int INDEX_STAGE_3_MASK_ = DATA_BLOCK_LENGTH - 1;
/**
* Surrogate mask to use when shifting offset to retrieve supplementary
* values
*/
protected static final int SURROGATE_MASK_ = 0x3FF;
/**
* Index or UTF16 characters
*/
protected char m_index_[];
/**
* Internal TrieValue which handles the parsing of the data value.
* This class is to be implemented by the user
*/
protected DataManipulate m_dataManipulate_;
/**
* Start index of the data portion of the trie. CharTrie combines
* index and data into a char array, so this is used to indicate the
* initial offset to the data portion.
* Note this index always points to the initial value.
*/
protected int m_dataOffset_;
/**
* Length of the data array
*/
protected int m_dataLength_;
// protected methods -----------------------------------------------
/**
* Gets the offset to the data which the surrogate pair points to.
* @param lead lead surrogate
* @param trail trailing surrogate
* @return offset to data
*/
protected abstract int getSurrogateOffset(char lead, char trail);
/**
* Gets the offset to the data which the index ch after variable offset
* points to.
* Note for locating a non-supplementary character data offset, calling
* <p>
* getRawOffset(0, ch);
* </p>
* will do. Otherwise if it is a supplementary character formed by
* surrogates lead and trail. Then we would have to call getRawOffset()
* with getFoldingIndexOffset(). See getSurrogateOffset().
* @param offset index offset which ch is to start from
* @param ch index to be used after offset
* @return offset to the data
*/
protected final int getRawOffset(int offset, char ch)
{
return (m_index_[offset + (ch >> INDEX_STAGE_1_SHIFT_)]
<< INDEX_STAGE_2_SHIFT_)
+ (ch & INDEX_STAGE_3_MASK_);
}
/**
* Gets the offset to data which the BMP character points to
* Treats a lead surrogate as a normal code point.
* @param ch BMP character
* @return offset to data
*/
protected final int getBMPOffset(char ch)
{
return (ch >= UTF16.LEAD_SURROGATE_MIN_VALUE
&& ch <= UTF16.LEAD_SURROGATE_MAX_VALUE)
? getRawOffset(LEAD_INDEX_OFFSET_, ch)
: getRawOffset(0, ch);
// using a getRawOffset(ch) makes no diff
}
/**
* Gets the offset to the data which this lead surrogate character points
* to.
* Data at the returned offset may contain folding offset information for
* the next trailing surrogate character.
* @param ch lead surrogate character
* @return offset to data
*/
protected final int getLeadOffset(char ch)
{
return getRawOffset(0, ch);
}
/**
* Internal trie getter from a code point.
* Could be faster(?) but longer with
* {@code if((c32)<=0xd7ff) { (result)=_TRIE_GET_RAW(trie, data, 0, c32); }}
* Gets the offset to data which the codepoint points to
* @param ch codepoint
* @return offset to data
*/
protected final int getCodePointOffset(int ch)
{
// if ((ch >> 16) == 0) slower
if (ch < 0) {
return -1;
} else if (ch < UTF16.LEAD_SURROGATE_MIN_VALUE) {
// fastpath for the part of the BMP below surrogates (D800) where getRawOffset() works
return getRawOffset(0, (char)ch);
} else if (ch < UTF16.SUPPLEMENTARY_MIN_VALUE) {
// BMP codepoint
return getBMPOffset((char)ch);
} else if (ch <= UCharacter.MAX_VALUE) {
// look at the construction of supplementary characters
// trail forms the ends of it.
return getSurrogateOffset(UTF16.getLeadSurrogate(ch),
(char)(ch & SURROGATE_MASK_));
} else {
// return -1 if there is an error, in this case we return
return -1;
}
}
/**
* <p>Parses the inputstream and creates the trie index with it.</p>
* <p>This is overwritten by the child classes.
* @param inputStream input stream containing the trie information
* @exception IOException thrown when data reading fails.
*/
protected void unserialize(InputStream inputStream) throws IOException
{
//indexLength is a multiple of 1024 >> INDEX_STAGE_2_SHIFT_
m_index_ = new char[m_dataOffset_];
DataInputStream input = new DataInputStream(inputStream);
for (int i = 0; i < m_dataOffset_; i ++) {
m_index_[i] = input.readChar();
}
}
/**
* Determines if this is a 16 bit trie
* @return true if this is a 16 bit trie
*/
protected final boolean isCharTrie()
{
return (m_options_ & HEADER_OPTIONS_DATA_IS_32_BIT_) == 0;
}
// private data members --------------------------------------------
/**
* Latin 1 option mask
*/
protected static final int HEADER_OPTIONS_LATIN1_IS_LINEAR_MASK_ = 0x200;
/**
* Constant number to authenticate the byte block
*/
protected static final int HEADER_SIGNATURE_ = 0x54726965;
/**
* Header option formatting
*/
private static final int HEADER_OPTIONS_SHIFT_MASK_ = 0xF;
protected static final int HEADER_OPTIONS_INDEX_SHIFT_ = 4;
protected static final int HEADER_OPTIONS_DATA_IS_32_BIT_ = 0x100;
/**
* Flag indicator for Latin quick access data block
*/
private boolean m_isLatin1Linear_;
/**
* <p>Trie options field.</p>
* <p>options bit field:<br>
* 9 1 = Latin-1 data is stored linearly at data + DATA_BLOCK_LENGTH<br>
* 8 0 = 16-bit data, 1=32-bit data<br>
* 7..4 INDEX_STAGE_1_SHIFT // 0..INDEX_STAGE_2_SHIFT<br>
* 3..0 INDEX_STAGE_2_SHIFT // 1..9<br>
*/
private int m_options_;
// private methods ---------------------------------------------------
/**
* Authenticates raw data header.
* Checking the header information, signature and options.
* @param signature This contains the options and type of a Trie
* @return true if the header is authenticated valid
*/
private final boolean checkHeader(int signature)
{
// check the signature
// Trie in big-endian US-ASCII (0x54726965).
// Magic number to authenticate the data.
if (signature != HEADER_SIGNATURE_) {
return false;
}
if ((m_options_ & HEADER_OPTIONS_SHIFT_MASK_) !=
INDEX_STAGE_1_SHIFT_ ||
((m_options_ >> HEADER_OPTIONS_INDEX_SHIFT_) &
HEADER_OPTIONS_SHIFT_MASK_)
!= INDEX_STAGE_2_SHIFT_) {
return false;
}
return true;
}
}

View file

@ -0,0 +1,655 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2009-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.util.Iterator;
import java.util.NoSuchElementException;
/**
* This is the interface and common implementation of a Unicode Trie2.
* It is a kind of compressed table that maps from Unicode code points (0..0x10ffff)
* to 16- or 32-bit integer values. It works best when there are ranges of
* characters with the same value, which is generally the case with Unicode
* character properties.
*
* This is the second common version of a Unicode trie (hence the name Trie2).
*
*/
abstract class Trie2 implements Iterable<Trie2.Range> {
/**
* Create a Trie2 from its serialized form. Inverse of utrie2_serialize().
*
* Reads from the current position and leaves the buffer after the end of the trie.
*
* The serialized format is identical between ICU4C and ICU4J, so this function
* will work with serialized Trie2s from either.
*
* The actual type of the returned Trie2 will be either Trie2_16 or Trie2_32, depending
* on the width of the data.
*
* To obtain the width of the Trie2, check the actual class type of the returned Trie2.
* Or use the createFromSerialized() function of Trie2_16 or Trie2_32, which will
* return only Tries of their specific type/size.
*
* The serialized Trie2 on the stream may be in either little or big endian byte order.
* This allows using serialized Tries from ICU4C without needing to consider the
* byte order of the system that created them.
*
* @param bytes a byte buffer to the serialized form of a UTrie2.
* @return An unserialized Trie2, ready for use.
* @throws IllegalArgumentException if the stream does not contain a serialized Trie2.
* @throws IOException if a read error occurs in the buffer.
*
*/
public static Trie2 createFromSerialized(ByteBuffer bytes) throws IOException {
// From ICU4C utrie2_impl.h
// * Trie2 data structure in serialized form:
// *
// * UTrie2Header header;
// * uint16_t index[header.index2Length];
// * uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...]
// * @internal
// */
// typedef struct UTrie2Header {
// /** "Tri2" in big-endian US-ASCII (0x54726932) */
// uint32_t signature;
// /**
// * options bit field:
// * 15.. 4 reserved (0)
// * 3.. 0 UTrie2ValueBits valueBits
// */
// uint16_t options;
//
// /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH */
// uint16_t indexLength;
//
// /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT */
// uint16_t shiftedDataLength;
//
// /** Null index and data blocks, not shifted. */
// uint16_t index2NullOffset, dataNullOffset;
//
// /**
// * First code point of the single-value range ending with U+10ffff,
// * rounded up and then shifted right by UTRIE2_SHIFT_1.
// */
// uint16_t shiftedHighStart;
// } UTrie2Header;
ByteOrder outerByteOrder = bytes.order();
try {
UTrie2Header header = new UTrie2Header();
/* check the signature */
header.signature = bytes.getInt();
switch (header.signature) {
case 0x54726932:
// The buffer is already set to the trie data byte order.
break;
case 0x32697254:
// Temporarily reverse the byte order.
boolean isBigEndian = outerByteOrder == ByteOrder.BIG_ENDIAN;
bytes.order(isBigEndian ? ByteOrder.LITTLE_ENDIAN : ByteOrder.BIG_ENDIAN);
header.signature = 0x54726932;
break;
default:
throw new IllegalArgumentException("Buffer does not contain a serialized UTrie2");
}
header.options = bytes.getChar();
header.indexLength = bytes.getChar();
header.shiftedDataLength = bytes.getChar();
header.index2NullOffset = bytes.getChar();
header.dataNullOffset = bytes.getChar();
header.shiftedHighStart = bytes.getChar();
if ((header.options & UTRIE2_OPTIONS_VALUE_BITS_MASK) != 0) {
throw new IllegalArgumentException("UTrie2 serialized format error.");
}
Trie2 This;
This = new Trie2_16();
This.header = header;
/* get the length values and offsets */
This.indexLength = header.indexLength;
This.dataLength = header.shiftedDataLength << UTRIE2_INDEX_SHIFT;
This.index2NullOffset = header.index2NullOffset;
This.dataNullOffset = header.dataNullOffset;
This.highStart = header.shiftedHighStart << UTRIE2_SHIFT_1;
This.highValueIndex = This.dataLength - UTRIE2_DATA_GRANULARITY;
This.highValueIndex += This.indexLength;
// Allocate the Trie2 index array. If the data width is 16 bits, the array also
// includes the space for the data.
int indexArraySize = This.indexLength;
indexArraySize += This.dataLength;
This.index = new char[indexArraySize];
/* Read in the index */
int i;
for (i=0; i<This.indexLength; i++) {
This.index[i] = bytes.getChar();
}
/* Read in the data. 16 bit data goes in the same array as the index.
* 32 bit data goes in its own separate data array.
*/
This.data16 = This.indexLength;
for (i=0; i<This.dataLength; i++) {
This.index[This.data16 + i] = bytes.getChar();
}
This.data32 = null;
This.initialValue = This.index[This.dataNullOffset];
This.errorValue = This.index[This.data16+UTRIE2_BAD_UTF8_DATA_OFFSET];
return This;
} finally {
bytes.order(outerByteOrder);
}
}
/**
* Get the value for a code point as stored in the Trie2.
*
* @param codePoint the code point
* @return the value
*/
public abstract int get(int codePoint);
/**
* Get the trie value for a UTF-16 code unit.
*
* A Trie2 stores two distinct values for input in the lead surrogate
* range, one for lead surrogates, which is the value that will be
* returned by this function, and a second value that is returned
* by Trie2.get().
*
* For code units outside of the lead surrogate range, this function
* returns the same result as Trie2.get().
*
* This function, together with the alternate value for lead surrogates,
* makes possible very efficient processing of UTF-16 strings without
* first converting surrogate pairs to their corresponding 32 bit code point
* values.
*
* At build-time, enumerate the contents of the Trie2 to see if there
* is non-trivial (non-initialValue) data for any of the supplementary
* code points associated with a lead surrogate.
* If so, then set a special (application-specific) value for the
* lead surrogate code _unit_, with Trie2Writable.setForLeadSurrogateCodeUnit().
*
* At runtime, use Trie2.getFromU16SingleLead(). If there is non-trivial
* data and the code unit is a lead surrogate, then check if a trail surrogate
* follows. If so, assemble the supplementary code point and look up its value
* with Trie2.get(); otherwise reset the lead
* surrogate's value or do a code point lookup for it.
*
* If there is only trivial data for lead and trail surrogates, then processing
* can often skip them. For example, in normalization or case mapping
* all characters that do not have any mappings are simply copied as is.
*
* @param c the code point or lead surrogate value.
* @return the value
*/
public abstract int getFromU16SingleLead(char c);
/**
* When iterating over the contents of a Trie2, Elements of this type are produced.
* The iterator will return one item for each contiguous range of codepoints having the same value.
*
* When iterating, the same Trie2EnumRange object will be reused and returned for each range.
* If you need to retain complete iteration results, clone each returned Trie2EnumRange,
* or save the range in some other way, before advancing to the next iteration step.
*/
public static class Range {
public int startCodePoint;
public int endCodePoint; // Inclusive.
public int value;
public boolean leadSurrogate;
public boolean equals(Object other) {
if (other == null || !(other.getClass().equals(getClass()))) {
return false;
}
Range tother = (Range)other;
return this.startCodePoint == tother.startCodePoint &&
this.endCodePoint == tother.endCodePoint &&
this.value == tother.value &&
this.leadSurrogate == tother.leadSurrogate;
}
public int hashCode() {
int h = initHash();
h = hashUChar32(h, startCodePoint);
h = hashUChar32(h, endCodePoint);
h = hashInt(h, value);
h = hashByte(h, leadSurrogate? 1: 0);
return h;
}
}
/**
* Create an iterator over the value ranges in this Trie2.
* Values from the Trie2 are not remapped or filtered, but are returned as they
* are stored in the Trie2.
*
* @return an Iterator
*/
public Iterator<Range> iterator() {
return iterator(defaultValueMapper);
}
private static ValueMapper defaultValueMapper = new ValueMapper() {
public int map(int in) {
return in;
}
};
/**
* Create an iterator over the value ranges from this Trie2.
* Values from the Trie2 are passed through a caller-supplied remapping function,
* and it is the remapped values that determine the ranges that
* will be produced by the iterator.
*
*
* @param mapper provides a function to remap values obtained from the Trie2.
* @return an Iterator
*/
public Iterator<Range> iterator(ValueMapper mapper) {
return new Trie2Iterator(mapper);
}
/**
* When iterating over the contents of a Trie2, an instance of TrieValueMapper may
* be used to remap the values from the Trie2. The remapped values will be used
* both in determining the ranges of codepoints and as the value to be returned
* for each range.
*
* Example of use, with an anonymous subclass of TrieValueMapper:
*
*
* ValueMapper m = new ValueMapper() {
* int map(int in) {return in & 0x1f;};
* }
* for (Iterator<Trie2EnumRange> iter = trie.iterator(m); i.hasNext(); ) {
* Trie2EnumRange r = i.next();
* ... // Do something with the range r.
* }
*
*/
public interface ValueMapper {
public int map(int originalVal);
}
//--------------------------------------------------------------------------------
//
// Below this point are internal implementation items. No further public API.
//
//--------------------------------------------------------------------------------
/**
* Trie2 data structure in serialized form:
*
* UTrie2Header header;
* uint16_t index[header.index2Length];
* uint16_t data[header.shiftedDataLength<<2]; -- or uint32_t data[...]
*
* For Java, this is read from the stream into an instance of UTrie2Header.
* (The C version just places a struct over the raw serialized data.)
*
* @internal
*/
static class UTrie2Header {
/** "Tri2" in big-endian US-ASCII (0x54726932) */
int signature;
/**
* options bit field (uint16_t):
* 15.. 4 reserved (0)
* 3.. 0 UTrie2ValueBits valueBits
*/
int options;
/** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH (uint16_t) */
int indexLength;
/** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT (uint16_t) */
int shiftedDataLength;
/** Null index and data blocks, not shifted. (uint16_t) */
int index2NullOffset, dataNullOffset;
/**
* First code point of the single-value range ending with U+10ffff,
* rounded up and then shifted right by UTRIE2_SHIFT_1. (uint16_t)
*/
int shiftedHighStart;
}
//
// Data members of UTrie2.
//
UTrie2Header header;
char index[]; // Index array. Includes data for 16 bit Tries.
int data16; // Offset to data portion of the index array, if 16 bit data.
// zero if 32 bit data.
int data32[]; // NULL if 16b data is used via index
int indexLength;
int dataLength;
int index2NullOffset; // 0xffff if there is no dedicated index-2 null block
int initialValue;
/** Value returned for out-of-range code points and illegal UTF-8. */
int errorValue;
/* Start of the last range which ends at U+10ffff, and its value. */
int highStart;
int highValueIndex;
int dataNullOffset;
/**
* Trie2 constants, defining shift widths, index array lengths, etc.
*
* These are needed for the runtime macros but users can treat these as
* implementation details and skip to the actual public API further below.
*/
static final int UTRIE2_OPTIONS_VALUE_BITS_MASK=0x000f;
/** Shift size for getting the index-1 table offset. */
static final int UTRIE2_SHIFT_1=6+5;
/** Shift size for getting the index-2 table offset. */
static final int UTRIE2_SHIFT_2=5;
/**
* Difference between the two shift sizes,
* for getting an index-1 offset from an index-2 offset. 6=11-5
*/
static final int UTRIE2_SHIFT_1_2=UTRIE2_SHIFT_1-UTRIE2_SHIFT_2;
/**
* Number of index-1 entries for the BMP. 32=0x20
* This part of the index-1 table is omitted from the serialized form.
*/
static final int UTRIE2_OMITTED_BMP_INDEX_1_LENGTH=0x10000>>UTRIE2_SHIFT_1;
/** Number of entries in an index-2 block. 64=0x40 */
static final int UTRIE2_INDEX_2_BLOCK_LENGTH=1<<UTRIE2_SHIFT_1_2;
/** Mask for getting the lower bits for the in-index-2-block offset. */
static final int UTRIE2_INDEX_2_MASK=UTRIE2_INDEX_2_BLOCK_LENGTH-1;
/** Number of entries in a data block. 32=0x20 */
static final int UTRIE2_DATA_BLOCK_LENGTH=1<<UTRIE2_SHIFT_2;
/** Mask for getting the lower bits for the in-data-block offset. */
static final int UTRIE2_DATA_MASK=UTRIE2_DATA_BLOCK_LENGTH-1;
/**
* Shift size for shifting left the index array values.
* Increases possible data size with 16-bit index values at the cost
* of compactability.
* This requires data blocks to be aligned by UTRIE2_DATA_GRANULARITY.
*/
static final int UTRIE2_INDEX_SHIFT=2;
/** The alignment size of a data block. Also the granularity for compaction. */
static final int UTRIE2_DATA_GRANULARITY=1<<UTRIE2_INDEX_SHIFT;
/**
* The part of the index-2 table for U+D800..U+DBFF stores values for
* lead surrogate code _units_ not code _points_.
* Values for lead surrogate code _points_ are indexed with this portion of the table.
* Length=32=0x20=0x400>>UTRIE2_SHIFT_2. (There are 1024=0x400 lead surrogates.)
*/
static final int UTRIE2_LSCP_INDEX_2_OFFSET=0x10000>>UTRIE2_SHIFT_2;
static final int UTRIE2_LSCP_INDEX_2_LENGTH=0x400>>UTRIE2_SHIFT_2;
/** Count the lengths of both BMP pieces. 2080=0x820 */
static final int UTRIE2_INDEX_2_BMP_LENGTH=UTRIE2_LSCP_INDEX_2_OFFSET+UTRIE2_LSCP_INDEX_2_LENGTH;
/**
* The 2-byte UTF-8 version of the index-2 table follows at offset 2080=0x820.
* Length 32=0x20 for lead bytes C0..DF, regardless of UTRIE2_SHIFT_2.
*/
static final int UTRIE2_UTF8_2B_INDEX_2_OFFSET=UTRIE2_INDEX_2_BMP_LENGTH;
static final int UTRIE2_UTF8_2B_INDEX_2_LENGTH=0x800>>6; /* U+0800 is the first code point after 2-byte UTF-8 */
/**
* The index-1 table, only used for supplementary code points, at offset 2112=0x840.
* Variable length, for code points up to highStart, where the last single-value range starts.
* Maximum length 512=0x200=0x100000>>UTRIE2_SHIFT_1.
* (For 0x100000 supplementary code points U+10000..U+10ffff.)
*
* The part of the index-2 table for supplementary code points starts
* after this index-1 table.
*
* Both the index-1 table and the following part of the index-2 table
* are omitted completely if there is only BMP data.
*/
static final int UTRIE2_INDEX_1_OFFSET=UTRIE2_UTF8_2B_INDEX_2_OFFSET+UTRIE2_UTF8_2B_INDEX_2_LENGTH;
/**
* The illegal-UTF-8 data block follows the ASCII block, at offset 128=0x80.
* Used with linear access for single bytes 0..0xbf for simple error handling.
* Length 64=0x40, not UTRIE2_DATA_BLOCK_LENGTH.
*/
static final int UTRIE2_BAD_UTF8_DATA_OFFSET=0x80;
/**
* Implementation class for an iterator over a Trie2.
*
* Iteration over a Trie2 first returns all of the ranges that are indexed by code points,
* then returns the special alternate values for the lead surrogates
*
* @internal
*/
class Trie2Iterator implements Iterator<Range> {
// The normal constructor that configures the iterator to cover the complete
// contents of the Trie2
Trie2Iterator(ValueMapper vm) {
mapper = vm;
nextStart = 0;
limitCP = 0x110000;
doLeadSurrogates = true;
}
/**
* The main next() function for Trie2 iterators
*
*/
public Range next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
if (nextStart >= limitCP) {
// Switch over from iterating normal code point values to
// doing the alternate lead-surrogate values.
doingCodePoints = false;
nextStart = 0xd800;
}
int endOfRange = 0;
int val = 0;
int mappedVal = 0;
if (doingCodePoints) {
// Iteration over code point values.
val = get(nextStart);
mappedVal = mapper.map(val);
endOfRange = rangeEnd(nextStart, limitCP, val);
// Loop once for each range in the Trie2 with the same raw (unmapped) value.
// Loop continues so long as the mapped values are the same.
for (;;) {
if (endOfRange >= limitCP-1) {
break;
}
val = get(endOfRange+1);
if (mapper.map(val) != mappedVal) {
break;
}
endOfRange = rangeEnd(endOfRange+1, limitCP, val);
}
} else {
// Iteration over the alternate lead surrogate values.
val = getFromU16SingleLead((char)nextStart);
mappedVal = mapper.map(val);
endOfRange = rangeEndLS((char)nextStart);
// Loop once for each range in the Trie2 with the same raw (unmapped) value.
// Loop continues so long as the mapped values are the same.
for (;;) {
if (endOfRange >= 0xdbff) {
break;
}
val = getFromU16SingleLead((char)(endOfRange+1));
if (mapper.map(val) != mappedVal) {
break;
}
endOfRange = rangeEndLS((char)(endOfRange+1));
}
}
returnValue.startCodePoint = nextStart;
returnValue.endCodePoint = endOfRange;
returnValue.value = mappedVal;
returnValue.leadSurrogate = !doingCodePoints;
nextStart = endOfRange+1;
return returnValue;
}
/**
*
*/
public boolean hasNext() {
return doingCodePoints && (doLeadSurrogates || nextStart < limitCP) || nextStart < 0xdc00;
}
private int rangeEndLS(char startingLS) {
if (startingLS >= 0xdbff) {
return 0xdbff;
}
int c;
int val = getFromU16SingleLead(startingLS);
for (c = startingLS+1; c <= 0x0dbff; c++) {
if (getFromU16SingleLead((char)c) != val) {
break;
}
}
return c-1;
}
//
// Iteration State Variables
//
private ValueMapper mapper;
private Range returnValue = new Range();
// The starting code point for the next range to be returned.
private int nextStart;
// The upper limit for the last normal range to be returned. Normally 0x110000, but
// may be lower when iterating over the code points for a single lead surrogate.
private int limitCP;
// True while iterating over the the Trie2 values for code points.
// False while iterating over the alternate values for lead surrogates.
private boolean doingCodePoints = true;
// True if the iterator should iterate the special values for lead surrogates in
// addition to the normal values for code points.
private boolean doLeadSurrogates = true;
}
/**
* Find the last character in a contiguous range of characters with the
* same Trie2 value as the input character.
*
* @param c The character to begin with.
* @return The last contiguous character with the same value.
*/
int rangeEnd(int start, int limitp, int val) {
int c;
int limit = Math.min(highStart, limitp);
for (c = start+1; c < limit; c++) {
if (get(c) != val) {
break;
}
}
if (c >= highStart) {
c = limitp;
}
return c - 1;
}
//
// Hashing implementation functions. FNV hash. Respected public domain algorithm.
//
private static int initHash() {
return 0x811c9DC5; // unsigned 2166136261
}
private static int hashByte(int h, int b) {
h = h * 16777619;
h = h ^ b;
return h;
}
private static int hashUChar32(int h, int c) {
h = Trie2.hashByte(h, c & 255);
h = Trie2.hashByte(h, (c>>8) & 255);
h = Trie2.hashByte(h, c>>16);
return h;
}
private static int hashInt(int h, int i) {
h = Trie2.hashByte(h, i & 255);
h = Trie2.hashByte(h, (i>>8) & 255);
h = Trie2.hashByte(h, (i>>16) & 255);
h = Trie2.hashByte(h, (i>>24) & 255);
return h;
}
}

View file

@ -0,0 +1,167 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 2009-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.IOException;
import java.nio.ByteBuffer;
/**
* @author aheninger
*
* A read-only Trie2, holding 16 bit data values.
*
* A Trie2 is a highly optimized data structure for mapping from Unicode
* code points (values ranging from 0 to 0x10ffff) to a 16 or 32 bit value.
*
* See class Trie2 for descriptions of the API for accessing the contents of a trie.
*
* The fundamental data access methods are declared final in this class, with
* the intent that applications might gain a little extra performance, when compared
* with calling the same methods via the abstract UTrie2 base class.
*/
public final class Trie2_16 extends Trie2 {
/**
* Internal constructor, not for general use.
*/
Trie2_16() {
}
/**
* Create a Trie2 from its serialized form. Inverse of utrie2_serialize().
* The serialized format is identical between ICU4C and ICU4J, so this function
* will work with serialized Trie2s from either.
*
* The serialized Trie2 in the bytes may be in either little or big endian byte order.
* This allows using serialized Tries from ICU4C without needing to consider the
* byte order of the system that created them.
*
* @param bytes a byte buffer to the serialized form of a UTrie2.
* @return An unserialized Trie2_16, ready for use.
* @throws IllegalArgumentException if the buffer does not contain a serialized Trie2.
* @throws IOException if a read error occurs in the buffer.
* @throws ClassCastException if the bytes contain a serialized Trie2_32
*/
public static Trie2_16 createFromSerialized(ByteBuffer bytes) throws IOException {
return (Trie2_16) Trie2.createFromSerialized(bytes);
}
/**
* Get the value for a code point as stored in the Trie2.
*
* @param codePoint the code point
* @return the value
*/
@Override
public final int get(int codePoint) {
int value;
int ix;
if (codePoint >= 0) {
if (codePoint < 0x0d800 || (codePoint > 0x0dbff && codePoint <= 0x0ffff)) {
// Ordinary BMP code point, excluding leading surrogates.
// BMP uses a single level lookup. BMP index starts at offset 0 in the Trie2 index.
// 16 bit data is stored in the index array itself.
ix = index[codePoint >> UTRIE2_SHIFT_2];
ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
value = index[ix];
return value;
}
if (codePoint <= 0xffff) {
// Lead Surrogate Code Point. A Separate index section is stored for
// lead surrogate code units and code points.
// The main index has the code unit data.
// For this function, we need the code point data.
// Note: this expression could be refactored for slightly improved efficiency, but
// surrogate code points will be so rare in practice that it's not worth it.
ix = index[UTRIE2_LSCP_INDEX_2_OFFSET + ((codePoint - 0xd800) >> UTRIE2_SHIFT_2)];
ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
value = index[ix];
return value;
}
if (codePoint < highStart) {
// Supplemental code point, use two-level lookup.
ix = (UTRIE2_INDEX_1_OFFSET - UTRIE2_OMITTED_BMP_INDEX_1_LENGTH) + (codePoint >> UTRIE2_SHIFT_1);
ix = index[ix];
ix += (codePoint >> UTRIE2_SHIFT_2) & UTRIE2_INDEX_2_MASK;
ix = index[ix];
ix = (ix << UTRIE2_INDEX_SHIFT) + (codePoint & UTRIE2_DATA_MASK);
value = index[ix];
return value;
}
if (codePoint <= 0x10ffff) {
value = index[highValueIndex];
return value;
}
}
// Fall through. The code point is outside of the legal range of 0..0x10ffff.
return errorValue;
}
/**
* Get a Trie2 value for a UTF-16 code unit.
*
* This function returns the same value as get() if the input
* character is outside of the lead surrogate range
*
* There are two values stored in a Trie2 for inputs in the lead
* surrogate range. This function returns the alternate value,
* while Trie2.get() returns the main value.
*
* @param codeUnit a 16 bit code unit or lead surrogate value.
* @return the value
*/
@Override
public int getFromU16SingleLead(char codeUnit) {
int value;
int ix;
// Because the input is a 16 bit char, we can skip the tests for it being in
// the BMP range. It is.
ix = index[codeUnit >> UTRIE2_SHIFT_2];
ix = (ix << UTRIE2_INDEX_SHIFT) + (codeUnit & UTRIE2_DATA_MASK);
value = index[ix];
return value;
}
/**
* @return the number of bytes of the serialized trie
*/
public int getSerializedLength() {
return 16+(header.indexLength+dataLength)*2;
}
}

View file

@ -0,0 +1,267 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
*
* Copyright (C) 2004-2014, International Business Machines
* Corporation and others. All Rights Reserved.
*
*******************************************************************************
* file name: UBiDiProps.java
* encoding: US-ASCII
* tab size: 8 (not used)
* indentation:4
*
* created on: 2005jan16
* created by: Markus W. Scherer
*
* Low-level Unicode bidi/shaping properties access.
* Java port of ubidi_props.h/.c.
*/
package sun.text.normalizer;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.MissingResourceException;
public final class UBiDiProps {
// constructors etc. --------------------------------------------------- ***
// port of ubidi_openProps()
private UBiDiProps() throws IOException{
ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME);
readData(bytes);
}
private void readData(ByteBuffer bytes) throws IOException {
// read the header
ICUBinary.readHeader(bytes, FMT, new IsAcceptable());
// read indexes[]
int i, count;
count=bytes.getInt();
if(count<IX_TOP) {
throw new IOException("indexes[0] too small in "+DATA_FILE_NAME);
}
indexes=new int[count];
indexes[0]=count;
for(i=1; i<count; ++i) {
indexes[i]=bytes.getInt();
}
// read the trie
trie=Trie2_16.createFromSerialized(bytes);
int expectedTrieLength=indexes[IX_TRIE_SIZE];
int trieLength=trie.getSerializedLength();
if(trieLength>expectedTrieLength) {
throw new IOException(DATA_FILE_NAME+": not enough bytes for the trie");
}
// skip padding after trie bytes
ICUBinary.skipBytes(bytes, expectedTrieLength-trieLength);
// read mirrors[]
count=indexes[IX_MIRROR_LENGTH];
if(count>0) {
mirrors=new int[count];
for(i=0; i<count; ++i) {
mirrors[i]=bytes.getInt();
}
}
// read jgArray[]
count=indexes[IX_JG_LIMIT]-indexes[IX_JG_START];
jgArray=new byte[count];
for(i=0; i<count; ++i) {
jgArray[i]=bytes.get();
}
// read jgArray2[]
count=indexes[IX_JG_LIMIT2]-indexes[IX_JG_START2];
jgArray2=new byte[count];
for(i=0; i<count; ++i) {
jgArray2[i]=bytes.get();
}
}
// implement ICUBinary.Authenticate
private static final class IsAcceptable implements ICUBinary.Authenticate {
public boolean isDataVersionAcceptable(byte version[]) {
return version[0]==2;
}
}
// property access functions ------------------------------------------- ***
public final int getClass(int c) {
return getClassFromProps(trie.get(c));
}
private final int getMirror(int c, int props) {
int delta=getMirrorDeltaFromProps(props);
if(delta!=ESC_MIRROR_DELTA) {
return c+delta;
} else {
/* look for mirror code point in the mirrors[] table */
int m;
int i, length;
int c2;
length=indexes[IX_MIRROR_LENGTH];
/* linear search */
for(i=0; i<length; ++i) {
m=mirrors[i];
c2=getMirrorCodePoint(m);
if(c==c2) {
/* found c, return its mirror code point using the index in m */
return getMirrorCodePoint(mirrors[getMirrorIndex(m)]);
} else if(c<c2) {
break;
}
}
/* c not found, return it itself */
return c;
}
}
public final int getMirror(int c) {
int props=trie.get(c);
return getMirror(c, props);
}
public final int getJoiningType(int c) {
return (trie.get(c)&JT_MASK)>>JT_SHIFT;
}
public final int getJoiningGroup(int c) {
int start, limit;
start=indexes[IX_JG_START];
limit=indexes[IX_JG_LIMIT];
if(start<=c && c<limit) {
return (int)jgArray[c-start]&0xff;
}
start=indexes[IX_JG_START2];
limit=indexes[IX_JG_LIMIT2];
if(start<=c && c<limit) {
return (int)jgArray2[c-start]&0xff;
}
return UCharacter.JoiningGroup.NO_JOINING_GROUP;
}
public final int getPairedBracketType(int c) {
return (trie.get(c)&BPT_MASK)>>BPT_SHIFT;
}
public final int getPairedBracket(int c) {
int props=trie.get(c);
if((props&BPT_MASK)==0) {
return c;
} else {
return getMirror(c, props);
}
}
// data members -------------------------------------------------------- ***
private int indexes[];
private int mirrors[];
private byte jgArray[];
private byte jgArray2[];
private Trie2_16 trie;
// data format constants ----------------------------------------------- ***
private static final String DATA_FILE_NAME = "/sun/text/resources/ubidi.icu";
/* format "BiDi" */
private static final int FMT=0x42694469;
/* indexes into indexes[] */
private static final int IX_TRIE_SIZE=2;
private static final int IX_MIRROR_LENGTH=3;
private static final int IX_JG_START=4;
private static final int IX_JG_LIMIT=5;
private static final int IX_JG_START2=6; /* new in format version 2.2, ICU 54 */
private static final int IX_JG_LIMIT2=7;
private static final int IX_TOP=16;
// definitions for 16-bit bidi/shaping properties word ----------------- ***
/* CLASS_SHIFT=0, */ /* bidi class: 5 bits (4..0) */
private static final int JT_SHIFT=5; /* joining type: 3 bits (7..5) */
private static final int BPT_SHIFT=8; /* Bidi_Paired_Bracket_Type(bpt): 2 bits (9..8) */
private static final int MIRROR_DELTA_SHIFT=13; /* bidi mirroring delta: 3 bits (15..13) */
private static final int CLASS_MASK= 0x0000001f;
private static final int JT_MASK= 0x000000e0;
private static final int BPT_MASK= 0x00000300;
private static final int getClassFromProps(int props) {
return props&CLASS_MASK;
}
private static final boolean getFlagFromProps(int props, int shift) {
return ((props>>shift)&1)!=0;
}
private static final int getMirrorDeltaFromProps(int props) {
return (short)props>>MIRROR_DELTA_SHIFT;
}
private static final int ESC_MIRROR_DELTA=-4;
// definitions for 32-bit mirror table entry --------------------------- ***
/* the source Unicode code point takes 21 bits (20..0) */
private static final int MIRROR_INDEX_SHIFT=21;
private static final int getMirrorCodePoint(int m) {
return m&0x1fffff;
}
private static final int getMirrorIndex(int m) {
return m>>>MIRROR_INDEX_SHIFT;
}
/*
* public singleton instance
*/
public static final UBiDiProps INSTANCE;
// This static initializer block must be placed after
// other static member initialization
static {
try {
INSTANCE = new UBiDiProps();
} catch (IOException e) {
throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME,"");
}
}
}

View file

@ -0,0 +1,539 @@
/*
* Copyright (c) 2009, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
/**
* <p>The UCharacter class provides extensions to the
* <a href="http://java.sun.com/j2se/1.5/docs/api/java/lang/Character.html">
* java.lang.Character</a> class. These extensions provide support for
* more Unicode properties and together with the <a href=../text/UTF16.html>UTF16</a>
* class, provide support for supplementary characters (those with code
* points above U+FFFF).
* Each ICU release supports the latest version of Unicode available at that time.
*
* <p>Code points are represented in these API using ints. While it would be
* more convenient in Java to have a separate primitive datatype for them,
* ints suffice in the meantime.
*
* <p>To use this class please add the jar file name icu4j.jar to the
* class path, since it contains data files which supply the information used
* by this file.<br>
* E.g. In Windows <br>
* <code>set CLASSPATH=%CLASSPATH%;$JAR_FILE_PATH/ucharacter.jar</code>.<br>
* Otherwise, another method would be to copy the files uprops.dat and
* unames.icu from the icu4j source subdirectory
* <i>$ICU4J_SRC/src/com.ibm.icu.impl.data</i> to your class directory
* <i>$ICU4J_CLASS/com.ibm.icu.impl.data</i>.
*
* <p>Aside from the additions for UTF-16 support, and the updated Unicode
* properties, the main differences between UCharacter and Character are:
* <ul>
* <li> UCharacter is not designed to be a char wrapper and does not have
* APIs to which involves management of that single char.<br>
* These include:
* <ul>
* <li> char charValue(),
* <li> int compareTo(java.lang.Character, java.lang.Character), etc.
* </ul>
* <li> UCharacter does not include Character APIs that are deprecated, nor
* does it include the Java-specific character information, such as
* boolean isJavaIdentifierPart(char ch).
* <li> Character maps characters 'A' - 'Z' and 'a' - 'z' to the numeric
* values '10' - '35'. UCharacter also does this in digit and
* getNumericValue, to adhere to the java semantics of these
* methods. New methods unicodeDigit, and
* getUnicodeNumericValue do not treat the above code points
* as having numeric values. This is a semantic change from ICU4J 1.3.1.
* </ul>
* <p>
* Further detail on differences can be determined using the program
* <a href=
* "http://source.icu-project.org/repos/icu/icu4j/trunk/src/com/ibm/icu/dev/test/lang/UCharacterCompare.java">
* com.ibm.icu.dev.test.lang.UCharacterCompare</a>
* </p>
* <p>
* In addition to Java compatibility functions, which calculate derived properties,
* this API provides low-level access to the Unicode Character Database.
* </p>
* <p>
* Unicode assigns each code point (not just assigned character) values for
* many properties.
* Most of them are simple boolean flags, or constants from a small enumerated list.
* For some properties, values are strings or other relatively more complex types.
* </p>
* <p>
* For more information see
* <a href="http://www.unicode/org/ucd/">"About the Unicode Character Database"</a>
* (http://www.unicode.org/ucd/)
* and the <a href="http://www.icu-project.org/userguide/properties.html">ICU
* User Guide chapter on Properties</a>
* (http://www.icu-project.org/userguide/properties.html).
* </p>
* <p>
* There are also functions that provide easy migration from C/POSIX functions
* like isblank(). Their use is generally discouraged because the C/POSIX
* standards do not define their semantics beyond the ASCII range, which means
* that different implementations exhibit very different behavior.
* Instead, Unicode properties should be used directly.
* </p>
* <p>
* There are also only a few, broad C/POSIX character classes, and they tend
* to be used for conflicting purposes. For example, the "isalpha()" class
* is sometimes used to determine word boundaries, while a more sophisticated
* approach would at least distinguish initial letters from continuation
* characters (the latter including combining marks).
* (In ICU, BreakIterator is the most sophisticated API for word boundaries.)
* Another example: There is no "istitle()" class for titlecase characters.
* </p>
* <p>
* ICU 3.4 and later provides API access for all twelve C/POSIX character classes.
* ICU implements them according to the Standard Recommendations in
* Annex C: Compatibility Properties of UTS #18 Unicode Regular Expressions
* (http://www.unicode.org/reports/tr18/#Compatibility_Properties).
* </p>
* <p>
* API access for C/POSIX character classes is as follows:
* <pre>{@code
* - alpha: isUAlphabetic(c) or hasBinaryProperty(c, UProperty.ALPHABETIC)
* - lower: isULowercase(c) or hasBinaryProperty(c, UProperty.LOWERCASE)
* - upper: isUUppercase(c) or hasBinaryProperty(c, UProperty.UPPERCASE)
* - punct: ((1<<getType(c)) & ((1<<DASH_PUNCTUATION)|(1<<START_PUNCTUATION)|
* (1<<END_PUNCTUATION)|(1<<CONNECTOR_PUNCTUATION)|(1<<OTHER_PUNCTUATION)|
* (1<<INITIAL_PUNCTUATION)|(1<<FINAL_PUNCTUATION)))!=0
* - digit: isDigit(c) or getType(c)==DECIMAL_DIGIT_NUMBER
* - xdigit: hasBinaryProperty(c, UProperty.POSIX_XDIGIT)
* - alnum: hasBinaryProperty(c, UProperty.POSIX_ALNUM)
* - space: isUWhiteSpace(c) or hasBinaryProperty(c, UProperty.WHITE_SPACE)
* - blank: hasBinaryProperty(c, UProperty.POSIX_BLANK)
* - cntrl: getType(c)==CONTROL
* - graph: hasBinaryProperty(c, UProperty.POSIX_GRAPH)
* - print: hasBinaryProperty(c, UProperty.POSIX_PRINT)
* }</pre>
* </p>
* <p>
* The C/POSIX character classes are also available in UnicodeSet patterns,
* using patterns like [:graph:] or \p{graph}.
* </p>
*
* There are several ICU (and Java) whitespace functions.
* Comparison:<ul>
* <li> isUWhiteSpace=UCHAR_WHITE_SPACE: Unicode White_Space property;
* most of general categories "Z" (separators) + most whitespace ISO controls
* (including no-break spaces, but excluding IS1..IS4 and ZWSP)
* <li> isWhitespace: Java isWhitespace; Z + whitespace ISO controls but excluding no-break spaces
* <li> isSpaceChar: just Z (including no-break spaces)</ul>
* </p>
* <p>
* This class is not subclassable.
* </p>
* @author Syn Wee Quek
* @stable ICU 2.1
* @see com.ibm.icu.lang.UCharacterEnums
*/
public final class UCharacter
{
/**
* Joining Group constants.
* @see UProperty#JOINING_GROUP
* @stable ICU 2.4
*/
public static interface JoiningGroup
{
/**
* @stable ICU 2.4
*/
public static final int NO_JOINING_GROUP = 0;
}
/**
* Numeric Type constants.
* @see UProperty#NUMERIC_TYPE
* @stable ICU 2.4
*/
public static interface NumericType
{
/**
* @stable ICU 2.4
*/
public static final int NONE = 0;
/**
* @stable ICU 2.4
*/
public static final int DECIMAL = 1;
/**
* @stable ICU 2.4
*/
public static final int DIGIT = 2;
/**
* @stable ICU 2.4
*/
public static final int NUMERIC = 3;
/**
* @stable ICU 2.4
*/
public static final int COUNT = 4;
}
/**
* Hangul Syllable Type constants.
*
* @see UProperty#HANGUL_SYLLABLE_TYPE
* @stable ICU 2.6
*/
public static interface HangulSyllableType
{
/**
* @stable ICU 2.6
*/
public static final int NOT_APPLICABLE = 0; /*[NA]*/ /*See note !!*/
/**
* @stable ICU 2.6
*/
public static final int LEADING_JAMO = 1; /*[L]*/
/**
* @stable ICU 2.6
*/
public static final int VOWEL_JAMO = 2; /*[V]*/
/**
* @stable ICU 2.6
*/
public static final int TRAILING_JAMO = 3; /*[T]*/
/**
* @stable ICU 2.6
*/
public static final int LV_SYLLABLE = 4; /*[LV]*/
/**
* @stable ICU 2.6
*/
public static final int LVT_SYLLABLE = 5; /*[LVT]*/
/**
* @stable ICU 2.6
*/
public static final int COUNT = 6;
}
// public data members -----------------------------------------------
/**
* The lowest Unicode code point value.
* @stable ICU 2.1
*/
public static final int MIN_VALUE = UTF16.CODEPOINT_MIN_VALUE;
/**
* The highest Unicode code point value (scalar value) according to the
* Unicode Standard.
* This is a 21-bit value (21 bits, rounded up).<br>
* Up-to-date Unicode implementation of java.lang.Character.MAX_VALUE
* @stable ICU 2.1
*/
public static final int MAX_VALUE = UTF16.CODEPOINT_MAX_VALUE;
// public methods ----------------------------------------------------
/**
* Returns the numeric value of a decimal digit code point.
* <br>This method observes the semantics of
* <code>java.lang.Character.digit()</code>. Note that this
* will return positive values for code points for which isDigit
* returns false, just like java.lang.Character.
* <br><em>Semantic Change:</em> In release 1.3.1 and
* prior, this did not treat the European letters as having a
* digit value, and also treated numeric letters and other numbers as
* digits.
* This has been changed to conform to the java semantics.
* <br>A code point is a valid digit if and only if:
* <ul>
* <li>ch is a decimal digit or one of the european letters, and
* <li>the value of ch is less than the specified radix.
* </ul>
* @param ch the code point to query
* @param radix the radix
* @return the numeric value represented by the code point in the
* specified radix, or -1 if the code point is not a decimal digit
* or if its value is too large for the radix
* @stable ICU 2.1
*/
public static int digit(int ch, int radix)
{
if (2 <= radix && radix <= 36) {
int value = digit(ch);
if (value < 0) {
// ch is not a decimal digit, try latin letters
value = UCharacterProperty.getEuropeanDigit(ch);
}
return (value < radix) ? value : -1;
} else {
return -1; // invalid radix
}
}
/**
* Returns the numeric value of a decimal digit code point.
* <br>This is a convenience overload of <code>digit(int, int)</code>
* that provides a decimal radix.
* <br><em>Semantic Change:</em> In release 1.3.1 and prior, this
* treated numeric letters and other numbers as digits. This has
* been changed to conform to the java semantics.
* @param ch the code point to query
* @return the numeric value represented by the code point,
* or -1 if the code point is not a decimal digit or if its
* value is too large for a decimal radix
* @stable ICU 2.1
*/
public static int digit(int ch)
{
return UCharacterProperty.INSTANCE.digit(ch);
}
/**
* Returns a value indicating a code point's Unicode category.
* Up-to-date Unicode implementation of java.lang.Character.getType()
* except for the above mentioned code points that had their category
* changed.<br>
* Return results are constants from the interface
* <a href=UCharacterCategory.html>UCharacterCategory</a><br>
* <em>NOTE:</em> the UCharacterCategory values are <em>not</em> compatible with
* those returned by java.lang.Character.getType. UCharacterCategory values
* match the ones used in ICU4C, while java.lang.Character type
* values, though similar, skip the value 17.</p>
* @param ch code point whose type is to be determined
* @return category which is a value of UCharacterCategory
* @stable ICU 2.1
*/
public static int getType(int ch)
{
return UCharacterProperty.INSTANCE.getType(ch);
}
/**
* Returns the Bidirection property of a code point.
* For example, 0x0041 (letter A) has the LEFT_TO_RIGHT directional
* property.<br>
* Result returned belongs to the interface
* <a href=UCharacterDirection.html>UCharacterDirection</a>
* @param ch the code point to be determined its direction
* @return direction constant from UCharacterDirection.
* @stable ICU 2.1
*/
public static int getDirection(int ch)
{
return UBiDiProps.INSTANCE.getClass(ch);
}
/**
* Maps the specified code point to a "mirror-image" code point.
* For code points with the "mirrored" property, implementations sometimes
* need a "poor man's" mapping to another code point such that the default
* glyph may serve as the mirror-image of the default glyph of the
* specified code point.<br>
* This is useful for text conversion to and from codepages with visual
* order, and for displays without glyph selection capabilities.
* @param ch code point whose mirror is to be retrieved
* @return another code point that may serve as a mirror-image substitute,
* or ch itself if there is no such mapping or ch does not have the
* "mirrored" property
* @stable ICU 2.1
*/
public static int getMirror(int ch)
{
return UBiDiProps.INSTANCE.getMirror(ch);
}
/**
* Maps the specified character to its paired bracket character.
* For Bidi_Paired_Bracket_Type!=None, this is the same as getMirror(int).
* Otherwise c itself is returned.
* See http://www.unicode.org/reports/tr9/
*
* @param c the code point to be mapped
* @return the paired bracket code point,
* or c itself if there is no such mapping
* (Bidi_Paired_Bracket_Type=None)
*
* @see UProperty#BIDI_PAIRED_BRACKET
* @see UProperty#BIDI_PAIRED_BRACKET_TYPE
* @see #getMirror(int)
* @stable ICU 52
*/
public static int getBidiPairedBracket(int c) {
return UBiDiProps.INSTANCE.getPairedBracket(c);
}
/**
* Returns the combining class of the argument codepoint
* @param ch code point whose combining is to be retrieved
* @return the combining class of the codepoint
* @stable ICU 2.1
*/
public static int getCombiningClass(int ch)
{
return Normalizer2.getNFDInstance().getCombiningClass(ch);
}
/**
* Returns the version of Unicode data used.
* @return the unicode version number used
* @stable ICU 2.1
*/
public static VersionInfo getUnicodeVersion()
{
return UCharacterProperty.INSTANCE.m_unicodeVersion_;
}
/**
* Returns a code point corresponding to the two UTF16 characters.
* @param lead the lead char
* @param trail the trail char
* @return code point if surrogate characters are valid.
* @exception IllegalArgumentException thrown when argument characters do
* not form a valid codepoint
* @stable ICU 2.1
*/
public static int getCodePoint(char lead, char trail)
{
if (UTF16.isLeadSurrogate(lead) && UTF16.isTrailSurrogate(trail)) {
return UCharacterProperty.getRawSupplementary(lead, trail);
}
throw new IllegalArgumentException("Illegal surrogate characters");
}
/**
* Returns the "age" of the code point.</p>
* <p>The "age" is the Unicode version when the code point was first
* designated (as a non-character or for Private Use) or assigned a
* character.
* <p>This can be useful to avoid emitting code points to receiving
* processes that do not accept newer characters.</p>
* <p>The data is from the UCD file DerivedAge.txt.</p>
* @param ch The code point.
* @return the Unicode version number
* @stable ICU 2.6
*/
public static VersionInfo getAge(int ch)
{
if (ch < MIN_VALUE || ch > MAX_VALUE) {
throw new IllegalArgumentException("Codepoint out of bounds");
}
return UCharacterProperty.INSTANCE.getAge(ch);
}
/**
* Returns the property value for an Unicode property type of a code point.
* Also returns binary and mask property values.</p>
* <p>Unicode, especially in version 3.2, defines many more properties than
* the original set in UnicodeData.txt.</p>
* <p>The properties APIs are intended to reflect Unicode properties as
* defined in the Unicode Character Database (UCD) and Unicode Technical
* Reports (UTR). For details about the properties see
* http://www.unicode.org/.</p>
* <p>For names of Unicode properties see the UCD file PropertyAliases.txt.
* </p>
* <pre>
* Sample usage:
* int ea = UCharacter.getIntPropertyValue(c, UProperty.EAST_ASIAN_WIDTH);
* int ideo = UCharacter.getIntPropertyValue(c, UProperty.IDEOGRAPHIC);
* boolean b = (ideo == 1) ? true : false;
* </pre>
* @param ch code point to test.
* @param type UProperty selector constant, identifies which binary
* property to check. Must be
* UProperty.BINARY_START &lt;= type &lt; UProperty.BINARY_LIMIT or
* UProperty.INT_START &lt;= type &lt; UProperty.INT_LIMIT or
* UProperty.MASK_START &lt;= type &lt; UProperty.MASK_LIMIT.
* @return numeric value that is directly the property value or,
* for enumerated properties, corresponds to the numeric value of
* the enumerated constant of the respective property value
* enumeration type (cast to enum type if necessary).
* Returns 0 or 1 (for false / true) for binary Unicode properties.
* Returns a bit-mask for mask properties.
* Returns 0 if 'type' is out of bounds or if the Unicode version
* does not have data for the property at all, or not for this code
* point.
* @see UProperty
* @see #hasBinaryProperty
* @see #getIntPropertyMinValue
* @see #getIntPropertyMaxValue
* @see #getUnicodeVersion
* @stable ICU 2.4
*/
// for BiDiBase.java
public static int getIntPropertyValue(int ch, int type) {
return UCharacterProperty.INSTANCE.getIntPropertyValue(ch, type);
}
// private constructor -----------------------------------------------
/**
* Private constructor to prevent instantiation
*/
private UCharacter() { }
/*
* Copied from UCharacterEnums.java
*/
/**
* Character type Mn
* @stable ICU 2.1
*/
public static final byte NON_SPACING_MARK = 6;
/**
* Character type Me
* @stable ICU 2.1
*/
public static final byte ENCLOSING_MARK = 7;
/**
* Character type Mc
* @stable ICU 2.1
*/
public static final byte COMBINING_SPACING_MARK = 8;
/**
* Character type count
* @stable ICU 2.1
*/
public static final byte CHAR_CATEGORY_COUNT = 30;
/**
* Directional type R
* @stable ICU 2.1
*/
public static final int RIGHT_TO_LEFT = 1;
/**
* Directional type AL
* @stable ICU 2.1
*/
public static final int RIGHT_TO_LEFT_ARABIC = 13;
}

View file

@ -0,0 +1,313 @@
/*
* Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.text.CharacterIterator;
/**
* Abstract class that defines an API for iteration on text objects.This is an
* interface for forward and backward iteration and random access into a text
* object. Forward iteration is done with post-increment and backward iteration
* is done with pre-decrement semantics, while the
* <code>java.text.CharacterIterator</code> interface methods provided forward
* iteration with "pre-increment" and backward iteration with pre-decrement
* semantics. This API is more efficient for forward iteration over code points.
* The other major difference is that this API can do both code unit and code point
* iteration, <code>java.text.CharacterIterator</code> can only iterate over
* code units and is limited to BMP (0 - 0xFFFF)
* @author Ram
* @stable ICU 2.4
*/
public abstract class UCharacterIterator
implements Cloneable {
/**
* Protected default constructor for the subclasses
* @stable ICU 2.4
*/
protected UCharacterIterator(){
}
/**
* Indicator that we have reached the ends of the UTF16 text.
* Moved from UForwardCharacterIterator.java
* @stable ICU 2.4
*/
public static final int DONE = -1;
// static final methods ----------------------------------------------------
/**
* Returns a <code>UCharacterIterator</code> object given a
* source string.
* @param source a string
* @return UCharacterIterator object
* @exception IllegalArgumentException if the argument is null
* @stable ICU 2.4
*/
public static final UCharacterIterator getInstance(String source){
return new ReplaceableUCharacterIterator(source);
}
/**
* Returns a <code>UCharacterIterator</code> object given a
* source StringBuffer.
* @param source an string buffer of UTF-16 code units
* @return UCharacterIterator object
* @exception IllegalArgumentException if the argument is null
* @stable ICU 2.4
*/
public static final UCharacterIterator getInstance(StringBuffer source){
return new ReplaceableUCharacterIterator(source);
}
/**
* Returns a <code>UCharacterIterator</code> object given a
* CharacterIterator.
* @param source a valid CharacterIterator object.
* @return UCharacterIterator object
* @exception IllegalArgumentException if the argument is null
* @stable ICU 2.4
*/
public static final UCharacterIterator getInstance(CharacterIterator source){
return new CharacterIteratorWrapper(source);
}
// public methods ----------------------------------------------------------
/**
* Returns the length of the text
* @return length of the text
* @stable ICU 2.4
*/
public abstract int getLength();
/**
* Gets the current index in text.
* @return current index in text.
* @stable ICU 2.4
*/
public abstract int getIndex();
/**
* Returns the UTF16 code unit at index, and increments to the next
* code unit (post-increment semantics). If index is out of
* range, DONE is returned, and the iterator is reset to the limit
* of the text.
* @return the next UTF16 code unit, or DONE if the index is at the limit
* of the text.
* @stable ICU 2.4
*/
public abstract int next();
/**
* Returns the code point at index, and increments to the next code
* point (post-increment semantics). If index does not point to a
* valid surrogate pair, the behavior is the same as
* <code>next()</code>. Otherwise the iterator is incremented past
* the surrogate pair, and the code point represented by the pair
* is returned.
* @return the next codepoint in text, or DONE if the index is at
* the limit of the text.
* @stable ICU 2.4
*/
public int nextCodePoint(){
int ch1 = next();
if(UTF16.isLeadSurrogate((char)ch1)){
int ch2 = next();
if(UTF16.isTrailSurrogate((char)ch2)){
return UCharacterProperty.getRawSupplementary((char)ch1,
(char)ch2);
}else if (ch2 != DONE) {
// unmatched surrogate so back out
previous();
}
}
return ch1;
}
/**
* Decrement to the position of the previous code unit in the
* text, and return it (pre-decrement semantics). If the
* resulting index is less than 0, the index is reset to 0 and
* DONE is returned.
* @return the previous code unit in the text, or DONE if the new
* index is before the start of the text.
* @stable ICU 2.4
*/
public abstract int previous();
/**
* Retreat to the start of the previous code point in the text,
* and return it (pre-decrement semantics). If the index is not
* preceeded by a valid surrogate pair, the behavior is the same
* as <code>previous()</code>. Otherwise the iterator is
* decremented to the start of the surrogate pair, and the code
* point represented by the pair is returned.
* @return the previous code point in the text, or DONE if the new
* index is before the start of the text.
* @stable ICU 2.4
*/
public int previousCodePoint(){
int ch1 = previous();
if(UTF16.isTrailSurrogate((char)ch1)){
int ch2 = previous();
if(UTF16.isLeadSurrogate((char)ch2)){
return UCharacterProperty.getRawSupplementary((char)ch2,
(char)ch1);
}else if (ch2 != DONE) {
//unmatched trail surrogate so back out
next();
}
}
return ch1;
}
/**
* Sets the index to the specified index in the text.
* @param index the index within the text.
* @exception IndexOutOfBoundsException is thrown if an invalid index is
* supplied
* @stable ICU 2.4
*/
public abstract void setIndex(int index);
/**
* Sets the current index to the start.
* @stable ICU 2.4
*/
public void setToStart() {
setIndex(0);
}
/**
* Fills the buffer with the underlying text storage of the iterator
* If the buffer capacity is not enough a exception is thrown. The capacity
* of the fill in buffer should at least be equal to length of text in the
* iterator obtained by calling <code>getLength()</code>.
* <b>Usage:</b>
*
* <pre>{@code
* UChacterIterator iter = new UCharacterIterator.getInstance(text);
* char[] buf = new char[iter.getLength()];
* iter.getText(buf);
*
* OR
* char[] buf= new char[1];
* int len = 0;
* for(;;){
* try{
* len = iter.getText(buf);
* break;
* }catch(IndexOutOfBoundsException e){
* buf = new char[iter.getLength()];
* }
* }
* }</pre>
*
* @param fillIn an array of chars to fill with the underlying UTF-16 code
* units.
* @param offset the position within the array to start putting the data.
* @return the number of code units added to fillIn, as a convenience
* @exception IndexOutOfBoundsException exception if there is not enough
* room after offset in the array, or if offset < 0.
* @stable ICU 2.4
*/
public abstract int getText(char[] fillIn, int offset);
/**
* Convenience override for <code>getText(char[], int)</code> that provides
* an offset of 0.
* @param fillIn an array of chars to fill with the underlying UTF-16 code
* units.
* @return the number of code units added to fillIn, as a convenience
* @exception IndexOutOfBoundsException exception if there is not enough
* room in the array.
* @stable ICU 2.4
*/
public final int getText(char[] fillIn) {
return getText(fillIn, 0);
}
/**
* Convenience method for returning the underlying text storage as a string
* @return the underlying text storage in the iterator as a string
* @stable ICU 2.4
*/
public String getText() {
char[] text = new char[getLength()];
getText(text);
return new String(text);
}
/**
* Moves the current position by the number of code points
* specified, either forward or backward depending on the sign of
* delta (positive or negative respectively). If the current index
* is at a trail surrogate then the first adjustment is by code
* unit, and the remaining adjustments are by code points. If the
* resulting index would be less than zero, the index is set to
* zero, and if the resulting index would be greater than limit,
* the index is set to limit.
* @param delta the number of code units to move the current index.
* @return the new index
* @exception IndexOutOfBoundsException is thrown if an invalid delta is
* supplied
* @stable ICU 2.4
*
*/
public int moveCodePointIndex(int delta){
if(delta>0){
while(delta>0 && nextCodePoint() != DONE){delta--;}
}else{
while(delta<0 && previousCodePoint() != DONE){delta++;}
}
if(delta!=0){
throw new IndexOutOfBoundsException();
}
return getIndex();
}
/**
* Creates a copy of this iterator, independent from other iterators.
* If it is not possible to clone the iterator, returns null.
* @return copy of this iterator
* @stable ICU 2.4
*/
public Object clone() throws CloneNotSupportedException{
return super.clone();
}
}

View file

@ -0,0 +1,607 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.util.Iterator;
import java.util.MissingResourceException;
import sun.text.normalizer.UCharacter.HangulSyllableType;
import sun.text.normalizer.UCharacter.NumericType;
/**
* <p>Internal class used for Unicode character property database.</p>
* <p>This classes store binary data read from uprops.icu.
* It does not have the capability to parse the data into more high-level
* information. It only returns bytes of information when required.</p>
* <p>Due to the form most commonly used for retrieval, array of char is used
* to store the binary data.</p>
* <p>UCharacterPropertyDB also contains information on accessing indexes to
* significant points in the binary data.</p>
* <p>Responsibility for molding the binary data into more meaning form lies on
* <a href=UCharacter.html>UCharacter</a>.</p>
* @author Syn Wee Quek
* @since release 2.1, february 1st 2002
*/
final class UCharacterProperty
{
// public data members -----------------------------------------------
/*
* public singleton instance
*/
public static final UCharacterProperty INSTANCE;
/**
* Trie data
*/
public Trie2_16 m_trie_;
/**
* Unicode version
*/
public VersionInfo m_unicodeVersion_;
/**
* Character type mask
*/
public static final int TYPE_MASK = 0x1F;
// uprops.h enum UPropertySource --------------------------------------- ***
/** From uchar.c/uprops.icu main trie */
public static final int SRC_CHAR=1;
/** From uchar.c/uprops.icu properties vectors trie */
public static final int SRC_PROPSVEC=2;
/** From ubidi_props.c/ubidi.icu */
public static final int SRC_BIDI=5;
/** From normalizer2impl.cpp/nfc.nrm */
public static final int SRC_NFC=8;
/** From normalizer2impl.cpp/nfkc.nrm */
public static final int SRC_NFKC=9;
// public methods ----------------------------------------------------
/**
* Gets the main property value for code point ch.
* @param ch code point whose property value is to be retrieved
* @return property value of code point
*/
public final int getProperty(int ch)
{
return m_trie_.get(ch);
}
/**
* Gets the unicode additional properties.
* Java version of C u_getUnicodeProperties().
* @param codepoint codepoint whose additional properties is to be
* retrieved
* @param column The column index.
* @return unicode properties
*/
public int getAdditional(int codepoint, int column) {
assert column >= 0;
if (column >= m_additionalColumnsCount_) {
return 0;
}
return m_additionalVectors_[m_additionalTrie_.get(codepoint) + column];
}
/**
* <p>Get the "age" of the code point.</p>
* <p>The "age" is the Unicode version when the code point was first
* designated (as a non-character or for Private Use) or assigned a
* character.</p>
* <p>This can be useful to avoid emitting code points to receiving
* processes that do not accept newer characters.</p>
* <p>The data is from the UCD file DerivedAge.txt.</p>
* <p>This API does not check the validity of the codepoint.</p>
* @param codepoint The code point.
* @return the Unicode version number
*/
public VersionInfo getAge(int codepoint)
{
int version = getAdditional(codepoint, 0) >> AGE_SHIFT_;
return VersionInfo.getInstance(
(version >> FIRST_NIBBLE_SHIFT_) & LAST_NIBBLE_MASK_,
version & LAST_NIBBLE_MASK_, 0, 0);
}
// int-value and enumerated properties --------------------------------- ***
public int getType(int c) {
return getProperty(c)&TYPE_MASK;
}
/*
* Map some of the Grapheme Cluster Break values to Hangul Syllable Types.
* Hangul_Syllable_Type is fully redundant with a subset of Grapheme_Cluster_Break.
*/
private static final int /* UHangulSyllableType */ gcbToHst[]={
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_OTHER */
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CONTROL */
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_CR */
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_EXTEND */
HangulSyllableType.LEADING_JAMO, /* U_GCB_L */
HangulSyllableType.NOT_APPLICABLE, /* U_GCB_LF */
HangulSyllableType.LV_SYLLABLE, /* U_GCB_LV */
HangulSyllableType.LVT_SYLLABLE, /* U_GCB_LVT */
HangulSyllableType.TRAILING_JAMO, /* U_GCB_T */
HangulSyllableType.VOWEL_JAMO /* U_GCB_V */
/*
* Omit GCB values beyond what we need for hst.
* The code below checks for the array length.
*/
};
private class IntProperty {
int column; // SRC_PROPSVEC column, or "source" if mask==0
int mask;
int shift;
IntProperty(int column, int mask, int shift) {
this.column=column;
this.mask=mask;
this.shift=shift;
}
IntProperty(int source) {
this.column=source;
this.mask=0;
}
int getValue(int c) {
// systematic, directly stored properties
return (getAdditional(c, column)&mask)>>>shift;
}
}
private class BiDiIntProperty extends IntProperty {
BiDiIntProperty() {
super(SRC_BIDI);
}
}
private class CombiningClassIntProperty extends IntProperty {
CombiningClassIntProperty(int source) {
super(source);
}
}
private class NormQuickCheckIntProperty extends IntProperty { // UCHAR_NF*_QUICK_CHECK properties
int which;
int max;
NormQuickCheckIntProperty(int source, int which, int max) {
super(source);
this.which=which;
this.max=max;
}
}
private IntProperty intProp = new BiDiIntProperty() { // BIDI_PAIRED_BRACKET_TYPE
int getValue(int c) {
return UBiDiProps.INSTANCE.getPairedBracketType(c);
}
};
public int getIntPropertyValue(int c, int which) {
if (which == BIDI_PAIRED_BRACKET_TYPE) {
return intProp.getValue(c);
}
return 0; // undefined
}
/**
* Forms a supplementary code point from the argument character<br>
* Note this is for internal use hence no checks for the validity of the
* surrogate characters are done
* @param lead lead surrogate character
* @param trail trailing surrogate character
* @return code point of the supplementary character
*/
public static int getRawSupplementary(char lead, char trail)
{
return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_;
}
/**
* Gets the type mask
* @param type character type
* @return mask
*/
public static final int getMask(int type)
{
return 1 << type;
}
/**
* Returns the digit values of characters like 'A' - 'Z', normal,
* half-width and full-width. This method assumes that the other digit
* characters are checked by the calling method.
* @param ch character to test
* @return -1 if ch is not a character of the form 'A' - 'Z', otherwise
* its corresponding digit will be returned.
*/
public static int getEuropeanDigit(int ch) {
if ((ch > 0x7a && ch < 0xff21)
|| ch < 0x41 || (ch > 0x5a && ch < 0x61)
|| ch > 0xff5a || (ch > 0xff3a && ch < 0xff41)) {
return -1;
}
if (ch <= 0x7a) {
// ch >= 0x41 or ch < 0x61
return ch + 10 - ((ch <= 0x5a) ? 0x41 : 0x61);
}
// ch >= 0xff21
if (ch <= 0xff3a) {
return ch + 10 - 0xff21;
}
// ch >= 0xff41 && ch <= 0xff5a
return ch + 10 - 0xff41;
}
public int digit(int c) {
int value = getNumericTypeValue(getProperty(c)) - NTV_DECIMAL_START_;
if(value<=9) {
return value;
} else {
return -1;
}
}
// protected variables -----------------------------------------------
/**
* Extra property trie
*/
Trie2_16 m_additionalTrie_;
/**
* Extra property vectors, 1st column for age and second for binary
* properties.
*/
int m_additionalVectors_[];
/**
* Number of additional columns
*/
int m_additionalColumnsCount_;
/**
* Maximum values for block, bits used as in vector word
* 0
*/
int m_maxBlockScriptValue_;
/**
* Maximum values for script, bits used as in vector word
* 0
*/
int m_maxJTGValue_;
/**
* Script_Extensions data
*/
public char[] m_scriptExtensions_;
// private variables -------------------------------------------------
/**
* Default name of the datafile
*/
private static final String DATA_FILE_NAME_ = "/sun/text/resources/uprops.icu";
/**
* Shift value for lead surrogate to form a supplementary character.
*/
private static final int LEAD_SURROGATE_SHIFT_ = 10;
/**
* Offset to add to combined surrogate pair to avoid masking.
*/
private static final int SURROGATE_OFFSET_ =
UTF16.SUPPLEMENTARY_MIN_VALUE -
(UTF16.SURROGATE_MIN_VALUE <<
LEAD_SURROGATE_SHIFT_) -
UTF16.TRAIL_SURROGATE_MIN_VALUE;
// property data constants -------------------------------------------------
/**
* Numeric types and values in the main properties words.
*/
private static final int NUMERIC_TYPE_VALUE_SHIFT_ = 6;
private static final int getNumericTypeValue(int props) {
return props >> NUMERIC_TYPE_VALUE_SHIFT_;
}
/* constants for the storage form of numeric types and values */
/** No numeric value. */
private static final int NTV_NONE_ = 0;
/** Decimal digits: nv=0..9 */
private static final int NTV_DECIMAL_START_ = 1;
/** Other digits: nv=0..9 */
private static final int NTV_DIGIT_START_ = 11;
/** Small integers: nv=0..154 */
private static final int NTV_NUMERIC_START_ = 21;
private static final int ntvGetType(int ntv) {
return
(ntv==NTV_NONE_) ? NumericType.NONE :
(ntv<NTV_DIGIT_START_) ? NumericType.DECIMAL :
(ntv<NTV_NUMERIC_START_) ? NumericType.DIGIT :
NumericType.NUMERIC;
}
/*
* Properties in vector word 0
* Bits
* 31..24 DerivedAge version major/minor one nibble each
* 23..22 3..1: Bits 7..0 = Script_Extensions index
* 3: Script value from Script_Extensions
* 2: Script=Inherited
* 1: Script=Common
* 0: Script=bits 7..0
* 21..20 reserved
* 19..17 East Asian Width
* 16.. 8 UBlockCode
* 7.. 0 UScriptCode
*/
/**
* Script_Extensions: mask includes Script
*/
public static final int SCRIPT_X_MASK = 0x00c000ff;
//private static final int SCRIPT_X_SHIFT = 22;
/**
* Integer properties mask and shift values for East Asian cell width.
* Equivalent to icu4c UPROPS_EA_MASK
*/
private static final int EAST_ASIAN_MASK_ = 0x000e0000;
/**
* Integer properties mask and shift values for East Asian cell width.
* Equivalent to icu4c UPROPS_EA_SHIFT
*/
private static final int EAST_ASIAN_SHIFT_ = 17;
/**
* Integer properties mask and shift values for blocks.
* Equivalent to icu4c UPROPS_BLOCK_MASK
*/
private static final int BLOCK_MASK_ = 0x0001ff00;
/**
* Integer properties mask and shift values for blocks.
* Equivalent to icu4c UPROPS_BLOCK_SHIFT
*/
private static final int BLOCK_SHIFT_ = 8;
/**
* Integer properties mask and shift values for scripts.
* Equivalent to icu4c UPROPS_SHIFT_MASK
*/
public static final int SCRIPT_MASK_ = 0x000000ff;
/**
* Additional properties used in internal trie data
*/
/*
* Properties in vector word 1
* Each bit encodes one binary property.
* The following constants represent the bit number, use 1<<UPROPS_XYZ.
* UPROPS_BINARY_1_TOP<=32!
*
* Keep this list of property enums in sync with
* propListNames[] in icu/source/tools/genprops/props2.c!
*
* ICU 2.6/uprops format version 3.2 stores full properties instead of "Other_".
*/
private static final int WHITE_SPACE_PROPERTY_ = 0;
private static final int DASH_PROPERTY_ = 1;
private static final int HYPHEN_PROPERTY_ = 2;
private static final int QUOTATION_MARK_PROPERTY_ = 3;
private static final int TERMINAL_PUNCTUATION_PROPERTY_ = 4;
private static final int MATH_PROPERTY_ = 5;
private static final int HEX_DIGIT_PROPERTY_ = 6;
private static final int ASCII_HEX_DIGIT_PROPERTY_ = 7;
private static final int ALPHABETIC_PROPERTY_ = 8;
private static final int IDEOGRAPHIC_PROPERTY_ = 9;
private static final int DIACRITIC_PROPERTY_ = 10;
private static final int EXTENDER_PROPERTY_ = 11;
private static final int NONCHARACTER_CODE_POINT_PROPERTY_ = 12;
private static final int GRAPHEME_EXTEND_PROPERTY_ = 13;
private static final int GRAPHEME_LINK_PROPERTY_ = 14;
private static final int IDS_BINARY_OPERATOR_PROPERTY_ = 15;
private static final int IDS_TRINARY_OPERATOR_PROPERTY_ = 16;
private static final int RADICAL_PROPERTY_ = 17;
private static final int UNIFIED_IDEOGRAPH_PROPERTY_ = 18;
private static final int DEFAULT_IGNORABLE_CODE_POINT_PROPERTY_ = 19;
private static final int DEPRECATED_PROPERTY_ = 20;
private static final int LOGICAL_ORDER_EXCEPTION_PROPERTY_ = 21;
private static final int XID_START_PROPERTY_ = 22;
private static final int XID_CONTINUE_PROPERTY_ = 23;
private static final int ID_START_PROPERTY_ = 24;
private static final int ID_CONTINUE_PROPERTY_ = 25;
private static final int GRAPHEME_BASE_PROPERTY_ = 26;
private static final int S_TERM_PROPERTY_ = 27;
private static final int VARIATION_SELECTOR_PROPERTY_ = 28;
private static final int PATTERN_SYNTAX = 29; /* new in ICU 3.4 and Unicode 4.1 */
private static final int PATTERN_WHITE_SPACE = 30;
/*
* Properties in vector word 2
* Bits
* 31..26 reserved
* 25..20 Line Break
* 19..15 Sentence Break
* 14..10 Word Break
* 9.. 5 Grapheme Cluster Break
* 4.. 0 Decomposition Type
*/
private static final int LB_MASK = 0x03f00000;
private static final int LB_SHIFT = 20;
private static final int SB_MASK = 0x000f8000;
private static final int SB_SHIFT = 15;
private static final int WB_MASK = 0x00007c00;
private static final int WB_SHIFT = 10;
private static final int GCB_MASK = 0x000003e0;
private static final int GCB_SHIFT = 5;
/**
* Integer properties mask for decomposition type.
* Equivalent to icu4c UPROPS_DT_MASK.
*/
private static final int DECOMPOSITION_TYPE_MASK_ = 0x0000001f;
/**
* First nibble shift
*/
private static final int FIRST_NIBBLE_SHIFT_ = 0x4;
/**
* Second nibble mask
*/
private static final int LAST_NIBBLE_MASK_ = 0xF;
/**
* Age value shift
*/
private static final int AGE_SHIFT_ = 24;
// private constructors --------------------------------------------------
/**
* Constructor
* @exception IOException thrown when data reading fails or data corrupted
*/
private UCharacterProperty() throws IOException
{
// jar access
ByteBuffer bytes=ICUBinary.getRequiredData(DATA_FILE_NAME_);
m_unicodeVersion_ = ICUBinary.readHeaderAndDataVersion(bytes, DATA_FORMAT, new IsAcceptable());
// Read or skip the 16 indexes.
int propertyOffset = bytes.getInt();
/* exceptionOffset = */ bytes.getInt();
/* caseOffset = */ bytes.getInt();
int additionalOffset = bytes.getInt();
int additionalVectorsOffset = bytes.getInt();
m_additionalColumnsCount_ = bytes.getInt();
int scriptExtensionsOffset = bytes.getInt();
int reservedOffset7 = bytes.getInt();
/* reservedOffset8 = */ bytes.getInt();
/* dataTopOffset = */ bytes.getInt();
m_maxBlockScriptValue_ = bytes.getInt();
m_maxJTGValue_ = bytes.getInt();
ICUBinary.skipBytes(bytes, (16 - 12) << 2);
// read the main properties trie
m_trie_ = Trie2_16.createFromSerialized(bytes);
int expectedTrieLength = (propertyOffset - 16) * 4;
int trieLength = m_trie_.getSerializedLength();
if(trieLength > expectedTrieLength) {
throw new IOException("uprops.icu: not enough bytes for main trie");
}
// skip padding after trie bytes
ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
// skip unused intervening data structures
ICUBinary.skipBytes(bytes, (additionalOffset - propertyOffset) * 4);
if(m_additionalColumnsCount_ > 0) {
// reads the additional property block
m_additionalTrie_ = Trie2_16.createFromSerialized(bytes);
expectedTrieLength = (additionalVectorsOffset-additionalOffset)*4;
trieLength = m_additionalTrie_.getSerializedLength();
if(trieLength > expectedTrieLength) {
throw new IOException("uprops.icu: not enough bytes for additional-properties trie");
}
// skip padding after trie bytes
ICUBinary.skipBytes(bytes, expectedTrieLength - trieLength);
// additional properties
int size = scriptExtensionsOffset - additionalVectorsOffset;
m_additionalVectors_ = new int[size];
for (int i = 0; i < size; i ++) {
m_additionalVectors_[i] = bytes.getInt();
}
}
// Script_Extensions
int numChars = (reservedOffset7 - scriptExtensionsOffset) * 2;
if(numChars > 0) {
m_scriptExtensions_ = new char[numChars];
for(int i = 0; i < numChars; ++i) {
m_scriptExtensions_[i] = bytes.getChar();
}
}
}
private static final class IsAcceptable implements ICUBinary.Authenticate {
// @Override when we switch to Java 6
public boolean isDataVersionAcceptable(byte version[]) {
return version[0] == 7;
}
}
private static final int DATA_FORMAT = 0x5550726F; // "UPro"
public void upropsvec_addPropertyStarts(UnicodeSet set) {
/* add the start code point of each same-value range of the properties vectors trie */
if(m_additionalColumnsCount_>0) {
/* if m_additionalColumnsCount_==0 then the properties vectors trie may not be there at all */
Iterator<Trie2.Range> trieIterator = m_additionalTrie_.iterator();
Trie2.Range range;
while(trieIterator.hasNext() && !(range=trieIterator.next()).leadSurrogate) {
set.add(range.startCodePoint);
}
}
}
// This static initializer block must be placed after
// other static member initialization
static {
try {
INSTANCE = new UCharacterProperty();
}
catch (IOException e) {
throw new MissingResourceException(e.getMessage(),DATA_FILE_NAME_,"");
}
}
// Moved from UProperty.java
/**
* Enumerated property Bidi_Paired_Bracket_Type (new in Unicode 6.3).
* Used in UAX #9: Unicode Bidirectional Algorithm
* (http://www.unicode.org/reports/tr9/)
* Returns UCharacter.BidiPairedBracketType values.
* @stable ICU 52
*/
public static final int BIDI_PAIRED_BRACKET_TYPE = 0x1015;
}

View file

@ -0,0 +1,616 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/**
*******************************************************************************
* Copyright (C) 1996-2014, International Business Machines Corporation and
* others. All Rights Reserved.
*******************************************************************************
*/
package sun.text.normalizer;
/**
* <p>Standalone utility class providing UTF16 character conversions and
* indexing conversions.
* <p>Code that uses strings alone rarely need modification.
* By design, UTF-16 does not allow overlap, so searching for strings is a safe
* operation. Similarly, concatenation is always safe. Substringing is safe if
* the start and end are both on UTF-32 boundaries. In normal code, the values
* for start and end are on those boundaries, since they arose from operations
* like searching. If not, the nearest UTF-32 boundaries can be determined
* using <code>bounds()</code>.
* <strong>Examples:</strong>
* <p>The following examples illustrate use of some of these methods.
* <pre>{@code
* // iteration forwards: Original
* for (int i = 0; i < s.length(); ++i) {
* char ch = s.charAt(i);
* doSomethingWith(ch);
* }
*
* // iteration forwards: Changes for UTF-32
* int ch;
* for (int i = 0; i < s.length(); i += UTF16.getCharCount(ch)) {
* ch = UTF16.charAt(s, i);
* doSomethingWith(ch);
* }
*
* // iteration backwards: Original
* for (int i = s.length() - 1; i >= 0; --i) {
* char ch = s.charAt(i);
* doSomethingWith(ch);
* }
*
* // iteration backwards: Changes for UTF-32
* int ch;
* for (int i = s.length() - 1; i > 0; i -= UTF16.getCharCount(ch)) {
* ch = UTF16.charAt(s, i);
* doSomethingWith(ch);
* }
* }</pre>
* <strong>Notes:</strong>
* <ul>
* <li>
* <strong>Naming:</strong> For clarity, High and Low surrogates are called
* <code>Lead</code> and <code>Trail</code> in the API, which gives a better
* sense of their ordering in a string. <code>offset16</code> and
* <code>offset32</code> are used to distinguish offsets to UTF-16
* boundaries vs offsets to UTF-32 boundaries. <code>int char32</code> is
* used to contain UTF-32 characters, as opposed to <code>char16</code>,
* which is a UTF-16 code unit.
* </li>
* <li>
* <strong>Roundtripping Offsets:</strong> You can always roundtrip from a
* UTF-32 offset to a UTF-16 offset and back. Because of the difference in
* structure, you can roundtrip from a UTF-16 offset to a UTF-32 offset and
* back if and only if <code>bounds(string, offset16) != TRAIL</code>.
* </li>
* <li>
* <strong>Exceptions:</strong> The error checking will throw an exception
* if indices are out of bounds. Other than that, all methods will
* behave reasonably, even if unmatched surrogates or out-of-bounds UTF-32
* values are present. <code>UCharacter.isLegal()</code> can be used to check
* for validity if desired.
* </li>
* <li>
* <strong>Unmatched Surrogates:</strong> If the string contains unmatched
* surrogates, then these are counted as one UTF-32 value. This matches
* their iteration behavior, which is vital. It also matches common display
* practice as missing glyphs (see the Unicode Standard Section 5.4, 5.5).
* </li>
* <li>
* <strong>Optimization:</strong> The method implementations may need
* optimization if the compiler doesn't fold static final methods. Since
* surrogate pairs will form an exceeding small percentage of all the text
* in the world, the singleton case should always be optimized for.
* </li>
* </ul>
* @author Mark Davis, with help from Markus Scherer
* @stable ICU 2.1
*/
public final class UTF16
{
// public variables ---------------------------------------------------
/**
* The lowest Unicode code point value.
* @stable ICU 2.1
*/
public static final int CODEPOINT_MIN_VALUE = 0;
/**
* The highest Unicode code point value (scalar value) according to the
* Unicode Standard.
* @stable ICU 2.1
*/
public static final int CODEPOINT_MAX_VALUE = 0x10ffff;
/**
* The minimum value for Supplementary code points
* @stable ICU 2.1
*/
public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000;
/**
* Lead surrogate minimum value
* @stable ICU 2.1
*/
public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800;
/**
* Trail surrogate minimum value
* @stable ICU 2.1
*/
public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00;
/**
* Lead surrogate maximum value
* @stable ICU 2.1
*/
public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF;
/**
* Trail surrogate maximum value
* @stable ICU 2.1
*/
public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF;
/**
* Surrogate minimum value
* @stable ICU 2.1
*/
public static final int SURROGATE_MIN_VALUE = LEAD_SURROGATE_MIN_VALUE;
/**
* Lead surrogate bitmask
*/
private static final int LEAD_SURROGATE_BITMASK = 0xFFFFFC00;
/**
* Trail surrogate bitmask
*/
private static final int TRAIL_SURROGATE_BITMASK = 0xFFFFFC00;
/**
* Surrogate bitmask
*/
private static final int SURROGATE_BITMASK = 0xFFFFF800;
/**
* Lead surrogate bits
*/
private static final int LEAD_SURROGATE_BITS = 0xD800;
/**
* Trail surrogate bits
*/
private static final int TRAIL_SURROGATE_BITS = 0xDC00;
/**
* Surrogate bits
*/
private static final int SURROGATE_BITS = 0xD800;
// constructor --------------------------------------------------------
// /CLOVER:OFF
/**
* Prevent instance from being created.
*/
private UTF16() {
}
// /CLOVER:ON
// public method ------------------------------------------------------
/**
* Extract a single UTF-32 value from a string.
* Used when iterating forwards or backwards (with
* <code>UTF16.getCharCount()</code>, as well as random access. If a
* validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">
* UCharacter.isLegal()</a></code> on the return value.
* If the char retrieved is part of a surrogate pair, its supplementary
* character will be returned. If a complete supplementary character is
* not found the incomplete character will be returned
* @param source array of UTF-16 chars
* @param offset16 UTF-16 offset to the start of the character.
* @return UTF-32 value for the UTF-32 value that contains the char at
* offset16. The boundaries of that codepoint are the same as in
* <code>bounds32()</code>.
* @exception IndexOutOfBoundsException thrown if offset16 is out of
* bounds.
* @stable ICU 2.1
*/
public static int charAt(String source, int offset16) {
char single = source.charAt(offset16);
if (single < LEAD_SURROGATE_MIN_VALUE) {
return single;
}
return _charAt(source, offset16, single);
}
private static int _charAt(String source, int offset16, char single) {
if (single > TRAIL_SURROGATE_MAX_VALUE) {
return single;
}
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is
// low, look both directions.
if (single <= LEAD_SURROGATE_MAX_VALUE) {
++offset16;
if (source.length() != offset16) {
char trail = source.charAt(offset16);
if (trail >= TRAIL_SURROGATE_MIN_VALUE && trail <= TRAIL_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(single, trail);
}
}
} else {
--offset16;
if (offset16 >= 0) {
// single is a trail surrogate so
char lead = source.charAt(offset16);
if (lead >= LEAD_SURROGATE_MIN_VALUE && lead <= LEAD_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(lead, single);
}
}
}
return single; // return unmatched surrogate
}
/**
* Extract a single UTF-32 value from a string.
* Used when iterating forwards or backwards (with
* <code>UTF16.getCharCount()</code>, as well as random access. If a
* validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
* </a></code> on the return value.
* If the char retrieved is part of a surrogate pair, its supplementary
* character will be returned. If a complete supplementary character is
* not found the incomplete character will be returned
* @param source array of UTF-16 chars
* @param offset16 UTF-16 offset to the start of the character.
* @return UTF-32 value for the UTF-32 value that contains the char at
* offset16. The boundaries of that codepoint are the same as in
* <code>bounds32()</code>.
* @exception IndexOutOfBoundsException thrown if offset16 is out of bounds.
* @stable ICU 2.1
*/
public static int charAt(CharSequence source, int offset16) {
char single = source.charAt(offset16);
if (single < UTF16.LEAD_SURROGATE_MIN_VALUE) {
return single;
}
return _charAt(source, offset16, single);
}
private static int _charAt(CharSequence source, int offset16, char single) {
if (single > UTF16.TRAIL_SURROGATE_MAX_VALUE) {
return single;
}
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is
// low, look both directions.
if (single <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
++offset16;
if (source.length() != offset16) {
char trail = source.charAt(offset16);
if (trail >= UTF16.TRAIL_SURROGATE_MIN_VALUE
&& trail <= UTF16.TRAIL_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(single, trail);
}
}
} else {
--offset16;
if (offset16 >= 0) {
// single is a trail surrogate so
char lead = source.charAt(offset16);
if (lead >= UTF16.LEAD_SURROGATE_MIN_VALUE
&& lead <= UTF16.LEAD_SURROGATE_MAX_VALUE) {
return UCharacterProperty.getRawSupplementary(lead, single);
}
}
}
return single; // return unmatched surrogate
}
/**
* Extract a single UTF-32 value from a substring. Used when iterating forwards or backwards
* (with <code>UTF16.getCharCount()</code>, as well as random access. If a validity check is
* required, use <code><a href="../lang/UCharacter.html#isLegal(char)">UCharacter.isLegal()
* </a></code>
* on the return value. If the char retrieved is part of a surrogate pair, its supplementary
* character will be returned. If a complete supplementary character is not found the incomplete
* character will be returned
*
* @param source Array of UTF-16 chars
* @param start Offset to substring in the source array for analyzing
* @param limit Offset to substring in the source array for analyzing
* @param offset16 UTF-16 offset relative to start
* @return UTF-32 value for the UTF-32 value that contains the char at offset16. The boundaries
* of that codepoint are the same as in <code>bounds32()</code>.
* @exception IndexOutOfBoundsException Thrown if offset16 is not within the range of start and limit.
* @stable ICU 2.1
*/
public static int charAt(char source[], int start, int limit, int offset16) {
offset16 += start;
if (offset16 < start || offset16 >= limit) {
throw new ArrayIndexOutOfBoundsException(offset16);
}
char single = source[offset16];
if (!isSurrogate(single)) {
return single;
}
// Convert the UTF-16 surrogate pair if necessary.
// For simplicity in usage, and because the frequency of pairs is
// low, look both directions.
if (single <= LEAD_SURROGATE_MAX_VALUE) {
offset16++;
if (offset16 >= limit) {
return single;
}
char trail = source[offset16];
if (isTrailSurrogate(trail)) {
return UCharacterProperty.getRawSupplementary(single, trail);
}
}
else { // isTrailSurrogate(single), so
if (offset16 == start) {
return single;
}
offset16--;
char lead = source[offset16];
if (isLeadSurrogate(lead))
return UCharacterProperty.getRawSupplementary(lead, single);
}
return single; // return unmatched surrogate
}
/**
* Determines how many chars this char32 requires.
* If a validity check is required, use <code>
* <a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code> on
* char32 before calling.
* @param char32 the input codepoint.
* @return 2 if is in supplementary space, otherwise 1.
* @stable ICU 2.1
*/
public static int getCharCount(int char32)
{
if (char32 < SUPPLEMENTARY_MIN_VALUE) {
return 1;
}
return 2;
}
/**
* Determines whether the code value is a surrogate.
* @param char16 the input character.
* @return true if the input character is a surrogate.
* @stable ICU 2.1
*/
public static boolean isSurrogate(char char16)
{
return (char16 & SURROGATE_BITMASK) == SURROGATE_BITS;
}
/**
* Determines whether the character is a trail surrogate.
* @param char16 the input character.
* @return true if the input character is a trail surrogate.
* @stable ICU 2.1
*/
public static boolean isTrailSurrogate(char char16)
{
return (char16 & TRAIL_SURROGATE_BITMASK) == TRAIL_SURROGATE_BITS;
}
/**
* Determines whether the character is a lead surrogate.
* @param char16 the input character.
* @return true if the input character is a lead surrogate
* @stable ICU 2.1
*/
public static boolean isLeadSurrogate(char char16)
{
return (char16 & LEAD_SURROGATE_BITMASK) == LEAD_SURROGATE_BITS;
}
/**
* Returns the lead surrogate.
* If a validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
* on char32 before calling.
* @param char32 the input character.
* @return lead surrogate if the getCharCount(ch) is 2; <br>
* and 0 otherwise (note: 0 is not a valid lead surrogate).
* @stable ICU 2.1
*/
public static char getLeadSurrogate(int char32)
{
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
return (char)(LEAD_SURROGATE_OFFSET_ +
(char32 >> LEAD_SURROGATE_SHIFT_));
}
return 0;
}
/**
* Returns the trail surrogate.
* If a validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
* on char32 before calling.
* @param char32 the input character.
* @return the trail surrogate if the getCharCount(ch) is 2; <br> otherwise
* the character itself
* @stable ICU 2.1
*/
public static char getTrailSurrogate(int char32)
{
if (char32 >= SUPPLEMENTARY_MIN_VALUE) {
return (char)(TRAIL_SURROGATE_MIN_VALUE +
(char32 & TRAIL_SURROGATE_MASK_));
}
return (char) char32;
}
/**
* Convenience method corresponding to String.valueOf(char). Returns a one
* or two char string containing the UTF-32 value in UTF16 format. If a
* validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
* on char32 before calling.
* @param char32 the input character.
* @return string value of char32 in UTF16 format
* @exception IllegalArgumentException thrown if char32 is a invalid
* codepoint.
* @stable ICU 2.1
*/
public static String valueOf(int char32)
{
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
throw new IllegalArgumentException("Illegal codepoint");
}
return toString(char32);
}
/**
* Append a single UTF-32 value to the end of a StringBuffer.
* If a validity check is required, use
* <code><a href="../lang/UCharacter.html#isLegal(char)">isLegal()</a></code>
* on char32 before calling.
* @param target the buffer to append to
* @param char32 value to append.
* @return the updated StringBuffer
* @exception IllegalArgumentException thrown when char32 does not lie
* within the range of the Unicode codepoints
* @stable ICU 2.1
*/
public static StringBuffer append(StringBuffer target, int char32)
{
// Check for irregular values
if (char32 < CODEPOINT_MIN_VALUE || char32 > CODEPOINT_MAX_VALUE) {
throw new IllegalArgumentException("Illegal codepoint: " + Integer.toHexString(char32));
}
// Write the UTF-16 values
if (char32 >= SUPPLEMENTARY_MIN_VALUE)
{
target.append(getLeadSurrogate(char32));
target.append(getTrailSurrogate(char32));
}
else {
target.append((char) char32);
}
return target;
}
/**
* Shifts offset16 by the argument number of codepoints within a subarray.
* @param source char array
* @param start position of the subarray to be performed on
* @param limit position of the subarray to be performed on
* @param offset16 UTF16 position to shift relative to start
* @param shift32 number of codepoints to shift
* @return new shifted offset16 relative to start
* @exception IndexOutOfBoundsException if the new offset16 is out of
* bounds with respect to the subarray or the subarray bounds
* are out of range.
* @stable ICU 2.1
*/
public static int moveCodePointOffset(char source[], int start, int limit,
int offset16, int shift32)
{
int size = source.length;
int count;
char ch;
int result = offset16 + start;
if (start < 0 || limit < start) {
throw new StringIndexOutOfBoundsException(start);
}
if (limit > size) {
throw new StringIndexOutOfBoundsException(limit);
}
if (offset16 < 0 || result > limit) {
throw new StringIndexOutOfBoundsException(offset16);
}
if (shift32 > 0) {
if (shift32 + result > size) {
throw new StringIndexOutOfBoundsException(result);
}
count = shift32;
while (result < limit && count > 0)
{
ch = source[result];
if (isLeadSurrogate(ch) && (result + 1 < limit) &&
isTrailSurrogate(source[result + 1])) {
result++;
}
count--;
result++;
}
} else {
if (result + shift32 < start) {
throw new StringIndexOutOfBoundsException(result);
}
for (count = -shift32; count > 0; count--) {
result--;
if (result < start) {
break;
}
ch = source[result];
if (isTrailSurrogate(ch) && result > start && isLeadSurrogate(source[result - 1])) {
result--;
}
}
}
if (count != 0) {
throw new StringIndexOutOfBoundsException(shift32);
}
result -= start;
return result;
}
// private data members -------------------------------------------------
/**
* Shift value for lead surrogate to form a supplementary character.
*/
private static final int LEAD_SURROGATE_SHIFT_ = 10;
/**
* Mask to retrieve the significant value from a trail surrogate.
*/
private static final int TRAIL_SURROGATE_MASK_ = 0x3FF;
/**
* Value that all lead surrogate starts with
*/
private static final int LEAD_SURROGATE_OFFSET_ =
LEAD_SURROGATE_MIN_VALUE -
(SUPPLEMENTARY_MIN_VALUE
>> LEAD_SURROGATE_SHIFT_);
// private methods ------------------------------------------------------
/**
* <p>Converts argument code point and returns a String object representing
* the code point's value in UTF16 format.
* <p>This method does not check for the validity of the codepoint, the
* results are not guaranteed if a invalid codepoint is passed as
* argument.
* <p>The result is a string whose length is 1 for non-supplementary code
* points, 2 otherwise.
* @param ch code point
* @return string representation of the code point
*/
private static String toString(int ch)
{
if (ch < SUPPLEMENTARY_MIN_VALUE) {
return String.valueOf((char) ch);
}
StringBuilder result = new StringBuilder();
result.append(getLeadSurrogate(ch));
result.append(getTrailSurrogate(ch));
return result.toString();
}
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,273 @@
/*
* Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* Copyright (C) 1996-2011, International Business Machines Corporation and *
* others. All Rights Reserved. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.io.IOException;
import java.util.Locale;
final class Utility {
/**
* Convert characters outside the range U+0020 to U+007F to
* Unicode escapes, and convert backslash to a double backslash.
*/
public static final String escape(String s) {
StringBuilder buf = new StringBuilder();
for (int i=0; i<s.length(); ) {
int c = Character.codePointAt(s, i);
i += UTF16.getCharCount(c);
if (c >= ' ' && c <= 0x007F) {
if (c == '\\') {
buf.append("\\\\"); // That is, "\\"
} else {
buf.append((char)c);
}
} else {
boolean four = c <= 0xFFFF;
buf.append(four ? "\\u" : "\\U");
buf.append(hex(c, four ? 4 : 8));
}
}
return buf.toString();
}
/* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
private static final char[] UNESCAPE_MAP = {
/*" 0x22, 0x22 */
/*' 0x27, 0x27 */
/*? 0x3F, 0x3F */
/*\ 0x5C, 0x5C */
/*a*/ 0x61, 0x07,
/*b*/ 0x62, 0x08,
/*e*/ 0x65, 0x1b,
/*f*/ 0x66, 0x0c,
/*n*/ 0x6E, 0x0a,
/*r*/ 0x72, 0x0d,
/*t*/ 0x74, 0x09,
/*v*/ 0x76, 0x0b
};
/**
* Convert an escape to a 32-bit code point value. We attempt
* to parallel the icu4c unescapeAt() function.
* @param offset16 an array containing offset to the character
* <em>after</em> the backslash. Upon return offset16[0] will
* be updated to point after the escape sequence.
* @return character value from 0 to 10FFFF, or -1 on error.
*/
public static int unescapeAt(String s, int[] offset16) {
int c;
int result = 0;
int n = 0;
int minDig = 0;
int maxDig = 0;
int bitsPerDigit = 4;
int dig;
int i;
boolean braces = false;
/* Check that offset is in range */
int offset = offset16[0];
int length = s.length();
if (offset < 0 || offset >= length) {
return -1;
}
/* Fetch first UChar after '\\' */
c = Character.codePointAt(s, offset);
offset += UTF16.getCharCount(c);
/* Convert hexadecimal and octal escapes */
switch (c) {
case 'u':
minDig = maxDig = 4;
break;
case 'U':
minDig = maxDig = 8;
break;
case 'x':
minDig = 1;
if (offset < length && UTF16.charAt(s, offset) == 0x7B /*{*/) {
++offset;
braces = true;
maxDig = 8;
} else {
maxDig = 2;
}
break;
default:
dig = UCharacter.digit(c, 8);
if (dig >= 0) {
minDig = 1;
maxDig = 3;
n = 1; /* Already have first octal digit */
bitsPerDigit = 3;
result = dig;
}
break;
}
if (minDig != 0) {
while (offset < length && n < maxDig) {
c = UTF16.charAt(s, offset);
dig = UCharacter.digit(c, (bitsPerDigit == 3) ? 8 : 16);
if (dig < 0) {
break;
}
result = (result << bitsPerDigit) | dig;
offset += UTF16.getCharCount(c);
++n;
}
if (n < minDig) {
return -1;
}
if (braces) {
if (c != 0x7D /*}*/) {
return -1;
}
++offset;
}
if (result < 0 || result >= 0x110000) {
return -1;
}
// If an escape sequence specifies a lead surrogate, see
// if there is a trail surrogate after it, either as an
// escape or as a literal. If so, join them up into a
// supplementary.
if (offset < length &&
UTF16.isLeadSurrogate((char) result)) {
int ahead = offset+1;
c = s.charAt(offset); // [sic] get 16-bit code unit
if (c == '\\' && ahead < length) {
int o[] = new int[] { ahead };
c = unescapeAt(s, o);
ahead = o[0];
}
if (UTF16.isTrailSurrogate((char) c)) {
offset = ahead;
result = UCharacterProperty.getRawSupplementary(
(char) result, (char) c);
}
}
offset16[0] = offset;
return result;
}
/* Convert C-style escapes in table */
for (i=0; i<UNESCAPE_MAP.length; i+=2) {
if (c == UNESCAPE_MAP[i]) {
offset16[0] = offset;
return UNESCAPE_MAP[i+1];
} else if (c < UNESCAPE_MAP[i]) {
break;
}
}
/* Map \cX to control-X: X & 0x1F */
if (c == 'c' && offset < length) {
c = UTF16.charAt(s, offset);
offset16[0] = offset + UTF16.getCharCount(c);
return 0x1F & c;
}
/* If no special forms are recognized, then consider
* the backslash to generically escape the next character. */
offset16[0] = offset;
return c;
}
/**
* Supplies a zero-padded hex representation of an integer (without 0x)
*/
public static String hex(long i, int places) {
if (i == Long.MIN_VALUE) return "-8000000000000000";
boolean negative = i < 0;
if (negative) {
i = -i;
}
String result = Long.toString(i, 16).toUpperCase(Locale.ENGLISH);
if (result.length() < places) {
result = "0000000000000000".substring(result.length(),places) + result;
}
if (negative) {
return '-' + result;
}
return result;
}
static final char DIGITS[] = {
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J',
'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T',
'U', 'V', 'W', 'X', 'Y', 'Z'
};
/**
* Return true if the character is NOT printable ASCII. The tab,
* newline and linefeed characters are considered unprintable.
*/
public static boolean isUnprintable(int c) {
//0x20 = 32 and 0x7E = 126
return !(c >= 0x20 && c <= 0x7E);
}
/**
* Escape unprintable characters using <backslash>uxxxx notation
* for U+0000 to U+FFFF and <backslash>Uxxxxxxxx for U+10000 and
* above. If the character is printable ASCII, then do nothing
* and return FALSE. Otherwise, append the escaped notation and
* return TRUE.
*/
public static <T extends Appendable> boolean escapeUnprintable(T result, int c) {
try {
if (isUnprintable(c)) {
result.append('\\');
if ((c & ~0xFFFF) != 0) {
result.append('U');
result.append(DIGITS[0xF&(c>>28)]);
result.append(DIGITS[0xF&(c>>24)]);
result.append(DIGITS[0xF&(c>>20)]);
result.append(DIGITS[0xF&(c>>16)]);
} else {
result.append('u');
}
result.append(DIGITS[0xF&(c>>12)]);
result.append(DIGITS[0xF&(c>>8)]);
result.append(DIGITS[0xF&(c>>4)]);
result.append(DIGITS[0xF&c]);
return true;
}
return false;
} catch (IOException e) {
throw new IllegalArgumentException(e);
}
}
}

View file

@ -0,0 +1,185 @@
/*
* Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*******************************************************************************
* (C) Copyright IBM Corp. and others, 1996-2009 - All Rights Reserved *
* *
* The original version of this source code and documentation is copyrighted *
* and owned by IBM, These materials are provided under terms of a License *
* Agreement between IBM and Sun. This technology is protected by multiple *
* US and International patents. This notice and attribution to IBM may not *
* to removed. *
*******************************************************************************
*/
package sun.text.normalizer;
import java.util.HashMap;
/**
* Class to store version numbers of the form major.minor.milli.micro.
* @author synwee
* @stable ICU 2.6
*/
public final class VersionInfo
{
// public methods ------------------------------------------------------
/**
* Returns an instance of VersionInfo with the argument version.
* @param version version String in the format of "major.minor.milli.micro"
* or "major.minor.milli" or "major.minor" or "major",
* where major, minor, milli, micro are non-negative numbers
* {@literal <=} 255. If the trailing version numbers are
* not specified they are taken as 0s. E.g. Version "3.1" is
* equivalent to "3.1.0.0".
* @return an instance of VersionInfo with the argument version.
* @exception throws an IllegalArgumentException when the argument version
* is not in the right format
* @stable ICU 2.6
*/
public static VersionInfo getInstance(String version)
{
int length = version.length();
int array[] = {0, 0, 0, 0};
int count = 0;
int index = 0;
while (count < 4 && index < length) {
char c = version.charAt(index);
if (c == '.') {
count ++;
}
else {
c -= '0';
if (c < 0 || c > 9) {
throw new IllegalArgumentException(INVALID_VERSION_NUMBER_);
}
array[count] *= 10;
array[count] += c;
}
index ++;
}
if (index != length) {
throw new IllegalArgumentException(
"Invalid version number: String '" + version + "' exceeds version format");
}
for (int i = 0; i < 4; i ++) {
if (array[i] < 0 || array[i] > 255) {
throw new IllegalArgumentException(INVALID_VERSION_NUMBER_);
}
}
return getInstance(array[0], array[1], array[2], array[3]);
}
/**
* Returns an instance of VersionInfo with the argument version.
* @param major major version, non-negative number {@literal <=} 255.
* @param minor minor version, non-negative number {@literal <=} 255.
* @param milli milli version, non-negative number {@literal <=} 255.
* @param micro micro version, non-negative number {@literal <=} 255.
* @exception throws an IllegalArgumentException when either arguments are
* negative or {@literal >} 255
* @stable ICU 2.6
*/
public static VersionInfo getInstance(int major, int minor, int milli,
int micro)
{
// checks if it is in the hashmap
// else
if (major < 0 || major > 255 || minor < 0 || minor > 255 ||
milli < 0 || milli > 255 || micro < 0 || micro > 255) {
throw new IllegalArgumentException(INVALID_VERSION_NUMBER_);
}
int version = getInt(major, minor, milli, micro);
Integer key = Integer.valueOf(version);
Object result = MAP_.get(key);
if (result == null) {
result = new VersionInfo(version);
MAP_.put(key, result);
}
return (VersionInfo)result;
}
/**
* Compares other with this VersionInfo.
* @param other VersionInfo to be compared
* @return 0 if the argument is a VersionInfo object that has version
* information equals to this object.
* Less than 0 if the argument is a VersionInfo object that has
* version information greater than this object.
* Greater than 0 if the argument is a VersionInfo object that
* has version information less than this object.
* @stable ICU 2.6
*/
public int compareTo(VersionInfo other)
{
return m_version_ - other.m_version_;
}
// private data members ----------------------------------------------
/**
* Version number stored as a byte for each of the major, minor, milli and
* micro numbers in the 32 bit int.
* Most significant for the major and the least significant contains the
* micro numbers.
*/
private int m_version_;
/**
* Map of singletons
*/
private static final HashMap<Integer, Object> MAP_ = new HashMap<>();
/**
* Error statement string
*/
private static final String INVALID_VERSION_NUMBER_ =
"Invalid version number: Version number may be negative or greater than 255";
// private constructor -----------------------------------------------
/**
* Constructor with int
* @param compactversion a 32 bit int with each byte representing a number
*/
private VersionInfo(int compactversion)
{
m_version_ = compactversion;
}
/**
* Gets the int from the version numbers
* @param major non-negative version number
* @param minor non-negativeversion number
* @param milli non-negativeversion number
* @param micro non-negativeversion number
*/
private static int getInt(int major, int minor, int milli, int micro)
{
return (major << 24) | (minor << 16) | (milli << 8) | micro;
}
}

View file

@ -0,0 +1,67 @@
/*
* Copyright (c) 2003, 2005, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
* Licensed Materials - Property of IBM
*
* (C) Copyright IBM Corp. 1999 All Rights Reserved.
* (C) IBM Corp. 1997-1998. All Rights Reserved.
*
* The program is provided "as is" without any warranty express or
* implied, including the warranty of non-infringement and the implied
* warranties of merchantibility and fitness for a particular purpose.
* IBM will not be liable for any damages suffered by you as a result
* of using the Program. In no event will IBM be liable for any
* special, indirect or consequential damages or lost profits even if
* IBM has been advised of the possibility of their occurrence. IBM
* will not be liable for any third party claims against you.
*/
package sun.text.resources;
import java.util.ListResourceBundle;
public class BreakIteratorInfo extends ListResourceBundle {
protected final Object[][] getContents() {
return new Object[][] {
// BreakIteratorClasses lists the class names to instantiate for each
// built-in type of BreakIterator
{"BreakIteratorClasses",
new String[] {
"RuleBasedBreakIterator", // character-break iterator class
"RuleBasedBreakIterator", // word-break iterator class
"RuleBasedBreakIterator", // line-break iterator class
"RuleBasedBreakIterator" // sentence-break iterator class
}
},
// Rules filename for each break-iterator
{"CharacterData", "CharacterBreakIteratorData"},
{"WordData", "WordBreakIteratorData"},
{"LineData", "LineBreakIteratorData"},
{"SentenceData", "SentenceBreakIteratorData"},
};
}
}

View file

@ -0,0 +1,35 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package sun.text.resources;
import java.util.spi.ResourceBundleProvider;
/**
* An interface for the internal locale data provider for which {@code ResourceBundle}
* searches.
*/
public interface BreakIteratorInfoProvider extends ResourceBundleProvider {
}

View file

@ -0,0 +1,36 @@
/*
* Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package sun.text.resources;
import java.util.ResourceBundle;
import sun.util.resources.BreakIteratorResourceBundle;
public class BreakIteratorResources extends BreakIteratorResourceBundle {
@Override
protected ResourceBundle getBreakIteratorInfo() {
return new BreakIteratorInfo();
}
}

View file

@ -0,0 +1,379 @@
/*
* Copyright (c) 1999, 2007, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*/
/*
* Licensed Materials - Property of IBM
*
* (C) Copyright IBM Corp. 1999 All Rights Reserved.
* (C) IBM Corp. 1997-1998. All Rights Reserved.
*
* The program is provided "as is" without any warranty express or
* implied, including the warranty of non-infringement and the implied
* warranties of merchantibility and fitness for a particular purpose.
* IBM will not be liable for any damages suffered by you as a result
* of using the Program. In no event will IBM be liable for any
* special, indirect or consequential damages or lost profits even if
* IBM has been advised of the possibility of their occurrence. IBM
* will not be liable for any third party claims against you.
*/
package sun.text.resources;
import java.util.ListResourceBundle;
/**
* Default break-iterator rules. These rules are more or less general for
* all locales, although there are probably a few we're missing. The
* behavior currently mimics the behavior of BreakIterator in JDK 1.2.
* There are known deficiencies in this behavior, including the fact that
* the logic for handling CJK characters works for Japanese but not for
* Chinese, and that we don't currently have an appropriate locale for
* Thai. The resources will eventually be updated to fix these problems.
*/
/* Modified for Hindi 3/1/99. */
/*
* Since JDK 1.5.0, this file no longer goes to runtime and is used at J2SE
* build phase in order to create [Character|Word|Line|Sentence]BreakIteratorData
* files which are used on runtime instead.
*/
public class BreakIteratorRules extends ListResourceBundle {
protected final Object[][] getContents() {
return new Object[][] {
// rules describing how to break between logical characters
{ "CharacterBreakRules",
// ignore non-spacing marks and enclosing marks (since we never
// put a break before ignore characters, this keeps combining
// accents with the base characters they modify)
"<enclosing>=[:Mn::Me:];"
// other category definitions
+ "<choseong>=[\u1100-\u115f];"
+ "<jungseong>=[\u1160-\u11a7];"
+ "<jongseong>=[\u11a8-\u11ff];"
+ "<surr-hi>=[\ud800-\udbff];"
+ "<surr-lo>=[\udc00-\udfff];"
// break after every character, except as follows:
+ ".;"
// keep base and combining characters togethers
+ "<base>=[^<enclosing>^[:Cc::Cf::Zl::Zp:]];"
+ "<base><enclosing><enclosing>*;"
// keep CRLF sequences together
+ "\r\n;"
// keep surrogate pairs together
+ "<surr-hi><surr-lo>;"
// keep Hangul syllables spelled out using conjoining jamo together
+ "<choseong>*<jungseong>*<jongseong>*;"
// various additions for Hindi support
+ "<nukta>=[\u093c];"
+ "<danda>=[\u0964\u0965];"
+ "<virama>=[\u094d];"
+ "<devVowelSign>=[\u093e-\u094c\u0962\u0963];"
+ "<devConsonant>=[\u0915-\u0939];"
+ "<devNuktaConsonant>=[\u0958-\u095f];"
+ "<devCharEnd>=[\u0902\u0903\u0951-\u0954];"
+ "<devCAMN>=(<devConsonant>{<nukta>});"
+ "<devConsonant1>=(<devNuktaConsonant>|<devCAMN>);"
+ "<zwj>=[\u200d];"
+ "<devConjunct>=({<devConsonant1><virama>{<zwj>}}<devConsonant1>);"
+ "<devConjunct>{<devVowelSign>}{<devCharEnd>};"
+ "<danda><nukta>;"
},
// default rules for finding word boundaries
{ "WordBreakRules",
// ignore non-spacing marks, enclosing marks, and format characters,
// all of which should not influence the algorithm
//"<ignore>=[:Mn::Me::Cf:];"
"<ignore>=[:Cf:];"
+ "<enclosing>=[:Mn::Me:];"
// Hindi phrase separator, kanji, katakana, hiragana, CJK diacriticals,
// other letters, and digits
+ "<danda>=[\u0964\u0965];"
+ "<kanji>=[\u3005\u4e00-\u9fa5\uf900-\ufa2d];"
+ "<kata>=[\u30a1-\u30fa\u30fd\u30fe];"
+ "<hira>=[\u3041-\u3094\u309d\u309e];"
+ "<cjk-diacrit>=[\u3099-\u309c\u30fb\u30fc];"
+ "<letter-base>=[:L::Mc:^[<kanji><kata><hira><cjk-diacrit>]];"
+ "<let>=(<letter-base><enclosing>*);"
+ "<digit-base>=[:N:];"
+ "<dgt>=(<digit-base><enclosing>*);"
// punctuation that can occur in the middle of a word: currently
// dashes, apostrophes, quotation marks, and periods
+ "<mid-word>=[:Pd::Pc:\u00ad\u2027\\\"\\\'\\.];"
// punctuation that can occur in the middle of a number: currently
// apostrophes, qoutation marks, periods, commas, and the Arabic
// decimal point
+ "<mid-num>=[\\\"\\\'\\,\u066b\\.];"
// punctuation that can occur at the beginning of a number: currently
// the period, the number sign, and all currency symbols except the cents sign
+ "<pre-num>=[:Sc:\\#\\.^\u00a2];"
// punctuation that can occur at the end of a number: currently
// the percent, per-thousand, per-ten-thousand, and Arabic percent
// signs, the cents sign, and the ampersand
+ "<post-num>=[\\%\\&\u00a2\u066a\u2030\u2031];"
// line separators: currently LF, FF, PS, and LS
+ "<ls>=[\n\u000c\u2028\u2029];"
// whitespace: all space separators and the tab character
+ "<ws-base>=[:Zs:\t];"
+ "<ws>=(<ws-base><enclosing>*);"
// a word is a sequence of letters that may contain internal
// punctuation, as long as it begins and ends with a letter and
// never contains two punctuation marks in a row
+ "<word>=((<let><let>*(<mid-word><let><let>*)*){<danda>});"
// a number is a sequence of digits that may contain internal
// punctuation, as long as it begins and ends with a digit and
// never contains two punctuation marks in a row.
+ "<number>=(<dgt><dgt>*(<mid-num><dgt><dgt>*)*);"
// break after every character, with the following exceptions
// (this will cause punctuation marks that aren't considered
// part of words or numbers to be treated as words unto themselves)
+ ".;"
// keep together any sequence of contiguous words and numbers
// (including just one of either), plus an optional trailing
// number-suffix character
+ "{<word>}(<number><word>)*{<number>{<post-num>}};"
// keep together and sequence of contiguous words and numbers
// that starts with a number-prefix character and a number,
// and may end with a number-suffix character
+ "<pre-num>(<number><word>)*{<number>{<post-num>}};"
// keep together runs of whitespace (optionally with a single trailing
// line separator or CRLF sequence)
+ "<ws>*{\r}{<ls>};"
// keep together runs of Katakana and CJK diacritical marks
+ "[<kata><cjk-diacrit>]*;"
// keep together runs of Hiragana and CJK diacritical marks
+ "[<hira><cjk-diacrit>]*;"
// keep together runs of Kanji
+ "<kanji>*;"
// keep together anything else and an enclosing mark
+ "<base>=[^<enclosing>^[:Cc::Cf::Zl::Zp:]];"
+ "<base><enclosing><enclosing>*;"
},
// default rules for determining legal line-breaking positions
{ "LineBreakRules",
// characters that always cause a break: ETX, tab, LF, FF, LS, and PS
"<break>=[\u0003\t\n\f\u2028\u2029];"
// ignore format characters and control characters EXCEPT for breaking chars
+ "<ignore>=[:Cf:[:Cc:^[<break>\r]]];"
// enclosing marks
+ "<enclosing>=[:Mn::Me:];"
// Hindi phrase separators
+ "<danda>=[\u0964\u0965];"
// characters that always prevent a break: the non-breaking space
// and similar characters
+ "<glue>=[\u00a0\u0f0c\u2007\u2011\u202f\ufeff];"
// whitespace: space separators and control characters, except for
// CR and the other characters mentioned above
+ "<space>=[:Zs::Cc:^[<glue><break>\r]];"
// dashes: dash punctuation and the discretionary hyphen, except for
// non-breaking hyphens
+ "<dash>=[:Pd:\u00ad^<glue>];"
// characters that stick to a word if they precede it: currency symbols
// (except the cents sign) and starting punctuation
+ "<pre-word>=[:Sc::Ps::Pi:^[\u00a2]\\\"\\\'];"
// characters that stick to a word if they follow it: ending punctuation,
// other punctuation that usually occurs at the end of a sentence,
// small Kana characters, some CJK diacritics, etc.
+ "<post-word>=[\\\":Pe::Pf:\\!\\%\\.\\,\\:\\;\\?\u00a2\u00b0\u066a\u2030-\u2034\u2103"
+ "\u2105\u2109\u3001\u3002\u3005\u3041\u3043\u3045\u3047\u3049\u3063"
+ "\u3083\u3085\u3087\u308e\u3099-\u309e\u30a1\u30a3\u30a5\u30a7\u30a9"
+ "\u30c3\u30e3\u30e5\u30e7\u30ee\u30f5\u30f6\u30fc-\u30fe\uff01\uff05"
+ "\uff0c\uff0e\uff1a\uff1b\uff1f];"
// Kanji: actually includes Kanji,Kana and Hangul syllables,
// except for small Kana and CJK diacritics
+ "<kanji>=[\u4e00-\u9fa5\uac00-\ud7a3\uf900-\ufa2d\ufa30-\ufa6a\u3041-\u3094\u30a1-\u30fa^[<post-word><ignore>]];"
// digits
+ "<digit>=[:Nd::No:];"
// punctuation that can occur in the middle of a number: periods and commas
+ "<mid-num>=[\\.\\,];"
// everything not mentioned above
+ "<char>=[^[<break><space><dash><kanji><glue><ignore><pre-word><post-word><mid-num>\r<danda>]];"
// a "number" is a run of prefix characters and dashes, followed by one or
// more digits with isolated number-punctuation characters interspersed
+ "<number>=([<pre-word><dash>]*<digit><digit>*(<mid-num><digit><digit>*)*);"
// the basic core of a word can be either a "number" as defined above, a single
// "Kanji" character, or a run of any number of not-explicitly-mentioned
// characters (this includes Latin letters)
+ "<word-core>=(<char>*|<kanji>|<number>);"
// a word may end with an optional suffix that be either a run of one or
// more dashes or a run of word-suffix characters
+ "<word-suffix>=((<dash><dash>*|<post-word>*));"
// a word, thus, is an optional run of word-prefix characters, followed by
// a word core and a word suffix (the syntax of <word-core> and <word-suffix>
// actually allows either of them to match the empty string, putting a break
// between things like ")(" or "aaa(aaa"
+ "<word>=(<pre-word>*<word-core><word-suffix>);"
+ "<hack1>=[\\(];"
+ "<hack2>=[\\)];"
+ "<hack3>=[\\$\\'];"
// finally, the rule that does the work: Keep together any run of words that
// are joined by runs of one of more non-spacing mark. Also keep a trailing
// line-break character or CRLF combination with the word. (line separators
// "win" over nbsp's)
+ "<word>(((<space>*<glue><glue>*{<space>})|<hack3>)<word>)*<space>*{<enclosing>*}{<hack1><hack2><post-word>*}{<enclosing>*}{\r}{<break>};"
+ "\r<break>;"
},
// default rules for finding sentence boundaries
{ "SentenceBreakRules",
// ignore non-spacing marks, enclosing marks, and format characters
"<ignore>=[:Mn::Me::Cf:];"
// letters
+ "<letter>=[:L:];"
// lowercase letters
+ "<lc>=[:Ll:];"
// uppercase letters
+ "<uc>=[:Lu:];"
// NOT lowercase letters
+ "<notlc>=[<letter>^<lc>];"
// whitespace (line separators are treated as whitespace)
+ "<space>=[\t\r\f\n\u2028:Zs:];"
// punctuation which may occur at the beginning of a sentence: "starting
// punctuation" and quotation marks
+ "<start-punctuation>=[:Ps::Pi:\\\"\\\'];"
// punctuation with may occur at the end of a sentence: "ending punctuation"
// and quotation marks
+ "<end>=[:Pe::Pf:\\\"\\\'];"
// digits
+ "<digit>=[:N:];"
// characters that unambiguously signal the end of a sentence
+ "<term>=[\\!\\?\u3002\uff01\uff1f];"
// periods, which MAY signal the end of a sentence
+ "<period>=[\\.\uff0e];"
// characters that may occur at the beginning of a sentence: basically anything
// not mentioned above (letters and digits are specifically excluded)
+ "<sent-start>=[^[:L:<space><start-punctuation><end><digit><term><period>\u2029<ignore>]];"
// Hindi phrase separator
+ "<danda>=[\u0964\u0965];"
// always break sentences after paragraph separators
+ ".*?{\u2029};"
// always break after a danda, if it's followed by whitespace
+ ".*?<danda><space>*;"
// if you see a period, skip over additional periods and ending punctuation
// and if the next character is a paragraph separator, break after the
// paragraph separator
//+ ".*?<period>[<period><end>]*<space>*\u2029;"
//+ ".*?[<period><end>]*<space>*\u2029;"
// if you see a period, skip over additional periods and ending punctuation,
// followed by optional whitespace, followed by optional starting punctuation,
// and if the next character is something that can start a sentence
// (basically, a capital letter), then put the sentence break between the
// whitespace and the opening punctuation
+ ".*?<period>[<period><end>]*<space><space>*/<notlc>;"
+ ".*?<period>[<period><end>]*<space>*/[<start-punctuation><sent-start>][<start-punctuation><sent-start>]*<letter>;"
// if you see a sentence-terminating character, skip over any additional
// terminators, periods, or ending punctuation, followed by any whitespace,
// followed by a SINGLE optional paragraph separator, and put the break there
+ ".*?<term>[<term><period><end>]*<space>*{\u2029};"
// The following rules are here to aid in backwards iteration. The automatically
// generated backwards state table will rewind to the beginning of the
// paragraph all the time (or all the way to the beginning of the document
// if the document doesn't use the Unicode PS character) because the only
// unambiguous character pairs are those involving paragraph separators.
// These specify a few more unambiguous breaking situations.
// if you see a sentence-starting character, followed by starting punctuation
// (remember, we're iterating backwards), followed by an optional run of
// whitespace, followed by an optional run of ending punctuation, followed
// by a period, this is a safe place to turn around
+ "!<sent-start><start-punctuation>*<space>*<end>*<period>;"
// if you see a letter or a digit, followed by an optional run of
// starting punctuation, followed by an optional run of whitespace,
// followed by an optional run of ending punctuation, followed by
// a sentence terminator, this is a safe place to turn around
+ "![<sent-start><lc><digit>]<start-punctuation>*<space>*<end>*<term>;"
}
};
}
}

View file

@ -0,0 +1,55 @@
/*
* Copyright (c) 2005, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*/
/*
* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
* (C) Copyright IBM Corp. 1996 - 1999 - All Rights Reserved
*
* The original version of this source code and documentation
* is copyrighted and owned by Taligent, Inc., a wholly-owned
* subsidiary of IBM. These materials are provided under terms
* of a License Agreement between Taligent and Sun. This technology
* is protected by multiple US and International patents.
*
* This notice and attribution to Taligent may not be removed.
* Taligent is a registered trademark of Taligent, Inc.
*
*/
package sun.text.resources;
import java.util.ListResourceBundle;
public class CollationData extends ListResourceBundle {
protected final Object[][] getContents() {
return new Object[][] {
{ "Rule", "" },
};
}
}

View file

@ -0,0 +1,35 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package sun.text.resources;
import java.util.spi.ResourceBundleProvider;
/**
* An interface for the internal locale data provider for which {@code ResourceBundle}
* searches.
*/
public interface CollationDataProvider extends ResourceBundleProvider {
}

View file

@ -0,0 +1,863 @@
/*
* Copyright (c) 1996, 2014, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
*/
/*
* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
* (C) Copyright IBM Corp. 1996 - 1999 - All Rights Reserved
*
* The original version of this source code and documentation
* is copyrighted and owned by Taligent, Inc., a wholly-owned
* subsidiary of IBM. These materials are provided under terms
* of a License Agreement between Taligent and Sun. This technology
* is protected by multiple US and International patents.
*
* This notice and attribution to Taligent may not be removed.
* Taligent is a registered trademark of Taligent, Inc.
*
*/
/*
* COPYRIGHT AND PERMISSION NOTICE
*
* Copyright (C) 1991-2012 Unicode, Inc. All rights reserved. Distributed under
* the Terms of Use in http://www.unicode.org/copyright.html.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of the Unicode data files and any associated documentation (the "Data
* Files") or Unicode software and any associated documentation (the
* "Software") to deal in the Data Files or Software without restriction,
* including without limitation the rights to use, copy, modify, merge,
* publish, distribute, and/or sell copies of the Data Files or Software, and
* to permit persons to whom the Data Files or Software are furnished to do so,
* provided that (a) the above copyright notice(s) and this permission notice
* appear with all copies of the Data Files or Software, (b) both the above
* copyright notice(s) and this permission notice appear in associated
* documentation, and (c) there is clear notice in each modified Data File or
* in the Software as well as in the documentation associated with the Data
* File(s) or Software that the data or software has been modified.
*
* THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY
* KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF
* THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS
* INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR
* CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
* DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
* TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
* OF THE DATA FILES OR SOFTWARE.
*
* Except as contained in this notice, the name of a copyright holder shall not
* be used in advertising or otherwise to promote the sale, use or other
* dealings in these Data Files or Software without prior written authorization
* of the copyright holder.
*/
package sun.text.resources;
import sun.util.resources.ParallelListResourceBundle;
public class FormatData extends ParallelListResourceBundle {
/**
* Overrides ListResourceBundle
*/
@Override
protected final Object[][] getContents() {
// Julian calendar era strings
final String[] julianEras = {
"BC",
"AD"
};
// Thai Buddhist calendar era strings
final String[] buddhistEras = {
"BC", // BC
"B.E." // Buddhist Era
};
// Japanese imperial calendar era abbreviations
final String[] japaneseEraAbbrs = {
"",
"M",
"T",
"S",
"H",
};
// Japanese imperial calendar era strings
final String[] japaneseEras = {
"",
"Meiji",
"Taisho",
"Showa",
"Heisei",
};
return new Object[][] {
{ "MonthNames",
new String[] {
"January", // january
"February", // february
"March", // march
"April", // april
"May", // may
"June", // june
"July", // july
"August", // august
"September", // september
"October", // october
"November", // november
"December", // december
"" // month 13 if applicable
}
},
{ "MonthAbbreviations",
new String[] {
"Jan", // abb january
"Feb", // abb february
"Mar", // abb march
"Apr", // abb april
"May", // abb may
"Jun", // abb june
"Jul", // abb july
"Aug", // abb august
"Sep", // abb september
"Oct", // abb october
"Nov", // abb november
"Dec", // abb december
"" // abb month 13 if applicable
}
},
{ "MonthNarrows",
new String[] {
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"10",
"11",
"12",
"",
}
},
{ "DayNames",
new String[] {
"Sunday", // Sunday
"Monday", // Monday
"Tuesday", // Tuesday
"Wednesday", // Wednesday
"Thursday", // Thursday
"Friday", // Friday
"Saturday" // Saturday
}
},
{ "DayAbbreviations",
new String[] {
"Sun", // abb Sunday
"Mon", // abb Monday
"Tue", // abb Tuesday
"Wed", // abb Wednesday
"Thu", // abb Thursday
"Fri", // abb Friday
"Sat" // abb Saturday
}
},
{ "DayNarrows",
new String[] {
"S",
"M",
"T",
"W",
"T",
"F",
"S",
}
},
{ "AmPmMarkers",
new String[] {
"AM", // am marker
"PM" // pm marker
}
},
{ "narrow.AmPmMarkers",
new String[] {
"a", // am marker
"p" // pm marker
}
},
{ "Eras",
julianEras },
{ "short.Eras",
julianEras },
{ "narrow.Eras",
new String[] {
"B",
"A",
}
},
{ "buddhist.Eras",
buddhistEras
},
{ "buddhist.short.Eras",
buddhistEras
},
{ "buddhist.narrow.Eras",
buddhistEras
},
{ "japanese.Eras",
japaneseEras },
{ "japanese.short.Eras",
japaneseEraAbbrs
},
{ "japanese.narrow.Eras",
japaneseEraAbbrs
},
{ "japanese.FirstYear",
new String[] { // Japanese imperial calendar year name
// empty in English
}
},
{ "NumberPatterns",
new String[] {
"#,##0.###;-#,##0.###", // decimal pattern
"\u00a4 #,##0.00;-\u00a4 #,##0.00", // currency pattern
"#,##0%" // percent pattern
}
},
{ "DefaultNumberingSystem", "" },
{ "NumberElements",
new String[] {
".", // decimal separator
",", // group (thousands) separator
";", // list separator
"%", // percent sign
"0", // native 0 digit
"#", // pattern digit
"-", // minus sign
"E", // exponential
"\u2030", // per mille
"\u221e", // infinity
"\ufffd" // NaN
}
},
{ "arab.NumberElements",
new String[] {
"\u066b",
"\u066c",
"\u061b",
"\u066a",
"\u0660",
"#",
"-",
"\u0627\u0633",
"\u0609",
"\u221e",
"NaN",
}
},
{ "arabext.NumberElements",
new String[] {
"\u066b",
"\u066c",
"\u061b",
"\u066a",
"\u06f0",
"#",
"-",
"\u00d7\u06f1\u06f0^",
"\u0609",
"\u221e",
"NaN",
}
},
{ "bali.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u1b50",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "beng.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u09e6",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "cham.NumberElements",
new String[] {
".",
",",
";",
"%",
"\uaa50",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "deva.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u0966",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "fullwide.NumberElements",
new String[] {
".",
",",
";",
"%",
"\uff10",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "gujr.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u0ae6",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "guru.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u0a66",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "java.NumberElements",
new String[] {
".",
",",
";",
"%",
"\ua9d0",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "kali.NumberElements",
new String[] {
".",
",",
";",
"%",
"\ua900",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "khmr.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u17e0",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "knda.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u0ce6",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "laoo.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u0ed0",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "lana.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u1a80",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "lanatham.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u1a90",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "latn.NumberElements",
new String[] {
".", // decimal separator
",", // group (thousands) separator
";", // list separator
"%", // percent sign
"0", // native 0 digit
"#", // pattern digit
"-", // minus sign
"E", // exponential
"\u2030", // per mille
"\u221e", // infinity
"\ufffd" // NaN
}
},
{ "lepc.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u1c40",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "limb.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u1946",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "mlym.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u0d66",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "mong.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u1810",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "mtei.NumberElements",
new String[] {
".",
",",
";",
"%",
"\uabf0",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "mymr.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u1040",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "mymrshan.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u1090",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "nkoo.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u07c0",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "olck.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u1c50",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "orya.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u0b66",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "saur.NumberElements",
new String[] {
".",
",",
";",
"%",
"\ua8d0",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "sund.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u1bb0",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "talu.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u19d0",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "tamldec.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u0be6",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "telu.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u0c66",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "thai.NumberElements",
new String[] {
".", // decimal separator
",", // group (thousands) separator
";", // list separator
"%", // percent sign
"\u0E50", // native 0 digit
"#", // pattern digit
"-", // minus sign
"E", // exponential
"\u2030", // per mille
"\u221e", // infinity
"\ufffd" // NaN
}
},
{ "tibt.NumberElements",
new String[] {
".",
",",
";",
"%",
"\u0f20",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "vaii.NumberElements",
new String[] {
".",
",",
";",
"%",
"\ua620",
"#",
"-",
"E",
"\u2030",
"\u221e",
"NaN",
}
},
{ "TimePatterns",
new String[] {
"h:mm:ss a z", // full time pattern
"h:mm:ss a z", // long time pattern
"h:mm:ss a", // medium time pattern
"h:mm a", // short time pattern
}
},
{ "DatePatterns",
new String[] {
"EEEE, MMMM d, yyyy", // full date pattern
"MMMM d, yyyy", // long date pattern
"MMM d, yyyy", // medium date pattern
"M/d/yy", // short date pattern
}
},
{ "DateTimePatterns",
new String[] {
"{1} {0}" // date-time pattern
}
},
{ "buddhist.TimePatterns",
new String[] {
"H:mm:ss z", // full time pattern
"H:mm:ss z", // long time pattern
"H:mm:ss", // medium time pattern
"H:mm", // short time pattern
}
},
{ "buddhist.DatePatterns",
new String[] {
"EEEE d MMMM G yyyy", // full date pattern
"d MMMM yyyy", // long date pattern
"d MMM yyyy", // medium date pattern
"d/M/yyyy", // short date pattern
}
},
{ "buddhist.DateTimePatterns",
new String[] {
"{1}, {0}" // date-time pattern
}
},
{ "japanese.TimePatterns",
new String[] {
"h:mm:ss a z", // full time pattern
"h:mm:ss a z", // long time pattern
"h:mm:ss a", // medium time pattern
"h:mm a", // short time pattern
}
},
{ "japanese.DatePatterns",
new String[] {
"GGGG yyyy MMMM d (EEEE)", // full date pattern
"GGGG yyyy MMMM d", // long date pattern
"GGGG yyyy MMM d", // medium date pattern
"Gy.MM.dd", // short date pattern
}
},
{ "japanese.DateTimePatterns",
new String[] {
"{1} {0}" // date-time pattern
}
},
{ "DateTimePatternChars", "GyMdkHmsSEDFwWahKzZ" },
};
}
}

View file

@ -0,0 +1,35 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package sun.text.resources;
import java.util.spi.ResourceBundleProvider;
/**
* An interface for the internal locale data provider for which {@code ResourceBundle}
* searches.
*/
public interface FormatDataProvider extends ResourceBundleProvider {
}

View file

@ -0,0 +1,83 @@
/*
* Copyright (c) 1997, 2014, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
* (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
*
* The original version of this source code and documentation
* is copyrighted and owned by Taligent, Inc., a wholly-owned
* subsidiary of IBM. These materials are provided under terms
* of a License Agreement between Taligent and Sun. This technology
* is protected by multiple US and International patents.
*
* This notice and attribution to Taligent may not be removed.
* Taligent is a registered trademark of Taligent, Inc.
*
*/
package sun.text.resources;
import sun.util.resources.ParallelListResourceBundle;
public class FormatData_en extends ParallelListResourceBundle {
/**
* Overrides ParallelListResourceBundle
*/
protected final Object[][] getContents() {
// This locale inherits almost everything from the root default locale. However,
// even if it inherited everything, we would still need this locale to exist
// to make the resource-bundle lookup mechanism work right. In that case, we'd
// define this method as follows:
// return new Object[][] { };
return new Object[][] {
{ "MonthNarrows",
new String[] {
"J",
"F",
"M",
"A",
"M",
"J",
"J",
"A",
"S",
"O",
"N",
"D",
"",
}
},
{ "NumberPatterns",
new String[] {
"#,##0.###;-#,##0.###", // decimal pattern
"\u00A4#,##0.00;-\u00A4#,##0.00", // currency pattern
"#,##0%" // percent pattern
}
},
{ "DateTimePatternChars", "GyMdkHmsSEDFwWahKzZ" },
};
}
}

View file

@ -0,0 +1,60 @@
/*
* Copyright (c) 1998, 2013, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
* (C) Copyright Taligent, Inc. 1996, 1997 - All Rights Reserved
* (C) Copyright IBM Corp. 1996 - 1998 - All Rights Reserved
*
* The original version of this source code and documentation
* is copyrighted and owned by Taligent, Inc., a wholly-owned
* subsidiary of IBM. These materials are provided under terms
* of a License Agreement between Taligent and Sun. This technology
* is protected by multiple US and International patents.
*
* This notice and attribution to Taligent may not be removed.
* Taligent is a registered trademark of Taligent, Inc.
*
*/
package sun.text.resources;
import sun.util.resources.ParallelListResourceBundle;
public class FormatData_en_US extends ParallelListResourceBundle {
/**
* Overrides ParallelListResourceBundle
*/
protected final Object[][] getContents() {
return new Object[][] {
{ "NumberPatterns",
new String[] {
"#,##0.###;-#,##0.###", // decimal pattern
"\u00a4#,##0.00;(\u00a4#,##0.00)", // currency pattern
"#,##0%" // percent pattern
}
},
};
}
}

View file

@ -0,0 +1,352 @@
/*
* Copyright (c) 2013, 2016, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
* COPYRIGHT AND PERMISSION NOTICE
*
* Copyright (C) 1991-2016 Unicode, Inc. All rights reserved.
* Distributed under the Terms of Use in
* http://www.unicode.org/copyright.html.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of the Unicode data files and any associated documentation
* (the "Data Files") or Unicode software and any associated documentation
* (the "Software") to deal in the Data Files or Software
* without restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, and/or sell copies of
* the Data Files or Software, and to permit persons to whom the Data Files
* or Software are furnished to do so, provided that
* (a) this copyright and permission notice appear with all copies
* of the Data Files or Software,
* (b) this copyright and permission notice appear in associated
* documentation, and
* (c) there is clear notice in each modified Data File or in the Software
* as well as in the documentation associated with the Data File(s) or
* Software that the data or software has been modified.
*
* THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
* ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
* WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT OF THIRD PARTY RIGHTS.
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
* NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
* DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
* DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
* TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
* PERFORMANCE OF THE DATA FILES OR SOFTWARE.
*
* Except as contained in this notice, the name of a copyright holder
* shall not be used in advertising or otherwise to promote the sale,
* use or other dealings in these Data Files or Software without prior
* written authorization of the copyright holder.
*/
// Note: this file has been generated by a tool.
package sun.text.resources;
import sun.util.resources.OpenListResourceBundle;
public class JavaTimeSupplementary extends OpenListResourceBundle {
@Override
protected final Object[][] getContents() {
final String[] sharedQuarterNames = {
"Q1",
"Q2",
"Q3",
"Q4",
};
final String[] sharedQuarterNarrows = {
"1",
"2",
"3",
"4",
};
final String[] sharedDatePatterns = {
"GGGG y MMMM d, EEEE",
"GGGG y MMMM d",
"GGGG y MMM d",
"G y-MM-dd",
};
final String[] sharedDayNames = {
"Sun",
"Mon",
"Tue",
"Wed",
"Thu",
"Fri",
"Sat",
};
final String[] sharedDayNarrows = {
"S",
"M",
"T",
"W",
"T",
"F",
"S",
};
final String[] sharedEras = {
"",
"AH",
};
final String[] sharedMonthNarrows = {
"1",
"2",
"3",
"4",
"5",
"6",
"7",
"8",
"9",
"10",
"11",
"12",
"",
};
final String[] sharedTimePatterns = {
"HH:mm:ss zzzz",
"HH:mm:ss z",
"HH:mm:ss",
"HH:mm",
};
final String[] sharedAmPmMarkers = {
"AM",
"PM",
};
final String[] sharedJavaTimeDatePatterns = {
"G y MMMM d, EEEE",
"G y MMMM d",
"G y MMM d",
"GGGGG y-MM-dd",
};
final String[] sharedJavaTimeLongEras = {
"",
"Meiji",
"Taisho",
"Showa",
"Heisei",
};
final String[] sharedShortEras = {
"Before R.O.C.",
"R.O.C.",
};
final String[] sharedMonthNames = {
"Jan",
"Feb",
"Mar",
"Apr",
"May",
"Jun",
"Jul",
"Aug",
"Sep",
"Oct",
"Nov",
"Dec",
"",
};
return new Object[][] {
{ "QuarterAbbreviations",
sharedQuarterNames },
{ "QuarterNames",
sharedQuarterNames },
{ "QuarterNarrows",
sharedQuarterNarrows },
{ "field.dayperiod",
"Dayperiod" },
{ "field.era",
"Era" },
{ "field.hour",
"Hour" },
{ "field.minute",
"Minute" },
{ "field.month",
"Month" },
{ "field.second",
"Second" },
{ "field.week",
"Week" },
{ "field.weekday",
"Day of the Week" },
{ "field.year",
"Year" },
{ "field.zone",
"Zone" },
{ "islamic.DatePatterns",
sharedDatePatterns },
{ "islamic.DayAbbreviations",
sharedDayNames },
{ "islamic.DayNames",
sharedDayNames },
{ "islamic.DayNarrows",
sharedDayNarrows },
{ "islamic.Eras",
sharedEras },
{ "islamic.MonthAbbreviations",
new String[] {
"Muh.",
"Saf.",
"Rab. I",
"Rab. II",
"Jum. I",
"Jum. II",
"Raj.",
"Sha.",
"Ram.",
"Shaw.",
"Dhu\u02bbl-Q.",
"Dhu\u02bbl-H.",
"",
}
},
{ "islamic.MonthNames",
new String[] {
"Muharram",
"Safar",
"Rabi\u02bb I",
"Rabi\u02bb II",
"Jumada I",
"Jumada II",
"Rajab",
"Sha\u02bbban",
"Ramadan",
"Shawwal",
"Dhu\u02bbl-Qi\u02bbdah",
"Dhu\u02bbl-Hijjah",
"",
}
},
{ "islamic.MonthNarrows",
sharedMonthNarrows },
{ "islamic.QuarterNames",
sharedQuarterNames },
{ "islamic.QuarterNarrows",
sharedQuarterNarrows },
{ "islamic.TimePatterns",
sharedTimePatterns },
{ "islamic.abbreviated.AmPmMarkers",
sharedAmPmMarkers },
{ "islamic.long.Eras",
sharedEras },
{ "islamic.narrow.Eras",
sharedEras },
{ "islamic.short.Eras",
sharedEras },
{ "java.time.buddhist.DatePatterns",
sharedJavaTimeDatePatterns },
{ "java.time.buddhist.long.Eras",
new String[] {
"BC",
"BE",
}
},
{ "java.time.buddhist.short.Eras",
new String[] {
"BC",
"B.E.",
}
},
{ "java.time.islamic.DatePatterns",
sharedJavaTimeDatePatterns },
{ "java.time.japanese.DatePatterns",
new String[] {
"G y MMMM d (EEEE)",
"G y MMMM d",
"G y MMM d",
"GGGGGy.MM.dd",
}
},
{ "java.time.japanese.long.Eras",
sharedJavaTimeLongEras },
{ "java.time.japanese.short.Eras",
sharedJavaTimeLongEras },
{ "java.time.long.Eras",
new String[] {
"BCE",
"CE",
}
},
{ "java.time.roc.DatePatterns",
sharedJavaTimeDatePatterns },
{ "java.time.short.Eras",
new String[] {
"BC",
"AD",
}
},
{ "roc.AmPmMarkers",
sharedAmPmMarkers },
{ "roc.DatePatterns",
sharedDatePatterns },
{ "roc.DayNames",
sharedDayNames },
{ "roc.DayNarrows",
sharedDayNarrows },
{ "roc.Eras",
sharedShortEras },
{ "roc.MonthAbbreviations",
sharedMonthNames },
{ "roc.MonthNames",
sharedMonthNames },
{ "roc.MonthNarrows",
sharedMonthNarrows },
{ "roc.QuarterNames",
sharedQuarterNames },
{ "roc.QuarterNarrows",
sharedQuarterNarrows },
{ "roc.TimePatterns",
sharedTimePatterns },
{ "roc.abbreviated.AmPmMarkers",
sharedAmPmMarkers },
{ "roc.long.Eras",
sharedShortEras },
{ "roc.narrow.AmPmMarkers",
sharedAmPmMarkers },
{ "roc.narrow.Eras",
sharedShortEras },
{ "roc.short.Eras",
sharedShortEras },
{ "timezone.gmtFormat",
"GMT{0}" },
{ "timezone.hourFormat",
"+HH:mm;-HH:mm" },
};
}
}

View file

@ -0,0 +1,35 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package sun.text.resources;
import java.util.spi.ResourceBundleProvider;
/**
* An interface for the internal locale data provider for which {@code ResourceBundle}
* searches.
*/
public interface JavaTimeSupplementaryProvider extends ResourceBundleProvider {
}

View file

@ -0,0 +1,274 @@
/*
* Copyright (c) 2013, 2016, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
/*
* COPYRIGHT AND PERMISSION NOTICE
*
* Copyright (C) 1991-2016 Unicode, Inc. All rights reserved.
* Distributed under the Terms of Use in
* http://www.unicode.org/copyright.html.
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of the Unicode data files and any associated documentation
* (the "Data Files") or Unicode software and any associated documentation
* (the "Software") to deal in the Data Files or Software
* without restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, and/or sell copies of
* the Data Files or Software, and to permit persons to whom the Data Files
* or Software are furnished to do so, provided that
* (a) this copyright and permission notice appear with all copies
* of the Data Files or Software,
* (b) this copyright and permission notice appear in associated
* documentation, and
* (c) there is clear notice in each modified Data File or in the Software
* as well as in the documentation associated with the Data File(s) or
* Software that the data or software has been modified.
*
* THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
* ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
* WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT OF THIRD PARTY RIGHTS.
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
* NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
* DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
* DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
* TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
* PERFORMANCE OF THE DATA FILES OR SOFTWARE.
*
* Except as contained in this notice, the name of a copyright holder
* shall not be used in advertising or otherwise to promote the sale,
* use or other dealings in these Data Files or Software without prior
* written authorization of the copyright holder.
*/
// Note: this file has been generated by a tool.
package sun.text.resources;
import sun.util.resources.OpenListResourceBundle;
public class JavaTimeSupplementary_en extends OpenListResourceBundle {
@Override
protected final Object[][] getContents() {
final String[] sharedQuarterNames = {
"1st quarter",
"2nd quarter",
"3rd quarter",
"4th quarter",
};
final String[] sharedDatePatterns = {
"EEEE, MMMM d, y GGGG",
"MMMM d, y GGGG",
"MMM d, y GGGG",
"M/d/y G",
};
final String[] sharedDayNames = {
"Sunday",
"Monday",
"Tuesday",
"Wednesday",
"Thursday",
"Friday",
"Saturday",
};
final String[] sharedQuarterAbbreviations = {
"Q1",
"Q2",
"Q3",
"Q4",
};
final String[] sharedTimePatterns = {
"h:mm:ss a zzzz",
"h:mm:ss a z",
"h:mm:ss a",
"h:mm a",
};
final String[] sharedNarrowAmPmMarkers = {
"a",
"p",
};
final String[] sharedJavaTimeDatePatterns = {
"EEEE, MMMM d, y G",
"MMMM d, y G",
"MMM d, y G",
"M/d/y GGGGG",
};
final String[] sharedEras = {
"Before R.O.C.",
"Minguo",
};
return new Object[][] {
{ "QuarterNames",
sharedQuarterNames },
{ "calendarname.buddhist",
"Buddhist Calendar" },
{ "calendarname.gregorian",
"Gregorian Calendar" },
{ "calendarname.gregory",
"Gregorian Calendar" },
{ "calendarname.islamic",
"Islamic Calendar" },
{ "calendarname.islamic-civil",
"Islamic Calendar (tabular, civil epoch)" },
{ "calendarname.islamic-umalqura",
"Islamic Calendar (Umm al-Qura)" },
{ "calendarname.japanese",
"Japanese Calendar" },
{ "calendarname.roc",
"Minguo Calendar" },
{ "field.dayperiod",
"AM/PM" },
{ "field.era",
"era" },
{ "field.hour",
"hour" },
{ "field.minute",
"minute" },
{ "field.month",
"month" },
{ "field.second",
"second" },
{ "field.week",
"week" },
{ "field.weekday",
"day of the week" },
{ "field.year",
"year" },
{ "field.zone",
"time zone" },
{ "islamic.AmPmMarkers",
new String[] {
"AM",
"PM",
}
},
{ "islamic.DatePatterns",
sharedDatePatterns },
{ "islamic.DayNames",
sharedDayNames },
{ "islamic.QuarterAbbreviations",
sharedQuarterAbbreviations },
{ "islamic.QuarterNames",
sharedQuarterNames },
{ "islamic.TimePatterns",
sharedTimePatterns },
{ "islamic.narrow.AmPmMarkers",
sharedNarrowAmPmMarkers },
{ "java.time.buddhist.DatePatterns",
sharedJavaTimeDatePatterns },
{ "java.time.buddhist.short.Eras",
new String[] {
"BC",
"BE",
}
},
{ "java.time.islamic.DatePatterns",
sharedJavaTimeDatePatterns },
{ "java.time.japanese.DatePatterns",
sharedJavaTimeDatePatterns },
{ "java.time.long.Eras",
new String[] {
"Before Christ",
"Anno Domini",
}
},
{ "java.time.roc.DatePatterns",
sharedJavaTimeDatePatterns },
{ "roc.DatePatterns",
sharedDatePatterns },
{ "roc.DayAbbreviations",
new String[] {
"Sun",
"Mon",
"Tue",
"Wed",
"Thu",
"Fri",
"Sat",
}
},
{ "roc.DayNames",
sharedDayNames },
{ "roc.Eras",
sharedEras },
{ "roc.MonthNames",
new String[] {
"January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December",
"",
}
},
{ "roc.MonthNarrows",
new String[] {
"J",
"F",
"M",
"A",
"M",
"J",
"J",
"A",
"S",
"O",
"N",
"D",
"",
}
},
{ "roc.QuarterAbbreviations",
sharedQuarterAbbreviations },
{ "roc.QuarterNames",
sharedQuarterNames },
{ "roc.TimePatterns",
sharedTimePatterns },
{ "roc.long.Eras",
sharedEras },
{ "roc.narrow.AmPmMarkers",
sharedNarrowAmPmMarkers },
{ "roc.narrow.Eras",
sharedEras },
{ "roc.short.Eras",
sharedEras },
};
}
}

View file

@ -0,0 +1,35 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package sun.text.resources.cldr;
import java.util.spi.ResourceBundleProvider;
/**
* An interface for the internal locale data provider for which {@code ResourceBundle}
* searches.
*/
public interface FormatDataProvider extends ResourceBundleProvider {
}

Binary file not shown.

View file

@ -0,0 +1,61 @@
/*
* Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package sun.text.spi;
import java.util.Locale;
import java.util.spi.LocaleServiceProvider;
/**
* Service Provider Interface for retrieving DateTime patterns from
* specified Locale provider for java.time.
*/
public abstract class JavaTimeDateTimePatternProvider extends LocaleServiceProvider {
protected JavaTimeDateTimePatternProvider() {
}
/**
* Gets the formatting pattern for a timeStyle
* dateStyle, calendarType and locale.
* Concrete implementation of this method will retrieve
* a java.time specific dateTime Pattern from selected Locale Provider.
*
* @param timeStyle an {@code int} value representing FormatStyle constant, -1
* for date-only pattern
* @param dateStyle an {@code int} value,representing FormatStyle constant, -1
* for time-only pattern
* @param locale {@code locale}, non-null
* @param calType a {@code String},non-null representing CalendarType such as "japanese",
* "iso8601"
* @return formatting pattern {@code String}
* @see java.time.format.DateTimeFormatterBuilder#convertStyle(java.time.format.FormatStyle)
* @since 9
*/
public abstract String getJavaTimeDateTimePattern(int timeStyle, int dateStyle, String calType, Locale locale);
}