mirror of
https://github.com/openjdk/jdk.git
synced 2025-08-27 06:45:07 +02:00
8311906: Improve robustness of String constructors with mutable array inputs
Co-authored-by: Damon Fenacci <dfenacci@openjdk.org> Co-authored-by: Claes Redestad <redestad@openjdk.org> Co-authored-by: Amit Kumar <amitkumar@openjdk.org> Co-authored-by: Martin Doerr <mdoerr@openjdk.org> Reviewed-by: rgiulietti, thartmann, redestad, dfenacci
This commit is contained in:
parent
316b78336c
commit
155abc576a
15 changed files with 1300 additions and 248 deletions
|
@ -130,6 +130,9 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence
|
|||
* as the specified {@code CharSequence}. The initial capacity of
|
||||
* the string builder is {@code 16} plus the length of the
|
||||
* {@code CharSequence} argument.
|
||||
* <p>
|
||||
* The contents are unspecified if the {@code CharSequence}
|
||||
* is modified during string construction.
|
||||
*
|
||||
* @param seq the sequence to copy.
|
||||
*/
|
||||
|
@ -666,6 +669,10 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence
|
|||
* If {@code s} is {@code null}, then this method appends
|
||||
* characters as if the s parameter was a sequence containing the four
|
||||
* characters {@code "null"}.
|
||||
* <p>
|
||||
* The contents are unspecified if the {@code CharSequence}
|
||||
* is modified during the method call or an exception is thrown
|
||||
* when accessing the {@code CharSequence}.
|
||||
*
|
||||
* @param s the sequence to append.
|
||||
* @param start the starting index of the subsequence to be appended.
|
||||
|
@ -1241,6 +1248,10 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence
|
|||
* invocation of this object's
|
||||
* {@link #insert(int,CharSequence,int,int) insert}(dstOffset, s, 0, s.length())
|
||||
* method.
|
||||
* <p>
|
||||
* The contents are unspecified if the {@code CharSequence}
|
||||
* is modified during the method call or an exception is thrown
|
||||
* when accessing the {@code CharSequence}.
|
||||
*
|
||||
* <p>If {@code s} is {@code null}, then the four characters
|
||||
* {@code "null"} are inserted into this sequence.
|
||||
|
@ -1289,6 +1300,10 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence
|
|||
* <p>If {@code s} is {@code null}, then this method inserts
|
||||
* characters as if the s parameter was a sequence containing the four
|
||||
* characters {@code "null"}.
|
||||
* <p>
|
||||
* The contents are unspecified if the {@code CharSequence}
|
||||
* is modified during the method call or an exception is thrown
|
||||
* when accessing the {@code CharSequence}.
|
||||
*
|
||||
* @param dstOffset the offset in this sequence.
|
||||
* @param s the sequence to be inserted.
|
||||
|
@ -1675,11 +1690,10 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence
|
|||
/* for readObject() */
|
||||
void initBytes(char[] value, int off, int len) {
|
||||
if (String.COMPACT_STRINGS) {
|
||||
this.value = StringUTF16.compress(value, off, len);
|
||||
if (this.value != null) {
|
||||
this.coder = LATIN1;
|
||||
return;
|
||||
}
|
||||
byte[] val = StringUTF16.compress(value, off, len);
|
||||
this.coder = StringUTF16.coderFromArrayLen(val, len);
|
||||
this.value = val;
|
||||
return;
|
||||
}
|
||||
this.coder = UTF16;
|
||||
this.value = StringUTF16.toBytes(value, off, len);
|
||||
|
@ -1720,6 +1734,9 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence
|
|||
val[j++] = (byte)c;
|
||||
} else {
|
||||
inflate();
|
||||
// store c to make sure it has a UTF16 char
|
||||
StringUTF16.putChar(this.value, j++, c);
|
||||
i++;
|
||||
StringUTF16.putCharsSB(this.value, j, s, i, end);
|
||||
return;
|
||||
}
|
||||
|
@ -1812,6 +1829,10 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence
|
|||
} else {
|
||||
count = j;
|
||||
inflate();
|
||||
// Store c to make sure sb has a UTF16 char
|
||||
StringUTF16.putChar(this.value, j++, c);
|
||||
count = j;
|
||||
i++;
|
||||
StringUTF16.putCharsSB(this.value, j, s, i, end);
|
||||
count += end - i;
|
||||
return;
|
||||
|
@ -1923,6 +1944,10 @@ abstract sealed class AbstractStringBuilder implements Appendable, CharSequence
|
|||
* <p>
|
||||
* If {@code cs} is {@code null}, then the four characters
|
||||
* {@code "null"} are repeated into this sequence.
|
||||
* <p>
|
||||
* The contents are unspecified if the {@code CharSequence}
|
||||
* is modified during the method call or an exception is thrown
|
||||
* when accessing the {@code CharSequence}.
|
||||
*
|
||||
* @param cs a {@code CharSequence}
|
||||
* @param count number of times to copy
|
||||
|
|
|
@ -57,6 +57,10 @@ public interface Appendable {
|
|||
* {@code csq}, the entire sequence may not be appended. For
|
||||
* instance, if {@code csq} is a {@link java.nio.CharBuffer} then
|
||||
* the subsequence to append is defined by the buffer's position and limit.
|
||||
* <p>
|
||||
* The contents of this {@code Appendable} are unspecified if the {@code CharSequence}
|
||||
* is modified during the method call or an exception is thrown
|
||||
* when accessing the {@code CharSequence}.
|
||||
*
|
||||
* @param csq
|
||||
* The character sequence to append. If {@code csq} is
|
||||
|
@ -81,6 +85,10 @@ public interface Appendable {
|
|||
* <pre>
|
||||
* out.append(csq.subSequence(start, end)) </pre>
|
||||
*
|
||||
* <p>
|
||||
* The contents of this {@code Appendable} are unspecified if the {@code CharSequence}
|
||||
* is modified during the method call or an exception is thrown
|
||||
* when accessing the {@code CharSequence}.
|
||||
* @param csq
|
||||
* The character sequence from which a subsequence will be
|
||||
* appended. If {@code csq} is {@code null}, then characters
|
||||
|
|
|
@ -273,6 +273,9 @@ public final class String
|
|||
* contents of the character array are copied; subsequent modification of
|
||||
* the character array does not affect the newly created string.
|
||||
*
|
||||
* <p> The contents of the string are unspecified if the character array
|
||||
* is modified during string construction.
|
||||
*
|
||||
* @param value
|
||||
* The initial value of the string
|
||||
*/
|
||||
|
@ -288,6 +291,9 @@ public final class String
|
|||
* subarray are copied; subsequent modification of the character array does
|
||||
* not affect the newly created string.
|
||||
*
|
||||
* <p> The contents of the string are unspecified if the character array
|
||||
* is modified during string construction.
|
||||
*
|
||||
* @param value
|
||||
* Array that is the source of characters
|
||||
*
|
||||
|
@ -319,6 +325,9 @@ public final class String
|
|||
* {@code char}s; subsequent modification of the {@code int} array does not
|
||||
* affect the newly created string.
|
||||
*
|
||||
* <p> The contents of the string are unspecified if the codepoints array
|
||||
* is modified during string construction.
|
||||
*
|
||||
* @param codePoints
|
||||
* Array that is the source of Unicode code points
|
||||
*
|
||||
|
@ -346,12 +355,10 @@ public final class String
|
|||
return;
|
||||
}
|
||||
if (COMPACT_STRINGS) {
|
||||
byte[] val = StringLatin1.toBytes(codePoints, offset, count);
|
||||
if (val != null) {
|
||||
this.coder = LATIN1;
|
||||
this.value = val;
|
||||
return;
|
||||
}
|
||||
byte[] val = StringUTF16.compress(codePoints, offset, count);
|
||||
this.coder = StringUTF16.coderFromArrayLen(val, count);
|
||||
this.value = val;
|
||||
return;
|
||||
}
|
||||
this.coder = UTF16;
|
||||
this.value = StringUTF16.toBytes(codePoints, offset, count);
|
||||
|
@ -368,6 +375,9 @@ public final class String
|
|||
* <p> Each {@code byte} in the subarray is converted to a {@code char} as
|
||||
* specified in the {@link #String(byte[],int) String(byte[],int)} constructor.
|
||||
*
|
||||
* <p> The contents of the string are unspecified if the byte array
|
||||
* is modified during string construction.
|
||||
*
|
||||
* @deprecated This method does not properly convert bytes into characters.
|
||||
* As of JDK 1.1, the preferred way to do this is via the
|
||||
* {@code String} constructors that take a {@link Charset}, charset name,
|
||||
|
@ -429,6 +439,9 @@ public final class String
|
|||
* | (<b><i>b</i></b> & 0xff))
|
||||
* </pre></blockquote>
|
||||
*
|
||||
* <p> The contents of the string are unspecified if the byte array
|
||||
* is modified during string construction.
|
||||
*
|
||||
* @deprecated This method does not properly convert bytes into
|
||||
* characters. As of JDK 1.1, the preferred way to do this is via the
|
||||
* {@code String} constructors that take a {@link Charset}, charset name,
|
||||
|
@ -463,6 +476,9 @@ public final class String
|
|||
* java.nio.charset.CharsetDecoder} class should be used when more control
|
||||
* over the decoding process is required.
|
||||
*
|
||||
* <p> The contents of the string are unspecified if the byte array
|
||||
* is modified during string construction.
|
||||
*
|
||||
* @param bytes
|
||||
* The bytes to be decoded into characters
|
||||
*
|
||||
|
@ -501,6 +517,9 @@ public final class String
|
|||
* java.nio.charset.CharsetDecoder} class should be used when more control
|
||||
* over the decoding process is required.
|
||||
*
|
||||
* <p> The contents of the string are unspecified if the byte array
|
||||
* is modified during string construction.
|
||||
*
|
||||
* @param bytes
|
||||
* The bytes to be decoded into characters
|
||||
*
|
||||
|
@ -543,47 +562,43 @@ public final class String
|
|||
this.coder = LATIN1;
|
||||
return;
|
||||
}
|
||||
int sl = offset + length;
|
||||
byte[] dst = new byte[length];
|
||||
if (dp > 0) {
|
||||
System.arraycopy(bytes, offset, dst, 0, dp);
|
||||
offset += dp;
|
||||
}
|
||||
while (offset < sl) {
|
||||
int b1 = bytes[offset++];
|
||||
// Decode with a stable copy, to be the result if the decoded length is the same
|
||||
byte[] latin1 = Arrays.copyOfRange(bytes, offset, offset + length);
|
||||
int sp = dp; // first dp bytes are already in the copy
|
||||
while (sp < length) {
|
||||
int b1 = latin1[sp++];
|
||||
if (b1 >= 0) {
|
||||
dst[dp++] = (byte)b1;
|
||||
latin1[dp++] = (byte)b1;
|
||||
continue;
|
||||
}
|
||||
if ((b1 & 0xfe) == 0xc2 && offset < sl) { // b1 either 0xc2 or 0xc3
|
||||
int b2 = bytes[offset];
|
||||
if ((b1 & 0xfe) == 0xc2 && sp < length) { // b1 either 0xc2 or 0xc3
|
||||
int b2 = latin1[sp];
|
||||
if (b2 < -64) { // continuation bytes are always negative values in the range -128 to -65
|
||||
dst[dp++] = (byte)decode2(b1, b2);
|
||||
offset++;
|
||||
latin1[dp++] = (byte)decode2(b1, b2);
|
||||
sp++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// anything not a latin1, including the REPL
|
||||
// we have to go with the utf16
|
||||
offset--;
|
||||
sp--;
|
||||
break;
|
||||
}
|
||||
if (offset == sl) {
|
||||
if (dp != dst.length) {
|
||||
dst = Arrays.copyOf(dst, dp);
|
||||
if (sp == length) {
|
||||
if (dp != latin1.length) {
|
||||
latin1 = Arrays.copyOf(latin1, dp);
|
||||
}
|
||||
this.value = dst;
|
||||
this.value = latin1;
|
||||
this.coder = LATIN1;
|
||||
return;
|
||||
}
|
||||
byte[] buf = new byte[length << 1];
|
||||
StringLatin1.inflate(dst, 0, buf, 0, dp);
|
||||
dst = buf;
|
||||
dp = decodeUTF8_UTF16(bytes, offset, sl, dst, dp, true);
|
||||
byte[] utf16 = new byte[length << 1];
|
||||
StringLatin1.inflate(latin1, 0, utf16, 0, dp);
|
||||
dp = decodeUTF8_UTF16(latin1, sp, length, utf16, dp, true);
|
||||
if (dp != length) {
|
||||
dst = Arrays.copyOf(dst, dp << 1);
|
||||
utf16 = Arrays.copyOf(utf16, dp << 1);
|
||||
}
|
||||
this.value = dst;
|
||||
this.value = utf16;
|
||||
this.coder = UTF16;
|
||||
} else { // !COMPACT_STRINGS
|
||||
byte[] dst = new byte[length << 1];
|
||||
|
@ -655,12 +670,10 @@ public final class String
|
|||
char[] ca = new char[en];
|
||||
int clen = ad.decode(bytes, offset, length, ca);
|
||||
if (COMPACT_STRINGS) {
|
||||
byte[] bs = StringUTF16.compress(ca, 0, clen);
|
||||
if (bs != null) {
|
||||
value = bs;
|
||||
coder = LATIN1;
|
||||
return;
|
||||
}
|
||||
byte[] val = StringUTF16.compress(ca, 0, clen);;
|
||||
this.coder = StringUTF16.coderFromArrayLen(val, clen);
|
||||
this.value = val;
|
||||
return;
|
||||
}
|
||||
coder = UTF16;
|
||||
value = StringUTF16.toBytes(ca, 0, clen);
|
||||
|
@ -686,12 +699,10 @@ public final class String
|
|||
throw new Error(x);
|
||||
}
|
||||
if (COMPACT_STRINGS) {
|
||||
byte[] bs = StringUTF16.compress(ca, 0, caLen);
|
||||
if (bs != null) {
|
||||
value = bs;
|
||||
coder = LATIN1;
|
||||
return;
|
||||
}
|
||||
byte[] val = StringUTF16.compress(ca, 0, caLen);
|
||||
this.coder = StringUTF16.coderFromArrayLen(val, caLen);
|
||||
this.value = val;
|
||||
return;
|
||||
}
|
||||
coder = UTF16;
|
||||
value = StringUTF16.toBytes(ca, 0, caLen);
|
||||
|
@ -829,10 +840,9 @@ public final class String
|
|||
throw new IllegalArgumentException(x);
|
||||
}
|
||||
if (COMPACT_STRINGS) {
|
||||
byte[] bs = StringUTF16.compress(ca, 0, caLen);
|
||||
if (bs != null) {
|
||||
return new String(bs, LATIN1);
|
||||
}
|
||||
byte[] val = StringUTF16.compress(ca, 0, caLen);
|
||||
int coder = StringUTF16.coderFromArrayLen(val, len);
|
||||
return new String(val, coder);
|
||||
}
|
||||
return new String(StringUTF16.toBytes(ca, 0, caLen), UTF16);
|
||||
}
|
||||
|
@ -1386,6 +1396,9 @@ public final class String
|
|||
* java.nio.charset.CharsetDecoder} class should be used when more control
|
||||
* over the decoding process is required.
|
||||
*
|
||||
* <p> The contents of the string are unspecified if the byte array
|
||||
* is modified during string construction.
|
||||
*
|
||||
* @param bytes
|
||||
* The bytes to be decoded into characters
|
||||
*
|
||||
|
@ -1414,6 +1427,9 @@ public final class String
|
|||
* java.nio.charset.CharsetDecoder} class should be used when more control
|
||||
* over the decoding process is required.
|
||||
*
|
||||
* <p> The contents of the string are unspecified if the byte array
|
||||
* is modified during string construction.
|
||||
*
|
||||
* @param bytes
|
||||
* The bytes to be decoded into characters
|
||||
*
|
||||
|
@ -1438,6 +1454,9 @@ public final class String
|
|||
* java.nio.charset.CharsetDecoder} class should be used when more control
|
||||
* over the decoding process is required.
|
||||
*
|
||||
* <p> The contents of the string are unspecified if the byte array
|
||||
* is modified during string construction.
|
||||
*
|
||||
* @param bytes
|
||||
* The bytes to be decoded into characters
|
||||
*
|
||||
|
@ -1468,6 +1487,9 @@ public final class String
|
|||
* java.nio.charset.CharsetDecoder} class should be used when more control
|
||||
* over the decoding process is required.
|
||||
*
|
||||
* <p> The contents of the string are unspecified if the byte array
|
||||
* is modified during string construction.
|
||||
*
|
||||
* @param bytes
|
||||
* The bytes to be decoded into characters
|
||||
*
|
||||
|
@ -1496,6 +1518,9 @@ public final class String
|
|||
* string builder are copied; subsequent modification of the string builder
|
||||
* does not affect the newly created string.
|
||||
*
|
||||
* <p> The contents of the string are unspecified if the {@code StringBuilder}
|
||||
* is modified during string construction.
|
||||
*
|
||||
* <p> This constructor is provided to ease migration to {@code
|
||||
* StringBuilder}. Obtaining a string from a string builder via the {@code
|
||||
* toString} method is likely to run faster and is generally preferred.
|
||||
|
@ -4488,6 +4513,9 @@ public final class String
|
|||
* modification of the character array does not affect the returned
|
||||
* string.
|
||||
*
|
||||
* <p> The contents of the string are unspecified if the character array
|
||||
* is modified during string construction.
|
||||
*
|
||||
* @param data the character array.
|
||||
* @return a {@code String} that contains the characters of the
|
||||
* character array.
|
||||
|
@ -4506,6 +4534,9 @@ public final class String
|
|||
* are copied; subsequent modification of the character array does not
|
||||
* affect the returned string.
|
||||
*
|
||||
* <p> The contents of the string are unspecified if the character array
|
||||
* is modified during string construction.
|
||||
*
|
||||
* @param data the character array.
|
||||
* @param offset initial offset of the subarray.
|
||||
* @param count length of the subarray.
|
||||
|
@ -4767,15 +4798,18 @@ public final class String
|
|||
}
|
||||
|
||||
/*
|
||||
* Package private constructor. Trailing Void argument is there for
|
||||
* Private constructor. Trailing Void argument is there for
|
||||
* disambiguating it against other (public) constructors.
|
||||
*
|
||||
* Stores the char[] value into a byte[] that each byte represents
|
||||
* the8 low-order bits of the corresponding character, if the char[]
|
||||
* contains only latin1 character. Or a byte[] that stores all
|
||||
* characters in their byte sequences defined by the {@code StringUTF16}.
|
||||
*
|
||||
* <p> The contents of the string are unspecified if the character array
|
||||
* is modified during string construction.
|
||||
*/
|
||||
String(char[] value, int off, int len, Void sig) {
|
||||
private String(char[] value, int off, int len, Void sig) {
|
||||
if (len == 0) {
|
||||
this.value = "".value;
|
||||
this.coder = "".coder;
|
||||
|
@ -4783,11 +4817,9 @@ public final class String
|
|||
}
|
||||
if (COMPACT_STRINGS) {
|
||||
byte[] val = StringUTF16.compress(value, off, len);
|
||||
if (val != null) {
|
||||
this.value = val;
|
||||
this.coder = LATIN1;
|
||||
return;
|
||||
}
|
||||
this.coder = StringUTF16.coderFromArrayLen(val, len);
|
||||
this.value = val;
|
||||
return;
|
||||
}
|
||||
this.coder = UTF16;
|
||||
this.value = StringUTF16.toBytes(value, off, len);
|
||||
|
@ -4796,6 +4828,9 @@ public final class String
|
|||
/*
|
||||
* Package private constructor. Trailing Void argument is there for
|
||||
* disambiguating it against other (public) constructors.
|
||||
*
|
||||
* <p> The contents of the string are unspecified if the {@code StringBuilder}
|
||||
* is modified during string construction.
|
||||
*/
|
||||
String(AbstractStringBuilder asb, Void sig) {
|
||||
byte[] val = asb.getValue();
|
||||
|
@ -4806,12 +4841,9 @@ public final class String
|
|||
} else {
|
||||
// only try to compress val if some characters were deleted.
|
||||
if (COMPACT_STRINGS && asb.maybeLatin1) {
|
||||
byte[] buf = StringUTF16.compress(val, 0, length);
|
||||
if (buf != null) {
|
||||
this.coder = LATIN1;
|
||||
this.value = buf;
|
||||
return;
|
||||
}
|
||||
this.value = StringUTF16.compress(val, 0, length);
|
||||
this.coder = StringUTF16.coderFromArrayLen(this.value, length);
|
||||
return;
|
||||
}
|
||||
this.coder = UTF16;
|
||||
this.value = Arrays.copyOfRange(val, 0, length << 1);
|
||||
|
|
|
@ -47,8 +47,12 @@ final class StringLatin1 {
|
|||
return (char)(value[index] & 0xff);
|
||||
}
|
||||
|
||||
public static boolean canEncode(char cp) {
|
||||
return cp <= 0xff;
|
||||
}
|
||||
|
||||
public static boolean canEncode(int cp) {
|
||||
return cp >>> 8 == 0;
|
||||
return cp >=0 && cp <= 0xff;
|
||||
}
|
||||
|
||||
public static int length(byte[] value) {
|
||||
|
|
|
@ -34,7 +34,6 @@ import java.util.stream.Stream;
|
|||
import java.util.stream.StreamSupport;
|
||||
import jdk.internal.util.ArraysSupport;
|
||||
import jdk.internal.util.DecimalDigits;
|
||||
import jdk.internal.vm.annotation.DontInline;
|
||||
import jdk.internal.vm.annotation.ForceInline;
|
||||
import jdk.internal.vm.annotation.IntrinsicCandidate;
|
||||
|
||||
|
@ -54,6 +53,19 @@ final class StringUTF16 {
|
|||
return new byte[len << 1];
|
||||
}
|
||||
|
||||
// Check the size of a UTF16-coded string
|
||||
// Throw an exception if out of range
|
||||
public static int newBytesLength(int len) {
|
||||
if (len < 0) {
|
||||
throw new NegativeArraySizeException();
|
||||
}
|
||||
if (len > MAX_LENGTH) {
|
||||
throw new OutOfMemoryError("UTF16 String size is " + len +
|
||||
", should be less than " + MAX_LENGTH);
|
||||
}
|
||||
return len << 1;
|
||||
}
|
||||
|
||||
@IntrinsicCandidate
|
||||
// intrinsic performs no bounds checks
|
||||
static void putChar(byte[] val, int index, int c) {
|
||||
|
@ -148,6 +160,13 @@ final class StringUTF16 {
|
|||
return dst;
|
||||
}
|
||||
|
||||
/**
|
||||
* {@return an encoded byte[] for the UTF16 characters in char[]}
|
||||
* No checking is done on the characters, some may or may not be latin1.
|
||||
* @param value a char array
|
||||
* @param off an offset
|
||||
* @param len a length
|
||||
*/
|
||||
@IntrinsicCandidate
|
||||
public static byte[] toBytes(char[] value, int off, int len) {
|
||||
byte[] val = newBytesFor(len);
|
||||
|
@ -158,20 +177,209 @@ final class StringUTF16 {
|
|||
return val;
|
||||
}
|
||||
|
||||
public static byte[] compress(char[] val, int off, int len) {
|
||||
byte[] ret = new byte[len];
|
||||
if (compress(val, off, ret, 0, len) == len) {
|
||||
return ret;
|
||||
}
|
||||
return null;
|
||||
// Clever way to get the coder from a byte array returned from compress
|
||||
// that maybe either latin1 or UTF16-coded
|
||||
// Equivalent to (len == val.length) ? LATIN1 : UTF16
|
||||
@ForceInline
|
||||
static byte coderFromArrayLen(byte[] value, int len) {
|
||||
return (byte) ((len - value.length) >>> Integer.SIZE - 1);
|
||||
}
|
||||
|
||||
public static byte[] compress(byte[] val, int off, int len) {
|
||||
byte[] ret = new byte[len];
|
||||
if (compress(val, off, ret, 0, len) == len) {
|
||||
return ret;
|
||||
/**
|
||||
* {@return Compress the char array (containing UTF16) into a compact strings byte array}
|
||||
* If all the chars are LATIN1, it returns an array with len == count,
|
||||
* otherwise, it contains UTF16 characters.
|
||||
* <p>
|
||||
* A UTF16 array is returned *only* if at least 1 non-latin1 character is present.
|
||||
* This must be true even if the input array is modified while this method is executing.
|
||||
* This is assured by copying the characters while checking for latin1.
|
||||
* If all characters are latin1, a byte array with length equals count is returned,
|
||||
* indicating all latin1 chars. The scan may be implemented as an intrinsic,
|
||||
* which returns the index of the first non-latin1 character.
|
||||
* When the first non-latin1 character is found, it switches to creating a new
|
||||
* buffer; the saved prefix of latin1 characters is copied to the new buffer;
|
||||
* and the remaining input characters are copied to the buffer.
|
||||
* The index of the known non-latin1 character is checked, if it is latin1,
|
||||
* the input has been changed. In this case, a second attempt is made to compress to
|
||||
* latin1 from the copy made in the first pass to the originally allocated latin1 buffer.
|
||||
* If it succeeds the return value is latin1, otherwise, the utf16 value is returned.
|
||||
* In this unusual case, the result is correct for the snapshot of the value.
|
||||
* The resulting string contents are unspecified if the input array is modified during this
|
||||
* operation, but it is ensured that at least 1 non-latin1 character is present in
|
||||
* the non-latin1 buffer.
|
||||
*
|
||||
* @param val a char array
|
||||
* @param off starting offset
|
||||
* @param count count of chars to be compressed, {@code count} > 0
|
||||
*/
|
||||
@ForceInline
|
||||
public static byte[] compress(final char[] val, final int off, final int count) {
|
||||
byte[] latin1 = new byte[count];
|
||||
int ndx = compress(val, off, latin1, 0, count);
|
||||
if (ndx != count) {
|
||||
// Switch to UTF16
|
||||
byte[] utf16 = toBytes(val, off, count);
|
||||
// If the original character that was found to be non-latin1 is latin1 in the copy
|
||||
// try to make a latin1 string from the copy
|
||||
if (getChar(utf16, ndx) > 0xff
|
||||
|| compress(utf16, 0, latin1, 0, count) != count) {
|
||||
return utf16;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
return latin1; // latin1 success
|
||||
}
|
||||
|
||||
/**
|
||||
* {@return Compress the internal byte array (containing UTF16) into a compact strings byte array}
|
||||
* If all the chars are LATIN1, it returns an array with len == count,
|
||||
* otherwise, it contains UTF16 characters.
|
||||
* <p>
|
||||
* Refer to the description of the algorithm in {@link #compress(char[], int, int)}.
|
||||
*
|
||||
* @param val a byte array with UTF16 coding
|
||||
* @param off starting offset
|
||||
* @param count count of chars to be compressed, {@code count} > 0
|
||||
*/
|
||||
public static byte[] compress(final byte[] val, final int off, final int count) {
|
||||
byte[] latin1 = new byte[count];
|
||||
int ndx = compress(val, off, latin1, 0, count);
|
||||
if (ndx != count) {// Switch to UTF16
|
||||
byte[] utf16 = Arrays.copyOfRange(val, off << 1, newBytesLength(off + count));
|
||||
// If the original character that was found to be non-latin1 is latin1 in the copy
|
||||
// try to make a latin1 string from the copy
|
||||
if (getChar(utf16, ndx) > 0xff
|
||||
|| compress(utf16, 0, latin1, 0, count) != count) {
|
||||
return utf16;
|
||||
}
|
||||
}
|
||||
return latin1; // latin1 success
|
||||
}
|
||||
|
||||
/**
|
||||
* {@return compress the code points into a compact strings byte array}
|
||||
* If all the chars are LATIN1, returns an array with len == count.
|
||||
* If not, a new byte array is allocated and code points converted to UTF16.
|
||||
* The algorithm is similar to that of {@link #compress(char[], int, int)}.
|
||||
* <p>
|
||||
* The resulting encoding is attempted in several steps:
|
||||
* <UL>
|
||||
* <LI>If no non-latin1 characters are found, the encoding is latin1</LI>
|
||||
* <LI>If an estimate of the number of characters needed to represent the codepoints is
|
||||
* equal to the string length, they are all BMP with at least 1 UTF16 character
|
||||
* and are copied to the result. </LI>
|
||||
* <LI>The extractCodePoints method is called to carefully expand surrogates. </LI>
|
||||
* </UL>
|
||||
*
|
||||
* @param val an int array of code points
|
||||
* @param off starting offset
|
||||
* @param count length of code points to be compressed, length > 0
|
||||
*/
|
||||
public static byte[] compress(final int[] val, int off, final int count) {
|
||||
// Optimistically copy all latin1 code points to the destination
|
||||
byte[] latin1 = new byte[count];
|
||||
final int end = off + count;
|
||||
for (int ndx = 0; ndx < count; ndx++, off++) {
|
||||
int cp = val[off];
|
||||
if (cp >= 0 && cp <= 0xff) {
|
||||
latin1[ndx] = (byte)cp;
|
||||
} else {
|
||||
// Pass 1: Compute precise size of char[]; see extractCodePoints for caveat
|
||||
int estSize = ndx + computeCodePointSize(val, off, end);
|
||||
|
||||
// Pass 2: Switch to UTF16
|
||||
// cp = val[ndx] is at least one code point known to be UTF16
|
||||
byte[] utf16 = newBytesFor(estSize);
|
||||
if (ndx > 0) {
|
||||
StringLatin1.inflate(latin1, 0, utf16, 0, ndx); // inflate latin1 bytes
|
||||
}
|
||||
|
||||
if (estSize == count) {
|
||||
// Based on the computed size, all remaining code points are BMP and
|
||||
// can be copied without checking again
|
||||
putChar(utf16, ndx, cp); // ensure utf16 has a UTF16 char
|
||||
off++;
|
||||
for (int i = ndx + 1; i < count; i++, off++) {
|
||||
putChar(utf16, i, val[off]);
|
||||
}
|
||||
} else {
|
||||
// Some codepoint is a surrogate pair
|
||||
utf16 = extractCodepoints(val, off, end, utf16, ndx);
|
||||
|
||||
// The original character that was found to be UTF16 is not UTF16 in the copy
|
||||
// Try to make a latin1 string from the copy
|
||||
if (getChar(utf16, ndx) <= 0xff &&
|
||||
compress(utf16, 0, latin1, 0, count) == count) {
|
||||
return latin1; // latin1 success
|
||||
}
|
||||
}
|
||||
return utf16;
|
||||
}
|
||||
}
|
||||
return latin1; // Latin1 success
|
||||
}
|
||||
|
||||
// Extract code points into chars in the byte array
|
||||
//
|
||||
// Guard against possible races with the input array changing between the previous
|
||||
// computation of the required output size and storing the bmp or surrogates.
|
||||
// If a BMP code point is changed to a supplementary code point it would require 2 chars
|
||||
// in the output. Changing a supplementary char to BMP would reduce the size.
|
||||
// If the utf16 destination is not large enough, it is resized to fit the
|
||||
// remaining codepoints assuming they occupy 2 characters.
|
||||
// The destination may be copied to return exactly the final length.
|
||||
// The additional allocations and compression only occur if the input array is modified.
|
||||
private static byte[] extractCodepoints(int[] val, int off, int end, byte[] dst, int dstOff) {
|
||||
while (off < end) {
|
||||
// Compute a minimum estimate on the number of characters can be put into the dst
|
||||
// given the current codepoint and the number of remaining codepoints
|
||||
int codePoint = val[off]; // read each codepoint from val only once
|
||||
int dstLimit = dstOff
|
||||
+ Character.charCount(codePoint)
|
||||
+ (end - off - 1);
|
||||
if (dstLimit > (dst.length >> 1)) {
|
||||
// Resize to hold the remaining codepoints assuming they are all surrogates.
|
||||
// By resizing to the maximum that might be needed, only a single resize will occur.
|
||||
// dstLimit includes only a single char per codepoint, pad with an additional for each.
|
||||
int maxRemaining = dstLimit + (end - off - 1);
|
||||
dst = Arrays.copyOf(dst, newBytesLength(maxRemaining));
|
||||
}
|
||||
// Efficiently copy as many codepoints as fit within the current estimated limit
|
||||
// The dst at least enough space for the current codepoint.
|
||||
while (true) {
|
||||
if (Character.isBmpCodePoint(codePoint)) {
|
||||
putChar(dst, dstOff++, codePoint);
|
||||
} else {
|
||||
putChar(dst, dstOff++, Character.highSurrogate(codePoint));
|
||||
putChar(dst, dstOff++, Character.lowSurrogate(codePoint));
|
||||
}
|
||||
off++;
|
||||
if (dstOff + 2 > dstLimit)
|
||||
break; // no space for another surrogate; recompute limit
|
||||
codePoint = val[off];
|
||||
}
|
||||
}
|
||||
if (dstOff != (dst.length >> 1)) {
|
||||
// Truncate to actual length; should only occur if a codepoint was racily
|
||||
// changed from a surrogate to a BMP character.
|
||||
return Arrays.copyOf(dst, newBytesLength(dstOff));
|
||||
}
|
||||
return dst;
|
||||
}
|
||||
|
||||
// Compute the number of chars needed to represent the code points from off to end-1
|
||||
private static int computeCodePointSize(int[] val, int off, int end) {
|
||||
int n = end - off;
|
||||
while (off < end) {
|
||||
int codePoint = val[off++];
|
||||
if (Character.isBmpCodePoint(codePoint)) {
|
||||
continue;
|
||||
} else if (Character.isValidCodePoint(codePoint)) {
|
||||
n++;
|
||||
} else {
|
||||
throw new IllegalArgumentException(Integer.toString(codePoint));
|
||||
}
|
||||
}
|
||||
return n;
|
||||
}
|
||||
|
||||
// compressedCopy char[] -> byte[]
|
||||
|
@ -179,9 +387,8 @@ final class StringUTF16 {
|
|||
public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
|
||||
for (int i = 0; i < len; i++) {
|
||||
char c = src[srcOff];
|
||||
if (c > 0xFF) {
|
||||
len = 0;
|
||||
break;
|
||||
if (c > 0xff) {
|
||||
return i; // return index of non-latin1 char
|
||||
}
|
||||
dst[dstOff] = (byte)c;
|
||||
srcOff++;
|
||||
|
@ -197,9 +404,8 @@ final class StringUTF16 {
|
|||
checkBoundsOffCount(srcOff, len, src);
|
||||
for (int i = 0; i < len; i++) {
|
||||
char c = getChar(src, srcOff);
|
||||
if (c > 0xFF) {
|
||||
len = 0;
|
||||
break;
|
||||
if (c > 0xff) {
|
||||
return i; // return index of non-latin1 char
|
||||
}
|
||||
dst[dstOff] = (byte)c;
|
||||
srcOff++;
|
||||
|
@ -208,31 +414,14 @@ final class StringUTF16 {
|
|||
return len;
|
||||
}
|
||||
|
||||
// Create the UTF16 buffer for !COMPACT_STRINGS
|
||||
public static byte[] toBytes(int[] val, int index, int len) {
|
||||
final int end = index + len;
|
||||
// Pass 1: Compute precise size of char[]
|
||||
int n = len;
|
||||
for (int i = index; i < end; i++) {
|
||||
int cp = val[i];
|
||||
if (Character.isBmpCodePoint(cp))
|
||||
continue;
|
||||
else if (Character.isValidCodePoint(cp))
|
||||
n++;
|
||||
else throw new IllegalArgumentException(Integer.toString(cp));
|
||||
}
|
||||
// Pass 2: Allocate and fill in <high, low> pair
|
||||
int n = computeCodePointSize(val, index, end);
|
||||
|
||||
byte[] buf = newBytesFor(n);
|
||||
for (int i = index, j = 0; i < end; i++, j++) {
|
||||
int cp = val[i];
|
||||
if (Character.isBmpCodePoint(cp)) {
|
||||
putChar(buf, j, cp);
|
||||
} else {
|
||||
putChar(buf, j++, Character.highSurrogate(cp));
|
||||
putChar(buf, j, Character.lowSurrogate(cp));
|
||||
}
|
||||
}
|
||||
return buf;
|
||||
}
|
||||
return extractCodepoints(val, index, len, buf, 0);
|
||||
}
|
||||
|
||||
public static byte[] toBytes(char c) {
|
||||
byte[] result = new byte[2];
|
||||
|
@ -653,10 +842,9 @@ final class StringUTF16 {
|
|||
if (String.COMPACT_STRINGS &&
|
||||
!StringLatin1.canEncode(oldChar) &&
|
||||
StringLatin1.canEncode(newChar)) {
|
||||
byte[] val = compress(buf, 0, len);
|
||||
if (val != null) {
|
||||
return new String(val, LATIN1);
|
||||
}
|
||||
byte[] res = StringUTF16.compress(buf, 0, len);
|
||||
byte coder = StringUTF16.coderFromArrayLen(res, len);
|
||||
return new String(res, coder);
|
||||
}
|
||||
return new String(buf, UTF16);
|
||||
}
|
||||
|
@ -771,10 +959,9 @@ final class StringUTF16 {
|
|||
|
||||
if (String.COMPACT_STRINGS && replLat1 && !targLat1) {
|
||||
// combination 6
|
||||
byte[] lat1Result = compress(result, 0, resultLen);
|
||||
if (lat1Result != null) {
|
||||
return new String(lat1Result, LATIN1);
|
||||
}
|
||||
byte[] res = StringUTF16.compress(result, 0, resultLen);
|
||||
byte coder = StringUTF16.coderFromArrayLen(res, resultLen);
|
||||
return new String(res, coder); // combination 6
|
||||
}
|
||||
return new String(result, UTF16);
|
||||
}
|
||||
|
@ -838,7 +1025,7 @@ final class StringUTF16 {
|
|||
bits |= cp;
|
||||
putChar(result, i, cp);
|
||||
}
|
||||
if (bits > 0xFF) {
|
||||
if (bits < 0 || bits > 0xff) {
|
||||
return new String(result, UTF16);
|
||||
} else {
|
||||
return newString(result, 0, len);
|
||||
|
@ -939,7 +1126,7 @@ final class StringUTF16 {
|
|||
bits |= cp;
|
||||
putChar(result, i, cp);
|
||||
}
|
||||
if (bits > 0xFF) {
|
||||
if (bits < 0 || bits > 0xff) {
|
||||
return new String(result, UTF16);
|
||||
} else {
|
||||
return newString(result, 0, len);
|
||||
|
@ -1168,10 +1355,9 @@ final class StringUTF16 {
|
|||
return "";
|
||||
}
|
||||
if (String.COMPACT_STRINGS) {
|
||||
byte[] buf = compress(val, index, len);
|
||||
if (buf != null) {
|
||||
return new String(buf, LATIN1);
|
||||
}
|
||||
byte[] res = StringUTF16.compress(val, index, len);
|
||||
byte coder = StringUTF16.coderFromArrayLen(res, len);
|
||||
return new String(res, coder);
|
||||
}
|
||||
int last = index + len;
|
||||
return new String(Arrays.copyOfRange(val, index << 1, last << 1), UTF16);
|
||||
|
@ -1502,8 +1688,8 @@ final class StringUTF16 {
|
|||
|
||||
private static native boolean isBigEndian();
|
||||
|
||||
static final int HI_BYTE_SHIFT;
|
||||
static final int LO_BYTE_SHIFT;
|
||||
private static final int HI_BYTE_SHIFT;
|
||||
private static final int LO_BYTE_SHIFT;
|
||||
static {
|
||||
if (isBigEndian()) {
|
||||
HI_BYTE_SHIFT = 8;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue