8141132: JEP 254: Compact Strings

Adopt a more space-efficient internal representation for strings.

Co-authored-by: Brent Christian <brent.christian@oracle.com>
Co-authored-by: Vivek Deshpande <vivek.r.deshpande@intel.com>
Co-authored-by: Charlie Hunt <charlie.hunt@oracle.com>
Co-authored-by: Vladimir Kozlov <vladimir.kozlov@oracle.com>
Co-authored-by: Roger Riggs <roger.riggs@oracle.com>
Co-authored-by: Xueming Shen <xueming.shen@oracle.com>
Co-authored-by: Aleksey Shipilev <aleksey.shipilev@oracle.com>
Co-authored-by: Sandhya Viswanathan <sandhya.viswanathan@intel.com>
Reviewed-by: alanb, bdelsart, coleenp, iklam, jiangli, jrose, kevinw, naoto, pliden, roland, smarks, twisti
This commit is contained in:
Tobias Hartmann 2015-11-03 09:42:11 +01:00
parent b7dca6971d
commit 4ed5b73f3d
76 changed files with 8755 additions and 1288 deletions

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -38,11 +38,19 @@ import java.nio.charset.CodingErrorAction;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.Arrays;
import jdk.internal.HotSpotIntrinsicCandidate;
import sun.misc.MessageUtils;
import sun.nio.cs.HistoricallyNamedCharset;
import sun.nio.cs.ArrayDecoder;
import sun.nio.cs.ArrayEncoder;
import static java.lang.String.LATIN1;
import static java.lang.String.UTF16;
import static java.lang.String.COMPACT_STRINGS;
import static java.nio.charset.StandardCharsets.ISO_8859_1;
import static java.nio.charset.StandardCharsets.US_ASCII;
import static java.nio.charset.StandardCharsets.UTF_8;
/**
* Utility class for string encoding and decoding.
*/
@ -72,23 +80,13 @@ class StringCoding {
// Trim the given byte array to the given length
//
private static byte[] safeTrim(byte[] ba, int len, Charset cs, boolean isTrusted) {
private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) {
if (len == ba.length && (isTrusted || System.getSecurityManager() == null))
return ba;
else
return Arrays.copyOf(ba, len);
}
// Trim the given char array to the given length
//
private static char[] safeTrim(char[] ca, int len,
Charset cs, boolean isTrusted) {
if (len == ca.length && (isTrusted || System.getSecurityManager() == null))
return ca;
else
return Arrays.copyOf(ca, len);
}
private static int scale(int len, float expansionFactor) {
// We need to perform double, not float, arithmetic; otherwise
// we lose low order bits when len is larger than 2**24.
@ -117,21 +115,64 @@ class StringCoding {
}
}
static class Result {
byte[] value;
byte coder;
Result with() {
coder = COMPACT_STRINGS ? LATIN1 : UTF16;
value = new byte[0];
return this;
}
Result with(char[] val, int off, int len) {
if (String.COMPACT_STRINGS) {
byte[] bs = StringUTF16.compress(val, off, len);
if (bs != null) {
value = bs;
coder = LATIN1;
return this;
}
}
coder = UTF16;
value = StringUTF16.toBytes(val, off, len);
return this;
}
Result with(byte[] val, byte coder) {
this.coder = coder;
value = val;
return this;
}
}
@HotSpotIntrinsicCandidate
private static boolean hasNegatives(byte[] ba, int off, int len) {
for (int i = off; i < off + len; i++) {
if (ba[i] < 0) {
return true;
}
}
return false;
}
// -- Decoding --
private static class StringDecoder {
static class StringDecoder {
private final String requestedCharsetName;
private final Charset cs;
private final boolean isASCIICompatible;
private final CharsetDecoder cd;
private final boolean isTrusted;
protected final Result result;
private StringDecoder(Charset cs, String rcn) {
StringDecoder(Charset cs, String rcn) {
this.requestedCharsetName = rcn;
this.cs = cs;
this.cd = cs.newDecoder()
.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE);
this.isTrusted = (cs.getClass().getClassLoader0() == null);
this.result = new Result();
this.isASCIICompatible = (cd instanceof ArrayDecoder) &&
((ArrayDecoder)cd).isASCIICompatible();
}
String charsetName() {
@ -144,36 +185,58 @@ class StringCoding {
return requestedCharsetName;
}
char[] decode(byte[] ba, int off, int len) {
Result decode(byte[] ba, int off, int len) {
if (len == 0) {
return result.with();
}
// fastpath for ascii compatible
if (isASCIICompatible && !hasNegatives(ba, off, len)) {
if (COMPACT_STRINGS) {
return result.with(Arrays.copyOfRange(ba, off, off + len),
LATIN1);
} else {
return result.with(StringLatin1.inflate(ba, off, len), UTF16);
}
}
int en = scale(len, cd.maxCharsPerByte());
char[] ca = new char[en];
if (len == 0)
return ca;
if (cd instanceof ArrayDecoder) {
int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca);
return safeTrim(ca, clen, cs, isTrusted);
return result.with(ca, 0, clen);
}
cd.reset();
ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
CharBuffer cb = CharBuffer.wrap(ca);
try {
CoderResult cr = cd.decode(bb, cb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = cd.flush(cb);
if (!cr.isUnderflow())
cr.throwException();
} catch (CharacterCodingException x) {
// Substitution is always enabled,
// so this shouldn't happen
throw new Error(x);
}
return result.with(ca, 0, cb.position());
}
}
private static class StringDecoder8859_1 extends StringDecoder {
StringDecoder8859_1(Charset cs, String rcn) {
super(cs, rcn);
}
Result decode(byte[] ba, int off, int len) {
if (COMPACT_STRINGS) {
return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1);
} else {
cd.reset();
ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
CharBuffer cb = CharBuffer.wrap(ca);
try {
CoderResult cr = cd.decode(bb, cb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = cd.flush(cb);
if (!cr.isUnderflow())
cr.throwException();
} catch (CharacterCodingException x) {
// Substitution is always enabled,
// so this shouldn't happen
throw new Error(x);
}
return safeTrim(ca, cb.position(), cs, isTrusted);
return result.with(StringLatin1.inflate(ba, off, len), UTF16);
}
}
}
static char[] decode(String charsetName, byte[] ba, int off, int len)
static Result decode(String charsetName, byte[] ba, int off, int len)
throws UnsupportedEncodingException
{
StringDecoder sd = deref(decoder);
@ -183,8 +246,15 @@ class StringCoding {
sd = null;
try {
Charset cs = lookupCharset(csn);
if (cs != null)
sd = new StringDecoder(cs, csn);
if (cs != null) {
if (cs == UTF_8) {
sd = new StringDecoderUTF8(cs, csn);
} else if (cs == ISO_8859_1) {
sd = new StringDecoder8859_1(cs, csn);
} else {
sd = new StringDecoder(cs, csn);
}
}
} catch (IllegalCharsetNameException x) {}
if (sd == null)
throw new UnsupportedEncodingException(csn);
@ -193,7 +263,7 @@ class StringCoding {
return sd.decode(ba, off, len);
}
static char[] decode(Charset cs, byte[] ba, int off, int len) {
static Result decode(Charset cs, byte[] ba, int off, int len) {
// (1)We never cache the "external" cs, the only benefit of creating
// an additional StringDe/Encoder object to wrap it is to share the
// de/encode() method. These SD/E objects are short-lived, the young-gen
@ -210,44 +280,57 @@ class StringCoding {
// check (... && (isTrusted || SM == null || getClassLoader0())) in trim
// but it then can be argued that the SM is null when the operation
// is started...
if (cs == UTF_8) {
return StringDecoderUTF8.decode(ba, off, len, new Result());
}
CharsetDecoder cd = cs.newDecoder();
// ascii fastpath
if (cs == ISO_8859_1 || ((cd instanceof ArrayDecoder) &&
((ArrayDecoder)cd).isASCIICompatible() &&
!hasNegatives(ba, off, len))) {
if (COMPACT_STRINGS) {
return new Result().with(Arrays.copyOfRange(ba, off, off + len),
LATIN1);
} else {
return new Result().with(StringLatin1.inflate(ba, off, len), UTF16);
}
}
int en = scale(len, cd.maxCharsPerByte());
char[] ca = new char[en];
if (len == 0)
return ca;
boolean isTrusted = false;
if (System.getSecurityManager() != null) {
if (!(isTrusted = (cs.getClass().getClassLoader0() == null))) {
ba = Arrays.copyOfRange(ba, off, off + len);
off = 0;
}
if (len == 0) {
return new Result().with();
}
if (System.getSecurityManager() != null &&
cs.getClass().getClassLoader0() != null) {
ba = Arrays.copyOfRange(ba, off, off + len);
off = 0;
}
cd.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE)
.reset();
char[] ca = new char[en];
if (cd instanceof ArrayDecoder) {
int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca);
return safeTrim(ca, clen, cs, isTrusted);
} else {
ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
CharBuffer cb = CharBuffer.wrap(ca);
try {
CoderResult cr = cd.decode(bb, cb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = cd.flush(cb);
if (!cr.isUnderflow())
cr.throwException();
} catch (CharacterCodingException x) {
// Substitution is always enabled,
// so this shouldn't happen
throw new Error(x);
}
return safeTrim(ca, cb.position(), cs, isTrusted);
return new Result().with(ca, 0, clen);
}
ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
CharBuffer cb = CharBuffer.wrap(ca);
try {
CoderResult cr = cd.decode(bb, cb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = cd.flush(cb);
if (!cr.isUnderflow())
cr.throwException();
} catch (CharacterCodingException x) {
// Substitution is always enabled,
// so this shouldn't happen
throw new Error(x);
}
return new Result().with(ca, 0, cb.position());
}
static char[] decode(byte[] ba, int off, int len) {
static Result decode(byte[] ba, int off, int len) {
String csn = Charset.defaultCharset().name();
try {
// use charset name decode() variant which provides caching.
@ -273,6 +356,7 @@ class StringCoding {
private static class StringEncoder {
private Charset cs;
private CharsetEncoder ce;
private final boolean isASCIICompatible;
private final String requestedCharsetName;
private final boolean isTrusted;
@ -283,6 +367,8 @@ class StringCoding {
.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE);
this.isTrusted = (cs.getClass().getClassLoader0() == null);
this.isASCIICompatible = (ce instanceof ArrayEncoder) &&
((ArrayEncoder)ce).isASCIICompatible();
}
String charsetName() {
@ -295,36 +381,186 @@ class StringCoding {
return requestedCharsetName;
}
byte[] encode(char[] ca, int off, int len) {
byte[] encode(byte coder, byte[] val) {
// fastpath for ascii compatible
if (coder == LATIN1 && isASCIICompatible &&
!hasNegatives(val, 0, val.length)) {
return Arrays.copyOf(val, val.length);
}
int len = val.length >> coder; // assume LATIN1=0/UTF16=1;
int en = scale(len, ce.maxBytesPerChar());
byte[] ba = new byte[en];
if (len == 0)
if (len == 0) {
return ba;
if (ce instanceof ArrayEncoder) {
int blen = ((ArrayEncoder)ce).encode(ca, off, len, ba);
return safeTrim(ba, blen, cs, isTrusted);
} else {
ce.reset();
ByteBuffer bb = ByteBuffer.wrap(ba);
CharBuffer cb = CharBuffer.wrap(ca, off, len);
try {
CoderResult cr = ce.encode(cb, bb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = ce.flush(bb);
if (!cr.isUnderflow())
cr.throwException();
} catch (CharacterCodingException x) {
// Substitution is always enabled,
// so this shouldn't happen
throw new Error(x);
}
return safeTrim(ba, bb.position(), cs, isTrusted);
}
if (ce instanceof ArrayEncoder) {
if (!isTrusted) {
val = Arrays.copyOf(val, val.length);
}
int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
: ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
if (blen != -1) {
return safeTrim(ba, blen, isTrusted);
}
}
char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
: StringUTF16.toChars(val);
ce.reset();
ByteBuffer bb = ByteBuffer.wrap(ba);
CharBuffer cb = CharBuffer.wrap(ca, 0, len);
try {
CoderResult cr = ce.encode(cb, bb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = ce.flush(bb);
if (!cr.isUnderflow())
cr.throwException();
} catch (CharacterCodingException x) {
// Substitution is always enabled,
// so this shouldn't happen
throw new Error(x);
}
return safeTrim(ba, bb.position(), isTrusted);
}
}
static byte[] encode(String charsetName, char[] ca, int off, int len)
@HotSpotIntrinsicCandidate
private static int implEncodeISOArray(byte[] sa, int sp,
byte[] da, int dp, int len) {
int i = 0;
for (; i < len; i++) {
char c = StringUTF16.getChar(sa, sp++);
if (c > '\u00FF')
break;
da[dp++] = (byte)c;
}
return i;
}
static byte[] encode8859_1(byte coder, byte[] val) {
if (coder == LATIN1) {
return Arrays.copyOf(val, val.length);
}
int len = val.length >> 1;
byte[] dst = new byte[len];
int dp = 0;
int sp = 0;
int sl = len;
while (sp < sl) {
int ret = implEncodeISOArray(val, sp, dst, dp, len);
sp = sp + ret;
dp = dp + ret;
if (ret != len) {
char c = StringUTF16.getChar(val, sp++);
if (Character.isHighSurrogate(c) && sp < sl &&
Character.isLowSurrogate(StringUTF16.getChar(val, sp))) {
sp++;
}
dst[dp++] = '?';
len = sl - sp;
}
}
if (dp == dst.length) {
return dst;
}
return Arrays.copyOf(dst, dp);
}
static byte[] encodeASCII(byte coder, byte[] val) {
if (coder == LATIN1) {
byte[] dst = new byte[val.length];
for (int i = 0; i < val.length; i++) {
if (val[i] < 0) {
dst[i] = '?';
} else {
dst[i] = val[i];
}
}
return dst;
}
int len = val.length >> 1;
byte[] dst = new byte[len];
int dp = 0;
for (int i = 0; i < len; i++) {
char c = StringUTF16.getChar(val, i);
if (c < 0x80) {
dst[dp++] = (byte)c;
continue;
}
if (Character.isHighSurrogate(c) && i + 1 < len &&
Character.isLowSurrogate(StringUTF16.getChar(val, i + 1))) {
i++;
}
dst[dp++] = '?';
}
if (len == dp) {
return dst;
}
return Arrays.copyOf(dst, dp);
}
static byte[] encodeUTF8(byte coder, byte[] val) {
int dp = 0;
byte[] dst;
if (coder == LATIN1) {
dst = new byte[val.length << 1];
for (int sp = 0; sp < val.length; sp++) {
byte c = val[sp];
if (c < 0) {
dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6));
dst[dp++] = (byte)(0x80 | (c & 0x3f));
} else {
dst[dp++] = c;
}
}
} else {
int sp = 0;
int sl = val.length >> 1;
dst = new byte[sl * 3];
char c;
while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') {
// ascii fast loop;
dst[dp++] = (byte)c;
sp++;
}
while (sp < sl) {
c = StringUTF16.getChar(val, sp++);
if (c < 0x80) {
dst[dp++] = (byte)c;
} else if (c < 0x800) {
dst[dp++] = (byte)(0xc0 | (c >> 6));
dst[dp++] = (byte)(0x80 | (c & 0x3f));
} else if (Character.isSurrogate(c)) {
int uc = -1;
char c2;
if (Character.isHighSurrogate(c) && sp < sl &&
Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) {
uc = Character.toCodePoint(c, c2);
}
if (uc < 0) {
dst[dp++] = '?';
} else {
dst[dp++] = (byte)(0xf0 | ((uc >> 18)));
dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f));
dst[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f));
dst[dp++] = (byte)(0x80 | (uc & 0x3f));
sp++; // 2 chars
}
} else {
// 3 bytes, 16 bits
dst[dp++] = (byte)(0xe0 | ((c >> 12)));
dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f));
dst[dp++] = (byte)(0x80 | (c & 0x3f));
}
}
}
if (dp == dst.length) {
return dst;
}
return Arrays.copyOf(dst, dp);
}
static byte[] encode(String charsetName, byte coder, byte[] val)
throws UnsupportedEncodingException
{
StringEncoder se = deref(encoder);
@ -334,62 +570,88 @@ class StringCoding {
se = null;
try {
Charset cs = lookupCharset(csn);
if (cs != null)
if (cs != null) {
if (cs == UTF_8) {
return encodeUTF8(coder, val);
} else if (cs == ISO_8859_1) {
return encode8859_1(coder, val);
} else if (cs == US_ASCII) {
return encodeASCII(coder, val);
}
se = new StringEncoder(cs, csn);
}
} catch (IllegalCharsetNameException x) {}
if (se == null)
if (se == null) {
throw new UnsupportedEncodingException (csn);
}
set(encoder, se);
}
return se.encode(ca, off, len);
return se.encode(coder, val);
}
static byte[] encode(Charset cs, char[] ca, int off, int len) {
static byte[] encode(Charset cs, byte coder, byte[] val) {
if (cs == UTF_8) {
return encodeUTF8(coder, val);
} else if (cs == ISO_8859_1) {
return encode8859_1(coder, val);
} else if (cs == US_ASCII) {
return encodeASCII(coder, val);
}
CharsetEncoder ce = cs.newEncoder();
// fastpath for ascii compatible
if (coder == LATIN1 && (((ce instanceof ArrayEncoder) &&
((ArrayEncoder)ce).isASCIICompatible() &&
!hasNegatives(val, 0, val.length)))) {
return Arrays.copyOf(val, val.length);
}
int len = val.length >> coder; // assume LATIN1=0/UTF16=1;
int en = scale(len, ce.maxBytesPerChar());
byte[] ba = new byte[en];
if (len == 0)
if (len == 0) {
return ba;
boolean isTrusted = false;
if (System.getSecurityManager() != null) {
if (!(isTrusted = (cs.getClass().getClassLoader0() == null))) {
ca = Arrays.copyOfRange(ca, off, off + len);
off = 0;
}
}
boolean isTrusted = System.getSecurityManager() == null ||
cs.getClass().getClassLoader0() == null;
ce.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE)
.reset();
if (ce instanceof ArrayEncoder) {
int blen = ((ArrayEncoder)ce).encode(ca, off, len, ba);
return safeTrim(ba, blen, cs, isTrusted);
} else {
ByteBuffer bb = ByteBuffer.wrap(ba);
CharBuffer cb = CharBuffer.wrap(ca, off, len);
try {
CoderResult cr = ce.encode(cb, bb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = ce.flush(bb);
if (!cr.isUnderflow())
cr.throwException();
} catch (CharacterCodingException x) {
throw new Error(x);
if (!isTrusted) {
val = Arrays.copyOf(val, val.length);
}
int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
: ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
if (blen != -1) {
return safeTrim(ba, blen, isTrusted);
}
return safeTrim(ba, bb.position(), cs, isTrusted);
}
char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
: StringUTF16.toChars(val);
ByteBuffer bb = ByteBuffer.wrap(ba);
CharBuffer cb = CharBuffer.wrap(ca, 0, len);
try {
CoderResult cr = ce.encode(cb, bb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = ce.flush(bb);
if (!cr.isUnderflow())
cr.throwException();
} catch (CharacterCodingException x) {
throw new Error(x);
}
return safeTrim(ba, bb.position(), isTrusted);
}
static byte[] encode(char[] ca, int off, int len) {
static byte[] encode(byte coder, byte[] val) {
String csn = Charset.defaultCharset().name();
try {
// use charset name encode() variant which provides caching.
return encode(csn, ca, off, len);
return encode(csn, coder, val);
} catch (UnsupportedEncodingException x) {
warnUnsupportedCharset(csn);
}
try {
return encode("ISO-8859-1", ca, off, len);
return encode("ISO-8859-1", coder, val);
} catch (UnsupportedEncodingException x) {
// If this code is hit during VM initialization, MessageUtils is
// the only way we will be able to get any kind of error message.