8184947: ZipCoder performance improvements

Reviewed-by: martin, redestad
This commit is contained in:
Xueming Shen 2017-12-13 07:51:57 -08:00
parent 1ae8f54835
commit 4f0ea9242f
9 changed files with 620 additions and 881 deletions

View file

@ -3046,6 +3046,10 @@ public final class String
return COMPACT_STRINGS ? coder : UTF16;
}
byte[] value() {
return value;
}
private boolean isLatin1() {
return COMPACT_STRINGS && coder == LATIN1;
}

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2000, 2016, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -47,6 +47,11 @@ import sun.nio.cs.StandardCharsets;
import static java.lang.String.LATIN1;
import static java.lang.String.UTF16;
import static java.lang.String.COMPACT_STRINGS;
import static java.lang.Character.isSurrogate;
import static java.lang.Character.highSurrogate;
import static java.lang.Character.lowSurrogate;
import static java.lang.Character.isSupplementaryCodePoint;
import static java.lang.StringUTF16.putChar;
/**
* Utility class for string encoding and decoding.
@ -66,8 +71,6 @@ class StringCoding {
private static final Charset US_ASCII = sun.nio.cs.US_ASCII.INSTANCE;
private static final Charset UTF_8 = sun.nio.cs.UTF_8.INSTANCE;
private static boolean warnUnsupportedCharset = true;
private static <T> T deref(ThreadLocal<SoftReference<T>> tl) {
SoftReference<T> sr = tl.get();
if (sr == null)
@ -80,7 +83,6 @@ class StringCoding {
}
// Trim the given byte array to the given length
//
private static byte[] safeTrim(byte[] ba, int len, boolean isTrusted) {
if (len == ba.length && (isTrusted || System.getSecurityManager() == null))
return ba;
@ -105,17 +107,6 @@ class StringCoding {
return null;
}
private static void warnUnsupportedCharset(String csn) {
if (warnUnsupportedCharset) {
// Use err(String) rather than the Logging API or System.err
// since this method may be called during VM initialization
// before either is available.
err("WARNING: Default charset " + csn +
" not supported, using ISO-8859-1 instead\n");
warnUnsupportedCharset = false;
}
}
static class Result {
byte[] value;
byte coder;
@ -224,19 +215,6 @@ class StringCoding {
}
}
private static class StringDecoder8859_1 extends StringDecoder {
StringDecoder8859_1(Charset cs, String rcn) {
super(cs, rcn);
}
Result decode(byte[] ba, int off, int len) {
if (COMPACT_STRINGS) {
return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1);
} else {
return result.with(StringLatin1.inflate(ba, off, len), UTF16);
}
}
}
static Result decode(String charsetName, byte[] ba, int off, int len)
throws UnsupportedEncodingException
{
@ -249,12 +227,15 @@ class StringCoding {
Charset cs = lookupCharset(csn);
if (cs != null) {
if (cs == UTF_8) {
sd = new StringDecoderUTF8(cs, csn);
} else if (cs == ISO_8859_1) {
sd = new StringDecoder8859_1(cs, csn);
} else {
sd = new StringDecoder(cs, csn);
return decodeUTF8(ba, off, len, true);
}
if (cs == ISO_8859_1) {
return decodeLatin1(ba, off, len);
}
if (cs == US_ASCII) {
return decodeASCII(ba, off, len);
}
sd = new StringDecoder(cs, csn);
}
} catch (IllegalCharsetNameException x) {}
if (sd == null)
@ -265,6 +246,16 @@ class StringCoding {
}
static Result decode(Charset cs, byte[] ba, int off, int len) {
if (cs == UTF_8) {
return decodeUTF8(ba, off, len, true);
}
if (cs == ISO_8859_1) {
return decodeLatin1(ba, off, len);
}
if (cs == US_ASCII) {
return decodeASCII(ba, off, len);
}
// (1)We never cache the "external" cs, the only benefit of creating
// an additional StringDe/Encoder object to wrap it is to share the
// de/encode() method. These SD/E objects are short-lived, the young-gen
@ -280,39 +271,29 @@ class StringCoding {
// check (... && (isTrusted || SM == null || getClassLoader0())) in trim
// but it then can be argued that the SM is null when the operation
// is started...
if (cs == UTF_8) {
return StringDecoderUTF8.decode(ba, off, len, new Result());
}
CharsetDecoder cd = cs.newDecoder();
// ascii fastpath
if (cs == ISO_8859_1 || ((cd instanceof ArrayDecoder) &&
((ArrayDecoder)cd).isASCIICompatible() &&
!hasNegatives(ba, off, len))) {
if (COMPACT_STRINGS) {
return new Result().with(Arrays.copyOfRange(ba, off, off + len),
LATIN1);
} else {
return new Result().with(StringLatin1.inflate(ba, off, len), UTF16);
}
if ((cd instanceof ArrayDecoder) &&
((ArrayDecoder)cd).isASCIICompatible() && !hasNegatives(ba, off, len)) {
return decodeLatin1(ba, off, len);
}
int en = scale(len, cd.maxCharsPerByte());
if (len == 0) {
return new Result().with();
}
if (cs.getClass().getClassLoader0() != null &&
System.getSecurityManager() != null) {
ba = Arrays.copyOfRange(ba, off, off + len);
off = 0;
}
cd.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE)
.reset();
char[] ca = new char[en];
if (cd instanceof ArrayDecoder) {
int clen = ((ArrayDecoder)cd).decode(ba, off, len, ca);
return new Result().with(ca, 0, clen);
}
if (cs.getClass().getClassLoader0() != null &&
System.getSecurityManager() != null) {
ba = Arrays.copyOfRange(ba, off, off + len);
off = 0;
}
ByteBuffer bb = ByteBuffer.wrap(ba, off, len);
CharBuffer cb = CharBuffer.wrap(ca);
try {
@ -331,24 +312,22 @@ class StringCoding {
}
static Result decode(byte[] ba, int off, int len) {
String csn = Charset.defaultCharset().name();
try {
// use charset name decode() variant which provides caching.
return decode(csn, ba, off, len);
} catch (UnsupportedEncodingException x) {
warnUnsupportedCharset(csn);
Charset cs = Charset.defaultCharset();
if (cs == UTF_8) {
return decodeUTF8(ba, off, len, true);
}
try {
return decode("ISO-8859-1", ba, off, len);
} catch (UnsupportedEncodingException x) {
// If this code is hit during VM initialization, err(String) is
// the only way we will be able to get any kind of error message.
err("ISO-8859-1 charset not available: " + x.toString() + "\n");
// If we can not find ISO-8859-1 (a required encoding) then things
// are seriously wrong with the installation.
System.exit(1);
return null;
if (cs == ISO_8859_1) {
return decodeLatin1(ba, off, len);
}
if (cs == US_ASCII) {
return decodeASCII(ba, off, len);
}
StringDecoder sd = deref(decoder);
if (sd == null || !cs.name().equals(sd.cs.name())) {
sd = new StringDecoder(cs, cs.name());
set(decoder, sd);
}
return sd.decode(ba, off, len);
}
// -- Encoding --
@ -393,9 +372,6 @@ class StringCoding {
return ba;
}
if (ce instanceof ArrayEncoder) {
if (!isTrusted) {
val = Arrays.copyOf(val, val.length);
}
int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
: ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
if (blen != -1) {
@ -423,49 +399,140 @@ class StringCoding {
}
}
@HotSpotIntrinsicCandidate
private static int implEncodeISOArray(byte[] sa, int sp,
byte[] da, int dp, int len) {
int i = 0;
for (; i < len; i++) {
char c = StringUTF16.getChar(sa, sp++);
if (c > '\u00FF')
break;
da[dp++] = (byte)c;
static byte[] encode(String charsetName, byte coder, byte[] val)
throws UnsupportedEncodingException
{
StringEncoder se = deref(encoder);
String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
if ((se == null) || !(csn.equals(se.requestedCharsetName())
|| csn.equals(se.charsetName()))) {
se = null;
try {
Charset cs = lookupCharset(csn);
if (cs != null) {
if (cs == UTF_8) {
return encodeUTF8(coder, val, true);
}
if (cs == ISO_8859_1) {
return encode8859_1(coder, val);
}
if (cs == US_ASCII) {
return encodeASCII(coder, val);
}
se = new StringEncoder(cs, csn);
}
} catch (IllegalCharsetNameException x) {}
if (se == null) {
throw new UnsupportedEncodingException (csn);
}
set(encoder, se);
}
return i;
return se.encode(coder, val);
}
static byte[] encode8859_1(byte coder, byte[] val) {
if (coder == LATIN1) {
static byte[] encode(Charset cs, byte coder, byte[] val) {
if (cs == UTF_8) {
return encodeUTF8(coder, val, true);
}
if (cs == ISO_8859_1) {
return encode8859_1(coder, val);
}
if (cs == US_ASCII) {
return encodeASCII(coder, val);
}
CharsetEncoder ce = cs.newEncoder();
// fastpath for ascii compatible
if (coder == LATIN1 && (((ce instanceof ArrayEncoder) &&
((ArrayEncoder)ce).isASCIICompatible() &&
!hasNegatives(val, 0, val.length)))) {
return Arrays.copyOf(val, val.length);
}
int len = val.length >> 1;
byte[] dst = new byte[len];
int dp = 0;
int sp = 0;
int sl = len;
while (sp < sl) {
int ret = implEncodeISOArray(val, sp, dst, dp, len);
sp = sp + ret;
dp = dp + ret;
if (ret != len) {
char c = StringUTF16.getChar(val, sp++);
if (Character.isHighSurrogate(c) && sp < sl &&
Character.isLowSurrogate(StringUTF16.getChar(val, sp))) {
sp++;
}
dst[dp++] = '?';
len = sl - sp;
int len = val.length >> coder; // assume LATIN1=0/UTF16=1;
int en = scale(len, ce.maxBytesPerChar());
byte[] ba = new byte[en];
if (len == 0) {
return ba;
}
ce.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE)
.reset();
if (ce instanceof ArrayEncoder) {
int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
: ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
if (blen != -1) {
return safeTrim(ba, blen, true);
}
}
if (dp == dst.length) {
return dst;
boolean isTrusted = cs.getClass().getClassLoader0() == null ||
System.getSecurityManager() == null;
char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
: StringUTF16.toChars(val);
ByteBuffer bb = ByteBuffer.wrap(ba);
CharBuffer cb = CharBuffer.wrap(ca, 0, len);
try {
CoderResult cr = ce.encode(cb, bb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = ce.flush(bb);
if (!cr.isUnderflow())
cr.throwException();
} catch (CharacterCodingException x) {
throw new Error(x);
}
return Arrays.copyOf(dst, dp);
return safeTrim(ba, bb.position(), isTrusted);
}
static byte[] encodeASCII(byte coder, byte[] val) {
static byte[] encode(byte coder, byte[] val) {
Charset cs = Charset.defaultCharset();
if (cs == UTF_8) {
return encodeUTF8(coder, val, true);
}
if (cs == ISO_8859_1) {
return encode8859_1(coder, val);
}
if (cs == US_ASCII) {
return encodeASCII(coder, val);
}
StringEncoder se = deref(encoder);
if (se == null || !cs.name().equals(se.cs.name())) {
se = new StringEncoder(cs, cs.name());
set(encoder, se);
}
return se.encode(coder, val);
}
/**
* Print a message directly to stderr, bypassing all character conversion
* methods.
* @param msg message to print
*/
private static native void err(String msg);
/* The cached Result for each thread */
private static final ThreadLocal<StringCoding.Result>
resultCached = new ThreadLocal<>() {
protected StringCoding.Result initialValue() {
return new StringCoding.Result();
}};
////////////////////////// ascii //////////////////////////////
private static Result decodeASCII(byte[] ba, int off, int len) {
Result result = resultCached.get();
if (COMPACT_STRINGS && !hasNegatives(ba, off, len)) {
return result.with(Arrays.copyOfRange(ba, off, off + len),
LATIN1);
}
byte[] dst = new byte[len<<1];
int dp = 0;
while (dp < len) {
int b = ba[off++];
putChar(dst, dp++, (b >= 0) ? (char)b : repl);
}
return result.with(dst, UTF16);
}
private static byte[] encodeASCII(byte coder, byte[] val) {
if (coder == LATIN1) {
byte[] dst = new byte[val.length];
for (int i = 0; i < val.length; i++) {
@ -498,59 +565,51 @@ class StringCoding {
return Arrays.copyOf(dst, dp);
}
static byte[] encodeUTF8(byte coder, byte[] val) {
int dp = 0;
byte[] dst;
////////////////////////// latin1/8859_1 ///////////////////////////
private static Result decodeLatin1(byte[] ba, int off, int len) {
Result result = resultCached.get();
if (COMPACT_STRINGS) {
return result.with(Arrays.copyOfRange(ba, off, off + len), LATIN1);
} else {
return result.with(StringLatin1.inflate(ba, off, len), UTF16);
}
}
@HotSpotIntrinsicCandidate
private static int implEncodeISOArray(byte[] sa, int sp,
byte[] da, int dp, int len) {
int i = 0;
for (; i < len; i++) {
char c = StringUTF16.getChar(sa, sp++);
if (c > '\u00FF')
break;
da[dp++] = (byte)c;
}
return i;
}
private static byte[] encode8859_1(byte coder, byte[] val) {
if (coder == LATIN1) {
dst = new byte[val.length << 1];
for (int sp = 0; sp < val.length; sp++) {
byte c = val[sp];
if (c < 0) {
dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6));
dst[dp++] = (byte)(0x80 | (c & 0x3f));
} else {
dst[dp++] = c;
}
}
} else {
int sp = 0;
int sl = val.length >> 1;
dst = new byte[sl * 3];
char c;
while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') {
// ascii fast loop;
dst[dp++] = (byte)c;
sp++;
}
while (sp < sl) {
c = StringUTF16.getChar(val, sp++);
if (c < 0x80) {
dst[dp++] = (byte)c;
} else if (c < 0x800) {
dst[dp++] = (byte)(0xc0 | (c >> 6));
dst[dp++] = (byte)(0x80 | (c & 0x3f));
} else if (Character.isSurrogate(c)) {
int uc = -1;
char c2;
if (Character.isHighSurrogate(c) && sp < sl &&
Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) {
uc = Character.toCodePoint(c, c2);
}
if (uc < 0) {
dst[dp++] = '?';
} else {
dst[dp++] = (byte)(0xf0 | ((uc >> 18)));
dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f));
dst[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f));
dst[dp++] = (byte)(0x80 | (uc & 0x3f));
sp++; // 2 chars
}
} else {
// 3 bytes, 16 bits
dst[dp++] = (byte)(0xe0 | ((c >> 12)));
dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f));
dst[dp++] = (byte)(0x80 | (c & 0x3f));
return Arrays.copyOf(val, val.length);
}
int len = val.length >> 1;
byte[] dst = new byte[len];
int dp = 0;
int sp = 0;
int sl = len;
while (sp < sl) {
int ret = implEncodeISOArray(val, sp, dst, dp, len);
sp = sp + ret;
dp = dp + ret;
if (ret != len) {
char c = StringUTF16.getChar(val, sp++);
if (Character.isHighSurrogate(c) && sp < sl &&
Character.isLowSurrogate(StringUTF16.getChar(val, sp))) {
sp++;
}
dst[dp++] = '?';
len = sl - sp;
}
}
if (dp == dst.length) {
@ -559,113 +618,333 @@ class StringCoding {
return Arrays.copyOf(dst, dp);
}
static byte[] encode(String charsetName, byte coder, byte[] val)
throws UnsupportedEncodingException
{
StringEncoder se = deref(encoder);
String csn = (charsetName == null) ? "ISO-8859-1" : charsetName;
if ((se == null) || !(csn.equals(se.requestedCharsetName())
|| csn.equals(se.charsetName()))) {
se = null;
try {
Charset cs = lookupCharset(csn);
if (cs != null) {
if (cs == UTF_8) {
return encodeUTF8(coder, val);
} else if (cs == ISO_8859_1) {
return encode8859_1(coder, val);
} else if (cs == US_ASCII) {
return encodeASCII(coder, val);
}
se = new StringEncoder(cs, csn);
//////////////////////////////// utf8 ////////////////////////////////////
private static boolean isNotContinuation(int b) {
return (b & 0xc0) != 0x80;
}
private static boolean isMalformed3(int b1, int b2, int b3) {
return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
(b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80;
}
private static boolean isMalformed3_2(int b1, int b2) {
return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
(b2 & 0xc0) != 0x80;
}
private static boolean isMalformed4(int b2, int b3, int b4) {
return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 ||
(b4 & 0xc0) != 0x80;
}
private static boolean isMalformed4_2(int b1, int b2) {
return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
(b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
(b2 & 0xc0) != 0x80;
}
private static boolean isMalformed4_3(int b3) {
return (b3 & 0xc0) != 0x80;
}
// for nb == 3/4
private static int malformedN(byte[] src, int sp, int nb) {
if (nb == 3) {
int b1 = src[sp++];
int b2 = src[sp++]; // no need to lookup b3
return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
isNotContinuation(b2)) ? 1 : 2;
} else if (nb == 4) { // we don't care the speed here
int b1 = src[sp++] & 0xff;
int b2 = src[sp++] & 0xff;
if (b1 > 0xf4 ||
(b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
(b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
isNotContinuation(b2))
return 1;
if (isNotContinuation(src[sp++]))
return 2;
return 3;
}
assert false;
return -1;
}
private static void throwMalformed(int off, int nb) {
throw new IllegalArgumentException("malformed input off : " + off +
", length : " + nb);
}
private static char repl = '\ufffd';
private static Result decodeUTF8(byte[] src, int sp, int len, boolean doReplace) {
// ascii-bais, which has a relative impact to the non-ascii-only bytes
if (COMPACT_STRINGS && !hasNegatives(src, sp, len))
return resultCached.get().with(Arrays.copyOfRange(src, sp, sp + len),
LATIN1);
return decodeUTF8_0(src, sp, len, doReplace);
}
private static Result decodeUTF8_0(byte[] src, int sp, int len, boolean doReplace) {
Result ret = resultCached.get();
int sl = sp + len;
int dp = 0;
byte[] dst = new byte[len];
if (COMPACT_STRINGS) {
while (sp < sl) {
int b1 = src[sp];
if (b1 >= 0) {
dst[dp++] = (byte)b1;
sp++;
continue;
}
} catch (IllegalCharsetNameException x) {}
if (se == null) {
throw new UnsupportedEncodingException (csn);
if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) &&
sp + 1 < sl) {
int b2 = src[sp + 1];
if (!isNotContinuation(b2)) {
dst[dp++] = (byte)(((b1 << 6) ^ b2)^
(((byte) 0xC0 << 6) ^
((byte) 0x80 << 0)));
sp += 2;
continue;
}
}
// anything not a latin1, including the repl
// we have to go with the utf16
break;
}
if (sp == sl) {
if (dp != dst.length) {
dst = Arrays.copyOf(dst, dp);
}
return ret.with(dst, LATIN1);
}
set(encoder, se);
}
return se.encode(coder, val);
if (dp == 0) {
dst = new byte[len << 1];
} else {
byte[] buf = new byte[len << 1];
StringLatin1.inflate(dst, 0, buf, 0, dp);
dst = buf;
}
while (sp < sl) {
int b1 = src[sp++];
if (b1 >= 0) {
putChar(dst, dp++, (char) b1);
} else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
if (sp < sl) {
int b2 = src[sp++];
if (isNotContinuation(b2)) {
if (!doReplace) {
throwMalformed(sp - 1, 1);
}
putChar(dst, dp++, repl);
sp--;
} else {
putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^
(((byte) 0xC0 << 6) ^
((byte) 0x80 << 0))));
}
continue;
}
if (!doReplace) {
throwMalformed(sp, 1); // underflow()
}
putChar(dst, dp++, repl);
break;
} else if ((b1 >> 4) == -2) {
if (sp + 1 < sl) {
int b2 = src[sp++];
int b3 = src[sp++];
if (isMalformed3(b1, b2, b3)) {
if (!doReplace) {
throwMalformed(sp - 3, 3);
}
putChar(dst, dp++, repl);
sp -= 3;
sp += malformedN(src, sp, 3);
} else {
char c = (char)((b1 << 12) ^
(b2 << 6) ^
(b3 ^
(((byte) 0xE0 << 12) ^
((byte) 0x80 << 6) ^
((byte) 0x80 << 0))));
if (isSurrogate(c)) {
if (!doReplace) {
throwMalformed(sp - 3, 3);
}
putChar(dst, dp++, repl);
} else {
putChar(dst, dp++, c);
}
}
continue;
}
if (sp < sl && isMalformed3_2(b1, src[sp])) {
if (!doReplace) {
throwMalformed(sp - 1, 2);
}
putChar(dst, dp++, repl);
continue;
}
if (!doReplace){
throwMalformed(sp, 1);
}
putChar(dst, dp++, repl);
break;
} else if ((b1 >> 3) == -2) {
if (sp + 2 < sl) {
int b2 = src[sp++];
int b3 = src[sp++];
int b4 = src[sp++];
int uc = ((b1 << 18) ^
(b2 << 12) ^
(b3 << 6) ^
(b4 ^
(((byte) 0xF0 << 18) ^
((byte) 0x80 << 12) ^
((byte) 0x80 << 6) ^
((byte) 0x80 << 0))));
if (isMalformed4(b2, b3, b4) ||
!isSupplementaryCodePoint(uc)) { // shortest form check
if (!doReplace) {
throwMalformed(sp - 4, 4);
}
putChar(dst, dp++, repl);
sp -= 4;
sp += malformedN(src, sp, 4);
} else {
putChar(dst, dp++, highSurrogate(uc));
putChar(dst, dp++, lowSurrogate(uc));
}
continue;
}
b1 &= 0xff;
if (b1 > 0xf4 ||
sp < sl && isMalformed4_2(b1, src[sp] & 0xff)) {
if (!doReplace) {
throwMalformed(sp - 1, 1); // or 2
}
putChar(dst, dp++, repl);
continue;
}
if (!doReplace) {
throwMalformed(sp - 1, 1);
}
sp++;
putChar(dst, dp++, repl);
if (sp < sl && isMalformed4_3(src[sp])) {
continue;
}
break;
} else {
if (!doReplace) {
throwMalformed(sp - 1, 1);
}
putChar(dst, dp++, repl);
}
}
if (dp != len) {
dst = Arrays.copyOf(dst, dp << 1);
}
return ret.with(dst, UTF16);
}
static byte[] encode(Charset cs, byte coder, byte[] val) {
if (cs == UTF_8) {
return encodeUTF8(coder, val);
} else if (cs == ISO_8859_1) {
return encode8859_1(coder, val);
} else if (cs == US_ASCII) {
return encodeASCII(coder, val);
}
CharsetEncoder ce = cs.newEncoder();
// fastpath for ascii compatible
if (coder == LATIN1 && (((ce instanceof ArrayEncoder) &&
((ArrayEncoder)ce).isASCIICompatible() &&
!hasNegatives(val, 0, val.length)))) {
private static byte[] encodeUTF8(byte coder, byte[] val, boolean doReplace) {
if (coder == UTF16)
return encodeUTF8_UTF16(val, doReplace);
if (!hasNegatives(val, 0, val.length))
return Arrays.copyOf(val, val.length);
}
int len = val.length >> coder; // assume LATIN1=0/UTF16=1;
int en = scale(len, ce.maxBytesPerChar());
byte[] ba = new byte[en];
if (len == 0) {
return ba;
}
boolean isTrusted = cs.getClass().getClassLoader0() == null ||
System.getSecurityManager() == null;
ce.onMalformedInput(CodingErrorAction.REPLACE)
.onUnmappableCharacter(CodingErrorAction.REPLACE)
.reset();
if (ce instanceof ArrayEncoder) {
if (!isTrusted) {
val = Arrays.copyOf(val, val.length);
}
int blen = (coder == LATIN1 ) ? ((ArrayEncoder)ce).encodeFromLatin1(val, 0, len, ba)
: ((ArrayEncoder)ce).encodeFromUTF16(val, 0, len, ba);
if (blen != -1) {
return safeTrim(ba, blen, isTrusted);
int dp = 0;
byte[] dst = new byte[val.length << 1];
for (int sp = 0; sp < val.length; sp++) {
byte c = val[sp];
if (c < 0) {
dst[dp++] = (byte)(0xc0 | ((c & 0xff) >> 6));
dst[dp++] = (byte)(0x80 | (c & 0x3f));
} else {
dst[dp++] = c;
}
}
char[] ca = (coder == LATIN1 ) ? StringLatin1.toChars(val)
: StringUTF16.toChars(val);
ByteBuffer bb = ByteBuffer.wrap(ba);
CharBuffer cb = CharBuffer.wrap(ca, 0, len);
try {
CoderResult cr = ce.encode(cb, bb, true);
if (!cr.isUnderflow())
cr.throwException();
cr = ce.flush(bb);
if (!cr.isUnderflow())
cr.throwException();
} catch (CharacterCodingException x) {
throw new Error(x);
}
return safeTrim(ba, bb.position(), isTrusted);
if (dp == dst.length)
return dst;
return Arrays.copyOf(dst, dp);
}
static byte[] encode(byte coder, byte[] val) {
String csn = Charset.defaultCharset().name();
try {
// use charset name encode() variant which provides caching.
return encode(csn, coder, val);
} catch (UnsupportedEncodingException x) {
warnUnsupportedCharset(csn);
private static byte[] encodeUTF8_UTF16(byte[] val, boolean doReplace) {
int dp = 0;
int sp = 0;
int sl = val.length >> 1;
byte[] dst = new byte[sl * 3];
char c;
while (sp < sl && (c = StringUTF16.getChar(val, sp)) < '\u0080') {
// ascii fast loop;
dst[dp++] = (byte)c;
sp++;
}
try {
return encode("ISO-8859-1", coder, val);
} catch (UnsupportedEncodingException x) {
// If this code is hit during VM initialization, err(String) is
// the only way we will be able to get any kind of error message.
err("ISO-8859-1 charset not available: " + x.toString() + "\n");
// If we can not find ISO-8859-1 (a required encoding) then things
// are seriously wrong with the installation.
System.exit(1);
return null;
while (sp < sl) {
c = StringUTF16.getChar(val, sp++);
if (c < 0x80) {
dst[dp++] = (byte)c;
} else if (c < 0x800) {
dst[dp++] = (byte)(0xc0 | (c >> 6));
dst[dp++] = (byte)(0x80 | (c & 0x3f));
} else if (Character.isSurrogate(c)) {
int uc = -1;
char c2;
if (Character.isHighSurrogate(c) && sp < sl &&
Character.isLowSurrogate(c2 = StringUTF16.getChar(val, sp))) {
uc = Character.toCodePoint(c, c2);
}
if (uc < 0) {
if (doReplace) {
dst[dp++] = '?';
} else {
throwMalformed(sp - 1, 1); // or 2, does not matter here
}
} else {
dst[dp++] = (byte)(0xf0 | ((uc >> 18)));
dst[dp++] = (byte)(0x80 | ((uc >> 12) & 0x3f));
dst[dp++] = (byte)(0x80 | ((uc >> 6) & 0x3f));
dst[dp++] = (byte)(0x80 | (uc & 0x3f));
sp++; // 2 chars
}
} else {
// 3 bytes, 16 bits
dst[dp++] = (byte)(0xe0 | ((c >> 12)));
dst[dp++] = (byte)(0x80 | ((c >> 6) & 0x3f));
dst[dp++] = (byte)(0x80 | (c & 0x3f));
}
}
if (dp == dst.length) {
return dst;
}
return Arrays.copyOf(dst, dp);
}
/**
* Print a message directly to stderr, bypassing all character conversion
* methods.
* @param msg message to print
////////////////////// for j.u.z.ZipCoder //////////////////////////
/*
* Throws iae, instead of replacing, if malformed or unmappble.
*/
private static native void err(String msg);
static String newStringUTF8NoRepl(byte[] src, int off, int len) {
if (COMPACT_STRINGS && !hasNegatives(src, off, len))
return new String(Arrays.copyOfRange(src, off, off + len), LATIN1);
Result ret = decodeUTF8_0(src, off, len, false);
return new String(ret.value, ret.coder);
}
/*
* Throws iae, instead of replacing, if unmappble.
*/
static byte[] getBytesUTF8NoRepl(String s) {
return encodeUTF8(s.coder(), s.value(), false);
}
}

View file

@ -1,235 +0,0 @@
/*
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation. Oracle designates this
* particular file as subject to the "Classpath" exception as provided
* by Oracle in the LICENSE file that accompanied this code.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package java.lang;
import java.nio.charset.Charset;
import java.util.Arrays;
import static java.lang.String.LATIN1;
import static java.lang.String.UTF16;
import static java.lang.String.COMPACT_STRINGS;
import static java.lang.Character.isSurrogate;
import static java.lang.Character.highSurrogate;
import static java.lang.Character.lowSurrogate;
import static java.lang.Character.isSupplementaryCodePoint;
import static java.lang.StringUTF16.putChar;
class StringDecoderUTF8 extends StringCoding.StringDecoder {
StringDecoderUTF8(Charset cs, String rcn) {
super(cs, rcn);
}
private static boolean isNotContinuation(int b) {
return (b & 0xc0) != 0x80;
}
private static boolean isMalformed3(int b1, int b2, int b3) {
return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
(b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80;
}
private static boolean isMalformed3_2(int b1, int b2) {
return (b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
(b2 & 0xc0) != 0x80;
}
private static boolean isMalformed4(int b2, int b3, int b4) {
return (b2 & 0xc0) != 0x80 || (b3 & 0xc0) != 0x80 ||
(b4 & 0xc0) != 0x80;
}
private static boolean isMalformed4_2(int b1, int b2) {
return (b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
(b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
(b2 & 0xc0) != 0x80;
}
private static boolean isMalformed4_3(int b3) {
return (b3 & 0xc0) != 0x80;
}
// for nb == 3/4
private static int malformedN(byte[] src, int sp, int nb) {
if (nb == 3) {
int b1 = src[sp++];
int b2 = src[sp++]; // no need to lookup b3
return ((b1 == (byte)0xe0 && (b2 & 0xe0) == 0x80) ||
isNotContinuation(b2)) ? 1 : 2;
} else if (nb == 4) { // we don't care the speed here
int b1 = src[sp++] & 0xff;
int b2 = src[sp++] & 0xff;
if (b1 > 0xf4 ||
(b1 == 0xf0 && (b2 < 0x90 || b2 > 0xbf)) ||
(b1 == 0xf4 && (b2 & 0xf0) != 0x80) ||
isNotContinuation(b2))
return 1;
if (isNotContinuation(src[sp++]))
return 2;
return 3;
}
assert false;
return -1;
}
private static char repl = '\ufffd';
StringCoding.Result decode(byte[] src, int sp, int len) {
return decode(src, sp, len, result);
}
static StringCoding.Result decode(byte[] src, int sp, int len,
StringCoding.Result ret) {
int sl = sp + len;
byte[] dst = new byte[len];
int dp = 0;
if (COMPACT_STRINGS) { // Latin1 only loop
while (sp < sl) {
int b1 = src[sp];
if (b1 >= 0) {
dst[dp++] = (byte)b1;
sp++;
continue;
}
if ((b1 == (byte)0xc2 || b1 == (byte)0xc3) &&
sp + 1 < sl) {
int b2 = src[sp + 1];
if (!isNotContinuation(b2)) {
dst[dp++] = (byte)(((b1 << 6) ^ b2)^
(((byte) 0xC0 << 6) ^
((byte) 0x80 << 0)));
sp += 2;
continue;
}
}
// anything not a latin1, including the repl
// we have to go with the utf16
break;
}
if (sp == sl) {
if (dp != dst.length) {
dst = Arrays.copyOf(dst, dp);
}
return ret.with(dst, LATIN1);
}
}
if (dp == 0) {
dst = new byte[len << 1];
} else {
byte[] buf = new byte[len << 1];
StringLatin1.inflate(dst, 0, buf, 0, dp);
dst = buf;
}
while (sp < sl) {
int b1 = src[sp++];
if (b1 >= 0) {
putChar(dst, dp++, (char) b1);
} else if ((b1 >> 5) == -2 && (b1 & 0x1e) != 0) {
if (sp < sl) {
int b2 = src[sp++];
if (isNotContinuation(b2)) {
putChar(dst, dp++, repl);
sp--;
} else {
putChar(dst, dp++, (char)(((b1 << 6) ^ b2)^
(((byte) 0xC0 << 6) ^
((byte) 0x80 << 0))));
}
continue;
}
putChar(dst, dp++, repl);
break;
} else if ((b1 >> 4) == -2) {
if (sp + 1 < sl) {
int b2 = src[sp++];
int b3 = src[sp++];
if (isMalformed3(b1, b2, b3)) {
putChar(dst, dp++, repl);
sp -= 3;
sp += malformedN(src, sp, 3);
} else {
char c = (char)((b1 << 12) ^
(b2 << 6) ^
(b3 ^
(((byte) 0xE0 << 12) ^
((byte) 0x80 << 6) ^
((byte) 0x80 << 0))));
putChar(dst, dp++, isSurrogate(c) ? repl : c);
}
continue;
}
if (sp < sl && isMalformed3_2(b1, src[sp])) {
putChar(dst, dp++, repl);
continue;
}
putChar(dst, dp++, repl);
break;
} else if ((b1 >> 3) == -2) {
if (sp + 2 < sl) {
int b2 = src[sp++];
int b3 = src[sp++];
int b4 = src[sp++];
int uc = ((b1 << 18) ^
(b2 << 12) ^
(b3 << 6) ^
(b4 ^
(((byte) 0xF0 << 18) ^
((byte) 0x80 << 12) ^
((byte) 0x80 << 6) ^
((byte) 0x80 << 0))));
if (isMalformed4(b2, b3, b4) ||
!isSupplementaryCodePoint(uc)) { // shortest form check
putChar(dst, dp++, repl);
sp -= 4;
sp += malformedN(src, sp, 4);
} else {
putChar(dst, dp++, highSurrogate(uc));
putChar(dst, dp++, lowSurrogate(uc));
}
continue;
}
b1 &= 0xff;
if (b1 > 0xf4 ||
sp < sl && isMalformed4_2(b1, src[sp] & 0xff)) {
putChar(dst, dp++, repl);
continue;
}
sp++;
putChar(dst, dp++, repl);
if (sp < sl && isMalformed4_3(src[sp])) {
continue;
}
break;
} else {
putChar(dst, dp++, repl);
}
}
if (dp != len) {
dst = Arrays.copyOf(dst, dp << 1);
}
return ret.with(dst, UTF16);
}
}

View file

@ -2184,6 +2184,15 @@ public final class System {
public Stream<ModuleLayer> layers(ClassLoader loader) {
return ModuleLayer.layers(loader);
}
public String newStringUTF8NoRepl(byte[] bytes, int off, int len) {
return StringCoding.newStringUTF8NoRepl(bytes, off, len);
}
public byte[] getBytesUTF8NoRepl(String s) {
return StringCoding.getBytesUTF8NoRepl(s);
}
});
}
}

View file

@ -28,72 +28,60 @@ package java.util.zip;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CoderResult;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CodingErrorAction;
import java.util.Arrays;
import sun.nio.cs.ArrayDecoder;
import sun.nio.cs.ArrayEncoder;
import static java.nio.charset.StandardCharsets.UTF_8;
/**
* Utility class for zipfile name and comment decoding and encoding
*/
final class ZipCoder {
class ZipCoder {
private static boolean isASCII(byte[] ba, int off, int len) {
for (int i = off; i < off + len; i++) {
if (ba[i] < 0)
return false;
private static final jdk.internal.misc.JavaLangAccess JLA =
jdk.internal.misc.SharedSecrets.getJavaLangAccess();
static final class UTF8 extends ZipCoder {
UTF8(Charset utf8) {
super(utf8);
}
@Override
boolean isUTF8() {
return true;
}
@Override
String toString(byte[] ba, int off, int length) {
return JLA.newStringUTF8NoRepl(ba, off, length);
}
@Override
byte[] getBytes(String s) {
return JLA.getBytesUTF8NoRepl(s);
}
return true;
}
private static boolean hasReplaceChar(byte[] ba) {
for (int i = 0; i < ba.length; i++) {
if (ba[i] == (byte)'?')
return true;
}
return false;
// UTF_8.ArrayEn/Decoder is stateless, so make it singleton.
private static ZipCoder utf8 = new UTF8(UTF_8);
public static ZipCoder get(Charset charset) {
if (charset == UTF_8)
return utf8;
return new ZipCoder(charset);
}
String toString(byte[] ba, int off, int length) {
try {
return decoder().decode(ByteBuffer.wrap(ba, off, length)).toString();
// fastpath for UTF-8 cs and ascii only name, leverage the
// compact string impl to avoid the unnecessary char[] copy/
// paste. A temporary workaround before we have better approach,
// such as a String constructor that throws exception for
// malformed and/or unmappable characters, instead of silently
// replacing with repl char
if (isUTF8 && isASCII(ba, off, length)) {
return new String(ba, off, length, cs);
} catch (CharacterCodingException x) {
throw new IllegalArgumentException(x);
}
CharsetDecoder cd = decoder().reset();
int len = (int)(length * cd.maxCharsPerByte());
char[] ca = new char[len];
if (len == 0)
return new String(ca);
// UTF-8 only for now. Other ArrayDeocder only handles
// CodingErrorAction.REPLACE mode. ZipCoder uses
// REPORT mode.
if (isUTF8 && cd instanceof ArrayDecoder) {
int clen = ((ArrayDecoder)cd).decode(ba, off, length, ca);
if (clen == -1) // malformed
throw new IllegalArgumentException("MALFORMED");
return new String(ca, 0, clen);
}
ByteBuffer bb = ByteBuffer.wrap(ba, off, length);
CharBuffer cb = CharBuffer.wrap(ca);
CoderResult cr = cd.decode(bb, cb, true);
if (!cr.isUnderflow())
throw new IllegalArgumentException(cr.toString());
cr = cd.flush(cb);
if (!cr.isUnderflow())
throw new IllegalArgumentException(cr.toString());
return new String(ca, 0, cb.position());
}
String toString(byte[] ba, int length) {
@ -105,84 +93,47 @@ final class ZipCoder {
}
byte[] getBytes(String s) {
if (isUTF8) {
// fastpath for UTF8. should only occur when the string
// has malformed surrogates. A postscan should still be
// faster and use less memory.
byte[] ba = s.getBytes(cs);
if (!hasReplaceChar(ba)) {
return ba;
try {
ByteBuffer bb = encoder().encode(CharBuffer.wrap(s));
int pos = bb.position();
int limit = bb.limit();
if (bb.hasArray() && pos == 0 && limit == bb.capacity()) {
return bb.array();
}
byte[] bytes = new byte[bb.limit() - bb.position()];
bb.get(bytes);
return bytes;
} catch (CharacterCodingException x) {
throw new IllegalArgumentException(x);
}
CharsetEncoder ce = encoder().reset();
char[] ca = s.toCharArray();
int len = (int)(ca.length * ce.maxBytesPerChar());
byte[] ba = new byte[len];
if (len == 0)
return ba;
// UTF-8 only for now. Other ArrayDeocder only handles
// CodingErrorAction.REPLACE mode.
if (isUTF8 && ce instanceof ArrayEncoder) {
int blen = ((ArrayEncoder)ce).encode(ca, 0, ca.length, ba);
if (blen == -1) // malformed
throw new IllegalArgumentException("MALFORMED");
return Arrays.copyOf(ba, blen);
}
ByteBuffer bb = ByteBuffer.wrap(ba);
CharBuffer cb = CharBuffer.wrap(ca);
CoderResult cr = ce.encode(cb, bb, true);
if (!cr.isUnderflow())
throw new IllegalArgumentException(cr.toString());
cr = ce.flush(bb);
if (!cr.isUnderflow())
throw new IllegalArgumentException(cr.toString());
if (bb.position() == ba.length) // defensive copy?
return ba;
else
return Arrays.copyOf(ba, bb.position());
}
// assume invoked only if "this" is not utf8
byte[] getBytesUTF8(String s) {
if (isUTF8)
return getBytes(s);
if (utf8 == null)
utf8 = new ZipCoder(StandardCharsets.UTF_8);
return utf8.getBytes(s);
}
String toStringUTF8(byte[] ba, int len) {
return toStringUTF8(ba, 0, len);
return utf8.toString(ba, 0, len);
}
String toStringUTF8(byte[] ba, int off, int len) {
if (isUTF8)
return toString(ba, off, len);
if (utf8 == null)
utf8 = new ZipCoder(StandardCharsets.UTF_8);
return utf8.toString(ba, off, len);
}
boolean isUTF8() {
return isUTF8;
return false;
}
private Charset cs;
private CharsetDecoder dec;
private CharsetEncoder enc;
private boolean isUTF8;
private ZipCoder utf8;
private ZipCoder(Charset cs) {
this.cs = cs;
this.isUTF8 = cs.name().equals(StandardCharsets.UTF_8.name());
}
static ZipCoder get(Charset charset) {
return new ZipCoder(charset);
}
private CharsetDecoder decoder() {
protected CharsetDecoder decoder() {
if (dec == null) {
dec = cs.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
@ -191,7 +142,7 @@ final class ZipCoder {
return dec;
}
private CharsetEncoder encoder() {
protected CharsetEncoder encoder() {
if (enc == null) {
enc = cs.newEncoder()
.onMalformedInput(CodingErrorAction.REPORT)