8243469: Lazily encode name in ZipFile.getEntryPos

Co-authored-by: Eirik Bjørsnøs <eirbjo@gmail.com>
Reviewed-by: lancea, simonis
This commit is contained in:
Claes Redestad 2020-04-27 17:26:05 +02:00
parent c55e7d5a4a
commit d2e0d0e06a
3 changed files with 337 additions and 136 deletions

View file

@ -32,6 +32,7 @@ import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.nio.charset.CharacterCodingException;
import java.nio.charset.CodingErrorAction;
import java.util.Arrays;
import sun.nio.cs.UTF_8;
@ -43,51 +44,13 @@ class ZipCoder {
private static final jdk.internal.access.JavaLangAccess JLA =
jdk.internal.access.SharedSecrets.getJavaLangAccess();
static final class UTF8ZipCoder extends ZipCoder {
// Encoding/decoding is stateless, so make it singleton.
static final ZipCoder INSTANCE = new UTF8ZipCoder(UTF_8.INSTANCE);
private UTF8ZipCoder(Charset utf8) {
super(utf8);
}
@Override
boolean isUTF8() {
return true;
}
@Override
String toString(byte[] ba, int off, int length) {
return JLA.newStringUTF8NoRepl(ba, off, length);
}
@Override
byte[] getBytes(String s) {
return JLA.getBytesUTF8NoRepl(s);
}
@Override
int hashN(byte[] a, int off, int len) {
// Performance optimization: when UTF8-encoded, ZipFile.getEntryPos
// assume that the hash of a name remains unchanged when appending a
// trailing '/', which allows lookups to avoid rehashing
int end = off + len;
if (len > 0 && a[end - 1] == '/') {
end--;
}
int h = 1;
for (int i = off; i < end; i++) {
h = 31 * h + a[i];
}
return h;
}
}
// Encoding/decoding is stateless, so make it singleton.
static final UTF8ZipCoder UTF8 = new UTF8ZipCoder(UTF_8.INSTANCE);
public static ZipCoder get(Charset charset) {
if (charset == UTF_8.INSTANCE)
return UTF8ZipCoder.INSTANCE;
if (charset == UTF_8.INSTANCE) {
return UTF8;
}
return new ZipCoder(charset);
}
@ -123,40 +86,74 @@ class ZipCoder {
}
}
// assume invoked only if "this" is not utf8
byte[] getBytesUTF8(String s) {
return UTF8ZipCoder.INSTANCE.getBytes(s);
}
String toStringUTF8(byte[] ba, int len) {
return UTF8ZipCoder.INSTANCE.toString(ba, 0, len);
}
String toStringUTF8(byte[] ba, int off, int len) {
return UTF8ZipCoder.INSTANCE.toString(ba, off, len);
return UTF8.toString(ba, 0, len);
}
boolean isUTF8() {
return false;
}
int hashN(byte[] a, int off, int len) {
int h = 1;
while (len-- > 0) {
h = 31 * h + a[off++];
// Hash code functions for ZipFile entry names. We generate the hash as-if
// we first decoded the byte sequence to a String, then appended '/' if no
// trailing slash was found, then called String.hashCode(). This
// normalization ensures we can simplify and speed up lookups.
int normalizedHash(byte[] a, int off, int len) {
if (len == 0) {
return 0;
}
return normalizedHashDecode(0, a, off, off + len);
}
// Matching normalized hash code function for Strings
static int normalizedHash(String name) {
int hsh = name.hashCode();
int len = name.length();
if (len > 0 && name.charAt(len - 1) != '/') {
hsh = hsh * 31 + '/';
}
return hsh;
}
boolean hasTrailingSlash(byte[] a, int end) {
byte[] slashBytes = slashBytes();
return end >= slashBytes.length &&
Arrays.mismatch(a, end - slashBytes.length, end, slashBytes, 0, slashBytes.length) == -1;
}
// Implements normalizedHash by decoding byte[] to char[] and then computing
// the hash. This is a slow-path used for non-UTF8 charsets and also when
// aborting the ASCII fast-path in the UTF8 implementation, so {@code h}
// might be a partially calculated hash code
int normalizedHashDecode(int h, byte[] a, int off, int end) {
try {
// cb will be a newly allocated CharBuffer with pos == 0,
// arrayOffset == 0, backed by an array.
CharBuffer cb = decoder().decode(ByteBuffer.wrap(a, off, end - off));
int limit = cb.limit();
char[] decoded = cb.array();
for (int i = 0; i < limit; i++) {
h = 31 * h + decoded[i];
}
if (limit > 0 && decoded[limit - 1] != '/') {
h = 31 * h + '/';
}
} catch (CharacterCodingException cce) {
// Ignore - return the hash code generated so far.
}
return h;
}
private Charset cs;
private CharsetDecoder dec;
private byte[] slashBytes;
private final Charset cs;
protected CharsetDecoder dec;
private CharsetEncoder enc;
private ZipCoder(Charset cs) {
this.cs = cs;
}
private CharsetDecoder decoder() {
protected CharsetDecoder decoder() {
if (dec == null) {
dec = cs.newDecoder()
.onMalformedInput(CodingErrorAction.REPORT)
@ -173,4 +170,73 @@ class ZipCoder {
}
return enc;
}
// This method produces an array with the bytes that will correspond to a
// trailing '/' in the chosen character encoding.
//
// While in most charsets a trailing slash will be encoded as the byte
// value of '/', this does not hold in the general case. E.g., in charsets
// such as UTF-16 and UTF-32 it will be represented by a sequence of 2 or 4
// bytes, respectively.
private byte[] slashBytes() {
if (slashBytes == null) {
// Take into account charsets that produce a BOM, e.g., UTF-16
byte[] slash = "/".getBytes(cs);
byte[] doubleSlash = "//".getBytes(cs);
slashBytes = Arrays.copyOfRange(doubleSlash, slash.length, doubleSlash.length);
}
return slashBytes;
}
static final class UTF8ZipCoder extends ZipCoder {
private UTF8ZipCoder(Charset utf8) {
super(utf8);
}
@Override
boolean isUTF8() {
return true;
}
@Override
String toString(byte[] ba, int off, int length) {
return JLA.newStringUTF8NoRepl(ba, off, length);
}
@Override
byte[] getBytes(String s) {
return JLA.getBytesUTF8NoRepl(s);
}
@Override
int normalizedHash(byte[] a, int off, int len) {
if (len == 0) {
return 0;
}
int end = off + len;
int h = 0;
while (off < end) {
byte b = a[off];
if (b < 0) {
// Non-ASCII, fall back to decoder loop
return normalizedHashDecode(h, a, off, end);
} else {
h = 31 * h + b;
off++;
}
}
if (a[end - 1] != '/') {
h = 31 * h + '/';
}
return h;
}
@Override
boolean hasTrailingSlash(byte[] a, int end) {
return end > 0 && a[end - 1] == '/';
}
}
}