8243469: Lazily encode name in ZipFile.getEntryPos

Co-authored-by: Eirik Bjørsnøs <eirbjo@gmail.com> Reviewed-by: lancea, simonis
2025-08-28 15:24:43 +02:00 · 2020-04-27 17:26:05 +02:00 · 2020-04-27 17:26:05 +02:00 · d2e0d0e06a
commit d2e0d0e06a
parent c55e7d5a4a
3 changed files with 337 additions and 136 deletions
--- a/src/java.base/share/classes/java/util/zip/ZipCoder.java
+++ b/src/java.base/share/classes/java/util/zip/ZipCoder.java
@ -32,6 +32,7 @@ import java.nio.charset.CharsetDecoder;
 import java.nio.charset.CharsetEncoder;
 import java.nio.charset.CharacterCodingException;
 import java.nio.charset.CodingErrorAction;
+import java.util.Arrays;

 import sun.nio.cs.UTF_8;

@ -43,51 +44,13 @@ class ZipCoder {
    private static final jdk.internal.access.JavaLangAccess JLA =
        jdk.internal.access.SharedSecrets.getJavaLangAccess();

-    static final class UTF8ZipCoder extends ZipCoder {
-
-        // Encoding/decoding is stateless, so make it singleton.
-        static final ZipCoder INSTANCE = new UTF8ZipCoder(UTF_8.INSTANCE);
-
-        private UTF8ZipCoder(Charset utf8) {
-            super(utf8);
-        }
-
-        @Override
-        boolean isUTF8() {
-            return true;
-        }
-
-        @Override
-        String toString(byte[] ba, int off, int length) {
-            return JLA.newStringUTF8NoRepl(ba, off, length);
-        }
-
-        @Override
-        byte[] getBytes(String s) {
-            return JLA.getBytesUTF8NoRepl(s);
-        }
-
-        @Override
-        int hashN(byte[] a, int off, int len) {
-            // Performance optimization: when UTF8-encoded, ZipFile.getEntryPos
-            // assume that the hash of a name remains unchanged when appending a
-            // trailing '/', which allows lookups to avoid rehashing
-            int end = off + len;
-            if (len > 0 && a[end - 1] == '/') {
-                end--;
-            }
-
-            int h = 1;
-            for (int i = off; i < end; i++) {
-                h = 31 * h + a[i];
-            }
-            return h;
-        }
-    }
+    // Encoding/decoding is stateless, so make it singleton.
+    static final UTF8ZipCoder UTF8 = new UTF8ZipCoder(UTF_8.INSTANCE);

    public static ZipCoder get(Charset charset) {
-        if (charset == UTF_8.INSTANCE)
-            return UTF8ZipCoder.INSTANCE;
+        if (charset == UTF_8.INSTANCE) {
+            return UTF8;
+        }
        return new ZipCoder(charset);
    }

@ -123,40 +86,74 @@ class ZipCoder {
        }
    }

-    // assume invoked only if "this" is not utf8
-    byte[] getBytesUTF8(String s) {
-        return UTF8ZipCoder.INSTANCE.getBytes(s);
-    }
-
    String toStringUTF8(byte[] ba, int len) {
-        return UTF8ZipCoder.INSTANCE.toString(ba, 0, len);
-    }
-
-    String toStringUTF8(byte[] ba, int off, int len) {
-        return UTF8ZipCoder.INSTANCE.toString(ba, off, len);
+        return UTF8.toString(ba, 0, len);
    }

    boolean isUTF8() {
        return false;
    }

-    int hashN(byte[] a, int off, int len) {
-        int h = 1;
-        while (len-- > 0) {
-            h = 31 * h + a[off++];
+    // Hash code functions for ZipFile entry names. We generate the hash as-if
+    // we first decoded the byte sequence to a String, then appended '/' if no
+    // trailing slash was found, then called String.hashCode(). This
+    // normalization ensures we can simplify and speed up lookups.
+    int normalizedHash(byte[] a, int off, int len) {
+        if (len == 0) {
+            return 0;
+        }
+        return normalizedHashDecode(0, a, off, off + len);
+    }
+
+    // Matching normalized hash code function for Strings
+    static int normalizedHash(String name) {
+        int hsh = name.hashCode();
+        int len = name.length();
+        if (len > 0 && name.charAt(len - 1) != '/') {
+            hsh = hsh * 31 + '/';
+        }
+        return hsh;
+    }
+
+    boolean hasTrailingSlash(byte[] a, int end) {
+        byte[] slashBytes = slashBytes();
+        return end >= slashBytes.length &&
+            Arrays.mismatch(a, end - slashBytes.length, end, slashBytes, 0, slashBytes.length) == -1;
+    }
+
+    // Implements normalizedHash by decoding byte[] to char[] and then computing
+    // the hash. This is a slow-path used for non-UTF8 charsets and also when
+    // aborting the ASCII fast-path in the UTF8 implementation, so {@code h}
+    // might be a partially calculated hash code
+    int normalizedHashDecode(int h, byte[] a, int off, int end) {
+        try {
+            // cb will be a newly allocated CharBuffer with pos == 0,
+            // arrayOffset == 0, backed by an array.
+            CharBuffer cb = decoder().decode(ByteBuffer.wrap(a, off, end - off));
+            int limit = cb.limit();
+            char[] decoded = cb.array();
+            for (int i = 0; i < limit; i++) {
+                h = 31 * h + decoded[i];
+            }
+            if (limit > 0 && decoded[limit - 1] != '/') {
+                h = 31 * h + '/';
+            }
+        } catch (CharacterCodingException cce) {
+            // Ignore - return the hash code generated so far.
        }
        return h;
    }

-    private Charset cs;
-    private CharsetDecoder dec;
+    private byte[] slashBytes;
+    private final Charset cs;
+    protected CharsetDecoder dec;
    private CharsetEncoder enc;

    private ZipCoder(Charset cs) {
        this.cs = cs;
    }

-    private CharsetDecoder decoder() {
+    protected CharsetDecoder decoder() {
        if (dec == null) {
            dec = cs.newDecoder()
              .onMalformedInput(CodingErrorAction.REPORT)
@ -173,4 +170,73 @@ class ZipCoder {
        }
        return enc;
    }
+
+    // This method produces an array with the bytes that will correspond to a
+    // trailing '/' in the chosen character encoding.
+    //
+    // While in most charsets a trailing slash will be encoded as the byte
+    // value of '/', this does not hold in the general case. E.g., in charsets
+    // such as UTF-16 and UTF-32 it will be represented by a sequence of 2 or 4
+    // bytes, respectively.
+    private byte[] slashBytes() {
+        if (slashBytes == null) {
+            // Take into account charsets that produce a BOM, e.g., UTF-16
+            byte[] slash = "/".getBytes(cs);
+            byte[] doubleSlash = "//".getBytes(cs);
+            slashBytes = Arrays.copyOfRange(doubleSlash, slash.length, doubleSlash.length);
+        }
+        return slashBytes;
+    }
+
+    static final class UTF8ZipCoder extends ZipCoder {
+
+        private UTF8ZipCoder(Charset utf8) {
+            super(utf8);
+        }
+
+        @Override
+        boolean isUTF8() {
+            return true;
+        }
+
+        @Override
+        String toString(byte[] ba, int off, int length) {
+            return JLA.newStringUTF8NoRepl(ba, off, length);
+        }
+
+        @Override
+        byte[] getBytes(String s) {
+            return JLA.getBytesUTF8NoRepl(s);
+        }
+
+        @Override
+        int normalizedHash(byte[] a, int off, int len) {
+            if (len == 0) {
+                return 0;
+            }
+
+            int end = off + len;
+            int h = 0;
+            while (off < end) {
+                byte b = a[off];
+                if (b < 0) {
+                    // Non-ASCII, fall back to decoder loop
+                    return normalizedHashDecode(h, a, off, end);
+                } else {
+                    h = 31 * h + b;
+                    off++;
+                }
+            }
+
+            if (a[end - 1] != '/') {
+                h = 31 * h + '/';
+            }
+            return h;
+        }
+
+        @Override
+        boolean hasTrailingSlash(byte[] a, int end) {
+            return end > 0 && a[end - 1] == '/';
+        }
+    }
 }