8311906: Improve robustness of String constructors with mutable array inputs

Co-authored-by: Damon Fenacci <dfenacci@openjdk.org> Co-authored-by: Claes Redestad <redestad@openjdk.org> Co-authored-by: Amit Kumar <amitkumar@openjdk.org> Co-authored-by: Martin Doerr <mdoerr@openjdk.org> Reviewed-by: rgiulietti, thartmann, redestad, dfenacci
2025-08-27 06:45:07 +02:00 · 2023-12-04 18:28:59 +00:00 · 2023-12-04 18:28:59 +00:00 · 155abc576a
commit 155abc576a
parent 316b78336c
15 changed files with 1300 additions and 248 deletions
--- a/src/java.base/share/classes/java/lang/String.java
+++ b/src/java.base/share/classes/java/lang/String.java
@ -273,6 +273,9 @@ public final class String
     * contents of the character array are copied; subsequent modification of
     * the character array does not affect the newly created string.
     *
+     * <p> The contents of the string are unspecified if the character array
+     * is modified during string construction.
+     *
     * @param  value
     *         The initial value of the string
     */
@ -288,6 +291,9 @@ public final class String
     * subarray are copied; subsequent modification of the character array does
     * not affect the newly created string.
     *
+     * <p> The contents of the string are unspecified if the character array
+     * is modified during string construction.
+     *
     * @param  value
     *         Array that is the source of characters
     *
@ -319,6 +325,9 @@ public final class String
     * {@code char}s; subsequent modification of the {@code int} array does not
     * affect the newly created string.
     *
+     * <p> The contents of the string are unspecified if the codepoints array
+     * is modified during string construction.
+     *
     * @param  codePoints
     *         Array that is the source of Unicode code points
     *
@ -346,12 +355,10 @@ public final class String
            return;
        }
        if (COMPACT_STRINGS) {
-            byte[] val = StringLatin1.toBytes(codePoints, offset, count);
-            if (val != null) {
-                this.coder = LATIN1;
-                this.value = val;
-                return;
-            }
+            byte[] val = StringUTF16.compress(codePoints, offset, count);
+            this.coder = StringUTF16.coderFromArrayLen(val, count);
+            this.value = val;
+            return;
        }
        this.coder = UTF16;
        this.value = StringUTF16.toBytes(codePoints, offset, count);
@ -368,6 +375,9 @@ public final class String
     * <p> Each {@code byte} in the subarray is converted to a {@code char} as
     * specified in the {@link #String(byte[],int) String(byte[],int)} constructor.
     *
+     * <p> The contents of the string are unspecified if the byte array
+     * is modified during string construction.
+     *
     * @deprecated This method does not properly convert bytes into characters.
     * As of JDK&nbsp;1.1, the preferred way to do this is via the
     * {@code String} constructors that take a {@link Charset}, charset name,
@ -429,6 +439,9 @@ public final class String
     *                         | (<b><i>b</i></b> &amp; 0xff))
     * </pre></blockquote>
     *
+     * <p> The contents of the string are unspecified if the byte array
+     * is modified during string construction.
+     *
     * @deprecated  This method does not properly convert bytes into
     * characters.  As of JDK&nbsp;1.1, the preferred way to do this is via the
     * {@code String} constructors that take a {@link Charset}, charset name,
@ -463,6 +476,9 @@ public final class String
     * java.nio.charset.CharsetDecoder} class should be used when more control
     * over the decoding process is required.
     *
+     * <p> The contents of the string are unspecified if the byte array
+     * is modified during string construction.
+     *
     * @param  bytes
     *         The bytes to be decoded into characters
     *
@ -501,6 +517,9 @@ public final class String
     * java.nio.charset.CharsetDecoder} class should be used when more control
     * over the decoding process is required.
     *
+     * <p> The contents of the string are unspecified if the byte array
+     * is modified during string construction.
+     *
     * @param  bytes
     *         The bytes to be decoded into characters
     *
@ -543,47 +562,43 @@ public final class String
                    this.coder = LATIN1;
                    return;
                }
-                int sl = offset + length;
-                byte[] dst = new byte[length];
-                if (dp > 0) {
-                    System.arraycopy(bytes, offset, dst, 0, dp);
-                    offset += dp;
-                }
-                while (offset < sl) {
-                    int b1 = bytes[offset++];
+                // Decode with a stable copy, to be the result if the decoded length is the same
+                byte[] latin1 = Arrays.copyOfRange(bytes, offset, offset + length);
+                int sp = dp;            // first dp bytes are already in the copy
+                while (sp < length) {
+                    int b1 = latin1[sp++];
                    if (b1 >= 0) {
-                        dst[dp++] = (byte)b1;
+                        latin1[dp++] = (byte)b1;
                        continue;
                    }
-                    if ((b1 & 0xfe) == 0xc2 && offset < sl) { // b1 either 0xc2 or 0xc3
-                        int b2 = bytes[offset];
+                    if ((b1 & 0xfe) == 0xc2 && sp < length) { // b1 either 0xc2 or 0xc3
+                        int b2 = latin1[sp];
                        if (b2 < -64) { // continuation bytes are always negative values in the range -128 to -65
-                            dst[dp++] = (byte)decode2(b1, b2);
-                            offset++;
+                            latin1[dp++] = (byte)decode2(b1, b2);
+                            sp++;
                            continue;
                        }
                    }
                    // anything not a latin1, including the REPL
                    // we have to go with the utf16
-                    offset--;
+                    sp--;
                    break;
                }
-                if (offset == sl) {
-                    if (dp != dst.length) {
-                        dst = Arrays.copyOf(dst, dp);
+                if (sp == length) {
+                    if (dp != latin1.length) {
+                        latin1 = Arrays.copyOf(latin1, dp);
                    }
-                    this.value = dst;
+                    this.value = latin1;
                    this.coder = LATIN1;
                    return;
                }
-                byte[] buf = new byte[length << 1];
-                StringLatin1.inflate(dst, 0, buf, 0, dp);
-                dst = buf;
-                dp = decodeUTF8_UTF16(bytes, offset, sl, dst, dp, true);
+                byte[] utf16 = new byte[length << 1];
+                StringLatin1.inflate(latin1, 0, utf16, 0, dp);
+                dp = decodeUTF8_UTF16(latin1, sp, length, utf16, dp, true);
                if (dp != length) {
-                    dst = Arrays.copyOf(dst, dp << 1);
+                    utf16 = Arrays.copyOf(utf16, dp << 1);
                }
-                this.value = dst;
+                this.value = utf16;
                this.coder = UTF16;
            } else { // !COMPACT_STRINGS
                byte[] dst = new byte[length << 1];
@ -655,12 +670,10 @@ public final class String
                char[] ca = new char[en];
                int clen = ad.decode(bytes, offset, length, ca);
                if (COMPACT_STRINGS) {
-                    byte[] bs = StringUTF16.compress(ca, 0, clen);
-                    if (bs != null) {
-                        value = bs;
-                        coder = LATIN1;
-                        return;
-                    }
+                    byte[] val = StringUTF16.compress(ca, 0, clen);;
+                    this.coder = StringUTF16.coderFromArrayLen(val, clen);
+                    this.value = val;
+                    return;
                }
                coder = UTF16;
                value = StringUTF16.toBytes(ca, 0, clen);
@ -686,12 +699,10 @@ public final class String
                throw new Error(x);
            }
            if (COMPACT_STRINGS) {
-                byte[] bs = StringUTF16.compress(ca, 0, caLen);
-                if (bs != null) {
-                    value = bs;
-                    coder = LATIN1;
-                    return;
-                }
+                byte[] val = StringUTF16.compress(ca, 0, caLen);
+                this.coder = StringUTF16.coderFromArrayLen(val, caLen);
+                this.value = val;
+                return;
            }
            coder = UTF16;
            value = StringUTF16.toBytes(ca, 0, caLen);
@ -829,10 +840,9 @@ public final class String
            throw new IllegalArgumentException(x);
        }
        if (COMPACT_STRINGS) {
-            byte[] bs = StringUTF16.compress(ca, 0, caLen);
-            if (bs != null) {
-                return new String(bs, LATIN1);
-            }
+            byte[] val = StringUTF16.compress(ca, 0, caLen);
+            int coder = StringUTF16.coderFromArrayLen(val, len);
+            return new String(val, coder);
        }
        return new String(StringUTF16.toBytes(ca, 0, caLen), UTF16);
    }
@ -1386,6 +1396,9 @@ public final class String
     * java.nio.charset.CharsetDecoder} class should be used when more control
     * over the decoding process is required.
     *
+     * <p> The contents of the string are unspecified if the byte array
+     * is modified during string construction.
+     *
     * @param  bytes
     *         The bytes to be decoded into characters
     *
@ -1414,6 +1427,9 @@ public final class String
     * java.nio.charset.CharsetDecoder} class should be used when more control
     * over the decoding process is required.
     *
+     * <p> The contents of the string are unspecified if the byte array
+     * is modified during string construction.
+     *
     * @param  bytes
     *         The bytes to be decoded into characters
     *
@ -1438,6 +1454,9 @@ public final class String
     * java.nio.charset.CharsetDecoder} class should be used when more control
     * over the decoding process is required.
     *
+     * <p> The contents of the string are unspecified if the byte array
+     * is modified during string construction.
+     *
     * @param  bytes
     *         The bytes to be decoded into characters
     *
@ -1468,6 +1487,9 @@ public final class String
     * java.nio.charset.CharsetDecoder} class should be used when more control
     * over the decoding process is required.
     *
+     * <p> The contents of the string are unspecified if the byte array
+     * is modified during string construction.
+     *
     * @param  bytes
     *         The bytes to be decoded into characters
     *
@ -1496,6 +1518,9 @@ public final class String
     * string builder are copied; subsequent modification of the string builder
     * does not affect the newly created string.
     *
+     * <p> The contents of the string are unspecified if the {@code StringBuilder}
+     * is modified during string construction.
+     *
     * <p> This constructor is provided to ease migration to {@code
     * StringBuilder}. Obtaining a string from a string builder via the {@code
     * toString} method is likely to run faster and is generally preferred.
@ -4488,6 +4513,9 @@ public final class String
     * modification of the character array does not affect the returned
     * string.
     *
+     * <p> The contents of the string are unspecified if the character array
+     * is modified during string construction.
+     *
     * @param   data     the character array.
     * @return  a {@code String} that contains the characters of the
     *          character array.
@ -4506,6 +4534,9 @@ public final class String
     * are copied; subsequent modification of the character array does not
     * affect the returned string.
     *
+     * <p> The contents of the string are unspecified if the character array
+     * is modified during string construction.
+     *
     * @param   data     the character array.
     * @param   offset   initial offset of the subarray.
     * @param   count    length of the subarray.
@ -4767,15 +4798,18 @@ public final class String
    }

    /*
-     * Package private constructor. Trailing Void argument is there for
+     * Private constructor. Trailing Void argument is there for
     * disambiguating it against other (public) constructors.
     *
     * Stores the char[] value into a byte[] that each byte represents
     * the8 low-order bits of the corresponding character, if the char[]
     * contains only latin1 character. Or a byte[] that stores all
     * characters in their byte sequences defined by the {@code StringUTF16}.
+     *
+     * <p> The contents of the string are unspecified if the character array
+     * is modified during string construction.
     */
-    String(char[] value, int off, int len, Void sig) {
+    private String(char[] value, int off, int len, Void sig) {
        if (len == 0) {
            this.value = "".value;
            this.coder = "".coder;
@ -4783,11 +4817,9 @@ public final class String
        }
        if (COMPACT_STRINGS) {
            byte[] val = StringUTF16.compress(value, off, len);
-            if (val != null) {
-                this.value = val;
-                this.coder = LATIN1;
-                return;
-            }
+            this.coder = StringUTF16.coderFromArrayLen(val, len);
+            this.value = val;
+            return;
        }
        this.coder = UTF16;
        this.value = StringUTF16.toBytes(value, off, len);
@ -4796,6 +4828,9 @@ public final class String
    /*
     * Package private constructor. Trailing Void argument is there for
     * disambiguating it against other (public) constructors.
+     *
+     * <p> The contents of the string are unspecified if the {@code StringBuilder}
+     * is modified during string construction.
     */
    String(AbstractStringBuilder asb, Void sig) {
        byte[] val = asb.getValue();
@ -4806,12 +4841,9 @@ public final class String
        } else {
            // only try to compress val if some characters were deleted.
            if (COMPACT_STRINGS && asb.maybeLatin1) {
-                byte[] buf = StringUTF16.compress(val, 0, length);
-                if (buf != null) {
-                    this.coder = LATIN1;
-                    this.value = buf;
-                    return;
-                }
+                this.value = StringUTF16.compress(val, 0, length);
+                this.coder = StringUTF16.coderFromArrayLen(this.value, length);
+                return;
            }
            this.coder = UTF16;
            this.value = Arrays.copyOfRange(val, 0, length << 1);