8241055: Regex Grapheme Matcher Performance Depends too much on Total Input Sequence Size

Reviewed-by: naoto
2025-08-27 14:54:52 +02:00 · 2020-04-16 16:09:47 -07:00 · 2020-04-16 16:09:47 -07:00 · 455eaca215
commit 455eaca215
parent d0f5c5c6bb
4 changed files with 116 additions and 83 deletions
--- a/src/java.base/share/classes/java/util/regex/Grapheme.java
+++ b/src/java.base/share/classes/java/util/regex/Grapheme.java
@ -30,21 +30,8 @@ import java.util.Objects;
 final class Grapheme {

    /**
-     * Determines if there is an extended  grapheme cluster boundary between two
-     * continuing characters {@code cp1} and {@code cp2}.
-     * <p>
-     * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
-     * for the extended grapheme cluster boundary rules
-     * <p>
-     * Note: this method does not take care of stateful breaking.
-     */
-    static boolean isBoundary(int cp1, int cp2) {
-        return rules[getType(cp1)][getType(cp2)];
-    }
-
-    /**
-     * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
-     * the start of the char sequence is a boundary.
+     * Look for the next extended grapheme cluster boundary in a CharSequence.
+     * It assumes the start of the char sequence at offset {@code off} is a boundary.
     * <p>
     * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
     * for the extended grapheme cluster boundary rules. The following implementation
@ -54,21 +41,20 @@ final class Grapheme {
     * @param src the {@code CharSequence} to be scanned
     * @param off offset to start looking for the next boundary in the src
     * @param limit limit offset in the src (exclusive)
-     * @return the next possible boundary
+     * @return the next grapheme boundary
     */
    static int nextBoundary(CharSequence src, int off, int limit) {
        Objects.checkFromToIndex(off, limit, src.length());

-        int ch0 = Character.codePointAt(src, 0);
-        int ret = Character.charCount(ch0);
-        int ch1;
+        int ch0 = Character.codePointAt(src, off);
+        int ret = off + Character.charCount(ch0);
        // indicates whether gb11 or gb12 is underway
-        int t0 = getGraphemeType(ch0);
+        int t0 = getType(ch0);
        int riCount = t0 == RI ? 1 : 0;
        boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC;
        while (ret < limit) {
-            ch1 = Character.codePointAt(src, ret);
-            int t1 = getGraphemeType(ch1);
+            int ch1 = Character.codePointAt(src, ret);
+            int t1 = getType(ch1);

            if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
                // continue for gb11
@ -177,7 +163,8 @@ final class Grapheme {
               cp == 0xAA7B || cp == 0xAA7D;
    }

-    private static int getGraphemeType(int cp) {
+    @SuppressWarnings("fallthrough")
+    private static int getType(int cp) {
        if (cp < 0x007F) { // ASCII
            if (cp < 32) { // Control characters
                if (cp == 0x000D)
@ -188,11 +175,7 @@ final class Grapheme {
            }
            return OTHER;
        }
-        return getType(cp);
-    }

-    @SuppressWarnings("fallthrough")
-    private static int getType(int cp) {
        if (EmojiData.isExtendedPictographic(cp)) {
            return EXTENDED_PICTOGRAPHIC;
        }
--- a/src/java.base/share/classes/java/util/regex/Pattern.java
+++ b/src/java.base/share/classes/java/util/regex/Pattern.java
@ -4035,17 +4035,8 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
            if (i < matcher.to) {
                int ch0 = Character.codePointAt(seq, i);
                int n = Character.charCount(ch0);
-                int j = i + n;
-                // Fast check if it's necessary to call Normalizer;
-                // testing Grapheme.isBoundary is enough for this case
-                while (j < matcher.to) {
-                    int ch1 = Character.codePointAt(seq, j);
-                    if (Grapheme.isBoundary(ch0, ch1))
-                        break;
-                    ch0 = ch1;
-                    j += Character.charCount(ch1);
-                }
-                if (i + n == j) {    // single, assume nfc cp
+                int j = Grapheme.nextBoundary(seq, i, matcher.to);
+                if (i + n == j) { // single cp grapheme, assume nfc
                    if (predicate.is(ch0))
                        return next.match(matcher, j, seq);
                } else {
@ -4109,13 +4100,12 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
                endIndex = matcher.getTextLength();
            }
            if (i == startIndex) {
-                return next.match(matcher, i, seq);
-            }
-            if (i < endIndex) {
-                if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) ||
-                    Grapheme.nextBoundary(seq,
-                        i - Character.charCount(Character.codePointBefore(seq, i)),
-                        i + Character.charCount(Character.codePointAt(seq, i))) > i) {
+                // continue with return below
+            } else if (i < endIndex) {
+                if (Character.isSurrogatePair(seq.charAt(i - 1), seq.charAt(i))) {
+                    return false;
+                }
+                if (Grapheme.nextBoundary(seq, matcher.last, endIndex) > i) {
                    return false;
                }
            } else {