8241055: Regex Grapheme Matcher Performance Depends too much on Total Input Sequence Size

Reviewed-by: naoto
2025-08-28 15:24:43 +02:00 · 2020-04-16 16:09:47 -07:00 · 2020-04-16 16:09:47 -07:00 · 455eaca215
commit 455eaca215
parent d0f5c5c6bb
4 changed files with 116 additions and 83 deletions
--- a/src/java.base/share/classes/java/util/regex/Pattern.java
+++ b/src/java.base/share/classes/java/util/regex/Pattern.java
@ -4035,17 +4035,8 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
            if (i < matcher.to) {
                int ch0 = Character.codePointAt(seq, i);
                int n = Character.charCount(ch0);
-                int j = i + n;
-                // Fast check if it's necessary to call Normalizer;
-                // testing Grapheme.isBoundary is enough for this case
-                while (j < matcher.to) {
-                    int ch1 = Character.codePointAt(seq, j);
-                    if (Grapheme.isBoundary(ch0, ch1))
-                        break;
-                    ch0 = ch1;
-                    j += Character.charCount(ch1);
-                }
-                if (i + n == j) {    // single, assume nfc cp
+                int j = Grapheme.nextBoundary(seq, i, matcher.to);
+                if (i + n == j) { // single cp grapheme, assume nfc
                    if (predicate.is(ch0))
                        return next.match(matcher, j, seq);
                } else {
@ -4109,13 +4100,12 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
                endIndex = matcher.getTextLength();
            }
            if (i == startIndex) {
-                return next.match(matcher, i, seq);
-            }
-            if (i < endIndex) {
-                if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) ||
-                    Grapheme.nextBoundary(seq,
-                        i - Character.charCount(Character.codePointBefore(seq, i)),
-                        i + Character.charCount(Character.codePointAt(seq, i))) > i) {
+                // continue with return below
+            } else if (i < endIndex) {
+                if (Character.isSurrogatePair(seq.charAt(i - 1), seq.charAt(i))) {
+                    return false;
+                }
+                if (Grapheme.nextBoundary(seq, matcher.last, endIndex) > i) {
                    return false;
                }
            } else {