8225061: Performance regression in Regex

Co-authored-by: Naoto Sato <naoto.sato@oracle.com> Reviewed-by: naoto, alanb
2025-08-27 14:54:52 +02:00 · 2019-06-01 03:18:23 +02:00 · 2019-06-01 03:18:23 +02:00 · 1813ce706a
commit 1813ce706a
parent d2ad9dabdf
5 changed files with 170 additions and 30 deletions
--- a/src/java.base/share/classes/java/util/regex/EmojiData.java.template
+++ b/src/java.base/share/classes/java/util/regex/EmojiData.java.template
@ -40,7 +40,16 @@ final class EmojiData {
     * @return true if {@code cp} is an extended pictographic
     */
    static boolean isExtendedPictographic(int cp) {
+        if (cp < 0x2000) {
+            return
+%%%EXTPICT_LOW%%%
+        } else {
+            return isHigh(cp);
+        }
+    }
+
+    private static boolean isHigh(int cp) {
        return
-%%%EXTPICT%%%
+%%%EXTPICT_HIGH%%%
    }
 }
--- a/src/java.base/share/classes/java/util/regex/Grapheme.java
+++ b/src/java.base/share/classes/java/util/regex/Grapheme.java
@ -29,6 +29,19 @@ import java.util.Objects;

 final class Grapheme {

+    /**
+     * Determines if there is an extended  grapheme cluster boundary between two
+     * continuing characters {@code cp1} and {@code cp2}.
+     * <p>
+     * See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
+     * for the extended grapheme cluster boundary rules
+     * <p>
+     * Note: this method does not take care of stateful breaking.
+     */
+    static boolean isBoundary(int cp1, int cp2) {
+        return rules[getType(cp1)][getType(cp2)];
+    }
+
    /**
     * Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
     * the start of the char sequence is a boundary.
@ -50,12 +63,12 @@ final class Grapheme {
        int ret = Character.charCount(ch0);
        int ch1;
        // indicates whether gb11 or gb12 is underway
-        boolean gb11 = EmojiData.isExtendedPictographic(ch0);
-        int riCount = getType(ch0) == RI ? 1 : 0;
+        int t0 = getGraphemeType(ch0);
+        int riCount = t0 == RI ? 1 : 0;
+        boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC;
        while (ret < limit) {
            ch1 = Character.codePointAt(src, ret);
-            int t0 = getType(ch0);
-            int t1 = getType(ch1);
+            int t1 = getGraphemeType(ch1);

            if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
                gb11 = false;
@ -65,13 +78,14 @@ final class Grapheme {
                if (ret > off) {
                    break;
                } else {
-                    gb11 = EmojiData.isExtendedPictographic(ch1);
+                    gb11 = t1 == EXTENDED_PICTOGRAPHIC;
                    riCount = 0;
                }
            }

-            riCount += getType(ch1) == RI ? 1 : 0;
-            ch0 = ch1;
+            riCount += (t1 == RI) ? 1 : 0;
+            t0 = t1;
+
            ret += Character.charCount(ch1);
        }
        return ret;
@ -163,6 +177,20 @@ final class Grapheme {
               cp == 0xAA7B || cp == 0xAA7D;
    }

+    private static int getGraphemeType(int cp) {
+        if (cp < 0x007F) { // ASCII
+            if (cp < 32) { // Control characters
+                if (cp == 0x000D)
+                    return CR;
+                if (cp == 0x000A)
+                    return LF;
+                return CONTROL;
+            }
+            return OTHER;
+        }
+        return getType(cp);
+    }
+
    @SuppressWarnings("fallthrough")
    private static int getType(int cp) {
        if (EmojiData.isExtendedPictographic(cp)) {
@ -171,12 +199,6 @@ final class Grapheme {

        int type = Character.getType(cp);
        switch(type) {
-        case Character.CONTROL:
-            if (cp == 0x000D)
-                return CR;
-            if (cp == 0x000A)
-                return LF;
-            return CONTROL;
        case Character.UNASSIGNED:
            // NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
            // but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
@ -184,6 +206,7 @@ final class Grapheme {
            if (cp == 0x0378)
                return OTHER;

+        case Character.CONTROL:
        case Character.LINE_SEPARATOR:
        case Character.PARAGRAPH_SEPARATOR:
        case Character.SURROGATE:
--- a/src/java.base/share/classes/java/util/regex/Pattern.java
+++ b/src/java.base/share/classes/java/util/regex/Pattern.java
@ -3973,7 +3973,16 @@ loop:   for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
            if (i < matcher.to) {
                int ch0 = Character.codePointAt(seq, i);
                int n = Character.charCount(ch0);
-                int j = Grapheme.nextBoundary(seq, i, matcher.to);
+                int j = i + n;
+                // Fast check if it's necessary to call Normalizer;
+                // testing Grapheme.isBoundary is enough for this case
+                while (j < matcher.to) {
+                    int ch1 = Character.codePointAt(seq, j);
+                    if (Grapheme.isBoundary(ch0, ch1))
+                        break;
+                    ch0 = ch1;
+                    j += Character.charCount(ch1);
+                }
                if (i + n == j) {    // single, assume nfc cp
                    if (predicate.is(ch0))
                        return next.match(matcher, j, seq);