8241055: Regex Grapheme Matcher Performance Depends too much on Total Input Sequence Size

Reviewed-by: naoto
This commit is contained in:
Philipp Kunz 2020-04-16 16:09:47 -07:00 committed by Naoto Sato
parent d0f5c5c6bb
commit 455eaca215
4 changed files with 116 additions and 83 deletions

View file

@ -30,21 +30,8 @@ import java.util.Objects;
final class Grapheme {
/**
* Determines if there is an extended grapheme cluster boundary between two
* continuing characters {@code cp1} and {@code cp2}.
* <p>
* See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
* for the extended grapheme cluster boundary rules
* <p>
* Note: this method does not take care of stateful breaking.
*/
static boolean isBoundary(int cp1, int cp2) {
return rules[getType(cp1)][getType(cp2)];
}
/**
* Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
* the start of the char sequence is a boundary.
* Look for the next extended grapheme cluster boundary in a CharSequence.
* It assumes the start of the char sequence at offset {@code off} is a boundary.
* <p>
* See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
* for the extended grapheme cluster boundary rules. The following implementation
@ -54,21 +41,20 @@ final class Grapheme {
* @param src the {@code CharSequence} to be scanned
* @param off offset to start looking for the next boundary in the src
* @param limit limit offset in the src (exclusive)
* @return the next possible boundary
* @return the next grapheme boundary
*/
static int nextBoundary(CharSequence src, int off, int limit) {
Objects.checkFromToIndex(off, limit, src.length());
int ch0 = Character.codePointAt(src, 0);
int ret = Character.charCount(ch0);
int ch1;
int ch0 = Character.codePointAt(src, off);
int ret = off + Character.charCount(ch0);
// indicates whether gb11 or gb12 is underway
int t0 = getGraphemeType(ch0);
int t0 = getType(ch0);
int riCount = t0 == RI ? 1 : 0;
boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC;
while (ret < limit) {
ch1 = Character.codePointAt(src, ret);
int t1 = getGraphemeType(ch1);
int ch1 = Character.codePointAt(src, ret);
int t1 = getType(ch1);
if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
// continue for gb11
@ -177,7 +163,8 @@ final class Grapheme {
cp == 0xAA7B || cp == 0xAA7D;
}
private static int getGraphemeType(int cp) {
@SuppressWarnings("fallthrough")
private static int getType(int cp) {
if (cp < 0x007F) { // ASCII
if (cp < 32) { // Control characters
if (cp == 0x000D)
@ -188,11 +175,7 @@ final class Grapheme {
}
return OTHER;
}
return getType(cp);
}
@SuppressWarnings("fallthrough")
private static int getType(int cp) {
if (EmojiData.isExtendedPictographic(cp)) {
return EXTENDED_PICTOGRAPHIC;
}

View file

@ -4035,17 +4035,8 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
if (i < matcher.to) {
int ch0 = Character.codePointAt(seq, i);
int n = Character.charCount(ch0);
int j = i + n;
// Fast check if it's necessary to call Normalizer;
// testing Grapheme.isBoundary is enough for this case
while (j < matcher.to) {
int ch1 = Character.codePointAt(seq, j);
if (Grapheme.isBoundary(ch0, ch1))
break;
ch0 = ch1;
j += Character.charCount(ch1);
}
if (i + n == j) { // single, assume nfc cp
int j = Grapheme.nextBoundary(seq, i, matcher.to);
if (i + n == j) { // single cp grapheme, assume nfc
if (predicate.is(ch0))
return next.match(matcher, j, seq);
} else {
@ -4109,13 +4100,12 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
endIndex = matcher.getTextLength();
}
if (i == startIndex) {
return next.match(matcher, i, seq);
}
if (i < endIndex) {
if (Character.isSurrogatePair(seq.charAt(i-1), seq.charAt(i)) ||
Grapheme.nextBoundary(seq,
i - Character.charCount(Character.codePointBefore(seq, i)),
i + Character.charCount(Character.codePointAt(seq, i))) > i) {
// continue with return below
} else if (i < endIndex) {
if (Character.isSurrogatePair(seq.charAt(i - 1), seq.charAt(i))) {
return false;
}
if (Grapheme.nextBoundary(seq, matcher.last, endIndex) > i) {
return false;
}
} else {