mirror of
https://github.com/openjdk/jdk.git
synced 2025-08-27 14:54:52 +02:00
8225061: Performance regression in Regex
Co-authored-by: Naoto Sato <naoto.sato@oracle.com> Reviewed-by: naoto, alanb
This commit is contained in:
parent
d2ad9dabdf
commit
1813ce706a
5 changed files with 170 additions and 30 deletions
|
@ -40,7 +40,16 @@ final class EmojiData {
|
|||
* @return true if {@code cp} is an extended pictographic
|
||||
*/
|
||||
static boolean isExtendedPictographic(int cp) {
|
||||
if (cp < 0x2000) {
|
||||
return
|
||||
%%%EXTPICT_LOW%%%
|
||||
} else {
|
||||
return isHigh(cp);
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean isHigh(int cp) {
|
||||
return
|
||||
%%%EXTPICT%%%
|
||||
%%%EXTPICT_HIGH%%%
|
||||
}
|
||||
}
|
||||
|
|
|
@ -29,6 +29,19 @@ import java.util.Objects;
|
|||
|
||||
final class Grapheme {
|
||||
|
||||
/**
|
||||
* Determines if there is an extended grapheme cluster boundary between two
|
||||
* continuing characters {@code cp1} and {@code cp2}.
|
||||
* <p>
|
||||
* See Unicode Standard Annex #29 Unicode Text Segmentation for the specification
|
||||
* for the extended grapheme cluster boundary rules
|
||||
* <p>
|
||||
* Note: this method does not take care of stateful breaking.
|
||||
*/
|
||||
static boolean isBoundary(int cp1, int cp2) {
|
||||
return rules[getType(cp1)][getType(cp2)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Look for the next extended grapheme cluster boundary in a CharSequence. It assumes
|
||||
* the start of the char sequence is a boundary.
|
||||
|
@ -50,12 +63,12 @@ final class Grapheme {
|
|||
int ret = Character.charCount(ch0);
|
||||
int ch1;
|
||||
// indicates whether gb11 or gb12 is underway
|
||||
boolean gb11 = EmojiData.isExtendedPictographic(ch0);
|
||||
int riCount = getType(ch0) == RI ? 1 : 0;
|
||||
int t0 = getGraphemeType(ch0);
|
||||
int riCount = t0 == RI ? 1 : 0;
|
||||
boolean gb11 = t0 == EXTENDED_PICTOGRAPHIC;
|
||||
while (ret < limit) {
|
||||
ch1 = Character.codePointAt(src, ret);
|
||||
int t0 = getType(ch0);
|
||||
int t1 = getType(ch1);
|
||||
int t1 = getGraphemeType(ch1);
|
||||
|
||||
if (gb11 && t0 == ZWJ && t1 == EXTENDED_PICTOGRAPHIC) {
|
||||
gb11 = false;
|
||||
|
@ -65,13 +78,14 @@ final class Grapheme {
|
|||
if (ret > off) {
|
||||
break;
|
||||
} else {
|
||||
gb11 = EmojiData.isExtendedPictographic(ch1);
|
||||
gb11 = t1 == EXTENDED_PICTOGRAPHIC;
|
||||
riCount = 0;
|
||||
}
|
||||
}
|
||||
|
||||
riCount += getType(ch1) == RI ? 1 : 0;
|
||||
ch0 = ch1;
|
||||
riCount += (t1 == RI) ? 1 : 0;
|
||||
t0 = t1;
|
||||
|
||||
ret += Character.charCount(ch1);
|
||||
}
|
||||
return ret;
|
||||
|
@ -163,6 +177,20 @@ final class Grapheme {
|
|||
cp == 0xAA7B || cp == 0xAA7D;
|
||||
}
|
||||
|
||||
private static int getGraphemeType(int cp) {
|
||||
if (cp < 0x007F) { // ASCII
|
||||
if (cp < 32) { // Control characters
|
||||
if (cp == 0x000D)
|
||||
return CR;
|
||||
if (cp == 0x000A)
|
||||
return LF;
|
||||
return CONTROL;
|
||||
}
|
||||
return OTHER;
|
||||
}
|
||||
return getType(cp);
|
||||
}
|
||||
|
||||
@SuppressWarnings("fallthrough")
|
||||
private static int getType(int cp) {
|
||||
if (EmojiData.isExtendedPictographic(cp)) {
|
||||
|
@ -171,12 +199,6 @@ final class Grapheme {
|
|||
|
||||
int type = Character.getType(cp);
|
||||
switch(type) {
|
||||
case Character.CONTROL:
|
||||
if (cp == 0x000D)
|
||||
return CR;
|
||||
if (cp == 0x000A)
|
||||
return LF;
|
||||
return CONTROL;
|
||||
case Character.UNASSIGNED:
|
||||
// NOTE: #tr29 lists "Unassigned and Default_Ignorable_Code_Point" as Control
|
||||
// but GraphemeBreakTest.txt lists u+0378/reserved-0378 as "Other"
|
||||
|
@ -184,6 +206,7 @@ final class Grapheme {
|
|||
if (cp == 0x0378)
|
||||
return OTHER;
|
||||
|
||||
case Character.CONTROL:
|
||||
case Character.LINE_SEPARATOR:
|
||||
case Character.PARAGRAPH_SEPARATOR:
|
||||
case Character.SURROGATE:
|
||||
|
|
|
@ -3973,7 +3973,16 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
|
|||
if (i < matcher.to) {
|
||||
int ch0 = Character.codePointAt(seq, i);
|
||||
int n = Character.charCount(ch0);
|
||||
int j = Grapheme.nextBoundary(seq, i, matcher.to);
|
||||
int j = i + n;
|
||||
// Fast check if it's necessary to call Normalizer;
|
||||
// testing Grapheme.isBoundary is enough for this case
|
||||
while (j < matcher.to) {
|
||||
int ch1 = Character.codePointAt(seq, j);
|
||||
if (Grapheme.isBoundary(ch0, ch1))
|
||||
break;
|
||||
ch0 = ch1;
|
||||
j += Character.charCount(ch1);
|
||||
}
|
||||
if (i + n == j) { // single, assume nfc cp
|
||||
if (predicate.is(ch0))
|
||||
return next.match(matcher, j, seq);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue