8237599: Greedy matching against supplementary chars fails to respect the region

Reviewed-by: rriggs
This commit is contained in:
Ivan Gerasimov 2020-03-25 08:46:31 -07:00
parent c01e986cc9
commit d1b506597f
2 changed files with 56 additions and 5 deletions

View file

@ -4340,14 +4340,22 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
this.cmin = cmin; this.cmin = cmin;
} }
boolean match(Matcher matcher, int i, CharSequence seq) { boolean match(Matcher matcher, int i, CharSequence seq) {
int starti = i;
int n = 0; int n = 0;
int to = matcher.to; int to = matcher.to;
// greedy, all the way down // greedy, all the way down
while (i < to) { while (i < to) {
int ch = Character.codePointAt(seq, i); int ch = Character.codePointAt(seq, i);
int len = Character.charCount(ch);
if (i + len > to) {
// the region cut off the high half of a surrogate pair
matcher.hitEnd = true;
ch = seq.charAt(i);
len = 1;
}
if (!predicate.is(ch)) if (!predicate.is(ch))
break; break;
i += Character.charCount(ch); i += len;
n++; n++;
} }
if (i >= to) { if (i >= to) {
@ -4358,9 +4366,10 @@ loop: for(int x=0, offset=0; x<nCodePoints; x++, offset+=len) {
return true; return true;
if (n == cmin) if (n == cmin)
return false; return false;
// backing off if match fails // backing off if match fails
int ch = Character.codePointBefore(seq, i); int ch = Character.codePointBefore(seq, i);
i -= Character.charCount(ch); // check if the region cut off the low half of a surrogate pair
i = Math.max(starti, i - Character.charCount(ch));
n--; n--;
} }
return false; return false;

View file

@ -36,7 +36,7 @@
* 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895 * 8151481 4867170 7080302 6728861 6995635 6736245 4916384 6328855 6192895
* 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706 * 6345469 6988218 6693451 7006761 8140212 8143282 8158482 8176029 8184706
* 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812 * 8194667 8197462 8184692 8221431 8224789 8228352 8230829 8236034 8235812
* 8216332 8214245 * 8216332 8214245 8237599
* *
* @library /test/lib * @library /test/lib
* @library /lib/testlibrary/java/lang * @library /lib/testlibrary/java/lang
@ -195,6 +195,7 @@ public class RegExTest {
surrogatePairWithCanonEq(); surrogatePairWithCanonEq();
lineBreakWithQuantifier(); lineBreakWithQuantifier();
caseInsensitivePMatch(); caseInsensitivePMatch();
surrogatePairOverlapRegion();
if (failure) { if (failure) {
throw new throw new
@ -5155,4 +5156,45 @@ public class RegExTest {
} }
report("caseInsensitivePMatch"); report("caseInsensitivePMatch");
} }
// This test is for 8237599
private static void surrogatePairOverlapRegion() {
String input = "\ud801\udc37";
Pattern p = Pattern.compile(".+");
Matcher m = p.matcher(input);
m.region(0, 1);
boolean ok = m.find();
if (!ok || !m.group(0).equals(input.substring(0, 1)))
{
failCount++;
System.out.println("Input \"" + input + "\".substr(0, 1)" +
" expected to match pattern \"" + p + "\"");
if (ok) {
System.out.println("group(0): \"" + m.group(0) + "\"");
}
} else if (!m.hitEnd()) {
failCount++;
System.out.println("Expected m.hitEnd() == true");
}
p = Pattern.compile(".*(.)");
m = p.matcher(input);
m.region(1, 2);
ok = m.find();
if (!ok || !m.group(0).equals(input.substring(1, 2))
|| !m.group(1).equals(input.substring(1, 2)))
{
failCount++;
System.out.println("Input \"" + input + "\".substr(1, 2)" +
" expected to match pattern \"" + p + "\"");
if (ok) {
System.out.println("group(0): \"" + m.group(0) + "\"");
System.out.println("group(1): \"" + m.group(1) + "\"");
}
}
report("surrogatePairOverlapRegion");
}
} }