8248655: Support supplementary characters in String case insensitive operations

8248434: some newly added locale cannot parse uppercased date string

Reviewed-by: jlaskey, joehw, rriggs, bchristi
This commit is contained in:
Naoto Sato 2020-07-23 08:46:31 -07:00
parent dc80e63811
commit 1f63603288
7 changed files with 272 additions and 87 deletions

View file

@ -43,7 +43,6 @@ import java.util.Optional;
import java.util.Spliterator;
import java.util.StringJoiner;
import java.util.function.Function;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import java.util.stream.Collectors;
@ -1134,16 +1133,16 @@ public final class String
/**
* Compares this {@code String} to another {@code String}, ignoring case
* considerations. Two strings are considered equal ignoring case if they
* are of the same length and corresponding characters in the two strings
* are equal ignoring case.
* are of the same length and corresponding Unicode code points in the two
* strings are equal ignoring case.
*
* <p> Two characters {@code c1} and {@code c2} are considered the same
* <p> Two Unicode code points are considered the same
* ignoring case if at least one of the following is true:
* <ul>
* <li> The two characters are the same (as compared by the
* <li> The two Unicode code points are the same (as compared by the
* {@code ==} operator)
* <li> Calling {@code Character.toLowerCase(Character.toUpperCase(char))}
* on each character produces the same result
* <li> Calling {@code Character.toLowerCase(Character.toUpperCase(int))}
* on each Unicode code point produces the same result
* </ul>
*
* <p>Note that this method does <em>not</em> take locale into account, and
@ -1158,6 +1157,7 @@ public final class String
* false} otherwise
*
* @see #equals(Object)
* @see #codePoints()
*/
public boolean equalsIgnoreCase(String anotherString) {
return (this == anotherString) ? true
@ -1224,7 +1224,8 @@ public final class String
/**
* A Comparator that orders {@code String} objects as by
* {@code compareToIgnoreCase}. This comparator is serializable.
* {@link #compareToIgnoreCase(String) compareToIgnoreCase}.
* This comparator is serializable.
* <p>
* Note that this Comparator does <em>not</em> take locale into account,
* and will result in an unsatisfactory ordering for certain locales.
@ -1261,10 +1262,10 @@ public final class String
/**
* Compares two strings lexicographically, ignoring case
* differences. This method returns an integer whose sign is that of
* calling {@code compareTo} with normalized versions of the strings
* calling {@code compareTo} with case folded versions of the strings
* where case differences have been eliminated by calling
* {@code Character.toLowerCase(Character.toUpperCase(character))} on
* each character.
* {@code Character.toLowerCase(Character.toUpperCase(int))} on
* each Unicode code point.
* <p>
* Note that this method does <em>not</em> take locale into account,
* and will result in an unsatisfactory ordering for certain locales.
@ -1275,6 +1276,7 @@ public final class String
* specified String is greater than, equal to, or less
* than this String, ignoring case considerations.
* @see java.text.Collator
* @see #codePoints()
* @since 1.2
*/
public int compareToIgnoreCase(String str) {
@ -1362,30 +1364,26 @@ public final class String
* <p>
* A substring of this {@code String} object is compared to a substring
* of the argument {@code other}. The result is {@code true} if these
* substrings represent character sequences that are the same, ignoring
* case if and only if {@code ignoreCase} is true. The substring of
* this {@code String} object to be compared begins at index
* {@code toffset} and has length {@code len}. The substring of
* {@code other} to be compared begins at index {@code ooffset} and
* has length {@code len}. The result is {@code false} if and only if
* at least one of the following is true:
* <ul><li>{@code toffset} is negative.
* <li>{@code ooffset} is negative.
* <li>{@code toffset+len} is greater than the length of this
* substrings represent Unicode code point sequences that are the same,
* ignoring case if and only if {@code ignoreCase} is true.
* The sequences {@code tsequence} and {@code osequence} are compared,
* where {@code tsequence} is the sequence produced as if by calling
* {@code this.substring(toffset, len).codePoints()} and {@code osequence}
* is the sequence produced as if by calling
* {@code other.substring(ooffset, len).codePoints()}.
* The result is {@code true} if and only if all of the following
* are true:
* <ul><li>{@code toffset} is non-negative.
* <li>{@code ooffset} is non-negative.
* <li>{@code toffset+len} is less than or equal to the length of this
* {@code String} object.
* <li>{@code ooffset+len} is greater than the length of the other
* <li>{@code ooffset+len} is less than or equal to the length of the other
* argument.
* <li>{@code ignoreCase} is {@code false} and there is some nonnegative
* integer <i>k</i> less than {@code len} such that:
* <blockquote><pre>
* this.charAt(toffset+k) != other.charAt(ooffset+k)
* </pre></blockquote>
* <li>{@code ignoreCase} is {@code true} and there is some nonnegative
* integer <i>k</i> less than {@code len} such that:
* <blockquote><pre>
* Character.toLowerCase(Character.toUpperCase(this.charAt(toffset+k))) !=
* Character.toLowerCase(Character.toUpperCase(other.charAt(ooffset+k)))
* </pre></blockquote>
* <li>if {@code ignoreCase} is {@code false}, all pairs of corresponding Unicode
* code points are equal integer values; or if {@code ignoreCase} is {@code true},
* {@link Character#toLowerCase(int) Character.toLowerCase(}
* {@link Character#toUpperCase(int)}{@code )} on all pairs of Unicode code points
* results in equal integer values.
* </ul>
*
* <p>Note that this method does <em>not</em> take locale into account,
@ -1400,12 +1398,14 @@ public final class String
* @param other the string argument.
* @param ooffset the starting offset of the subregion in the string
* argument.
* @param len the number of characters to compare.
* @param len the number of characters (Unicode code units -
* 16bit {@code char} value) to compare.
* @return {@code true} if the specified subregion of this string
* matches the specified subregion of the string argument;
* {@code false} otherwise. Whether the matching is exact
* or case insensitive depends on the {@code ignoreCase}
* argument.
* @see #codePoints()
*/
public boolean regionMatches(boolean ignoreCase, int toffset,
String other, int ooffset, int len) {

View file

@ -319,25 +319,92 @@ final class StringUTF16 {
}
public static int compareToCI(byte[] value, byte[] other) {
int len1 = length(value);
int len2 = length(other);
int lim = Math.min(len1, len2);
for (int k = 0; k < lim; k++) {
char c1 = getChar(value, k);
char c2 = getChar(other, k);
if (c1 != c2) {
c1 = Character.toUpperCase(c1);
c2 = Character.toUpperCase(c2);
if (c1 != c2) {
c1 = Character.toLowerCase(c1);
c2 = Character.toLowerCase(c2);
if (c1 != c2) {
return c1 - c2;
}
}
return compareToCIImpl(value, 0, length(value), other, 0, length(other));
}
private static int compareToCIImpl(byte[] value, int toffset, int tlen,
byte[] other, int ooffset, int olen) {
int tlast = toffset + tlen;
int olast = ooffset + olen;
assert toffset >= 0 && ooffset >= 0;
assert tlast <= length(value);
assert olast <= length(other);
for (int k1 = toffset, k2 = ooffset; k1 < tlast && k2 < olast; k1++, k2++) {
int cp1 = (int)getChar(value, k1);
int cp2 = (int)getChar(other, k2);
if (cp1 == cp2 || compareCodePointCI(cp1, cp2) == 0) {
continue;
}
// Check for supplementary characters case
cp1 = codePointIncluding(value, cp1, k1, toffset, tlast);
if (cp1 < 0) {
k1++;
cp1 -= cp1;
}
cp2 = codePointIncluding(other, cp2, k2, ooffset, olast);
if (cp2 < 0) {
k2++;
cp2 -= cp2;
}
int diff = compareCodePointCI(cp1, cp2);
if (diff != 0) {
return diff;
}
}
return len1 - len2;
return tlen - olen;
}
// Case insensitive comparison of two code points
private static int compareCodePointCI(int cp1, int cp2) {
// try converting both characters to uppercase.
// If the results match, then the comparison scan should
// continue.
cp1 = Character.toUpperCase(cp1);
cp2 = Character.toUpperCase(cp2);
if (cp1 != cp2) {
// Unfortunately, conversion to uppercase does not work properly
// for the Georgian alphabet, which has strange rules about case
// conversion. So we need to make one last check before
// exiting.
cp1 = Character.toLowerCase(cp1);
cp2 = Character.toLowerCase(cp2);
if (cp1 != cp2) {
return cp1 - cp2;
}
}
return 0;
}
// Returns a code point from the code unit pointed by "index". If it is
// not a surrogate or an unpaired surrogate, then the code unit is
// returned as is. Otherwise, it is combined with the code unit before
// or after, depending on the type of the surrogate at index, to make a
// supplementary code point. The return value will be negated if the code
// unit pointed by index is a high surrogate, and index + 1 is a low surrogate.
private static int codePointIncluding(byte[] ba, int cp, int index, int start, int end) {
// fast check
if (!Character.isSurrogate((char)cp)) {
return cp;
}
if (Character.isLowSurrogate((char)cp)) {
if (index > start) {
char c = getChar(ba, index - 1);
if (Character.isHighSurrogate(c)) {
return Character.toCodePoint(c, (char)cp);
}
}
} else if (index + 1 < end) { // cp == high surrogate
char c = getChar(ba, index + 1);
if (Character.isLowSurrogate(c)) {
// negate the code point
return - Character.toCodePoint((char)cp, c);
}
}
return cp;
}
public static int compareToCI_Latin1(byte[] value, byte[] other) {
@ -716,34 +783,7 @@ final class StringUTF16 {
public static boolean regionMatchesCI(byte[] value, int toffset,
byte[] other, int ooffset, int len) {
int last = toffset + len;
assert toffset >= 0 && ooffset >= 0;
assert ooffset + len <= length(other);
assert last <= length(value);
while (toffset < last) {
char c1 = getChar(value, toffset++);
char c2 = getChar(other, ooffset++);
if (c1 == c2) {
continue;
}
// try converting both characters to uppercase.
// If the results match, then the comparison scan should
// continue.
char u1 = Character.toUpperCase(c1);
char u2 = Character.toUpperCase(c2);
if (u1 == u2) {
continue;
}
// Unfortunately, conversion to uppercase does not work properly
// for the Georgian alphabet, which has strange rules about case
// conversion. So we need to make one last check before
// exiting.
if (Character.toLowerCase(u1) == Character.toLowerCase(u2)) {
continue;
}
return false;
}
return true;
return compareToCIImpl(value, toffset, len, other, ooffset, len) == 0;
}
public static boolean regionMatchesCI_Latin1(byte[] value, int toffset,