8302871: Speed up StringLatin1.regionMatchesCI

Reviewed-by: redestad, martin, alanb
This commit is contained in:
Eirik Bjorsnos 2023-02-25 07:48:03 +00:00 committed by Alan Bateman
parent b4ea80731c
commit 17e3769ed7
4 changed files with 161 additions and 18 deletions

View file

@ -138,10 +138,11 @@ class CharacterDataLatin1 extends CharacterData {
if (ch < 'A') { // Fast path for low code points
return ch;
}
int l = ch | 0x20; // Lowercase using 'oldest ASCII trick in the book'
if (l <= 'z' // In range a-z
|| (l >= 0xE0 && l <= 0xFE && l != 0xF7)) { // ..or agrave-thorn, excluding division
return l;
// ASCII and Latin-1 were designed to optimize case-twiddling operations
int lower = ch | 0x20;
if (lower <= 'z' // In range a-z
|| (lower >= 0xE0 && lower <= 0xFE && lower != 0xF7)) { // ..or agrave-thorn, excluding division
return lower;
}
return ch;
}
@ -150,10 +151,11 @@ class CharacterDataLatin1 extends CharacterData {
if (ch < 'a') { // Fast path for low code points
return ch;
}
int U = ch & 0xDF; // Uppercase using 'oldest ASCII trick in the book'
if (U <= 'Z' // In range A-Z
|| (U >= 0xC0 && U <= 0xDE && U != 0xD7)) { // ..or Agrave-Thorn, excluding multiplication
return U;
// ASCII and Latin-1 were designed to optimize case-twiddling operations
int upper = ch & 0xDF;
if (upper <= 'Z' // In range A-Z
|| (upper >= 0xC0 && upper <= 0xDE && upper != 0xD7)) { // ..or Agrave-Thorn, not multiplication
return upper;
}
// Special-case for 'y with Diaeresis' which uppercases out of latin1
@ -167,6 +169,27 @@ class CharacterDataLatin1 extends CharacterData {
return ch;
}
/**
* Compares two latin1 code points, ignoring case considerations
*
* @param b1 byte representing a latin1 code point
* @param b2 another byte representing a latin1 code point
* @return true if the two bytes are considered equals ignoring case in latin1
*/
static boolean equalsIgnoreCase(byte b1, byte b2) {
if (b1 == b2) {
return true;
}
// ASCII and Latin-1 were designed to optimize case-twiddling operations
int upper = b1 & 0xDF;
if (upper < 'A') {
return false; // Low ASCII
}
return (upper <= 'Z' // In range A-Z
|| (upper >= 0xC0 && upper <= 0XDE && upper != 0xD7)) // ..or A-grave-Thorn, not multiplication
&& upper == (b2 & 0xDF); // b2 has same uppercase
}
int toTitleCase(int ch) {
return toUpperCase(ch);
}

View file

@ -384,14 +384,9 @@ final class StringLatin1 {
byte[] other, int ooffset, int len) {
int last = toffset + len;
while (toffset < last) {
char c1 = (char)(value[toffset++] & 0xff);
char c2 = (char)(other[ooffset++] & 0xff);
if (c1 == c2) {
continue;
}
int u1 = CharacterDataLatin1.instance.toUpperCase(c1);
int u2 = CharacterDataLatin1.instance.toUpperCase(c2);
if (u1 == u2) {
byte b1 = value[toffset++];
byte b2 = other[ooffset++];
if (CharacterDataLatin1.equalsIgnoreCase(b1, b2)) {
continue;
}
return false;

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2015, 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -25,10 +25,12 @@ import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
import static org.testng.Assert.assertEquals;
import static org.testng.Assert.assertFalse;
import static org.testng.Assert.assertTrue;
/*
* @test
* @bug 8077559 8248655
* @bug 8077559 8248655 8302871
* @summary Tests Compact String. This one is for String.equalsIgnoreCase.
* @run testng/othervm -XX:+CompactStrings EqualsIgnoreCase
* @run testng/othervm -XX:-CompactStrings EqualsIgnoreCase
@ -75,4 +77,31 @@ public class EqualsIgnoreCase extends CompactString {
source));
});
}
/**
* Exhaustively check that all 256x256 latin1 code point pairs are equalsIgnoreCased
* in a manner consistent with Character.toLowerCase(Character.toUpperCase(c));
*/
@Test
public void checkConsistencyWithCharacterUppercaseLowerCase() {
for (char a = 0; a < 256; a++) {
for (char b = 0; b < 256; b++) {
int caseFoldA = Character.toLowerCase(Character.toUpperCase(a));
int caseFoldB = Character.toLowerCase(Character.toUpperCase(b));
String astr = Character.toString(a);
String bstr = Character.toString(b);
// If characters fold to the same lowercase, their strings should equalsIgnoreCase:
if (caseFoldA == caseFoldB) {
assertTrue(astr.equalsIgnoreCase(bstr),
"Expected %s to equalsIgnoreCase %s".formatted(astr, bstr));
} else {
assertFalse(astr.equalsIgnoreCase(bstr),
"Expected %s to not equalsIgnoreCase %s".formatted(astr, bstr));
}
}
}
}
}

View file

@ -0,0 +1,96 @@
/*
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/
package org.openjdk.bench.java.lang;
import org.openjdk.jmh.annotations.*;
import java.util.concurrent.TimeUnit;
/*
* This benchmark naively explores String::regionMatches, ignoring case
*/
public class RegionMatchesIC {
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@State(Scope.Benchmark)
@Warmup(iterations = 5, time = 1)
@Measurement(iterations = 5, time = 1)
@Fork(value = 3)
public static class Latin1 {
@Param({"1024"})
public int size;
@Param({"ascii-match",
"ascii-mismatch",
"number-match",
"number-mismatch",
"lat1-match",
"lat1-mismatch"})
String codePoints;
private String leftString;
private String rightString;
@Setup
public void setup() {
switch (codePoints) {
case "ascii-match" -> {
leftString = "a".repeat(size);
rightString = "A".repeat(size);
}
case "ascii-mismatch" -> {
leftString = "a".repeat(size);
rightString = "b".repeat(size);
}
case "number-match" -> {
leftString = "7".repeat(size);
rightString = "7".repeat(size);
}
case "number-mismatch" -> {
leftString = "7".repeat(size);
rightString = "9".repeat(size);
}
case "lat1-match" -> {
leftString = "\u00e5".repeat(size);
rightString = "\u00c5".repeat(size);
}
case "lat1-mismatch" -> {
leftString = "\u00e5".repeat(size);
rightString = "\u00c6".repeat(size);
}
default -> throw new IllegalArgumentException("Unsupported coding: " + codePoints);
}
// Make sure strings do not String.equals by adding a prefix
leftString = "l" + leftString;
rightString = "r" + rightString;
}
@Benchmark
public boolean regionMatchesIC() {
return leftString.regionMatches(true, 1, rightString, 1, size);
}
}
}