8302871: Speed up StringLatin1.regionMatchesCI

Reviewed-by: redestad, martin, alanb
2025-08-27 14:54:52 +02:00 · 2023-02-25 07:48:03 +00:00 · 2023-02-25 07:48:03 +00:00 · 17e3769ed7
commit 17e3769ed7
parent b4ea80731c
4 changed files with 161 additions and 18 deletions
--- a/src/java.base/share/classes/java/lang/CharacterDataLatin1.java.template
+++ b/src/java.base/share/classes/java/lang/CharacterDataLatin1.java.template
@ -138,10 +138,11 @@ class CharacterDataLatin1 extends CharacterData {
        if (ch < 'A') { // Fast path for low code points
            return ch;
        }
-        int l = ch | 0x20; // Lowercase using 'oldest ASCII trick in the book'
-        if (l <= 'z' // In range a-z
-                || (l >= 0xE0 && l <= 0xFE && l != 0xF7)) { // ..or agrave-thorn, excluding division
-            return l;
+        // ASCII and Latin-1 were designed to optimize case-twiddling operations
+        int lower = ch | 0x20;
+        if (lower <= 'z' // In range a-z
+                || (lower >= 0xE0 && lower <= 0xFE && lower != 0xF7)) { // ..or agrave-thorn, excluding division
+            return lower;
        }
        return ch;
    }
@ -150,10 +151,11 @@ class CharacterDataLatin1 extends CharacterData {
        if (ch < 'a') { // Fast path for low code points
            return ch;
        }
-        int U = ch & 0xDF; // Uppercase using 'oldest ASCII trick in the book'
-        if (U <= 'Z' // In range A-Z
-                || (U >= 0xC0 && U <= 0xDE && U != 0xD7)) { // ..or Agrave-Thorn, excluding multiplication
-            return U;
+        // ASCII and Latin-1 were designed to optimize case-twiddling operations
+        int upper = ch & 0xDF;
+        if (upper <= 'Z' // In range A-Z
+                || (upper >= 0xC0 && upper <= 0xDE && upper != 0xD7)) { // ..or Agrave-Thorn, not multiplication
+            return upper;
        }

        // Special-case for 'y with Diaeresis' which uppercases out of latin1
@ -167,6 +169,27 @@ class CharacterDataLatin1 extends CharacterData {
        return ch;
    }

+    /**
+     * Compares two latin1 code points, ignoring case considerations
+     *
+     * @param b1 byte representing a latin1 code point
+     * @param b2 another byte representing a latin1 code point
+     * @return true if the two bytes are considered equals ignoring case in latin1
+     */
+     static boolean equalsIgnoreCase(byte b1, byte b2) {
+         if (b1 == b2) {
+             return true;
+         }
+         // ASCII and Latin-1 were designed to optimize case-twiddling operations
+         int upper = b1 & 0xDF;
+         if (upper < 'A') {
+             return false;  // Low ASCII
+         }
+         return (upper <= 'Z' // In range A-Z
+                 || (upper >= 0xC0 && upper <= 0XDE && upper != 0xD7)) // ..or A-grave-Thorn, not multiplication
+                 && upper == (b2 & 0xDF); // b2 has same uppercase
+    }
+
    int toTitleCase(int ch) {
        return toUpperCase(ch);
    }
--- a/src/java.base/share/classes/java/lang/StringLatin1.java
+++ b/src/java.base/share/classes/java/lang/StringLatin1.java
@ -384,14 +384,9 @@ final class StringLatin1 {
                                          byte[] other, int ooffset, int len) {
        int last = toffset + len;
        while (toffset < last) {
-            char c1 = (char)(value[toffset++] & 0xff);
-            char c2 = (char)(other[ooffset++] & 0xff);
-            if (c1 == c2) {
-                continue;
-            }
-            int u1 = CharacterDataLatin1.instance.toUpperCase(c1);
-            int u2 = CharacterDataLatin1.instance.toUpperCase(c2);
-            if (u1 == u2) {
+            byte b1 = value[toffset++];
+            byte b2 = other[ooffset++];
+            if (CharacterDataLatin1.equalsIgnoreCase(b1, b2)) {
                continue;
            }
            return false;
--- a/test/jdk/java/lang/String/CompactString/EqualsIgnoreCase.java
+++ b/test/jdk/java/lang/String/CompactString/EqualsIgnoreCase.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2015, 2020, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, 2023, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -25,10 +25,12 @@ import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;

 import static org.testng.Assert.assertEquals;
+import static org.testng.Assert.assertFalse;
+import static org.testng.Assert.assertTrue;

 /*
 * @test
- * @bug 8077559 8248655
+ * @bug 8077559 8248655 8302871
 * @summary Tests Compact String. This one is for String.equalsIgnoreCase.
 * @run testng/othervm -XX:+CompactStrings EqualsIgnoreCase
 * @run testng/othervm -XX:-CompactStrings EqualsIgnoreCase
@ -75,4 +77,31 @@ public class EqualsIgnoreCase extends CompactString {
                                            source));
                        });
    }
+
+    /**
+     * Exhaustively check that all 256x256 latin1 code point pairs are equalsIgnoreCased
+     * in a manner consistent with Character.toLowerCase(Character.toUpperCase(c));
+     */
+    @Test
+    public void checkConsistencyWithCharacterUppercaseLowerCase() {
+        for (char a = 0; a < 256; a++) {
+            for (char b = 0; b < 256; b++) {
+
+                int caseFoldA = Character.toLowerCase(Character.toUpperCase(a));
+                int caseFoldB = Character.toLowerCase(Character.toUpperCase(b));
+
+                String astr = Character.toString(a);
+                String bstr = Character.toString(b);
+
+                // If characters fold to the same lowercase, their strings should equalsIgnoreCase:
+                if (caseFoldA == caseFoldB) {
+                    assertTrue(astr.equalsIgnoreCase(bstr),
+                            "Expected %s to equalsIgnoreCase %s".formatted(astr, bstr));
+                } else {
+                    assertFalse(astr.equalsIgnoreCase(bstr),
+                            "Expected %s to not equalsIgnoreCase %s".formatted(astr, bstr));
+                }
+            }
+        }
+    }
 }
--- a/test/micro/org/openjdk/bench/java/lang/RegionMatchesIC.java
+++ b/test/micro/org/openjdk/bench/java/lang/RegionMatchesIC.java
@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package org.openjdk.bench.java.lang;
+
+import org.openjdk.jmh.annotations.*;
+
+import java.util.concurrent.TimeUnit;
+
+/*
+ * This benchmark naively explores String::regionMatches, ignoring case
+ */
+
+public class RegionMatchesIC {
+
+    @BenchmarkMode(Mode.AverageTime)
+    @OutputTimeUnit(TimeUnit.NANOSECONDS)
+    @State(Scope.Benchmark)
+    @Warmup(iterations = 5, time = 1)
+    @Measurement(iterations = 5, time = 1)
+    @Fork(value = 3)
+    public static class Latin1 {
+
+        @Param({"1024"})
+        public int size;
+
+        @Param({"ascii-match",
+                "ascii-mismatch",
+                "number-match",
+                "number-mismatch",
+                "lat1-match",
+                "lat1-mismatch"})
+        String codePoints;
+        private String leftString;
+        private String rightString;
+
+        @Setup
+        public void setup() {
+
+            switch (codePoints) {
+                case "ascii-match" -> {
+                    leftString  = "a".repeat(size);
+                    rightString = "A".repeat(size);
+                }
+                case "ascii-mismatch" -> {
+                    leftString  = "a".repeat(size);
+                    rightString = "b".repeat(size);
+                }
+                case "number-match" -> {
+                    leftString  = "7".repeat(size);
+                    rightString = "7".repeat(size);
+                }
+                case "number-mismatch" -> {
+                    leftString  = "7".repeat(size);
+                    rightString = "9".repeat(size);
+                }
+                case "lat1-match" -> {
+                    leftString  = "\u00e5".repeat(size);
+                    rightString = "\u00c5".repeat(size);
+                }
+                case "lat1-mismatch" -> {
+                    leftString  = "\u00e5".repeat(size);
+                    rightString = "\u00c6".repeat(size);
+                }
+                default -> throw new IllegalArgumentException("Unsupported coding: " + codePoints);
+            }
+            // Make sure strings do not String.equals by adding a prefix
+            leftString = "l" + leftString;
+            rightString = "r" + rightString;
+        }
+
+        @Benchmark
+        public boolean regionMatchesIC() {
+            return leftString.regionMatches(true, 1, rightString, 1, size);
+        }
+    }
+}