8291660: Grapheme support in BreakIterator

Reviewed-by: smarks
2025-08-28 15:24:43 +02:00 · 2022-09-09 17:13:51 +00:00 · 2022-09-09 17:13:51 +00:00 · b8598b0297
commit b8598b0297
parent a14c3a493a
15 changed files with 245 additions and 149 deletions
--- a/src/java.base/share/classes/sun/text/resources/BreakIteratorInfo.java
+++ b/src/java.base/share/classes/sun/text/resources/BreakIteratorInfo.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003, 2005, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -50,7 +50,6 @@ public class BreakIteratorInfo extends ListResourceBundle {
            // built-in type of BreakIterator
            {"BreakIteratorClasses",
                new String[] {
-                    "RuleBasedBreakIterator",  // character-break iterator class
                    "RuleBasedBreakIterator",  // word-break iterator class
                    "RuleBasedBreakIterator",  // line-break iterator class
                    "RuleBasedBreakIterator"   // sentence-break iterator class
@ -58,7 +57,6 @@ public class BreakIteratorInfo extends ListResourceBundle {
            },

            // Rules filename for each break-iterator
-            {"CharacterData", "CharacterBreakIteratorData"},
            {"WordData",      "WordBreakIteratorData"},
            {"LineData",      "LineBreakIteratorData"},
            {"SentenceData",  "SentenceBreakIteratorData"},
--- a/src/java.base/share/classes/sun/text/resources/BreakIteratorRules.java
+++ b/src/java.base/share/classes/sun/text/resources/BreakIteratorRules.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2021, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2022, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -67,53 +67,6 @@ import java.util.ListResourceBundle;
 public class BreakIteratorRules extends ListResourceBundle {
    protected final Object[][] getContents() {
        return new Object[][] {
-            // rules describing how to break between logical characters
-            { "CharacterBreakRules",
-
-              // ignore non-spacing marks and enclosing marks (since we never
-              // put a break before ignore characters, this keeps combining
-              // accents with the base characters they modify)
-              "<enclosing>=[:Mn::Me:];"
-
-              // other category definitions
-              + "<choseong>=[\u1100-\u115f];"
-              + "<jungseong>=[\u1160-\u11a7];"
-              + "<jongseong>=[\u11a8-\u11ff];"
-              + "<surr-hi>=[\ud800-\udbff];"
-              + "<surr-lo>=[\udc00-\udfff];"
-
-              // break after every character, except as follows:
-              + ".;"
-
-              // keep base and combining characters togethers
-              + "<base>=[^<enclosing>^[:Cc::Cf::Zl::Zp:]];"
-              + "<base><enclosing><enclosing>*;"
-
-              // keep CRLF sequences together
-              + "\r\n;"
-
-              // keep surrogate pairs together
-              + "<surr-hi><surr-lo>;"
-
-              // keep Hangul syllables spelled out using conjoining jamo together
-              + "<choseong>*<jungseong>*<jongseong>*;"
-
-              // various additions for Hindi support
-              + "<nukta>=[\u093c];"
-              + "<danda>=[\u0964\u0965];"
-              + "<virama>=[\u094d];"
-              + "<devVowelSign>=[\u093e-\u094c\u0962\u0963];"
-              + "<devConsonant>=[\u0915-\u0939];"
-              + "<devNuktaConsonant>=[\u0958-\u095f];"
-              + "<devCharEnd>=[\u0902\u0903\u0951-\u0954];"
-              + "<devCAMN>=(<devConsonant>{<nukta>});"
-              + "<devConsonant1>=(<devNuktaConsonant>|<devCAMN>);"
-              + "<zwj>=[\u200d];"
-              + "<devConjunct>=({<devConsonant1><virama>{<zwj>}}<devConsonant1>);"
-              + "<devConjunct>{<devVowelSign>}{<devCharEnd>};"
-              + "<danda><nukta>;"
-            },
-
            // default rules for finding word boundaries
            { "WordBreakRules",
              // ignore non-spacing marks, enclosing marks, and format characters,
--- a/src/java.base/share/classes/sun/util/locale/provider/BreakIteratorProviderImpl.java
+++ b/src/java.base/share/classes/sun/util/locale/provider/BreakIteratorProviderImpl.java
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 1999, 2022, Oracle and/or its affiliates. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
@ -25,13 +25,18 @@

 package sun.util.locale.provider;

-import java.io.IOException;
 import java.text.BreakIterator;
+import java.text.CharacterIterator;
 import java.text.spi.BreakIteratorProvider;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
 import java.util.Locale;
 import java.util.MissingResourceException;
 import java.util.Objects;
 import java.util.Set;
+
+import jdk.internal.util.regex.Grapheme;
 import sun.text.DictionaryBasedBreakIterator;
 import sun.text.RuleBasedBreakIterator;

@ -45,10 +50,9 @@ import sun.text.RuleBasedBreakIterator;
 public class BreakIteratorProviderImpl extends BreakIteratorProvider
                                       implements AvailableLanguageTags {

-    private static final int CHARACTER_INDEX = 0;
-    private static final int WORD_INDEX = 1;
-    private static final int LINE_INDEX = 2;
-    private static final int SENTENCE_INDEX = 3;
+    private static final int WORD_INDEX = 0;
+    private static final int LINE_INDEX = 1;
+    private static final int SENTENCE_INDEX = 2;

    private final LocaleProviderAdapter.Type type;
    private final Set<String> langtags;
@ -127,10 +131,7 @@ public class BreakIteratorProviderImpl extends BreakIteratorProvider
     */
    @Override
    public BreakIterator getCharacterInstance(Locale locale) {
-        return getBreakInstance(locale,
-                                CHARACTER_INDEX,
-                                "CharacterData",
-                                "CharacterDictionary");
+        return new GraphemeBreakIterator();
    }

    /**
@ -193,4 +194,151 @@ public class BreakIteratorProviderImpl extends BreakIteratorProvider
    public boolean isSupportedLocale(Locale locale) {
        return LocaleProviderAdapter.forType(type).isSupportedProviderLocale(locale, langtags);
    }
+
+    static final class GraphemeBreakIterator extends BreakIterator {
+        CharacterIterator ci;
+        int offset;
+        List<Integer> boundaries;
+        int boundaryIndex;
+
+        GraphemeBreakIterator() {
+            setText("");
+        }
+
+        @Override
+        public int first() {
+            boundaryIndex = 0;
+            return current();
+        }
+
+        @Override
+        public int last() {
+            boundaryIndex = boundaries.size() - 1;
+            return current();
+        }
+
+        @Override
+        public int next(int n) {
+            if (n == 0) {
+                return offset;
+            }
+
+            boundaryIndex = boundaryIndex + n;
+            if (boundaryIndex < 0) {
+                boundaryIndex = 0;
+                current();
+                return DONE;
+            } else if (boundaryIndex >= boundaries.size()) {
+                boundaryIndex = boundaries.size() - 1;
+                current();
+                return DONE;
+            } else {
+                return current();
+            }
+        }
+
+        @Override
+        public int next() {
+            return next(1);
+        }
+
+        @Override
+        public int previous() {
+            return next(-1);
+        }
+
+        @Override
+        public int following(int offset) {
+            var lastBoundary = boundaries.get(boundaries.size() - 1);
+
+            if (offset < boundaries.get(0) || offset > lastBoundary) {
+                throw new IllegalArgumentException("offset is out of bounds: " + offset);
+            } else if (offset == this.offset && this.offset == lastBoundary) {
+                return DONE;
+            }
+
+            boundaryIndex = Collections.binarySearch(boundaries, Math.min(offset + 1, lastBoundary));
+            if (boundaryIndex < 0) {
+                boundaryIndex = -boundaryIndex - 1;
+            }
+
+            return current();
+        }
+
+        @Override
+        public int current() {
+            offset = boundaries.get(boundaryIndex);
+            return offset;
+        }
+
+        @Override
+        public CharacterIterator getText() {
+            return ci;
+        }
+
+        @Override
+        public void setText(CharacterIterator newText) {
+            ci = newText;
+            var text = new CharacterIteratorCharSequence(ci);
+            var end = ci.getEndIndex();
+            boundaries = new ArrayList<>();
+
+            for (int b = ci.getBeginIndex(); b < end;) {
+                boundaries.add(b);
+                b = Grapheme.nextBoundary(text, b, end);
+            }
+            boundaries.add(end);
+            boundaryIndex = 0;
+            offset = ci.getIndex();
+        }
+
+        // Had to override to suppress the bug in the BreakIterator's default impl.
+        // See the comments in the default impl.
+        @Override
+        public boolean isBoundary(int offset) {
+            if (offset < boundaries.get(0) || offset > boundaries.get(boundaries.size() - 1)) {
+                throw new IllegalArgumentException("offset is out of bounds: " + offset);
+            }
+            return Collections.binarySearch(boundaries, offset) >= 0;
+        }
+
+        @Override
+        public int hashCode() {
+            return Objects.hash(ci, offset, boundaries, boundaryIndex);
+        }
+
+        @Override
+        public boolean equals(Object o) {
+            return o instanceof GraphemeBreakIterator that &&
+                    ci.equals(that.ci) &&
+                    offset == that.offset &&
+                    boundaries.equals(that.boundaries) &&
+                    boundaryIndex == that.boundaryIndex;
+        }
+    }
+
+    // Implementation only for calling Grapheme.nextBoundary()
+    static final class CharacterIteratorCharSequence implements CharSequence {
+        CharacterIterator src;
+        CharacterIteratorCharSequence(CharacterIterator ci) {
+            src = ci;
+        }
+
+        @Override
+        public int length() {
+            return src.getEndIndex() - src.getBeginIndex();
+        }
+
+        @Override
+        public char charAt(int index) {
+            src.setIndex(index);
+            return src.current();
+        }
+
+        @Override
+        public CharSequence subSequence(int start, int end) {
+            // not expected to be called
+            throw new UnsupportedOperationException();
+        }
+    }
 }