8291660: Grapheme support in BreakIterator

Reviewed-by: smarks
This commit is contained in:
Naoto Sato 2022-09-09 17:13:51 +00:00
parent a14c3a493a
commit b8598b0297
15 changed files with 245 additions and 149 deletions

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2005, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -50,7 +50,6 @@ public class BreakIteratorInfo extends ListResourceBundle {
// built-in type of BreakIterator
{"BreakIteratorClasses",
new String[] {
"RuleBasedBreakIterator", // character-break iterator class
"RuleBasedBreakIterator", // word-break iterator class
"RuleBasedBreakIterator", // line-break iterator class
"RuleBasedBreakIterator" // sentence-break iterator class
@ -58,7 +57,6 @@ public class BreakIteratorInfo extends ListResourceBundle {
},
// Rules filename for each break-iterator
{"CharacterData", "CharacterBreakIteratorData"},
{"WordData", "WordBreakIteratorData"},
{"LineData", "LineBreakIteratorData"},
{"SentenceData", "SentenceBreakIteratorData"},

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 1999, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1999, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -67,53 +67,6 @@ import java.util.ListResourceBundle;
public class BreakIteratorRules extends ListResourceBundle {
protected final Object[][] getContents() {
return new Object[][] {
// rules describing how to break between logical characters
{ "CharacterBreakRules",
// ignore non-spacing marks and enclosing marks (since we never
// put a break before ignore characters, this keeps combining
// accents with the base characters they modify)
"<enclosing>=[:Mn::Me:];"
// other category definitions
+ "<choseong>=[\u1100-\u115f];"
+ "<jungseong>=[\u1160-\u11a7];"
+ "<jongseong>=[\u11a8-\u11ff];"
+ "<surr-hi>=[\ud800-\udbff];"
+ "<surr-lo>=[\udc00-\udfff];"
// break after every character, except as follows:
+ ".;"
// keep base and combining characters togethers
+ "<base>=[^<enclosing>^[:Cc::Cf::Zl::Zp:]];"
+ "<base><enclosing><enclosing>*;"
// keep CRLF sequences together
+ "\r\n;"
// keep surrogate pairs together
+ "<surr-hi><surr-lo>;"
// keep Hangul syllables spelled out using conjoining jamo together
+ "<choseong>*<jungseong>*<jongseong>*;"
// various additions for Hindi support
+ "<nukta>=[\u093c];"
+ "<danda>=[\u0964\u0965];"
+ "<virama>=[\u094d];"
+ "<devVowelSign>=[\u093e-\u094c\u0962\u0963];"
+ "<devConsonant>=[\u0915-\u0939];"
+ "<devNuktaConsonant>=[\u0958-\u095f];"
+ "<devCharEnd>=[\u0902\u0903\u0951-\u0954];"
+ "<devCAMN>=(<devConsonant>{<nukta>});"
+ "<devConsonant1>=(<devNuktaConsonant>|<devCAMN>);"
+ "<zwj>=[\u200d];"
+ "<devConjunct>=({<devConsonant1><virama>{<zwj>}}<devConsonant1>);"
+ "<devConjunct>{<devVowelSign>}{<devCharEnd>};"
+ "<danda><nukta>;"
},
// default rules for finding word boundaries
{ "WordBreakRules",
// ignore non-spacing marks, enclosing marks, and format characters,

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 1999, 2016, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1999, 2022, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
@ -25,13 +25,18 @@
package sun.util.locale.provider;
import java.io.IOException;
import java.text.BreakIterator;
import java.text.CharacterIterator;
import java.text.spi.BreakIteratorProvider;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
import java.util.MissingResourceException;
import java.util.Objects;
import java.util.Set;
import jdk.internal.util.regex.Grapheme;
import sun.text.DictionaryBasedBreakIterator;
import sun.text.RuleBasedBreakIterator;
@ -45,10 +50,9 @@ import sun.text.RuleBasedBreakIterator;
public class BreakIteratorProviderImpl extends BreakIteratorProvider
implements AvailableLanguageTags {
private static final int CHARACTER_INDEX = 0;
private static final int WORD_INDEX = 1;
private static final int LINE_INDEX = 2;
private static final int SENTENCE_INDEX = 3;
private static final int WORD_INDEX = 0;
private static final int LINE_INDEX = 1;
private static final int SENTENCE_INDEX = 2;
private final LocaleProviderAdapter.Type type;
private final Set<String> langtags;
@ -127,10 +131,7 @@ public class BreakIteratorProviderImpl extends BreakIteratorProvider
*/
@Override
public BreakIterator getCharacterInstance(Locale locale) {
return getBreakInstance(locale,
CHARACTER_INDEX,
"CharacterData",
"CharacterDictionary");
return new GraphemeBreakIterator();
}
/**
@ -193,4 +194,151 @@ public class BreakIteratorProviderImpl extends BreakIteratorProvider
public boolean isSupportedLocale(Locale locale) {
return LocaleProviderAdapter.forType(type).isSupportedProviderLocale(locale, langtags);
}
static final class GraphemeBreakIterator extends BreakIterator {
CharacterIterator ci;
int offset;
List<Integer> boundaries;
int boundaryIndex;
GraphemeBreakIterator() {
setText("");
}
@Override
public int first() {
boundaryIndex = 0;
return current();
}
@Override
public int last() {
boundaryIndex = boundaries.size() - 1;
return current();
}
@Override
public int next(int n) {
if (n == 0) {
return offset;
}
boundaryIndex = boundaryIndex + n;
if (boundaryIndex < 0) {
boundaryIndex = 0;
current();
return DONE;
} else if (boundaryIndex >= boundaries.size()) {
boundaryIndex = boundaries.size() - 1;
current();
return DONE;
} else {
return current();
}
}
@Override
public int next() {
return next(1);
}
@Override
public int previous() {
return next(-1);
}
@Override
public int following(int offset) {
var lastBoundary = boundaries.get(boundaries.size() - 1);
if (offset < boundaries.get(0) || offset > lastBoundary) {
throw new IllegalArgumentException("offset is out of bounds: " + offset);
} else if (offset == this.offset && this.offset == lastBoundary) {
return DONE;
}
boundaryIndex = Collections.binarySearch(boundaries, Math.min(offset + 1, lastBoundary));
if (boundaryIndex < 0) {
boundaryIndex = -boundaryIndex - 1;
}
return current();
}
@Override
public int current() {
offset = boundaries.get(boundaryIndex);
return offset;
}
@Override
public CharacterIterator getText() {
return ci;
}
@Override
public void setText(CharacterIterator newText) {
ci = newText;
var text = new CharacterIteratorCharSequence(ci);
var end = ci.getEndIndex();
boundaries = new ArrayList<>();
for (int b = ci.getBeginIndex(); b < end;) {
boundaries.add(b);
b = Grapheme.nextBoundary(text, b, end);
}
boundaries.add(end);
boundaryIndex = 0;
offset = ci.getIndex();
}
// Had to override to suppress the bug in the BreakIterator's default impl.
// See the comments in the default impl.
@Override
public boolean isBoundary(int offset) {
if (offset < boundaries.get(0) || offset > boundaries.get(boundaries.size() - 1)) {
throw new IllegalArgumentException("offset is out of bounds: " + offset);
}
return Collections.binarySearch(boundaries, offset) >= 0;
}
@Override
public int hashCode() {
return Objects.hash(ci, offset, boundaries, boundaryIndex);
}
@Override
public boolean equals(Object o) {
return o instanceof GraphemeBreakIterator that &&
ci.equals(that.ci) &&
offset == that.offset &&
boundaries.equals(that.boundaries) &&
boundaryIndex == that.boundaryIndex;
}
}
// Implementation only for calling Grapheme.nextBoundary()
static final class CharacterIteratorCharSequence implements CharSequence {
CharacterIterator src;
CharacterIteratorCharSequence(CharacterIterator ci) {
src = ci;
}
@Override
public int length() {
return src.getEndIndex() - src.getBeginIndex();
}
@Override
public char charAt(int index) {
src.setIndex(index);
return src.current();
}
@Override
public CharSequence subSequence(int start, int end) {
// not expected to be called
throw new UnsupportedOperationException();
}
}
}