8305486: Add split() variants that keep the delimiters to String and j.u.r.Pattern

Reviewed-by: jpai, rriggs
This commit is contained in:
Raffaello Giulietti 2023-05-08 14:51:35 +00:00
parent ad90fb6da3
commit 93ee19f58a
3 changed files with 359 additions and 25 deletions

View file

@ -3222,6 +3222,109 @@ public final class String
* @since 1.4
*/
public String[] split(String regex, int limit) {
return split(regex, limit, false);
}
/**
* Splits this string around matches of the given regular expression and
* returns both the strings and the matching delimiters.
*
* <p> The array returned by this method contains each substring of this
* string that is terminated by another substring that matches the given
* expression or is terminated by the end of the string.
* Each substring is immediately followed by the subsequence (the delimiter)
* that matches the given expression, <em>except</em> for the last
* substring, which is not followed by anything.
* The substrings in the array and the delimiters are in the order in which
* they occur in the input.
* If the expression does not match any part of the input then the resulting
* array has just one element, namely this string.
*
* <p> When there is a positive-width match at the beginning of this
* string then an empty leading substring is included at the beginning
* of the resulting array. A zero-width match at the beginning however
* never produces such empty leading substring nor the empty delimiter.
*
* <p> The {@code limit} parameter controls the number of times the
* pattern is applied and therefore affects the length of the resulting
* array.
* <ul>
* <li> If the <i>limit</i> is positive then the pattern will be applied
* at most <i>limit</i>&nbsp;-&nbsp;1 times, the array's length will be
* no greater than 2 &times; <i>limit</i> - 1, and the array's last
* entry will contain all input beyond the last matched delimiter.</li>
*
* <li> If the <i>limit</i> is zero then the pattern will be applied as
* many times as possible, the array can have any length, and trailing
* empty strings will be discarded.</li>
*
* <li> If the <i>limit</i> is negative then the pattern will be applied
* as many times as possible and the array can have any length.</li>
* </ul>
*
* <p> The input {@code "boo:::and::foo"}, for example, yields the following
* results with these parameters:
*
* <table class="plain" style="margin-left:2em;">
* <caption style="display:none">Split example showing regex, limit, and result</caption>
* <thead>
* <tr>
* <th scope="col">Regex</th>
* <th scope="col">Limit</th>
* <th scope="col">Result</th>
* </tr>
* </thead>
* <tbody>
* <tr><th scope="row" rowspan="3" style="font-weight:normal">:+</th>
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">2</th>
* <td>{@code { "boo", ":::", "and::foo" }}</td></tr>
* <tr><!-- : -->
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th>
* <td>{@code { "boo", ":::", "and", "::", "foo" }}</td></tr>
* <tr><!-- : -->
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-1</th>
* <td>{@code { "boo", ":::", "and", "::", "foo" }}</td></tr>
* <tr><th scope="row" rowspan="3" style="font-weight:normal">o</th>
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th>
* <td>{@code { "b", "o", "", "o", ":::and::f", "o", "", "o", "" }}</td></tr>
* <tr><!-- o -->
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-1</th>
* <td>{@code { "b", "o", "", "o", ":::and::f", "o", "", "o", "" }}</td></tr>
* <tr><!-- o -->
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">0</th>
* <td>{@code { "b", "o", "", "o", ":::and::f", "o", "", "o" }}</td></tr>
* </tbody>
* </table>
*
* @apiNote An invocation of this method of the form
* <i>str.</i>{@code splitWithDelimiters(}<i>regex</i>{@code ,}&nbsp;<i>n</i>{@code )}
* yields the same result as the expression
*
* <blockquote>
* <code>
* {@link java.util.regex.Pattern}.{@link
* java.util.regex.Pattern#compile(String) compile}(<i>regex</i>).{@link
* java.util.regex.Pattern#splitWithDelimiters(CharSequence,int) splitWithDelimiters}(<i>str</i>,&nbsp;<i>n</i>)
* </code>
* </blockquote>
*
* @param regex
* the delimiting regular expression
*
* @param limit
* the result threshold, as described above
*
* @return the array of strings computed by splitting this string
* around matches of the given regular expression, alternating
* substrings and matching delimiters
*
* @since 21
*/
public String[] splitWithDelimiters(String regex, int limit) {
return split(regex, limit, true);
}
private String[] split(String regex, int limit, boolean withDelimiters) {
/* fastpath if the regex is a
* (1) one-char String and this character is not one of the
* RegEx's meta characters ".$|()[{^?*+\\", or
@ -3230,48 +3333,57 @@ public final class String
*/
char ch = 0;
if (((regex.length() == 1 &&
".$|()[{^?*+\\".indexOf(ch = regex.charAt(0)) == -1) ||
(regex.length() == 2 &&
regex.charAt(0) == '\\' &&
(((ch = regex.charAt(1))-'0')|('9'-ch)) < 0 &&
((ch-'a')|('z'-ch)) < 0 &&
((ch-'A')|('Z'-ch)) < 0)) &&
(ch < Character.MIN_HIGH_SURROGATE ||
ch > Character.MAX_LOW_SURROGATE))
".$|()[{^?*+\\".indexOf(ch = regex.charAt(0)) == -1) ||
(regex.length() == 2 &&
regex.charAt(0) == '\\' &&
(((ch = regex.charAt(1))-'0')|('9'-ch)) < 0 &&
((ch-'a')|('z'-ch)) < 0 &&
((ch-'A')|('Z'-ch)) < 0)) &&
(ch < Character.MIN_HIGH_SURROGATE ||
ch > Character.MAX_LOW_SURROGATE))
{
// All the checks above can potentially be constant folded by
// a JIT/AOT compiler when the regex is a constant string.
// That requires method inlining of the checks, which is only
// possible when the actual split logic is in a separate method
// because the large split loop can usually not be inlined.
return split(ch, limit);
return split(ch, limit, withDelimiters);
}
return Pattern.compile(regex).split(this, limit);
Pattern pattern = Pattern.compile(regex);
return withDelimiters
? pattern.splitWithDelimiters(this, limit)
: pattern.split(this, limit);
}
private String[] split(char ch, int limit) {
private String[] split(char ch, int limit, boolean withDelimiters) {
int matchCount = 0;
int off = 0;
int next = 0;
int next;
boolean limited = limit > 0;
ArrayList<String> list = new ArrayList<>();
String del = withDelimiters ? String.valueOf(ch) : null;
while ((next = indexOf(ch, off)) != -1) {
if (!limited || list.size() < limit - 1) {
if (!limited || matchCount < limit - 1) {
list.add(substring(off, next));
if (withDelimiters) {
list.add(del);
}
off = next + 1;
++matchCount;
} else { // last one
//assert (list.size() == limit - 1);
int last = length();
list.add(substring(off, last));
off = last;
++matchCount;
break;
}
}
// If no match was found, return this
if (off == 0)
return new String[]{this};
return new String[] {this};
// Add remaining segment
if (!limited || list.size() < limit)
if (!limited || matchCount < limit)
list.add(substring(off, length()));
// Construct result
@ -3328,7 +3440,7 @@ public final class String
* @since 1.4
*/
public String[] split(String regex) {
return split(regex, 0);
return split(regex, 0, false);
}
/**