8305486: Add split() variants that keep the delimiters to String and j.u.r.Pattern

Reviewed-by: jpai, rriggs
This commit is contained in:
Raffaello Giulietti 2023-05-08 14:51:35 +00:00
parent ad90fb6da3
commit 93ee19f58a
3 changed files with 359 additions and 25 deletions

View file

@ -1302,6 +1302,100 @@ public final class Pattern
* around matches of this pattern
*/
public String[] split(CharSequence input, int limit) {
return split(input, limit, false);
}
/**
* Splits the given input sequence around matches of this pattern and
* returns both the strings and the matching delimiters.
*
* <p> The array returned by this method contains each substring of the
* input sequence that is terminated by another subsequence that matches
* this pattern or is terminated by the end of the input sequence.
* Each substring is immediately followed by the subsequence (the delimiter)
* that matches this pattern, <em>except</em> for the last substring, which
* is not followed by anything.
* The substrings in the array and the delimiters are in the order in which
* they occur in the input.
* If this pattern does not match any subsequence of the input then the
* resulting array has just one element, namely the input sequence in string
* form.
*
* <p> When there is a positive-width match at the beginning of the input
* sequence then an empty leading substring is included at the beginning
* of the resulting array.
* A zero-width match at the beginning however never produces such empty
* leading substring nor the empty delimiter.
*
* <p> The {@code limit} parameter controls the number of times the
* pattern is applied and therefore affects the length of the resulting
* array.
* <ul>
* <li> If the <i>limit</i> is positive then the pattern will be applied
* at most <i>limit</i> - 1 times, the array's length will be
* no greater than 2 &times; <i>limit</i> - 1, and the array's last
* entry will contain all input beyond the last matched delimiter.</li>
*
* <li> If the <i>limit</i> is zero then the pattern will be applied as
* many times as possible, the array can have any length, and trailing
* empty strings, whether substrings or delimiters, will be discarded.</li>
*
* <li> If the <i>limit</i> is negative then the pattern will be applied
* as many times as possible and the array can have any length.</li>
* </ul>
*
* <p> The input {@code "boo:::and::foo"}, for example, yields the following
* results with these parameters:
*
* <table class="plain" style="margin-left:2em;">
* <caption style="display:none">Split example showing regex, limit, and result</caption>
* <thead>
* <tr>
* <th scope="col">Regex</th>
* <th scope="col">Limit</th>
* <th scope="col">Result</th>
* </tr>
* </thead>
* <tbody>
* <tr><th scope="row" rowspan="3" style="font-weight:normal">:+</th>
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">2</th>
* <td>{@code { "boo", ":::", "and::foo" }}</td></tr>
* <tr><!-- : -->
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th>
* <td>{@code { "boo", ":::", "and", "::", "foo" }}</td></tr>
* <tr><!-- : -->
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-1</th>
* <td>{@code { "boo", ":::", "and", "::", "foo" }}</td></tr>
* <tr><th scope="row" rowspan="3" style="font-weight:normal">o</th>
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">5</th>
* <td>{@code { "b", "o", "", "o", ":::and::f", "o", "", "o", "" }}</td></tr>
* <tr><!-- o -->
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">-1</th>
* <td>{@code { "b", "o", "", "o", ":::and::f", "o", "", "o", "" }}</td></tr>
* <tr><!-- o -->
* <th scope="row" style="font-weight:normal; text-align:right; padding-right:1em">0</th>
* <td>{@code { "b", "o", "", "o", ":::and::f", "o", "", "o" }}</td></tr>
* </tbody>
* </table>
*
* @param input
* The character sequence to be split
*
* @param limit
* The result threshold, as described above
*
* @return The array of strings computed by splitting the input
* around matches of this pattern, alternating
* substrings and matching delimiters
*
* @since 21
*/
public String[] splitWithDelimiters(CharSequence input, int limit) {
return split(input, limit, true);
}
private String[] split(CharSequence input, int limit, boolean withDelimiters) {
int matchCount = 0;
int index = 0;
boolean matchLimited = limit > 0;
ArrayList<String> matchList = new ArrayList<>();
@ -1309,7 +1403,7 @@ public final class Pattern
// Add segments before each match found
while(m.find()) {
if (!matchLimited || matchList.size() < limit - 1) {
if (!matchLimited || matchCount < limit - 1) {
if (index == 0 && index == m.start() && m.start() == m.end()) {
// no empty leading substring included for zero-width match
// at the beginning of the input char sequence.
@ -1318,11 +1412,15 @@ public final class Pattern
String match = input.subSequence(index, m.start()).toString();
matchList.add(match);
index = m.end();
} else if (matchList.size() == limit - 1) { // last one
String match = input.subSequence(index,
input.length()).toString();
if (withDelimiters) {
matchList.add(input.subSequence(m.start(), index).toString());
}
++matchCount;
} else if (matchCount == limit - 1) { // last one
String match = input.subSequence(index, input.length()).toString();
matchList.add(match);
index = m.end();
++matchCount;
}
}
@ -1331,14 +1429,16 @@ public final class Pattern
return new String[] {input.toString()};
// Add remaining segment
if (!matchLimited || matchList.size() < limit)
if (!matchLimited || matchCount < limit)
matchList.add(input.subSequence(index, input.length()).toString());
// Construct result
int resultSize = matchList.size();
if (limit == 0)
while (resultSize > 0 && matchList.get(resultSize-1).isEmpty())
if (limit == 0) {
while (resultSize > 0 && matchList.get(resultSize-1).isEmpty()) {
resultSize--;
}
}
String[] result = new String[resultSize];
return matchList.subList(0, resultSize).toArray(result);
}
@ -1378,7 +1478,7 @@ public final class Pattern
* around matches of this pattern
*/
public String[] split(CharSequence input) {
return split(input, 0);
return split(input, 0, false);
}
/**