diff --git a/src/java.base/share/classes/java/util/Locale.java b/src/java.base/share/classes/java/util/Locale.java index 1215f966531..368ba3854fb 100644 --- a/src/java.base/share/classes/java/util/Locale.java +++ b/src/java.base/share/classes/java/util/Locale.java @@ -1689,6 +1689,58 @@ public final class Locale implements Cloneable, Serializable { return langTag; } + /** + * {@return a case folded IETF BCP 47 language tag} + * + *

This method formats a language tag into one with case convention + * that adheres to section 2.1.1. Formatting of Language Tags of RFC5646. + * This format is defined as: All subtags, including extension and private + * use subtags, use lowercase letters with two exceptions: two-letter + * and four-letter subtags that neither appear at the start of the tag + * nor occur after singletons. Such two-letter subtags are all + * uppercase (as in the tags "en-CA-x-ca" or "sgn-BE-FR") and four- + * letter subtags are titlecase (as in the tag "az-Latn-x-latn"). As + * legacy tags, (defined as "grandfathered" in RFC5646) are not always well-formed, this method + * will simply case fold a legacy tag to match the exact case convention + * for the particular tag specified in the respective + * {@link ##legacy_tags Legacy tags} table. + * + *

Special Exceptions + *

To maintain consistency with {@link ##def_variant variant} + * which is case-sensitive, this method will neither case fold variant + * subtags nor case fold private use subtags prefixed by {@code lvariant}. + * + *

For example, + * {@snippet lang=java : + * String tag = "ja-kana-jp-x-lvariant-Oracle-JDK-Standard-Edition"; + * Locale.caseFoldLanguageTag(tag); // returns "ja-Kana-JP-x-lvariant-Oracle-JDK-Standard-Edition" + * String tag2 = "ja-kana-jp-x-Oracle-JDK-Standard-Edition"; + * Locale.caseFoldLanguageTag(tag2); // returns "ja-Kana-JP-x-oracle-jdk-standard-edition" + * } + * + *

Excluding case folding, this method makes no modifications to the tag itself. + * Case convention of language tags does not carry meaning, and is simply + * recommended as it corresponds with various ISO standards, including: + * ISO639-1, ISO15924, and ISO3166-1. + * + *

As the formatting of the case convention is dependent on the + * positioning of certain subtags, callers of this method should ensure + * that the language tag is well-formed, (conforming to section 2.1. Syntax + * of RFC5646). + * + * @param languageTag the IETF BCP 47 language tag. + * @throws IllformedLocaleException if {@code languageTag} is not well-formed + * @throws NullPointerException if {@code languageTag} is {@code null} + * @spec https://www.rfc-editor.org/rfc/rfc5646.html#section-2.1 + * RFC5646 2.1. Syntax + * @spec https://www.rfc-editor.org/rfc/rfc5646#section-2.1.1 + * RFC5646 2.1.1. Formatting of Language Tags + * @since 21 + */ + public static String caseFoldLanguageTag(String languageTag) { + return LanguageTag.caseFoldTag(languageTag); + } + /** * Returns a locale for the specified IETF BCP 47 language tag string. * @@ -1748,7 +1800,7 @@ public final class Locale implements Cloneable, Serializable { * // returns "th-TH-u-nu-thai-x-lvariant-TH" * * - *

This implements the 'Language-Tag' production of BCP47, and + *

This implements the 'Language-Tag' production of BCP47, and * so supports legacy (regular and irregular, referred to as * "Type: grandfathered" in BCP47) as well as * private use language tags. Stand alone private use tags are diff --git a/src/java.base/share/classes/sun/util/locale/LanguageTag.java b/src/java.base/share/classes/sun/util/locale/LanguageTag.java index 1ba11d9dca6..0c38290c131 100644 --- a/src/java.base/share/classes/sun/util/locale/LanguageTag.java +++ b/src/java.base/share/classes/sun/util/locale/LanguageTag.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010, 2020, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2010, 2023, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -34,7 +34,9 @@ package sun.util.locale; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; +import java.util.IllformedLocaleException; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Set; import java.util.StringJoiner; @@ -59,7 +61,6 @@ public class LanguageTag { private List extlangs = Collections.emptyList(); // extlang subtags private List variants = Collections.emptyList(); // variant subtags private List extensions = Collections.emptyList(); // extensions - // Map contains legacy language tags and its preferred mappings from // http://www.ietf.org/rfc/rfc5646.txt // Keys are lower-case strings. @@ -208,7 +209,6 @@ public class LanguageTag { tag.parseExtensions(itr, sts); } tag.parsePrivateuse(itr, sts); - if (!itr.isDone() && !sts.isError()) { String s = itr.current(); sts.errorIndex = itr.currentStart(); @@ -218,7 +218,6 @@ public class LanguageTag { sts.errorMsg = "Invalid subtag: " + s; } } - return tag; } @@ -414,6 +413,54 @@ public class LanguageTag { return found; } + public static String caseFoldTag(String tag) { + ParseStatus sts = new ParseStatus(); + parse(tag, sts); + // Illegal tags + if (sts.errorMsg != null) { + throw new IllformedLocaleException(String.format("Ill formed tag:" + + " %s", sts.errorMsg)); + } + // Legacy tags + String potentialLegacy = tag.toLowerCase(Locale.ROOT); + if (LEGACY.containsKey(potentialLegacy)) { + return LEGACY.get(potentialLegacy)[0]; + } + // Non-legacy tags + StringBuilder bldr = new StringBuilder(tag.length()); + String[] subtags = tag.split("-"); + boolean privateFound = false; + boolean singletonFound = false; + boolean privUseVarFound = false; + for (int i = 0; i < subtags.length; i++) { + String subtag = subtags[i]; + if (privUseVarFound) { + bldr.append(subtag); + } else if (i > 0 && isVariant(subtag) && !singletonFound && !privateFound) { + bldr.append(subtag); + } else if (i > 0 && isRegion(subtag) && !singletonFound && !privateFound) { + bldr.append(canonicalizeRegion(subtag)); + } else if (i > 0 && isScript(subtag) && !singletonFound && !privateFound) { + bldr.append(canonicalizeScript(subtag)); + // If subtag is not 2 letter, 4 letter, or variant + // under the right conditions, then it should be lower-case + } else { + if (isPrivateusePrefix(subtag)) { + privateFound = true; + } else if (isExtensionSingleton(subtag)) { + singletonFound = true; + } else if (subtag.equals(PRIVUSE_VARIANT_PREFIX)) { + privUseVarFound = true; + } + bldr.append(subtag.toLowerCase(Locale.ROOT)); + } + if (i != subtags.length-1) { + bldr.append("-"); + } + } + return bldr.substring(0); + } + public static LanguageTag parseLocale(BaseLocale baseLocale, LocaleExtensions localeExtensions) { LanguageTag tag = new LanguageTag(); diff --git a/test/jdk/java/util/Locale/CaseFoldLanguageTagTest.java b/test/jdk/java/util/Locale/CaseFoldLanguageTagTest.java new file mode 100644 index 00000000000..fdee5075229 --- /dev/null +++ b/test/jdk/java/util/Locale/CaseFoldLanguageTagTest.java @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * @test + * @bug 8159337 + * @summary Test Locale.caseFoldLanguageTag(String languageTag) + * @run junit CaseFoldLanguageTagTest + */ + +import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.Test; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import java.util.IllformedLocaleException; +import java.util.Locale; +import java.util.stream.Stream; + +/** + * Test the implementation of Locale.caseFoldLanguageTag(String languageTag). + * A variety of well-formed tags are tested, composed of the following subtags: + * language, extlang, script, region, variant, extension, singleton, privateuse, + * grandfathered, and irregular. For more info, see the following, + * Tag Syntax). + * In addition, the method is tested to ensure that IllformedLocaleException and + * NullPointerException are thrown given the right circumstances. + */ +public class CaseFoldLanguageTagTest { + + @ParameterizedTest + @MethodSource("wellFormedTags") + public void wellFormedTags(String tag, String foldedTag) { + assertEquals(foldedTag, Locale.caseFoldLanguageTag(tag), String.format("Folded %s", tag)); + } + + @ParameterizedTest + @MethodSource("illFormedTags") + public void illFormedTags(String tag) { + assertThrows(IllformedLocaleException.class, () -> + Locale.caseFoldLanguageTag(tag)); + } + + @Test + public void throwNPE() { + assertThrows(NullPointerException.class, () -> + Locale.caseFoldLanguageTag(null)); + } + + private static Stream wellFormedTags() { + return Stream.of( + // langtag tests + // language + Arguments.of("AB", "ab"), + // language - ext + Arguments.of("AB-ABC", "ab-abc"), + // language - ext - script + Arguments.of("AB-ABC-ABCD", "ab-abc-Abcd"), + // language - ext - script - region + Arguments.of("AB-ABC-ABCD-ab", "ab-abc-Abcd-AB"), + // language - region + Arguments.of("AB-ab", "ab-AB"), + // language - script + Arguments.of("AB-aBCD", "ab-Abcd"), + // language - private use + Arguments.of("AB-X-AB-ABCD", "ab-x-ab-abcd"), + // language - ext - script - region - variant + Arguments.of("AB-ABC-ABCD-ab-ABCDE", "ab-abc-Abcd-AB-ABCDE"), + // language - ext - script - region - variant x 2 + Arguments.of("AB-ABC-ABCD-ab-ABCDE-fghij", + "ab-abc-Abcd-AB-ABCDE-fghij"), + // language - ext - script - region - variant - extension + Arguments.of("AB-ABC-ABCD-ab-ABCDE-A-ABCD", + "ab-abc-Abcd-AB-ABCDE-a-abcd"), + // language - ext - script - region - variant - private + Arguments.of("AB-ABC-ABCD-ab-ABCDE-X-ABCD", + "ab-abc-Abcd-AB-ABCDE-x-abcd"), + // language - ext - script - region - variant - extension x2 + Arguments.of("AB-ABC-ABCD-ab-ABCDE-A-ABCD-B-EFGHI", + "ab-abc-Abcd-AB-ABCDE-a-abcd-b-efghi"), + // language - ext - script - region - variant - extension - private + Arguments.of("AB-ABC-ABCD-ab-ABCDE-A-ABCD-X-ABCD", + "ab-abc-Abcd-AB-ABCDE-a-abcd-x-abcd"), + // language - ext - script - region - variant x2 - extension x2 - private (x2 ext) + Arguments.of("AB-ABC-ABCD-ab-ABCDE-A-ABCD-X-ABCD-EFGHI", + "ab-abc-Abcd-AB-ABCDE-a-abcd-x-abcd-efghi"), + // language - variant x2 - extension x3 - private + Arguments.of("AB-aBcDeF-GhIjKl-a-ABC-DEFGH-B-ABC-C-ABC-X-A-ABC-DEF", + "ab-aBcDeF-GhIjKl-a-abc-defgh-b-abc-c-abc-x-a-abc-def"), + // language - ext- script - region - variant - extension x2 - private (x2 ext) + Arguments.of("AB-ABC-ABCD-ab-abCDe12-A-AB-B-ABCD-X-AB-ABCD", + "ab-abc-Abcd-AB-abCDe12-a-ab-b-abcd-x-ab-abcd"), + + // Multiple singleton extensions + Arguments.of("AB-ABC-ABCD-ab-ABCDE-A-ABCD-GGG-ZZZ-B-EFGHI", + "ab-abc-Abcd-AB-ABCDE-a-abcd-ggg-zzz-b-efghi"), + + // private use tests + Arguments.of("X-Abc", "x-abc"), // regular private + Arguments.of("X-A-ABC", "x-a-abc"), // private w/ extended (incl. 1) + Arguments.of("X-A-AB-Abcd", "x-a-ab-abcd"), // private w/ extended (incl. 1, 2, 4) + + // Legacy tests + // irregular + Arguments.of("I-AMI", "i-ami"), + Arguments.of("EN-gb-OED", "en-GB-oed"), + Arguments.of("SGN-be-fr", "sgn-BE-FR"), + // regular + Arguments.of("NO-BOK", "no-bok"), + Arguments.of("CEL-GAULISH", "cel-gaulish"), + Arguments.of("ZH-MIN-NAN", "zh-min-nan"), + + // Special JDK Cases (Variant and x-lvariant) + Arguments.of("de-POSIX-x-URP-lvariant-Abc-Def", "de-POSIX-x-urp-lvariant-Abc-Def"), + Arguments.of("JA-JPAN-JP-U-CA-JAPANESE-x-RANDOM-lvariant-JP", + "ja-Jpan-JP-u-ca-japanese-x-random-lvariant-JP"), + Arguments.of("ja-JP-u-ca-japanese-x-lvariant-JP", "ja-JP-u-ca-japanese-x-lvariant-JP"), + Arguments.of("XX-ABCD-yy-VARIANT-x-TEST-lvariant-JDK", + "xx-Abcd-YY-VARIANT-x-test-lvariant-JDK"), + Arguments.of("ja-kana-jp-x-lvariant-Oracle-JDK-Standard-Edition", + "ja-Kana-JP-x-lvariant-Oracle-JDK-Standard-Edition"), + Arguments.of("ja-kana-jp-x-Oracle-JDK-Standard-Edition", + "ja-Kana-JP-x-oracle-jdk-standard-edition"), + Arguments.of("ja-kana-jp-a-ABC-EFG-ZZZ-b-aaa-x-Oracle-JDK-Standard-Edition", + "ja-Kana-JP-a-abc-efg-zzz-b-aaa-x-oracle-jdk-standard-edition") + ); + } + + private static Stream illFormedTags() { + return Stream.of( + // Starts with non-language + Arguments.of("xabadadoo-me"), + // Starts with singleton + Arguments.of("a-abc"), + Arguments.of("a-singleton-en-us"), + // Hanging dash + Arguments.of("en-"), + // Double dash + Arguments.of("en--US"), + // Script before ext lang + Arguments.of("ab-Script-ext"), + // Region before ext lang + Arguments.of("ab-AB-ext"), + // Variants at start + Arguments.of("variant-first-ab") + ); + } +}