diff --git a/NEWS b/NEWS index 75d2b9562e5..b3039bb92c1 100644 --- a/NEWS +++ b/NEWS @@ -47,6 +47,10 @@ PHP NEWS . mb_detect_encoding's "non-strict" mode now behaves as described in the documentation. Previously, it would return false if the very first byte of the input string was invalid in all candidate encodings. (Alex Dowad) + . mb_strtolower, mb_strtotitle, and mb_convert_case implement conditional + casing rules for the Greek letter sigma. For mb_convert_case, conditional + casing only applies to MB_CASE_LOWER and MB_CASE_TITLE modes, not to + MB_CASE_LOWER_SIMPLE and MB_CASE_TITLE_SIMPLE. (Alex Dowad) - Opcache: . Added start, restart and force restart time to opcache's diff --git a/UPGRADING b/UPGRADING index f3254ab6781..e5e4e7c210d 100644 --- a/UPGRADING +++ b/UPGRADING @@ -56,6 +56,12 @@ PHP 8.3 UPGRADE NOTES "buffer_size" => int See GH-9336 +- MBString: + . mb_strtolower, mb_strtotitle, and mb_convert_case implement conditional + casing rules for the Greek letter sigma. For mb_convert_case, conditional + casing only applies to MB_CASE_LOWER and MB_CASE_TITLE modes, not to + MB_CASE_LOWER_SIMPLE and MB_CASE_TITLE_SIMPLE. (Alex Dowad) + - Standard: . E_NOTICEs emitted by unserialized() have been promoted to E_WARNING. RFC: https://wiki.php.net/rfc/improve_unserialize_error_handling diff --git a/ext/mbstring/php_unicode.c b/ext/mbstring/php_unicode.c index b1ffb06c31f..3e4b683924b 100644 --- a/ext/mbstring/php_unicode.c +++ b/ext/mbstring/php_unicode.c @@ -238,6 +238,45 @@ static uint32_t *emit_special_casing_sequence(uint32_t w, uint32_t *out) return out; } +/* Used when determining whether special casing rules should be applied to Greek letter sigma */ +static bool scan_ahead_for_cased_letter(unsigned char *in, size_t in_len, unsigned int state, const mbfl_encoding *encoding) +{ + uint32_t wchar_buf[64]; + + while (in_len) { + size_t out_len = encoding->to_wchar(&in, &in_len, wchar_buf, 64, &state); + ZEND_ASSERT(out_len <= 64); + for (unsigned int i = 0; i < out_len; i++) { + uint32_t w = wchar_buf[i]; + if (php_unicode_is_cased(w)) { + return true; + } + if (!php_unicode_is_case_ignorable(w)) { + return false; + } + } + } + + return false; +} + +/* Used when determining whether special casing rules should be applied to Greek letter sigma */ +static bool scan_back_for_cased_letter(uint32_t *begin, uint32_t *end) +{ + if (end != NULL) { + while (--end >= begin) { + uint32_t w = *end; + if (php_unicode_is_cased(w)) { + return true; + } + if (!php_unicode_is_case_ignorable(w)) { + return false; + } + } + } + return false; +} + MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, const char *srcstr, size_t in_len, const mbfl_encoding *src_encoding, const mbfl_encoding *dst_encoding, int illegal_mode, uint32_t illegal_substchar) { /* A Unicode codepoint can expand out to up to 3 codepoints when uppercased, lowercased, or title cased @@ -246,6 +285,9 @@ MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, cons unsigned int state = 0, title_mode = 0; unsigned char *in = (unsigned char*)srcstr; enum mbfl_no_encoding enc = src_encoding->no_encoding; + /* In rare cases, we need to scan backwards through the previously converted codepoints to see + * if special conversion rules should be used for the Greek letter sigma */ + uint32_t *converted_end = NULL; mb_convert_buf buf; mb_convert_buf_init(&buf, in_len + 1, illegal_substchar, illegal_mode); @@ -315,6 +357,43 @@ MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, cons *p++ = w; continue; } + if (w == 0x3A3) { + /* For Greek capital letter sigma, there is a special casing rule; + * if it is the last letter in a word, it should be downcased to U+03C2 + * (GREEK SMALL LETTER FINAL SIGMA) + * Specifically, we need to check if this codepoint is preceded by any + * number of case-ignorable codepoints, preceded by a cased letter, AND + * is NOT followed by any number of case-ignorable codepoints followed + * by a cased letter. + * Ref: http://www.unicode.org/reports/tr21/tr21-5.html + * Ref: https://unicode.org/Public/UNIDATA/SpecialCasing.txt + * + * While the special casing rules say we should scan backwards through "any number" + * of case-ignorable codepoints, that is a great implementation burden + * It would basically mean we need to keep all the codepoints in a big buffer + * during this conversion operation, but we don't want to do that (to reduce the + * amount of temporary scratch memory used) + * Hence, we only scan back through the codepoints in wchar_buf, and if we hit the + * beginning of the buffer, whatever codepoints have not yet been overwritten in + * the latter part of converted_buf */ + int j = i - 1; + while (j >= 0 && php_unicode_is_case_ignorable(wchar_buf[j])) { + j--; + } + if (j >= 0 ? php_unicode_is_cased(wchar_buf[j]) : scan_back_for_cased_letter(p, converted_end)) { + /* Now scan ahead to look for a cased letter */ + j = i + 1; + while (j < out_len && php_unicode_is_case_ignorable(wchar_buf[j])) { + j++; + } + /* If we hit the end of wchar_buf, convert more of the input string into + * codepoints and continue scanning */ + if (j >= out_len ? !scan_ahead_for_cased_letter(in, in_len, state, src_encoding) : !php_unicode_is_cased(wchar_buf[j])) { + *p++ = 0x3C2; + continue; + } + } + } w = php_unicode_tolower_raw(w, enc); if (UNEXPECTED(w > 0xFFFFFF)) { p = emit_special_casing_sequence(w, p); @@ -362,6 +441,7 @@ MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, cons EMPTY_SWITCH_DEFAULT_CASE() } + converted_end = p; ZEND_ASSERT(p - converted_buf <= 192); dst_encoding->from_wchar(converted_buf, p - converted_buf, &buf, !in_len); } diff --git a/ext/mbstring/tests/casemapping.phpt b/ext/mbstring/tests/casemapping.phpt index 85f5140e2bf..85a21494a6f 100644 --- a/ext/mbstring/tests/casemapping.phpt +++ b/ext/mbstring/tests/casemapping.phpt @@ -58,6 +58,12 @@ echo bin2hex(mb_convert_case($str, MB_CASE_UPPER_SIMPLE)), "\n"; echo bin2hex(mb_convert_case($str, MB_CASE_FOLD)), "\n"; echo bin2hex(mb_convert_case($str, MB_CASE_FOLD_SIMPLE)), "\n"; +// Check handling of Greek letter capital sigma +echo mb_convert_case("ΚΑΛΗΣΠΕΡΑ ΣΑΣ", MB_CASE_TITLE, "UTF-8"), "\n"; +echo mb_convert_case("ΚΑΛΗΣΠΕΡΑ ΣΑΣ", MB_CASE_TITLE_SIMPLE, "UTF-8"), "\n"; +echo mb_convert_case("ΚΑΛΗΣΠΕΡΑ ΣΑΣ", MB_CASE_LOWER, "UTF-8"), "\n"; +echo mb_convert_case("ΚΑΛΗΣΠΕΡΑ ΣΑΣ", MB_CASE_LOWER_SIMPLE, "UTF-8"), "\n"; + ?> --EXPECT-- String: ß @@ -109,3 +115,7 @@ dd dd 69 69 +Καλησπερα Σασ +Καλησπερα Σασ +καλησπερα σας +καλησπερα σασ diff --git a/ext/mbstring/tests/mb_strtolower_basic.phpt b/ext/mbstring/tests/mb_strtolower_basic.phpt index debb42cd9ed..10b3282d33c 100644 --- a/ext/mbstring/tests/mb_strtolower_basic.phpt +++ b/ext/mbstring/tests/mb_strtolower_basic.phpt @@ -35,6 +35,33 @@ if ($mb == $greek_lower) { echo "Incorrectly converted\n"; } +echo "\n-- Greek letter sigma --\n"; +var_dump(mb_strtolower("Σ", 'UTF-8')); +var_dump(mb_strtolower("aΣ", 'UTF-8')); +var_dump(mb_strtolower("aΣb", 'UTF-8')); +var_dump(mb_strtolower("aΣ b", 'UTF-8')); +var_dump(mb_strtolower(" ΣΣΣΣ ", 'UTF-8')); + +// Apostrophe, full stop, colon, etc. are "case-ignorable" +// When checking whether capital sigma is at the end of a word or not, we skip over +// any number of case-ignorable characters, both when scanning back and when scanning forward +var_dump(mb_strtolower("'Σ", 'UTF-8')); +var_dump(mb_strtolower("ab'Σ", 'UTF-8')); +var_dump(mb_strtolower("Σ'", 'UTF-8')); +var_dump(mb_strtolower("Σ'a", 'UTF-8')); +var_dump(mb_strtolower("a'Σ'a", 'UTF-8')); + +// We scan back by at least 63 characters when necessary, +// but there is no guarantee that we will scan back further than that +var_dump(mb_strtolower('a' . str_repeat('.', 63) . "Σ", 'UTF-8')); +var_dump(mb_strtolower('a' . str_repeat('.', 64) . "Σ", 'UTF-8')); // Context-sensitive casing doesn't work here! + +// When scanning forward to confirm if capital sigma is at the end of a word or not, +// there is no limit as to how far we will scan +var_dump(mb_strtolower("abcΣ" . str_repeat('.', 64) . ' abc', 'UTF-8')); +var_dump(mb_strtolower("abcΣ" . str_repeat('.', 64) . 'a abc', 'UTF-8')); +var_dump(mb_strtolower("abcΣ" . str_repeat('.', 256) . ' abc', 'UTF-8')); + echo "Done"; ?> --EXPECT-- @@ -47,4 +74,21 @@ Correctly converted -- Multibyte String -- string(64) "zrHOss6zzrTOtc62zrfOuM65zrrOu868zr3Ovs6/z4DPgc+Dz4TPhc+Gz4fPiM+J" Correctly converted + +-- Greek letter sigma -- +string(2) "σ" +string(3) "aς" +string(4) "aσb" +string(5) "aς b" +string(10) " σσσς " +string(3) "'σ" +string(5) "ab'ς" +string(3) "σ'" +string(4) "σ'a" +string(6) "a'σ'a" +string(66) "a...............................................................ς" +string(67) "a................................................................σ" +string(73) "abcς................................................................ abc" +string(74) "abcσ................................................................a abc" +string(265) "abcς................................................................................................................................................................................................................................................................ abc" Done