diff --git a/ext/mbstring/php_unicode.c b/ext/mbstring/php_unicode.c index 546096bf326..af48ba4287f 100644 --- a/ext/mbstring/php_unicode.c +++ b/ext/mbstring/php_unicode.c @@ -427,12 +427,34 @@ MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, cons *p++ = w; continue; } - uint32_t w2 = title_mode ? php_unicode_tolower_raw(w, src_encoding) : php_unicode_totitle_raw(w, src_encoding); + uint32_t w2; + if (title_mode) { + if (w == 0x3A3) { + int j = i - 1; + while (j >= 0 && php_unicode_is_case_ignorable(wchar_buf[j])) { + j--; + } + if (j >= 0 ? php_unicode_is_cased(wchar_buf[j]) : scan_back_for_cased_letter(p, converted_end)) { + j = i + 1; + while (j < out_len && php_unicode_is_case_ignorable(wchar_buf[j])) { + j++; + } + if (j >= out_len ? !scan_ahead_for_cased_letter(in, in_len, state, src_encoding) : !php_unicode_is_cased(wchar_buf[j])) { + *p++ = 0x3C2; + goto set_title_mode; + } + } + } + w2 = php_unicode_tolower_raw(w, src_encoding); + } else { + w2 = php_unicode_totitle_raw(w, src_encoding); + } if (UNEXPECTED(w2 > 0xFFFFFF)) { p = emit_special_casing_sequence(w2, p); } else { *p++ = w2; } +set_title_mode: if (!php_unicode_is_case_ignorable(w)) { title_mode = php_unicode_is_cased(w); } diff --git a/ext/mbstring/tests/casemapping.phpt b/ext/mbstring/tests/casemapping.phpt index 85a21494a6f..050ebc94e25 100644 --- a/ext/mbstring/tests/casemapping.phpt +++ b/ext/mbstring/tests/casemapping.phpt @@ -115,7 +115,7 @@ dd dd 69 69 -Καλησπερα Σασ +Καλησπερα Σας Καλησπερα Σασ καλησπερα σας καλησπερα σασ diff --git a/ext/mbstring/tests/mb_convert_case_various_mode.phpt b/ext/mbstring/tests/mb_convert_case_various_mode.phpt index 10c5bdb3aa6..70dcbfcd66a 100644 --- a/ext/mbstring/tests/mb_convert_case_various_mode.phpt +++ b/ext/mbstring/tests/mb_convert_case_various_mode.phpt @@ -21,6 +21,33 @@ try { echo $e->getMessage() . \PHP_EOL; } +echo "\n-- Greek letter sigma --\n"; +var_dump(mb_convert_case("Σ", MB_CASE_TITLE, 'UTF-8')); +var_dump(mb_convert_case("aΣ", MB_CASE_TITLE, 'UTF-8')); +var_dump(mb_convert_case("aΣb", MB_CASE_TITLE, 'UTF-8')); +var_dump(mb_convert_case("aΣ b", MB_CASE_TITLE, 'UTF-8')); +var_dump(mb_convert_case(" ΣΣΣΣ ", MB_CASE_TITLE, 'UTF-8')); + +// Apostrophe, full stop, colon, etc. are "case-ignorable" +// When checking whether capital sigma is at the end of a word or not, we skip over +// any number of case-ignorable characters, both when scanning back and when scanning forward +var_dump(mb_convert_case("'Σ", MB_CASE_TITLE, 'UTF-8')); +var_dump(mb_convert_case("ab'Σ", MB_CASE_TITLE, 'UTF-8')); +var_dump(mb_convert_case("Σ'", MB_CASE_TITLE, 'UTF-8')); +var_dump(mb_convert_case("Σ'a", MB_CASE_TITLE, 'UTF-8')); +var_dump(mb_convert_case("a'Σ'a", MB_CASE_TITLE, 'UTF-8')); + +// We scan back by at least 63 characters when necessary, +// but there is no guarantee that we will scan back further than that +var_dump(mb_convert_case('a' . str_repeat('.', 63) . "Σ", MB_CASE_TITLE, 'UTF-8')); +var_dump(mb_convert_case('a' . str_repeat('.', 64) . "Σ", MB_CASE_TITLE, 'UTF-8')); // Context-sensitive casing doesn't work here! + +// When scanning forward to confirm if capital sigma is at the end of a word or not, +// there is no limit as to how far we will scan +var_dump(mb_convert_case("abcΣ" . str_repeat('.', 64) . ' abc', MB_CASE_TITLE, 'UTF-8')); +var_dump(mb_convert_case("abcΣ" . str_repeat('.', 64) . 'a abc', MB_CASE_TITLE, 'UTF-8')); +var_dump(mb_convert_case("abcΣ" . str_repeat('.', 256) . ' abc', MB_CASE_TITLE, 'UTF-8')); + /* Regression test for new implementation; * When converting a codepoint, if we overwrite it with the converted version before * checking whether we should shift in/out of 'title mode', then the conversion will be incorrect */ @@ -38,5 +65,22 @@ string(13) "foo bar spaß" string(13) "Foo Bar Spaß" string(13) "foo bar spaß" mb_convert_case(): Argument #2 ($mode) must be one of the MB_CASE_* constants + +-- Greek letter sigma -- +string(2) "Σ" +string(3) "Aς" +string(4) "Aσb" +string(5) "Aς B" +string(10) " Σσσς " +string(3) "'Σ" +string(5) "Ab'ς" +string(3) "Σ'" +string(4) "Σ'a" +string(6) "A'σ'a" +string(66) "A...............................................................ς" +string(67) "A................................................................σ" +string(73) "Abcς................................................................ Abc" +string(74) "Abcσ................................................................a Abc" +string(265) "Abcς................................................................................................................................................................................................................................................................ Abc" string(12) "02bc004e012d" string(8) "0149012d"