mirror of
https://github.com/php/php-src.git
synced 2025-08-16 05:58:45 +02:00
Implement conditional casing for Greek letter sigma when title-casing text
This commit is contained in:
parent
290efe842d
commit
a90358639d
3 changed files with 68 additions and 2 deletions
|
@ -427,12 +427,34 @@ MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, cons
|
|||
*p++ = w;
|
||||
continue;
|
||||
}
|
||||
uint32_t w2 = title_mode ? php_unicode_tolower_raw(w, src_encoding) : php_unicode_totitle_raw(w, src_encoding);
|
||||
uint32_t w2;
|
||||
if (title_mode) {
|
||||
if (w == 0x3A3) {
|
||||
int j = i - 1;
|
||||
while (j >= 0 && php_unicode_is_case_ignorable(wchar_buf[j])) {
|
||||
j--;
|
||||
}
|
||||
if (j >= 0 ? php_unicode_is_cased(wchar_buf[j]) : scan_back_for_cased_letter(p, converted_end)) {
|
||||
j = i + 1;
|
||||
while (j < out_len && php_unicode_is_case_ignorable(wchar_buf[j])) {
|
||||
j++;
|
||||
}
|
||||
if (j >= out_len ? !scan_ahead_for_cased_letter(in, in_len, state, src_encoding) : !php_unicode_is_cased(wchar_buf[j])) {
|
||||
*p++ = 0x3C2;
|
||||
goto set_title_mode;
|
||||
}
|
||||
}
|
||||
}
|
||||
w2 = php_unicode_tolower_raw(w, src_encoding);
|
||||
} else {
|
||||
w2 = php_unicode_totitle_raw(w, src_encoding);
|
||||
}
|
||||
if (UNEXPECTED(w2 > 0xFFFFFF)) {
|
||||
p = emit_special_casing_sequence(w2, p);
|
||||
} else {
|
||||
*p++ = w2;
|
||||
}
|
||||
set_title_mode:
|
||||
if (!php_unicode_is_case_ignorable(w)) {
|
||||
title_mode = php_unicode_is_cased(w);
|
||||
}
|
||||
|
|
|
@ -115,7 +115,7 @@ dd
|
|||
dd
|
||||
69
|
||||
69
|
||||
Καλησπερα Σασ
|
||||
Καλησπερα Σας
|
||||
Καλησπερα Σασ
|
||||
καλησπερα σας
|
||||
καλησπερα σασ
|
||||
|
|
|
@ -21,6 +21,33 @@ try {
|
|||
echo $e->getMessage() . \PHP_EOL;
|
||||
}
|
||||
|
||||
echo "\n-- Greek letter sigma --\n";
|
||||
var_dump(mb_convert_case("Σ", MB_CASE_TITLE, 'UTF-8'));
|
||||
var_dump(mb_convert_case("aΣ", MB_CASE_TITLE, 'UTF-8'));
|
||||
var_dump(mb_convert_case("aΣb", MB_CASE_TITLE, 'UTF-8'));
|
||||
var_dump(mb_convert_case("aΣ b", MB_CASE_TITLE, 'UTF-8'));
|
||||
var_dump(mb_convert_case(" ΣΣΣΣ ", MB_CASE_TITLE, 'UTF-8'));
|
||||
|
||||
// Apostrophe, full stop, colon, etc. are "case-ignorable"
|
||||
// When checking whether capital sigma is at the end of a word or not, we skip over
|
||||
// any number of case-ignorable characters, both when scanning back and when scanning forward
|
||||
var_dump(mb_convert_case("'Σ", MB_CASE_TITLE, 'UTF-8'));
|
||||
var_dump(mb_convert_case("ab'Σ", MB_CASE_TITLE, 'UTF-8'));
|
||||
var_dump(mb_convert_case("Σ'", MB_CASE_TITLE, 'UTF-8'));
|
||||
var_dump(mb_convert_case("Σ'a", MB_CASE_TITLE, 'UTF-8'));
|
||||
var_dump(mb_convert_case("a'Σ'a", MB_CASE_TITLE, 'UTF-8'));
|
||||
|
||||
// We scan back by at least 63 characters when necessary,
|
||||
// but there is no guarantee that we will scan back further than that
|
||||
var_dump(mb_convert_case('a' . str_repeat('.', 63) . "Σ", MB_CASE_TITLE, 'UTF-8'));
|
||||
var_dump(mb_convert_case('a' . str_repeat('.', 64) . "Σ", MB_CASE_TITLE, 'UTF-8')); // Context-sensitive casing doesn't work here!
|
||||
|
||||
// When scanning forward to confirm if capital sigma is at the end of a word or not,
|
||||
// there is no limit as to how far we will scan
|
||||
var_dump(mb_convert_case("abcΣ" . str_repeat('.', 64) . ' abc', MB_CASE_TITLE, 'UTF-8'));
|
||||
var_dump(mb_convert_case("abcΣ" . str_repeat('.', 64) . 'a abc', MB_CASE_TITLE, 'UTF-8'));
|
||||
var_dump(mb_convert_case("abcΣ" . str_repeat('.', 256) . ' abc', MB_CASE_TITLE, 'UTF-8'));
|
||||
|
||||
/* Regression test for new implementation;
|
||||
* When converting a codepoint, if we overwrite it with the converted version before
|
||||
* checking whether we should shift in/out of 'title mode', then the conversion will be incorrect */
|
||||
|
@ -38,5 +65,22 @@ string(13) "foo bar spaß"
|
|||
string(13) "Foo Bar Spaß"
|
||||
string(13) "foo bar spaß"
|
||||
mb_convert_case(): Argument #2 ($mode) must be one of the MB_CASE_* constants
|
||||
|
||||
-- Greek letter sigma --
|
||||
string(2) "Σ"
|
||||
string(3) "Aς"
|
||||
string(4) "Aσb"
|
||||
string(5) "Aς B"
|
||||
string(10) " Σσσς "
|
||||
string(3) "'Σ"
|
||||
string(5) "Ab'ς"
|
||||
string(3) "Σ'"
|
||||
string(4) "Σ'a"
|
||||
string(6) "A'σ'a"
|
||||
string(66) "A...............................................................ς"
|
||||
string(67) "A................................................................σ"
|
||||
string(73) "Abcς................................................................ Abc"
|
||||
string(74) "Abcσ................................................................a Abc"
|
||||
string(265) "Abcς................................................................................................................................................................................................................................................................ Abc"
|
||||
string(12) "02bc004e012d"
|
||||
string(8) "0149012d"
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue