mirror of
https://github.com/php/php-src.git
synced 2025-08-16 05:58:45 +02:00
Implement conditional casing for Greek letter sigma when title-casing text
This commit is contained in:
parent
290efe842d
commit
a90358639d
3 changed files with 68 additions and 2 deletions
|
@ -427,12 +427,34 @@ MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, cons
|
||||||
*p++ = w;
|
*p++ = w;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
uint32_t w2 = title_mode ? php_unicode_tolower_raw(w, src_encoding) : php_unicode_totitle_raw(w, src_encoding);
|
uint32_t w2;
|
||||||
|
if (title_mode) {
|
||||||
|
if (w == 0x3A3) {
|
||||||
|
int j = i - 1;
|
||||||
|
while (j >= 0 && php_unicode_is_case_ignorable(wchar_buf[j])) {
|
||||||
|
j--;
|
||||||
|
}
|
||||||
|
if (j >= 0 ? php_unicode_is_cased(wchar_buf[j]) : scan_back_for_cased_letter(p, converted_end)) {
|
||||||
|
j = i + 1;
|
||||||
|
while (j < out_len && php_unicode_is_case_ignorable(wchar_buf[j])) {
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
if (j >= out_len ? !scan_ahead_for_cased_letter(in, in_len, state, src_encoding) : !php_unicode_is_cased(wchar_buf[j])) {
|
||||||
|
*p++ = 0x3C2;
|
||||||
|
goto set_title_mode;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
w2 = php_unicode_tolower_raw(w, src_encoding);
|
||||||
|
} else {
|
||||||
|
w2 = php_unicode_totitle_raw(w, src_encoding);
|
||||||
|
}
|
||||||
if (UNEXPECTED(w2 > 0xFFFFFF)) {
|
if (UNEXPECTED(w2 > 0xFFFFFF)) {
|
||||||
p = emit_special_casing_sequence(w2, p);
|
p = emit_special_casing_sequence(w2, p);
|
||||||
} else {
|
} else {
|
||||||
*p++ = w2;
|
*p++ = w2;
|
||||||
}
|
}
|
||||||
|
set_title_mode:
|
||||||
if (!php_unicode_is_case_ignorable(w)) {
|
if (!php_unicode_is_case_ignorable(w)) {
|
||||||
title_mode = php_unicode_is_cased(w);
|
title_mode = php_unicode_is_cased(w);
|
||||||
}
|
}
|
||||||
|
|
|
@ -115,7 +115,7 @@ dd
|
||||||
dd
|
dd
|
||||||
69
|
69
|
||||||
69
|
69
|
||||||
Καλησπερα Σασ
|
Καλησπερα Σας
|
||||||
Καλησπερα Σασ
|
Καλησπερα Σασ
|
||||||
καλησπερα σας
|
καλησπερα σας
|
||||||
καλησπερα σασ
|
καλησπερα σασ
|
||||||
|
|
|
@ -21,6 +21,33 @@ try {
|
||||||
echo $e->getMessage() . \PHP_EOL;
|
echo $e->getMessage() . \PHP_EOL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
echo "\n-- Greek letter sigma --\n";
|
||||||
|
var_dump(mb_convert_case("Σ", MB_CASE_TITLE, 'UTF-8'));
|
||||||
|
var_dump(mb_convert_case("aΣ", MB_CASE_TITLE, 'UTF-8'));
|
||||||
|
var_dump(mb_convert_case("aΣb", MB_CASE_TITLE, 'UTF-8'));
|
||||||
|
var_dump(mb_convert_case("aΣ b", MB_CASE_TITLE, 'UTF-8'));
|
||||||
|
var_dump(mb_convert_case(" ΣΣΣΣ ", MB_CASE_TITLE, 'UTF-8'));
|
||||||
|
|
||||||
|
// Apostrophe, full stop, colon, etc. are "case-ignorable"
|
||||||
|
// When checking whether capital sigma is at the end of a word or not, we skip over
|
||||||
|
// any number of case-ignorable characters, both when scanning back and when scanning forward
|
||||||
|
var_dump(mb_convert_case("'Σ", MB_CASE_TITLE, 'UTF-8'));
|
||||||
|
var_dump(mb_convert_case("ab'Σ", MB_CASE_TITLE, 'UTF-8'));
|
||||||
|
var_dump(mb_convert_case("Σ'", MB_CASE_TITLE, 'UTF-8'));
|
||||||
|
var_dump(mb_convert_case("Σ'a", MB_CASE_TITLE, 'UTF-8'));
|
||||||
|
var_dump(mb_convert_case("a'Σ'a", MB_CASE_TITLE, 'UTF-8'));
|
||||||
|
|
||||||
|
// We scan back by at least 63 characters when necessary,
|
||||||
|
// but there is no guarantee that we will scan back further than that
|
||||||
|
var_dump(mb_convert_case('a' . str_repeat('.', 63) . "Σ", MB_CASE_TITLE, 'UTF-8'));
|
||||||
|
var_dump(mb_convert_case('a' . str_repeat('.', 64) . "Σ", MB_CASE_TITLE, 'UTF-8')); // Context-sensitive casing doesn't work here!
|
||||||
|
|
||||||
|
// When scanning forward to confirm if capital sigma is at the end of a word or not,
|
||||||
|
// there is no limit as to how far we will scan
|
||||||
|
var_dump(mb_convert_case("abcΣ" . str_repeat('.', 64) . ' abc', MB_CASE_TITLE, 'UTF-8'));
|
||||||
|
var_dump(mb_convert_case("abcΣ" . str_repeat('.', 64) . 'a abc', MB_CASE_TITLE, 'UTF-8'));
|
||||||
|
var_dump(mb_convert_case("abcΣ" . str_repeat('.', 256) . ' abc', MB_CASE_TITLE, 'UTF-8'));
|
||||||
|
|
||||||
/* Regression test for new implementation;
|
/* Regression test for new implementation;
|
||||||
* When converting a codepoint, if we overwrite it with the converted version before
|
* When converting a codepoint, if we overwrite it with the converted version before
|
||||||
* checking whether we should shift in/out of 'title mode', then the conversion will be incorrect */
|
* checking whether we should shift in/out of 'title mode', then the conversion will be incorrect */
|
||||||
|
@ -38,5 +65,22 @@ string(13) "foo bar spaß"
|
||||||
string(13) "Foo Bar Spaß"
|
string(13) "Foo Bar Spaß"
|
||||||
string(13) "foo bar spaß"
|
string(13) "foo bar spaß"
|
||||||
mb_convert_case(): Argument #2 ($mode) must be one of the MB_CASE_* constants
|
mb_convert_case(): Argument #2 ($mode) must be one of the MB_CASE_* constants
|
||||||
|
|
||||||
|
-- Greek letter sigma --
|
||||||
|
string(2) "Σ"
|
||||||
|
string(3) "Aς"
|
||||||
|
string(4) "Aσb"
|
||||||
|
string(5) "Aς B"
|
||||||
|
string(10) " Σσσς "
|
||||||
|
string(3) "'Σ"
|
||||||
|
string(5) "Ab'ς"
|
||||||
|
string(3) "Σ'"
|
||||||
|
string(4) "Σ'a"
|
||||||
|
string(6) "A'σ'a"
|
||||||
|
string(66) "A...............................................................ς"
|
||||||
|
string(67) "A................................................................σ"
|
||||||
|
string(73) "Abcς................................................................ Abc"
|
||||||
|
string(74) "Abcσ................................................................a Abc"
|
||||||
|
string(265) "Abcς................................................................................................................................................................................................................................................................ Abc"
|
||||||
string(12) "02bc004e012d"
|
string(12) "02bc004e012d"
|
||||||
string(8) "0149012d"
|
string(8) "0149012d"
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue