Implement conditional casing for Greek letter sigma when title-casing text

This commit is contained in:
Alex Dowad 2023-01-09 13:21:31 +02:00
parent 290efe842d
commit a90358639d
3 changed files with 68 additions and 2 deletions

View file

@ -427,12 +427,34 @@ MBSTRING_API zend_string *php_unicode_convert_case(php_case_mode case_mode, cons
*p++ = w;
continue;
}
uint32_t w2 = title_mode ? php_unicode_tolower_raw(w, src_encoding) : php_unicode_totitle_raw(w, src_encoding);
uint32_t w2;
if (title_mode) {
if (w == 0x3A3) {
int j = i - 1;
while (j >= 0 && php_unicode_is_case_ignorable(wchar_buf[j])) {
j--;
}
if (j >= 0 ? php_unicode_is_cased(wchar_buf[j]) : scan_back_for_cased_letter(p, converted_end)) {
j = i + 1;
while (j < out_len && php_unicode_is_case_ignorable(wchar_buf[j])) {
j++;
}
if (j >= out_len ? !scan_ahead_for_cased_letter(in, in_len, state, src_encoding) : !php_unicode_is_cased(wchar_buf[j])) {
*p++ = 0x3C2;
goto set_title_mode;
}
}
}
w2 = php_unicode_tolower_raw(w, src_encoding);
} else {
w2 = php_unicode_totitle_raw(w, src_encoding);
}
if (UNEXPECTED(w2 > 0xFFFFFF)) {
p = emit_special_casing_sequence(w2, p);
} else {
*p++ = w2;
}
set_title_mode:
if (!php_unicode_is_case_ignorable(w)) {
title_mode = php_unicode_is_cased(w);
}

View file

@ -115,7 +115,7 @@ dd
dd
69
69
Καλησπερα Σασ
Καλησπερα Σας
Καλησπερα Σασ
καλησπερα σας
καλησπερα σασ

View file

@ -21,6 +21,33 @@ try {
echo $e->getMessage() . \PHP_EOL;
}
echo "\n-- Greek letter sigma --\n";
var_dump(mb_convert_case("Σ", MB_CASE_TITLE, 'UTF-8'));
var_dump(mb_convert_case("aΣ", MB_CASE_TITLE, 'UTF-8'));
var_dump(mb_convert_case("aΣb", MB_CASE_TITLE, 'UTF-8'));
var_dump(mb_convert_case("aΣ b", MB_CASE_TITLE, 'UTF-8'));
var_dump(mb_convert_case(" ΣΣΣΣ ", MB_CASE_TITLE, 'UTF-8'));
// Apostrophe, full stop, colon, etc. are "case-ignorable"
// When checking whether capital sigma is at the end of a word or not, we skip over
// any number of case-ignorable characters, both when scanning back and when scanning forward
var_dump(mb_convert_case("'Σ", MB_CASE_TITLE, 'UTF-8'));
var_dump(mb_convert_case("ab'Σ", MB_CASE_TITLE, 'UTF-8'));
var_dump(mb_convert_case("Σ'", MB_CASE_TITLE, 'UTF-8'));
var_dump(mb_convert_case("Σ'a", MB_CASE_TITLE, 'UTF-8'));
var_dump(mb_convert_case("a'Σ'a", MB_CASE_TITLE, 'UTF-8'));
// We scan back by at least 63 characters when necessary,
// but there is no guarantee that we will scan back further than that
var_dump(mb_convert_case('a' . str_repeat('.', 63) . "Σ", MB_CASE_TITLE, 'UTF-8'));
var_dump(mb_convert_case('a' . str_repeat('.', 64) . "Σ", MB_CASE_TITLE, 'UTF-8')); // Context-sensitive casing doesn't work here!
// When scanning forward to confirm if capital sigma is at the end of a word or not,
// there is no limit as to how far we will scan
var_dump(mb_convert_case("abcΣ" . str_repeat('.', 64) . ' abc', MB_CASE_TITLE, 'UTF-8'));
var_dump(mb_convert_case("abcΣ" . str_repeat('.', 64) . 'a abc', MB_CASE_TITLE, 'UTF-8'));
var_dump(mb_convert_case("abcΣ" . str_repeat('.', 256) . ' abc', MB_CASE_TITLE, 'UTF-8'));
/* Regression test for new implementation;
* When converting a codepoint, if we overwrite it with the converted version before
* checking whether we should shift in/out of 'title mode', then the conversion will be incorrect */
@ -38,5 +65,22 @@ string(13) "foo bar spaß"
string(13) "Foo Bar Spaß"
string(13) "foo bar spaß"
mb_convert_case(): Argument #2 ($mode) must be one of the MB_CASE_* constants
-- Greek letter sigma --
string(2) "Σ"
string(3) "Aς"
string(4) "Aσb"
string(5) "Aς B"
string(10) " Σσσς "
string(3) "'Σ"
string(5) "Ab'ς"
string(3) "Σ'"
string(4) "Σ'a"
string(6) "A'σ'a"
string(66) "A...............................................................ς"
string(67) "A................................................................σ"
string(73) "Abcς................................................................ Abc"
string(74) "Abcσ................................................................a Abc"
string(265) "Abcς................................................................................................................................................................................................................................................................ Abc"
string(12) "02bc004e012d"
string(8) "0149012d"