mirror of
https://github.com/php/php-src.git
synced 2025-08-15 21:48:51 +02:00
Add unit tests for mb_str_split/mb_substr on MacJapanese encoding
MacJapanese has a somewhat unusual feature that when mapped to Unicode, many characters map to sequences of several codepoints. Add test cases demonstrating how mb_str_split and mb_substr behave in this situation. When adding these tests, I found the behavior of mb_substr was wrong due to an inconsistency between the string "length" as measured by mb_strlen and the number of native MacJapanese characters which mb_substr would count when iterating over the string using the mblen_table. This has been fixed. I believe that mb_strstr will also return wrong results in some cases for MacJapanese. I still need to come up with unit tests which demonstrate the problem and figure out how to fix it.
This commit is contained in:
parent
cca4ca6d3d
commit
d8b5b9fa55
3 changed files with 56 additions and 2 deletions
|
@ -2041,7 +2041,10 @@ static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, c
|
|||
len = in_len;
|
||||
}
|
||||
return zend_string_init_fast((const char*)in, len);
|
||||
} else if (enc->mblen_table != NULL) {
|
||||
} else if (enc->mblen_table) {
|
||||
/* The use of the `mblen_table` means that for encodings like MacJapanese,
|
||||
* we treat each character in its native charset as "1 character", even if it
|
||||
* maps to a sequence of several codepoints */
|
||||
const unsigned char *mbtab = enc->mblen_table;
|
||||
unsigned char *limit = in + in_len;
|
||||
while (from && in < limit) {
|
||||
|
@ -2254,7 +2257,21 @@ PHP_FUNCTION(mb_substr)
|
|||
|
||||
size_t mblen = 0;
|
||||
if (from < 0 || (!len_is_null && len < 0)) {
|
||||
mblen = mb_get_strlen(str, enc);
|
||||
if (enc->mblen_table) {
|
||||
/* Because we use the `mblen_table` when iterating over the string and
|
||||
* extracting the requested part, we also need to use it here for counting
|
||||
* the "length" of the string
|
||||
* Otherwise, we can get wrong results for text encodings like MacJapanese,
|
||||
* where one native 'character' can map to a sequence of several codepoints */
|
||||
const unsigned char *mbtab = enc->mblen_table;
|
||||
unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
|
||||
while (p < e) {
|
||||
p += mbtab[*p];
|
||||
mblen++;
|
||||
}
|
||||
} else {
|
||||
mblen = mb_get_strlen(str, enc);
|
||||
}
|
||||
}
|
||||
|
||||
/* if "from" position is negative, count start position from the end
|
||||
|
|
|
@ -80,6 +80,23 @@ foreach (['SJIS', 'SJIS-2004', 'MacJapanese', 'SJIS-Mobile#DOCOMO', 'SJIS-Mobile
|
|||
echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n";
|
||||
}
|
||||
|
||||
/*
|
||||
Some MacJapanese characters map to a sequence of several Unicode codepoints. Examples:
|
||||
|
||||
0x85AB 0xF862+0x0058+0x0049+0x0049+0x0049 # roman numeral thirteen
|
||||
0x85AC 0xF861+0x0058+0x0049+0x0056 # roman numeral fourteen
|
||||
0x85AD 0xF860+0x0058+0x0056 # roman numeral fifteen
|
||||
0x85BF 0xF862+0x0078+0x0069+0x0069+0x0069 # small roman numeral thirteen
|
||||
0x85C0 0xF861+0x0078+0x0069+0x0076 # small roman numeral fourteen
|
||||
0x85C1 0xF860+0x0078+0x0076 # small roman numeral fifteen
|
||||
|
||||
Even though they map to multiple codepoints, mb_str_split treats these as ONE character each
|
||||
*/
|
||||
|
||||
echo "== MacJapanese characters which map to 3-5 codepoints each ==\n";
|
||||
echo "[", implode(', ', array_map('bin2hex', mb_str_split("abc\x85\xAB\x85\xAC\x85\xAD", 1, 'MacJapanese'))), "]\n";
|
||||
echo "[", implode(', ', array_map('bin2hex', mb_str_split("abc\x85\xBF\x85\xC0\x85\xC1", 2, 'MacJapanese'))), "]\n";
|
||||
|
||||
?>
|
||||
--EXPECT--
|
||||
BIG-5: a4e9 a5bb
|
||||
|
@ -104,3 +121,6 @@ SJIS-Mobile#KDDI: [80a1, 6162, 6380, a1]
|
|||
SJIS-Mobile#KDDI: [6162, 63fd, feff, 6162, fdfe, ff]
|
||||
SJIS-Mobile#SoftBank: [80a1, 6162, 6380, a1]
|
||||
SJIS-Mobile#SoftBank: [6162, 63fd, feff, 6162, fdfe, ff]
|
||||
== MacJapanese characters which map to 3-5 codepoints each ==
|
||||
[61, 62, 63, 85ab, 85ac, 85ad]
|
||||
[6162, 6385bf, 85c085c1]
|
||||
|
|
|
@ -64,6 +64,16 @@ echo "SJIS-Mobile#SoftBank:\n";
|
|||
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#SoftBank')) . "\n";
|
||||
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#SoftBank')) . "\n";
|
||||
|
||||
echo "-- Testing MacJapanese characters which map to 3-5 codepoints each --\n";
|
||||
|
||||
/* There are many characters in MacJapanese which map to sequences of several codepoints */
|
||||
print bin2hex(mb_substr("abc\x85\xAB\x85\xAC\x85\xAD", 0, 3, 'MacJapanese')) . "\n";
|
||||
print bin2hex(mb_substr("abc\x85\xAB\x85\xAC\x85\xAD", 3, 2, 'MacJapanese')) . "\n";
|
||||
print bin2hex(mb_substr("abc\x85\xAB\x85\xAC\x85\xAD", -2, 1, 'MacJapanese')) . "\n";
|
||||
print bin2hex(mb_substr("abc\x85\xBF\x85\xC0\x85\xC1", 0, 3, 'MacJapanese')) . "\n";
|
||||
print bin2hex(mb_substr("abc\x85\xBF\x85\xC0\x85\xC1", 3, 2, 'MacJapanese')) . "\n";
|
||||
print bin2hex(mb_substr("abc\x85\xBF\x85\xC0\x85\xC1", -2, 1, 'MacJapanese')) . "\n";
|
||||
|
||||
echo "ISO-2022-JP:\n";
|
||||
print "1: " . bin2hex(mb_substr($iso2022jp, 0, 3, 'ISO-2022-JP')) . "\n";
|
||||
print "2: " . bin2hex(mb_substr($iso2022jp, -1, null, 'ISO-2022-JP')) . "\n";
|
||||
|
@ -145,6 +155,13 @@ SJIS-Mobile#KDDI:
|
|||
SJIS-Mobile#SoftBank:
|
||||
6380
|
||||
806162
|
||||
-- Testing MacJapanese characters which map to 3-5 codepoints each --
|
||||
616263
|
||||
85ab85ac
|
||||
85ac
|
||||
616263
|
||||
85bf85c0
|
||||
85c0
|
||||
ISO-2022-JP:
|
||||
1: 1b2442212121721b284241
|
||||
2: 43
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue