mirror of
https://github.com/php/php-src.git
synced 2025-08-16 14:08:47 +02:00
Add unit tests for mb_str_split/mb_substr on MacJapanese encoding
MacJapanese has a somewhat unusual feature that when mapped to Unicode, many characters map to sequences of several codepoints. Add test cases demonstrating how mb_str_split and mb_substr behave in this situation. When adding these tests, I found the behavior of mb_substr was wrong due to an inconsistency between the string "length" as measured by mb_strlen and the number of native MacJapanese characters which mb_substr would count when iterating over the string using the mblen_table. This has been fixed. I believe that mb_strstr will also return wrong results in some cases for MacJapanese. I still need to come up with unit tests which demonstrate the problem and figure out how to fix it.
This commit is contained in:
parent
cca4ca6d3d
commit
d8b5b9fa55
3 changed files with 56 additions and 2 deletions
|
@ -2041,7 +2041,10 @@ static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, c
|
||||||
len = in_len;
|
len = in_len;
|
||||||
}
|
}
|
||||||
return zend_string_init_fast((const char*)in, len);
|
return zend_string_init_fast((const char*)in, len);
|
||||||
} else if (enc->mblen_table != NULL) {
|
} else if (enc->mblen_table) {
|
||||||
|
/* The use of the `mblen_table` means that for encodings like MacJapanese,
|
||||||
|
* we treat each character in its native charset as "1 character", even if it
|
||||||
|
* maps to a sequence of several codepoints */
|
||||||
const unsigned char *mbtab = enc->mblen_table;
|
const unsigned char *mbtab = enc->mblen_table;
|
||||||
unsigned char *limit = in + in_len;
|
unsigned char *limit = in + in_len;
|
||||||
while (from && in < limit) {
|
while (from && in < limit) {
|
||||||
|
@ -2254,7 +2257,21 @@ PHP_FUNCTION(mb_substr)
|
||||||
|
|
||||||
size_t mblen = 0;
|
size_t mblen = 0;
|
||||||
if (from < 0 || (!len_is_null && len < 0)) {
|
if (from < 0 || (!len_is_null && len < 0)) {
|
||||||
mblen = mb_get_strlen(str, enc);
|
if (enc->mblen_table) {
|
||||||
|
/* Because we use the `mblen_table` when iterating over the string and
|
||||||
|
* extracting the requested part, we also need to use it here for counting
|
||||||
|
* the "length" of the string
|
||||||
|
* Otherwise, we can get wrong results for text encodings like MacJapanese,
|
||||||
|
* where one native 'character' can map to a sequence of several codepoints */
|
||||||
|
const unsigned char *mbtab = enc->mblen_table;
|
||||||
|
unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
|
||||||
|
while (p < e) {
|
||||||
|
p += mbtab[*p];
|
||||||
|
mblen++;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
mblen = mb_get_strlen(str, enc);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* if "from" position is negative, count start position from the end
|
/* if "from" position is negative, count start position from the end
|
||||||
|
|
|
@ -80,6 +80,23 @@ foreach (['SJIS', 'SJIS-2004', 'MacJapanese', 'SJIS-Mobile#DOCOMO', 'SJIS-Mobile
|
||||||
echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n";
|
echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
Some MacJapanese characters map to a sequence of several Unicode codepoints. Examples:
|
||||||
|
|
||||||
|
0x85AB 0xF862+0x0058+0x0049+0x0049+0x0049 # roman numeral thirteen
|
||||||
|
0x85AC 0xF861+0x0058+0x0049+0x0056 # roman numeral fourteen
|
||||||
|
0x85AD 0xF860+0x0058+0x0056 # roman numeral fifteen
|
||||||
|
0x85BF 0xF862+0x0078+0x0069+0x0069+0x0069 # small roman numeral thirteen
|
||||||
|
0x85C0 0xF861+0x0078+0x0069+0x0076 # small roman numeral fourteen
|
||||||
|
0x85C1 0xF860+0x0078+0x0076 # small roman numeral fifteen
|
||||||
|
|
||||||
|
Even though they map to multiple codepoints, mb_str_split treats these as ONE character each
|
||||||
|
*/
|
||||||
|
|
||||||
|
echo "== MacJapanese characters which map to 3-5 codepoints each ==\n";
|
||||||
|
echo "[", implode(', ', array_map('bin2hex', mb_str_split("abc\x85\xAB\x85\xAC\x85\xAD", 1, 'MacJapanese'))), "]\n";
|
||||||
|
echo "[", implode(', ', array_map('bin2hex', mb_str_split("abc\x85\xBF\x85\xC0\x85\xC1", 2, 'MacJapanese'))), "]\n";
|
||||||
|
|
||||||
?>
|
?>
|
||||||
--EXPECT--
|
--EXPECT--
|
||||||
BIG-5: a4e9 a5bb
|
BIG-5: a4e9 a5bb
|
||||||
|
@ -104,3 +121,6 @@ SJIS-Mobile#KDDI: [80a1, 6162, 6380, a1]
|
||||||
SJIS-Mobile#KDDI: [6162, 63fd, feff, 6162, fdfe, ff]
|
SJIS-Mobile#KDDI: [6162, 63fd, feff, 6162, fdfe, ff]
|
||||||
SJIS-Mobile#SoftBank: [80a1, 6162, 6380, a1]
|
SJIS-Mobile#SoftBank: [80a1, 6162, 6380, a1]
|
||||||
SJIS-Mobile#SoftBank: [6162, 63fd, feff, 6162, fdfe, ff]
|
SJIS-Mobile#SoftBank: [6162, 63fd, feff, 6162, fdfe, ff]
|
||||||
|
== MacJapanese characters which map to 3-5 codepoints each ==
|
||||||
|
[61, 62, 63, 85ab, 85ac, 85ad]
|
||||||
|
[6162, 6385bf, 85c085c1]
|
||||||
|
|
|
@ -64,6 +64,16 @@ echo "SJIS-Mobile#SoftBank:\n";
|
||||||
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#SoftBank')) . "\n";
|
print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#SoftBank')) . "\n";
|
||||||
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#SoftBank')) . "\n";
|
print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#SoftBank')) . "\n";
|
||||||
|
|
||||||
|
echo "-- Testing MacJapanese characters which map to 3-5 codepoints each --\n";
|
||||||
|
|
||||||
|
/* There are many characters in MacJapanese which map to sequences of several codepoints */
|
||||||
|
print bin2hex(mb_substr("abc\x85\xAB\x85\xAC\x85\xAD", 0, 3, 'MacJapanese')) . "\n";
|
||||||
|
print bin2hex(mb_substr("abc\x85\xAB\x85\xAC\x85\xAD", 3, 2, 'MacJapanese')) . "\n";
|
||||||
|
print bin2hex(mb_substr("abc\x85\xAB\x85\xAC\x85\xAD", -2, 1, 'MacJapanese')) . "\n";
|
||||||
|
print bin2hex(mb_substr("abc\x85\xBF\x85\xC0\x85\xC1", 0, 3, 'MacJapanese')) . "\n";
|
||||||
|
print bin2hex(mb_substr("abc\x85\xBF\x85\xC0\x85\xC1", 3, 2, 'MacJapanese')) . "\n";
|
||||||
|
print bin2hex(mb_substr("abc\x85\xBF\x85\xC0\x85\xC1", -2, 1, 'MacJapanese')) . "\n";
|
||||||
|
|
||||||
echo "ISO-2022-JP:\n";
|
echo "ISO-2022-JP:\n";
|
||||||
print "1: " . bin2hex(mb_substr($iso2022jp, 0, 3, 'ISO-2022-JP')) . "\n";
|
print "1: " . bin2hex(mb_substr($iso2022jp, 0, 3, 'ISO-2022-JP')) . "\n";
|
||||||
print "2: " . bin2hex(mb_substr($iso2022jp, -1, null, 'ISO-2022-JP')) . "\n";
|
print "2: " . bin2hex(mb_substr($iso2022jp, -1, null, 'ISO-2022-JP')) . "\n";
|
||||||
|
@ -145,6 +155,13 @@ SJIS-Mobile#KDDI:
|
||||||
SJIS-Mobile#SoftBank:
|
SJIS-Mobile#SoftBank:
|
||||||
6380
|
6380
|
||||||
806162
|
806162
|
||||||
|
-- Testing MacJapanese characters which map to 3-5 codepoints each --
|
||||||
|
616263
|
||||||
|
85ab85ac
|
||||||
|
85ac
|
||||||
|
616263
|
||||||
|
85bf85c0
|
||||||
|
85c0
|
||||||
ISO-2022-JP:
|
ISO-2022-JP:
|
||||||
1: 1b2442212121721b284241
|
1: 1b2442212121721b284241
|
||||||
2: 43
|
2: 43
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue