diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c index 1f52e0aa977..f23a8b08ace 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c @@ -40,7 +40,7 @@ static int mbfl_filt_conv_sjis_wchar_flush(mbfl_convert_filter *filter); static size_t mb_sjis_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); static void mb_wchar_to_sjis(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); -const unsigned char mblen_table_sjis[] = { /* 0x80-0x9f,0xE0-0xFF */ +const unsigned char mblen_table_sjis[] = { /* 0x81-0x9F,0xE0-0xEF */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, @@ -49,14 +49,14 @@ const unsigned char mblen_table_sjis[] = { /* 0x80-0x9f,0xE0-0xFF */ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; static const char *mbfl_encoding_sjis_aliases[] = {"x-sjis", "SHIFT-JIS", NULL}; diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c index 78d4bd43107..737871eda8a 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c @@ -41,7 +41,7 @@ #include "unicode_table_jis2004.h" #include "unicode_table_jis.h" -extern const unsigned char mblen_table_sjis[]; +extern const unsigned char mblen_table_sjis_mobile[]; extern const unsigned char mblen_table_eucjp[]; static size_t mb_sjis2004_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); @@ -62,7 +62,7 @@ const mbfl_encoding mbfl_encoding_sjis2004 = { "SJIS-2004", "Shift_JIS", mbfl_encoding_sjis2004_aliases, - mblen_table_sjis, + mblen_table_sjis_mobile, /* Leading byte values used for SJIS-2004 are the same as mobile SJIS variants */ MBFL_ENCTYPE_GL_UNSAFE, &vtbl_sjis2004_wchar, &vtbl_wchar_sjis2004, diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c index 0cb93cc38e8..0ff2a198d36 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c @@ -35,7 +35,24 @@ #include "sjis_mac2uni.h" -extern const unsigned char mblen_table_sjis[]; +const unsigned char mblen_table_sjismac[] = { /* 0x81-0x9F,0xE0-0xED */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 +}; static int mbfl_filt_conv_wchar_sjis_mac_flush(mbfl_convert_filter *filter); static int mbfl_filt_conv_sjis_mac_wchar_flush(mbfl_convert_filter *filter); @@ -49,7 +66,7 @@ const mbfl_encoding mbfl_encoding_sjis_mac = { "SJIS-mac", "Shift_JIS", mbfl_encoding_sjis_mac_aliases, - mblen_table_sjis, + mblen_table_sjismac, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_sjis_mac_wchar, &vtbl_wchar_sjis_mac, diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c index 31a1e3a4d77..448e0a74ca3 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c @@ -35,8 +35,26 @@ #include "emoji2uni.h" +const unsigned char mblen_table_sjis_mobile[] = { /* 0x81-0x9F,0xE0-0xFC */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1 +}; + extern int mbfl_bisec_srch2(int w, const unsigned short tbl[], int n); -extern const unsigned char mblen_table_sjis[]; static int mbfl_filt_conv_sjis_wchar_flush(mbfl_convert_filter *filter); static size_t mb_sjis_docomo_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); @@ -55,7 +73,7 @@ const mbfl_encoding mbfl_encoding_sjis_docomo = { "SJIS-Mobile#DOCOMO", "Shift_JIS", mbfl_encoding_sjis_docomo_aliases, - mblen_table_sjis, + mblen_table_sjis_mobile, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_sjis_docomo_wchar, &vtbl_wchar_sjis_docomo, @@ -68,7 +86,7 @@ const mbfl_encoding mbfl_encoding_sjis_kddi = { "SJIS-Mobile#KDDI", "Shift_JIS", mbfl_encoding_sjis_kddi_aliases, - mblen_table_sjis, + mblen_table_sjis_mobile, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_sjis_kddi_wchar, &vtbl_wchar_sjis_kddi, @@ -81,7 +99,7 @@ const mbfl_encoding mbfl_encoding_sjis_sb = { "SJIS-Mobile#SOFTBANK", "Shift_JIS", mbfl_encoding_sjis_sb_aliases, - mblen_table_sjis, + mblen_table_sjis_mobile, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_sjis_sb_wchar, &vtbl_wchar_sjis_sb, diff --git a/ext/mbstring/tests/mb_str_split_jp.phpt b/ext/mbstring/tests/mb_str_split_jp.phpt index 22f39539608..41a123d5897 100644 --- a/ext/mbstring/tests/mb_str_split_jp.phpt +++ b/ext/mbstring/tests/mb_str_split_jp.phpt @@ -69,6 +69,17 @@ if(end($array) !== $enc){ last array element: %s expected: %s\n", unpack("H*", end($array))[1],unpack("H*", $enc)[1]); } +/* SJIS byte 0x80 was previously wrongly treated as the starting byte for a 2-byte character */ +echo "== Regression test for SJIS byte 0x80 ==\n"; +foreach (['SJIS', 'SJIS-2004', 'MacJapanese', 'SJIS-Mobile#DOCOMO', 'SJIS-Mobile#KDDI', 'SJIS-Mobile#SoftBank'] as $encoding) { + $array = mb_str_split("\x80\xA1abc\x80\xA1", 2, $encoding); + echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n"; + + // Also try bytes 0xFD, 0xFE, and 0xFF + $array = mb_str_split("abc\xFD\xFE\xFFab\xFD\xFE\xFF", 2, $encoding); + echo "$encoding: [" . implode(', ', array_map('bin2hex', $array)) . "]\n"; +} + ?> --EXPECT-- BIG-5: a4e9 a5bb @@ -80,3 +91,16 @@ UTF-16LE: e565 2c67 UTF-32BE: 000065e5 0000672c UTF-32LE: e5650000 2c670000 UTF-8: e697a5 e69cac +== Regression test for SJIS byte 0x80 == +SJIS: [80a1, 6162, 6380, a1] +SJIS: [6162, 63fd, feff, 6162, fdfe, ff] +SJIS-2004: [80a1, 6162, 6380, a1] +SJIS-2004: [6162, 63fd, feff, 6162, fdfe, ff] +MacJapanese: [80a1, 6162, 6380, a1] +MacJapanese: [6162, 63fd, feff, 6162, fdfe, ff] +SJIS-Mobile#DOCOMO: [80a1, 6162, 6380, a1] +SJIS-Mobile#DOCOMO: [6162, 63fd, feff, 6162, fdfe, ff] +SJIS-Mobile#KDDI: [80a1, 6162, 6380, a1] +SJIS-Mobile#KDDI: [6162, 63fd, feff, 6162, fdfe, ff] +SJIS-Mobile#SoftBank: [80a1, 6162, 6380, a1] +SJIS-Mobile#SoftBank: [6162, 63fd, feff, 6162, fdfe, ff] diff --git a/ext/mbstring/tests/mb_strlen.phpt b/ext/mbstring/tests/mb_strlen.phpt index 11225917140..5ebfcd1aec0 100644 --- a/ext/mbstring/tests/mb_strlen.phpt +++ b/ext/mbstring/tests/mb_strlen.phpt @@ -13,43 +13,59 @@ include_once('common.inc'); mb_detect_order('auto'); // Test string -$euc_jp = '0123この文字列は日本語です。EUC-JPを使っています。0123日本語は面倒臭い。'; +$euc_jp = mb_convert_encoding("0123絖ユ茯сEUC-JP篏帥c障0123ユ茯√", 'EUC-JP', 'UTF-8'); $ascii = 'abcdefghijklmnopqrstuvwxyz;]=#0123456789'; -// ASCII echo "== ASCII ==\n"; -print mb_strlen($ascii,'ASCII') . "\n"; -print strlen($ascii) . "\n"; +print mb_strlen($ascii,'ASCII') . "\n"; +print strlen($ascii) . "\n"; -// EUC-JP echo "== EUC-JP ==\n"; -print mb_strlen($euc_jp,'EUC-JP') . "\n"; +print mb_strlen($euc_jp,'EUC-JP') . "\n"; mb_internal_encoding('EUC-JP') or print("mb_internal_encoding() failed\n"); -print strlen($euc_jp) . "\n"; +print strlen($euc_jp) . "\n"; -// SJIS echo "== SJIS ==\n"; $sjis = mb_convert_encoding($euc_jp, 'SJIS','EUC-JP'); -print mb_strlen($sjis,'SJIS') . "\n"; +print mb_strlen($sjis,'SJIS') . "\n"; mb_internal_encoding('SJIS') or print("mb_internal_encoding() failed\n"); -print strlen($sjis) . "\n"; +print strlen($sjis) . "\n"; +print "-- Testing illegal bytes 0x80,0xFD-FF --\n"; +// mb_strlen used to wrongly treat 0x80 as the starting byte of a 2-byte SJIS character +print mb_strlen("\x80\xA1", 'SJIS') . "\n"; +print mb_strlen("abc\xFD\xFE\xFF", 'SJIS') . "\n"; + +echo "== MacJapanese ==\n"; +print mb_strlen("\x80\xA1", 'MacJapanese') . "\n"; +print mb_strlen("abc\xFD\xFE\xFF", 'MacJapanese') . "\n"; + +echo "== SJIS-2004 ==\n"; +print mb_strlen("\x80\xA1", 'SJIS-2004') . "\n"; +print mb_strlen("abc\xFD\xFE\xFF", 'SJIS-2004') . "\n"; + +echo "== SJIS-Mobile#DOCOMO ==\n"; +print mb_strlen("\x80\xA1", 'SJIS-Mobile#DOCOMO') . "\n"; +print mb_strlen("abc\xFD\xFE\xFF", 'SJIS-Mobile#DOCOMO') . "\n"; + +echo "== SJIS-Mobile#KDDI ==\n"; +print mb_strlen("\x80\xA1", 'SJIS-Mobile#KDDI') . "\n"; +print mb_strlen("abc\xFD\xFE\xFF", 'SJIS-Mobile#KDDI') . "\n"; + +echo "== SJIS-Mobile#SoftBank ==\n"; +print mb_strlen("\x80\xA1", 'SJIS-Mobile#SoftBank') . "\n"; +print mb_strlen("abc\xFD\xFE\xFF", 'SJIS-Mobile#SoftBank') . "\n"; -// JIS -// Note: either convert_encoding or strlen has problem echo "== JIS ==\n"; $jis = mb_convert_encoding($euc_jp, 'JIS','EUC-JP'); -print mb_strlen($jis,'JIS') . "\n"; +print mb_strlen($jis,'JIS') . "\n"; mb_internal_encoding('JIS') or print("mb_internal_encoding() failed\n"); -print strlen($jis) . "\n"; +print strlen($jis) . "\n"; -// UTF-8 -// Note: either convert_encoding or strlen has problem echo "== UTF-8 ==\n"; $utf8 = mb_convert_encoding($euc_jp, 'UTF-8','EUC-JP'); -print mb_strlen($utf8,'UTF-8') . "\n"; +print mb_strlen($utf8,'UTF-8') . "\n"; mb_internal_encoding('UTF-8') or print("mb_internal_encoding() failed\n"); -print strlen($utf8) . "\n"; - +print strlen($utf8) . "\n"; // Wrong Parameters echo "== WRONG PARAMETERS ==\n"; @@ -72,6 +88,24 @@ try { == SJIS == 43 72 +-- Testing illegal bytes 0x80,0xFD-FF -- +2 +6 +== MacJapanese == +2 +6 +== SJIS-2004 == +2 +6 +== SJIS-Mobile#DOCOMO == +2 +6 +== SJIS-Mobile#KDDI == +2 +6 +== SJIS-Mobile#SoftBank == +2 +6 == JIS == 43 90 diff --git a/ext/mbstring/tests/mb_substr.phpt b/ext/mbstring/tests/mb_substr.phpt index 6d5e9d42ac0..5bd4a5e67f4 100644 --- a/ext/mbstring/tests/mb_substr.phpt +++ b/ext/mbstring/tests/mb_substr.phpt @@ -8,13 +8,13 @@ ini_set('include_path','.'); include_once('common.inc'); // EUC-JP -$euc_jp = "0123\xA4\xB3\xA4\xCE\xCA\xB8\xBB\xFA\xCE\xF3\xA4\xCF\xC6\xFC\xCB\xDC\xB8\xEC\xA4\xC7\xA4\xB9\xA1\xA3EUC-JP\xA4\xF2\xBB\xC8\xA4\xC3\xA4\xC6\xA4\xA4\xA4\xDE\xA4\xB9\xA1\xA3\xC6\xFC\xCB\xDC\xB8\xEC\xA4\xCF\xCC\xCC\xC5\xDD\xBD\xAD\xA4\xA4\xA1\xA3"; +$euc_jp = mb_convert_encoding('0123絖ユ茯сEUC-JP篏帥c障ユ茯√', 'EUC-JP', 'UTF-8'); // SJIS -$sjis = "\x93\xFA\x96{\x8C\xEA\x83e\x83L\x83X\x83g\x82\xC5\x82\xB7\x81B01234\x82T\x82U\x82V\x82W\x82X\x81B"; +$sjis = mb_convert_encoding('ユ茯鴻с01234鐚鐚鐚鐚鐚', 'SJIS', 'UTF-8'); // ISO-2022-JP $iso2022jp = "\x1B\$B\x21\x21!r\x1B(BABC"; // GB-18030 -$gb18030 = "\xC3\xDC\xC2\xEB\xD3\xC3\xBB\xA7\xC3\xFB\xC3\xDC\xC2\xEB\xC3\xFB\xB3\xC6\xC3\xFB\xB3\xC6"; +$gb18030 = mb_convert_encoding('絲桁絲腱医腱', 'GB18030', 'UTF-8'); // HZ $hz = "The next sentence is in GB.~{<:Ky2;S{#,NpJ)l6HK!#~}Bye."; // UTF-8 @@ -40,6 +40,29 @@ print "2: " . bin2hex(mb_substr($sjis, -1, null, 'SJIS')) . "\n"; print "3: " . bin2hex(mb_substr($sjis, -5, 3, 'SJIS')) . "\n"; print "4: " . bin2hex(mb_substr($sjis, 1, null, 'SJIS')) . "\n"; print "5:" . bin2hex(mb_substr($sjis, 10, 0, 'SJIS')) . "\n"; +echo "-- Testing illegal SJIS byte 0x80 --\n"; +print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS')) . "\n"; +print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS')) . "\n"; + +echo "SJIS-2004:\n"; +print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-2004')) . "\n"; +print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-2004')) . "\n"; + +echo "MacJapanese:\n"; +print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'MacJapanese')) . "\n"; +print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'MacJapanese')) . "\n"; + +echo "SJIS-Mobile#DOCOMO:\n"; +print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#DOCOMO')) . "\n"; +print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#DOCOMO')) . "\n"; + +echo "SJIS-Mobile#KDDI:\n"; +print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#KDDI')) . "\n"; +print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#KDDI')) . "\n"; + +echo "SJIS-Mobile#SoftBank:\n"; +print bin2hex(mb_substr("\x80abc\x80\xA1", 3, 2, 'SJIS-Mobile#SoftBank')) . "\n"; +print bin2hex(mb_substr("\x80abc\x80\xA1", 0, 3, 'SJIS-Mobile#SoftBank')) . "\n"; echo "ISO-2022-JP:\n"; print "1: " . bin2hex(mb_substr($iso2022jp, 0, 3, 'ISO-2022-JP')) . "\n"; @@ -98,6 +121,24 @@ SJIS: 3: 825582568257 4: 967b8cea8365834c8358836782c582b781423031323334825482558256825782588142 5: +-- Testing illegal SJIS byte 0x80 -- +6380 +806162 +SJIS-2004: +6380 +806162 +MacJapanese: +6380 +806162 +SJIS-Mobile#DOCOMO: +6380 +806162 +SJIS-Mobile#KDDI: +6380 +806162 +SJIS-Mobile#SoftBank: +6380 +806162 ISO-2022-JP: 1: 1b2442212121721b284241 2: 43