From 6fc8d014dfce03121bb04b0e5ae1eea6c7e1f801 Mon Sep 17 00:00:00 2001 From: pakutoma Date: Wed, 22 Mar 2023 02:09:14 +0900 Subject: [PATCH] Fix phpGH-10648: add check function pointer into mbfl_encoding Previously, mbstring used the same logic for encoding validation as for encoding conversion. However, there are cases where we want to use different logic for validation and conversion. For example, if a string ends up with missing input required by the encoding, or if a character is input that is invalid as an encoding but can be converted, the conversion should succeed and the validation should fail. To achieve this, a function pointer mb_check_fn has been added to struct mbfl_encoding to implement the logic used for validation. Also, added implementation of validation logic for UTF-7, UTF7-IMAP, ISO-2022-JP and JIS. --- UPGRADING | 10 + ext/mbstring/libmbfl/filters/mbfilter_7bit.c | 3 +- .../libmbfl/filters/mbfilter_base64.c | 3 +- ext/mbstring/libmbfl/filters/mbfilter_big5.c | 6 +- .../libmbfl/filters/mbfilter_cp5022x.c | 9 +- .../libmbfl/filters/mbfilter_cp51932.c | 3 +- ext/mbstring/libmbfl/filters/mbfilter_cp932.c | 6 +- ext/mbstring/libmbfl/filters/mbfilter_cp936.c | 3 +- .../libmbfl/filters/mbfilter_euc_cn.c | 3 +- .../libmbfl/filters/mbfilter_euc_jp.c | 3 +- .../libmbfl/filters/mbfilter_euc_jp_win.c | 3 +- .../libmbfl/filters/mbfilter_euc_kr.c | 3 +- .../libmbfl/filters/mbfilter_euc_tw.c | 3 +- .../libmbfl/filters/mbfilter_gb18030.c | 3 +- .../libmbfl/filters/mbfilter_htmlent.c | 3 +- ext/mbstring/libmbfl/filters/mbfilter_hz.c | 3 +- .../libmbfl/filters/mbfilter_iso2022_jp_ms.c | 3 +- .../libmbfl/filters/mbfilter_iso2022_kr.c | 3 +- .../filters/mbfilter_iso2022jp_mobile.c | 3 +- ext/mbstring/libmbfl/filters/mbfilter_jis.c | 164 +++++- .../libmbfl/filters/mbfilter_qprint.c | 3 +- .../libmbfl/filters/mbfilter_singlebyte.c | 3 +- ext/mbstring/libmbfl/filters/mbfilter_sjis.c | 3 +- .../libmbfl/filters/mbfilter_sjis_2004.c | 9 +- .../libmbfl/filters/mbfilter_sjis_mac.c | 3 +- .../libmbfl/filters/mbfilter_sjis_mobile.c | 9 +- ext/mbstring/libmbfl/filters/mbfilter_ucs2.c | 9 +- ext/mbstring/libmbfl/filters/mbfilter_ucs4.c | 9 +- ext/mbstring/libmbfl/filters/mbfilter_uhc.c | 3 +- ext/mbstring/libmbfl/filters/mbfilter_utf16.c | 9 +- ext/mbstring/libmbfl/filters/mbfilter_utf32.c | 9 +- ext/mbstring/libmbfl/filters/mbfilter_utf7.c | 158 ++++- .../libmbfl/filters/mbfilter_utf7imap.c | 130 ++++- ext/mbstring/libmbfl/filters/mbfilter_utf8.c | 3 +- .../libmbfl/filters/mbfilter_utf8_mobile.c | 12 +- .../libmbfl/filters/mbfilter_uuencode.c | 3 +- ext/mbstring/libmbfl/filters/utf7_helper.h | 22 + ext/mbstring/libmbfl/mbfl/mbfilter.c | 10 + ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c | 3 +- ext/mbstring/libmbfl/mbfl/mbfilter_pass.c | 1 + ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c | 1 + ext/mbstring/libmbfl/mbfl/mbfl_encoding.h | 2 + ext/mbstring/mbstring.c | 4 + ext/mbstring/tests/gh10192_utf7.phpt | 542 ++++++++++++++++++ ext/mbstring/tests/gh10192_utf7imap.phpt | 423 ++++++++++++++ ext/mbstring/tests/gh10648.phpt | 155 +++++ ext/mbstring/tests/iso2022jp_encoding.phpt | 68 +-- ext/mbstring/tests/utf_encodings.phpt | 13 +- 48 files changed, 1745 insertions(+), 114 deletions(-) create mode 100644 ext/mbstring/libmbfl/filters/utf7_helper.h create mode 100644 ext/mbstring/tests/gh10192_utf7.phpt create mode 100644 ext/mbstring/tests/gh10192_utf7imap.phpt create mode 100644 ext/mbstring/tests/gh10648.phpt diff --git a/UPGRADING b/UPGRADING index 744fa57c1c8..05b33b231ee 100644 --- a/UPGRADING +++ b/UPGRADING @@ -218,6 +218,16 @@ PHP 8.2 UPGRADE NOTES dba_fetch(string|array $key, $skip, $dba): string|false is still accepted, but it is recommended to use the new standard variant. +- MBString + . mb_check_encoding() now checks input encoding more strictly. + . mb_detect_encoding() now checks input encoding more strictly + when strict detection is enabled. + . mb_convert_encoding() checks the input encoding more strictly + if multiple encodings are passed to from_encoding + and the mbstring.strict_detection INI directive is set to 1. + This change only affects the encoding selection, + not the result of the conversion. + - Random . random_bytes() and random_int() now throw \Random\RandomException on CSPRNG failure. Previously a plain \Exception was thrown. diff --git a/ext/mbstring/libmbfl/filters/mbfilter_7bit.c b/ext/mbstring/libmbfl/filters/mbfilter_7bit.c index a611a4e09b1..54744aa4b8e 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_7bit.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_7bit.c @@ -64,7 +64,8 @@ const mbfl_encoding mbfl_encoding_7bit = { &vtbl_7bit_wchar, &vtbl_wchar_7bit, mb_7bit_to_wchar, - mb_wchar_to_7bit + mb_wchar_to_7bit, + NULL }; #define CK(statement) do { if ((statement) < 0) return (-1); } while (0) diff --git a/ext/mbstring/libmbfl/filters/mbfilter_base64.c b/ext/mbstring/libmbfl/filters/mbfilter_base64.c index ede3eef18ce..162e9b1bda8 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_base64.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_base64.c @@ -44,7 +44,8 @@ const mbfl_encoding mbfl_encoding_base64 = { NULL, NULL, mb_base64_to_wchar, - mb_wchar_to_base64 + mb_wchar_to_base64, + NULL }; const struct mbfl_convert_vtbl vtbl_8bit_b64 = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_big5.c b/ext/mbstring/libmbfl/filters/mbfilter_big5.c index 58f89d1b575..7618130aac8 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_big5.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_big5.c @@ -69,7 +69,8 @@ const mbfl_encoding mbfl_encoding_big5 = { &vtbl_big5_wchar, &vtbl_wchar_big5, mb_big5_to_wchar, - mb_wchar_to_big5 + mb_wchar_to_big5, + NULL }; const mbfl_encoding mbfl_encoding_cp950 = { @@ -82,7 +83,8 @@ const mbfl_encoding mbfl_encoding_cp950 = { &vtbl_cp950_wchar, &vtbl_wchar_cp950, mb_cp950_to_wchar, - mb_wchar_to_cp950 + mb_wchar_to_cp950, + NULL }; const struct mbfl_convert_vtbl vtbl_big5_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c index 32a8bdf15f5..93c33da9543 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c @@ -61,7 +61,8 @@ const mbfl_encoding mbfl_encoding_cp50220 = { &vtbl_cp50220_wchar, &vtbl_wchar_cp50220, mb_cp5022x_to_wchar, - mb_wchar_to_cp50220 + mb_wchar_to_cp50220, + NULL }; const mbfl_encoding mbfl_encoding_cp50221 = { @@ -74,7 +75,8 @@ const mbfl_encoding mbfl_encoding_cp50221 = { &vtbl_cp50221_wchar, &vtbl_wchar_cp50221, mb_cp5022x_to_wchar, - mb_wchar_to_cp50221 + mb_wchar_to_cp50221, + NULL }; const mbfl_encoding mbfl_encoding_cp50222 = { @@ -87,7 +89,8 @@ const mbfl_encoding mbfl_encoding_cp50222 = { &vtbl_cp50222_wchar, &vtbl_wchar_cp50222, mb_cp5022x_to_wchar, - mb_wchar_to_cp50222 + mb_wchar_to_cp50222, + NULL }; const struct mbfl_convert_vtbl vtbl_cp50220_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c b/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c index 6311f9b7213..d3aae8b10f5 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c @@ -69,7 +69,8 @@ const mbfl_encoding mbfl_encoding_cp51932 = { &vtbl_cp51932_wchar, &vtbl_wchar_cp51932, mb_cp51932_to_wchar, - mb_wchar_to_cp51932 + mb_wchar_to_cp51932, + NULL }; const struct mbfl_convert_vtbl vtbl_cp51932_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp932.c b/ext/mbstring/libmbfl/filters/mbfilter_cp932.c index cf8e461e1d9..506c2439390 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp932.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp932.c @@ -100,7 +100,8 @@ const mbfl_encoding mbfl_encoding_cp932 = { &vtbl_cp932_wchar, &vtbl_wchar_cp932, mb_cp932_to_wchar, - mb_wchar_to_cp932 + mb_wchar_to_cp932, + NULL }; const struct mbfl_convert_vtbl vtbl_cp932_wchar = { @@ -133,7 +134,8 @@ const mbfl_encoding mbfl_encoding_sjiswin = { &vtbl_sjiswin_wchar, &vtbl_wchar_sjiswin, mb_cp932_to_wchar, - mb_wchar_to_sjiswin + mb_wchar_to_sjiswin, + NULL }; const struct mbfl_convert_vtbl vtbl_sjiswin_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp936.c b/ext/mbstring/libmbfl/filters/mbfilter_cp936.c index 40ae8c86f91..02e808ce928 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp936.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp936.c @@ -68,7 +68,8 @@ const mbfl_encoding mbfl_encoding_cp936 = { &vtbl_cp936_wchar, &vtbl_wchar_cp936, mb_cp936_to_wchar, - mb_wchar_to_cp936 + mb_wchar_to_cp936, + NULL }; const struct mbfl_convert_vtbl vtbl_cp936_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_cn.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_cn.c index 50a0368a923..cec5f5d41d5 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_cn.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_cn.c @@ -67,7 +67,8 @@ const mbfl_encoding mbfl_encoding_euc_cn = { &vtbl_euccn_wchar, &vtbl_wchar_euccn, mb_euccn_to_wchar, - mb_wchar_to_euccn + mb_wchar_to_euccn, + NULL }; const struct mbfl_convert_vtbl vtbl_euccn_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c index 2b0ae77534d..aa5f323db6f 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c @@ -68,7 +68,8 @@ const mbfl_encoding mbfl_encoding_euc_jp = { &vtbl_eucjp_wchar, &vtbl_wchar_eucjp, mb_eucjp_to_wchar, - mb_wchar_to_eucjp + mb_wchar_to_eucjp, + NULL }; const struct mbfl_convert_vtbl vtbl_eucjp_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_win.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_win.c index 09287a9d8f6..d35cec95410 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_win.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_win.c @@ -69,7 +69,8 @@ const mbfl_encoding mbfl_encoding_eucjp_win = { &vtbl_eucjpwin_wchar, &vtbl_wchar_eucjpwin, mb_eucjpwin_to_wchar, - mb_wchar_to_eucjpwin + mb_wchar_to_eucjpwin, + NULL }; const struct mbfl_convert_vtbl vtbl_eucjpwin_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_kr.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_kr.c index 69e6811922e..b0cb1954739 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_kr.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_kr.c @@ -66,7 +66,8 @@ const mbfl_encoding mbfl_encoding_euc_kr = { &vtbl_euckr_wchar, &vtbl_wchar_euckr, mb_euckr_to_wchar, - mb_wchar_to_euckr + mb_wchar_to_euckr, + NULL }; const struct mbfl_convert_vtbl vtbl_euckr_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c index de1deb47705..522f5f4a05a 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c @@ -68,7 +68,8 @@ const mbfl_encoding mbfl_encoding_euc_tw = { &vtbl_euctw_wchar, &vtbl_wchar_euctw, mb_euctw_to_wchar, - mb_wchar_to_euctw + mb_wchar_to_euctw, + NULL }; const struct mbfl_convert_vtbl vtbl_euctw_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_gb18030.c b/ext/mbstring/libmbfl/filters/mbfilter_gb18030.c index 492df604624..d607aafef49 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_gb18030.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_gb18030.c @@ -49,7 +49,8 @@ const mbfl_encoding mbfl_encoding_gb18030 = { &vtbl_gb18030_wchar, &vtbl_wchar_gb18030, mb_gb18030_to_wchar, - mb_wchar_to_gb18030 + mb_wchar_to_gb18030, + NULL }; const struct mbfl_convert_vtbl vtbl_gb18030_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c b/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c index afebdfd0081..a75a9c757cb 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c @@ -66,7 +66,8 @@ const mbfl_encoding mbfl_encoding_html_ent = { &vtbl_html_wchar, &vtbl_wchar_html, mb_htmlent_to_wchar, - mb_wchar_to_htmlent + mb_wchar_to_htmlent, + NULL }; const struct mbfl_convert_vtbl vtbl_wchar_html = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_hz.c b/ext/mbstring/libmbfl/filters/mbfilter_hz.c index 72e5963acfc..b047bfc8b7b 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_hz.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_hz.c @@ -47,7 +47,8 @@ const mbfl_encoding mbfl_encoding_hz = { &vtbl_hz_wchar, &vtbl_wchar_hz, mb_hz_to_wchar, - mb_wchar_to_hz + mb_wchar_to_hz, + NULL }; const struct mbfl_convert_vtbl vtbl_hz_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c index 65b6d66d2ec..e3676d30e29 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c @@ -51,7 +51,8 @@ const mbfl_encoding mbfl_encoding_2022jpms = { &vtbl_2022jpms_wchar, &vtbl_wchar_2022jpms, mb_iso2022jpms_to_wchar, - mb_wchar_to_iso2022jpms + mb_wchar_to_iso2022jpms, + NULL }; const struct mbfl_convert_vtbl vtbl_2022jpms_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c index c4b2bf0b9f1..d51fd720e97 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c @@ -54,7 +54,8 @@ const mbfl_encoding mbfl_encoding_2022kr = { &vtbl_2022kr_wchar, &vtbl_wchar_2022kr, mb_iso2022kr_to_wchar, - mb_wchar_to_iso2022kr + mb_wchar_to_iso2022kr, + NULL }; const struct mbfl_convert_vtbl vtbl_wchar_2022kr = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c b/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c index 6792498b2c8..63d7c7b7f29 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c @@ -71,7 +71,8 @@ const mbfl_encoding mbfl_encoding_2022jp_kddi = { &vtbl_2022jp_kddi_wchar, &vtbl_wchar_2022jp_kddi, mb_iso2022jp_kddi_to_wchar, - mb_wchar_to_iso2022jp_kddi + mb_wchar_to_iso2022jp_kddi, + NULL }; const struct mbfl_convert_vtbl vtbl_2022jp_kddi_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_jis.c b/ext/mbstring/libmbfl/filters/mbfilter_jis.c index fc5f18aeb5d..80af0e69564 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_jis.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_jis.c @@ -37,6 +37,8 @@ static int mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter *filter); static size_t mb_iso2022jp_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); static void mb_wchar_to_iso2022jp(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); static void mb_wchar_to_jis(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +static bool mb_check_iso2022jp(unsigned char *in, size_t in_len); +static bool mb_check_jis(unsigned char *in, size_t in_len); const mbfl_encoding mbfl_encoding_jis = { mbfl_no_encoding_jis, @@ -49,6 +51,7 @@ const mbfl_encoding mbfl_encoding_jis = { &vtbl_wchar_jis, mb_iso2022jp_to_wchar, mb_wchar_to_jis, + mb_check_jis }; const mbfl_encoding mbfl_encoding_2022jp = { @@ -61,7 +64,8 @@ const mbfl_encoding mbfl_encoding_2022jp = { &vtbl_2022jp_wchar, &vtbl_wchar_2022jp, mb_iso2022jp_to_wchar, - mb_wchar_to_iso2022jp + mb_wchar_to_iso2022jp, + mb_check_iso2022jp }; const struct mbfl_convert_vtbl vtbl_jis_wchar = { @@ -780,3 +784,161 @@ static void mb_wchar_to_jis(uint32_t *in, size_t len, mb_convert_buf *buf, bool MB_CONVERT_BUF_STORE(buf, out, limit); } + +#define JISX_0201_KANA_SO 5 + +static bool mb_check_jis(unsigned char *in, size_t in_len) +{ + unsigned char *p = in, *e = p + in_len; + unsigned int state = ASCII; + + while (p < e) { + unsigned char c = *p++; + if (c == 0x1B) { + /* ESC seen; this is an escape sequence */ + if (state == JISX_0201_KANA_SO) { + return false; + } + if ((e - p) < 2) { + return false; + } + unsigned char c2 = *p++; + if (c2 == '$') { + unsigned char c3 = *p++; + if (c3 == '@' || c3 == 'B') { + state = JISX_0208; + } else if (c3 == '(') { + if (p == e) { + return false; + } + unsigned char c4 = *p++; + if (c4 == '@' || c4 == 'B') { + state = JISX_0208; + } else if (c4 == 'D') { + state = JISX_0212; + } else { + return false; + } + } else { + return false; + } + } else if (c2 == '(') { + unsigned char c3 = *p++; + /* ESC ( H is treated as a sequence transitioning to ASCII for historical reasons. + * see https://github.com/php/php-src/pull/10828#issuecomment-1478342432. */ + if (c3 == 'B' || c3 == 'H') { + state = ASCII; + } else if (c3 == 'J') { + state = JISX_0201_LATIN; + } else if (c3 == 'I') { + state = JISX_0201_KANA; + } else { + return false; + } + } else { + return false; + } + } else if (c == 0xE) { + /* "Kana In" marker */ + if (state != ASCII) { + return false; + } + state = JISX_0201_KANA_SO; + } else if (c == 0xF) { + /* "Kana Out" marker */ + if (state != JISX_0201_KANA_SO) { + return false; + } + state = ASCII; + } else if ((state == JISX_0208 || state == JISX_0212) && (c > 0x20 && c < 0x7F)) { + if (p == e) { + return false; + } + unsigned char c2 = *p++; + if (c2 > 0x20 && c2 < 0x7F) { + unsigned int s = (c - 0x21)*94 + c2 - 0x21; + if (state == JISX_0208) { + if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) { + continue; + } + } else { + if (s < jisx0212_ucs_table_size && jisx0212_ucs_table[s]) { + continue; + } + } + return false; + } else { + return false; + } + } else if (c < 0x80) { + continue; + } else if (c >= 0xA1 && c <= 0xDF) { + /* GR-invoked Kana */ + continue; + } else { + return false; + } + } + + return state == ASCII; +} + + +static bool mb_check_iso2022jp(unsigned char *in, size_t in_len) +{ + unsigned char *p = in, *e = p + in_len; + unsigned int state = ASCII; + + while (p < e) { + unsigned char c = *p++; + if (c == 0x1B) { + /* ESC seen; this is an escape sequence */ + if ((e - p) < 2) { + return false; + } + unsigned char c2 = *p++; + if (c2 == '$') { + unsigned char c3 = *p++; + if (c3 == '@' || c3 == 'B') { + state = JISX_0208; + } else { + return false; + } + } else if (c2 == '(') { + unsigned char c3 = *p++; + if (c3 == 'B') { + state = ASCII; + } else if (c3 == 'J') { + state = JISX_0201_LATIN; + } else { + return false; + } + } else { + return false; + } + } else if (c == 0xE || c == 0xF) { + /* "Kana In" or "Kana Out" marker; ISO-2022-JP is not accepted. */ + return false; + } else if (state == JISX_0208 && (c > 0x20 && c < 0x7F)) { + if (p == e) { + return false; + } + unsigned char c2 = *p++; + if (c2 > 0x20 && c2 < 0x7F) { + unsigned int s = (c - 0x21)*94 + c2 - 0x21; + if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) { + continue; + } + return false; + } else { + return false; + } + } else if (c < 0x80) { + continue; + } else { + return false; + } + } + + return state == ASCII; +} diff --git a/ext/mbstring/libmbfl/filters/mbfilter_qprint.c b/ext/mbstring/libmbfl/filters/mbfilter_qprint.c index 5fde30ee809..2bcddedede3 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_qprint.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_qprint.c @@ -46,7 +46,8 @@ const mbfl_encoding mbfl_encoding_qprint = { NULL, NULL, mb_qprint_to_wchar, - mb_wchar_to_qprint + mb_wchar_to_qprint, + NULL }; const struct mbfl_convert_vtbl vtbl_8bit_qprint = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c b/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c index 56c9b2dbc85..c5872335a85 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c @@ -86,7 +86,8 @@ static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int &vtbl_##id##_wchar, \ &vtbl_wchar_##id, \ mb_##id##_to_wchar, \ - mb_wchar_to_##id \ + mb_wchar_to_##id, \ + NULL \ } /* For single-byte encodings which use a conversion table */ diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c index f23a8b08ace..59399bf7217 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c @@ -71,7 +71,8 @@ const mbfl_encoding mbfl_encoding_sjis = { &vtbl_sjis_wchar, &vtbl_wchar_sjis, mb_sjis_to_wchar, - mb_wchar_to_sjis + mb_wchar_to_sjis, + NULL }; const struct mbfl_convert_vtbl vtbl_sjis_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c index 737871eda8a..bc4d9321870 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c @@ -67,7 +67,8 @@ const mbfl_encoding mbfl_encoding_sjis2004 = { &vtbl_sjis2004_wchar, &vtbl_wchar_sjis2004, mb_sjis2004_to_wchar, - mb_wchar_to_sjis2004 + mb_wchar_to_sjis2004, + NULL }; const struct mbfl_convert_vtbl vtbl_sjis2004_wchar = { @@ -100,7 +101,8 @@ const mbfl_encoding mbfl_encoding_eucjp2004 = { &vtbl_eucjp2004_wchar, &vtbl_wchar_eucjp2004, mb_eucjp2004_to_wchar, - mb_wchar_to_eucjp2004 + mb_wchar_to_eucjp2004, + NULL }; const struct mbfl_convert_vtbl vtbl_eucjp2004_wchar = { @@ -133,7 +135,8 @@ const mbfl_encoding mbfl_encoding_2022jp_2004 = { &vtbl_2022jp_2004_wchar, &vtbl_wchar_2022jp_2004, mb_iso2022jp2004_to_wchar, - mb_wchar_to_iso2022jp2004 + mb_wchar_to_iso2022jp2004, + NULL }; const struct mbfl_convert_vtbl vtbl_2022jp_2004_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c index 0ff2a198d36..8fb569b36c4 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c @@ -71,7 +71,8 @@ const mbfl_encoding mbfl_encoding_sjis_mac = { &vtbl_sjis_mac_wchar, &vtbl_wchar_sjis_mac, mb_sjismac_to_wchar, - mb_wchar_to_sjismac + mb_wchar_to_sjismac, + NULL }; const struct mbfl_convert_vtbl vtbl_sjis_mac_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c index a13b56e1d15..f7140a9a6ce 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c @@ -78,7 +78,8 @@ const mbfl_encoding mbfl_encoding_sjis_docomo = { &vtbl_sjis_docomo_wchar, &vtbl_wchar_sjis_docomo, mb_sjis_docomo_to_wchar, - mb_wchar_to_sjis_docomo + mb_wchar_to_sjis_docomo, + NULL }; const mbfl_encoding mbfl_encoding_sjis_kddi = { @@ -91,7 +92,8 @@ const mbfl_encoding mbfl_encoding_sjis_kddi = { &vtbl_sjis_kddi_wchar, &vtbl_wchar_sjis_kddi, mb_sjis_kddi_to_wchar, - mb_wchar_to_sjis_kddi + mb_wchar_to_sjis_kddi, + NULL }; const mbfl_encoding mbfl_encoding_sjis_sb = { @@ -104,7 +106,8 @@ const mbfl_encoding mbfl_encoding_sjis_sb = { &vtbl_sjis_sb_wchar, &vtbl_wchar_sjis_sb, mb_sjis_sb_to_wchar, - mb_wchar_to_sjis_sb + mb_wchar_to_sjis_sb, + NULL }; const struct mbfl_convert_vtbl vtbl_sjis_docomo_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c b/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c index 3e0d0828cfa..e6711d82f8a 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c @@ -56,7 +56,8 @@ const mbfl_encoding mbfl_encoding_ucs2 = { &vtbl_ucs2_wchar, &vtbl_wchar_ucs2, mb_ucs2_to_wchar, - mb_wchar_to_ucs2be + mb_wchar_to_ucs2be, + NULL }; const mbfl_encoding mbfl_encoding_ucs2be = { @@ -69,7 +70,8 @@ const mbfl_encoding mbfl_encoding_ucs2be = { &vtbl_ucs2be_wchar, &vtbl_wchar_ucs2be, mb_ucs2be_to_wchar, - mb_wchar_to_ucs2be + mb_wchar_to_ucs2be, + NULL }; const mbfl_encoding mbfl_encoding_ucs2le = { @@ -82,7 +84,8 @@ const mbfl_encoding mbfl_encoding_ucs2le = { &vtbl_ucs2le_wchar, &vtbl_wchar_ucs2le, mb_ucs2le_to_wchar, - mb_wchar_to_ucs2le + mb_wchar_to_ucs2le, + NULL }; const struct mbfl_convert_vtbl vtbl_ucs2_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c b/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c index 90312b8d501..410be0ace74 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c @@ -56,7 +56,8 @@ const mbfl_encoding mbfl_encoding_ucs4 = { &vtbl_ucs4_wchar, &vtbl_wchar_ucs4, mb_ucs4_to_wchar, - mb_wchar_to_ucs4be + mb_wchar_to_ucs4be, + NULL }; const mbfl_encoding mbfl_encoding_ucs4be = { @@ -69,7 +70,8 @@ const mbfl_encoding mbfl_encoding_ucs4be = { &vtbl_ucs4be_wchar, &vtbl_wchar_ucs4be, mb_ucs4be_to_wchar, - mb_wchar_to_ucs4be + mb_wchar_to_ucs4be, + NULL }; const mbfl_encoding mbfl_encoding_ucs4le = { @@ -82,7 +84,8 @@ const mbfl_encoding mbfl_encoding_ucs4le = { &vtbl_ucs4le_wchar, &vtbl_wchar_ucs4le, mb_ucs4le_to_wchar, - mb_wchar_to_ucs4le + mb_wchar_to_ucs4le, + NULL }; const struct mbfl_convert_vtbl vtbl_ucs4_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_uhc.c b/ext/mbstring/libmbfl/filters/mbfilter_uhc.c index 2ac351d644c..644e0b063d9 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_uhc.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_uhc.c @@ -71,7 +71,8 @@ const mbfl_encoding mbfl_encoding_uhc = { &vtbl_uhc_wchar, &vtbl_wchar_uhc, mb_uhc_to_wchar, - mb_wchar_to_uhc + mb_wchar_to_uhc, + NULL }; const struct mbfl_convert_vtbl vtbl_uhc_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c index eddd56f3627..2a7d98721df 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c @@ -49,7 +49,8 @@ const mbfl_encoding mbfl_encoding_utf16 = { &vtbl_utf16_wchar, &vtbl_wchar_utf16, mb_utf16_to_wchar, - mb_wchar_to_utf16be + mb_wchar_to_utf16be, + NULL }; const mbfl_encoding mbfl_encoding_utf16be = { @@ -62,7 +63,8 @@ const mbfl_encoding mbfl_encoding_utf16be = { &vtbl_utf16be_wchar, &vtbl_wchar_utf16be, mb_utf16be_to_wchar, - mb_wchar_to_utf16be + mb_wchar_to_utf16be, + NULL }; const mbfl_encoding mbfl_encoding_utf16le = { @@ -75,7 +77,8 @@ const mbfl_encoding mbfl_encoding_utf16le = { &vtbl_utf16le_wchar, &vtbl_wchar_utf16le, mb_utf16le_to_wchar, - mb_wchar_to_utf16le + mb_wchar_to_utf16le, + NULL }; const struct mbfl_convert_vtbl vtbl_utf16_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c index e8cd4ad454f..58551c8b393 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c @@ -49,7 +49,8 @@ const mbfl_encoding mbfl_encoding_utf32 = { &vtbl_utf32_wchar, &vtbl_wchar_utf32, mb_utf32_to_wchar, - mb_wchar_to_utf32be + mb_wchar_to_utf32be, + NULL }; const mbfl_encoding mbfl_encoding_utf32be = { @@ -62,7 +63,8 @@ const mbfl_encoding mbfl_encoding_utf32be = { &vtbl_utf32be_wchar, &vtbl_wchar_utf32be, mb_utf32be_to_wchar, - mb_wchar_to_utf32be + mb_wchar_to_utf32be, + NULL }; const mbfl_encoding mbfl_encoding_utf32le = { @@ -75,7 +77,8 @@ const mbfl_encoding mbfl_encoding_utf32le = { &vtbl_utf32le_wchar, &vtbl_wchar_utf32le, mb_utf32le_to_wchar, - mb_wchar_to_utf32le + mb_wchar_to_utf32le, + NULL }; const struct mbfl_convert_vtbl vtbl_utf32_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf7.c b/ext/mbstring/libmbfl/filters/mbfilter_utf7.c index f5fe261f69d..57641a4bbe4 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf7.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf7.c @@ -29,10 +29,12 @@ #include "mbfilter.h" #include "mbfilter_utf7.h" +#include "utf7_helper.h" static int mbfl_filt_conv_utf7_wchar_flush(mbfl_convert_filter *filter); static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); static void mb_wchar_to_utf7(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +static bool mb_check_utf7(unsigned char *in, size_t in_len); static const unsigned char mbfl_base64_table[] = { /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */ @@ -59,7 +61,8 @@ const mbfl_encoding mbfl_encoding_utf7 = { &vtbl_utf7_wchar, &vtbl_wchar_utf7, mb_utf7_to_wchar, - mb_wchar_to_utf7 + mb_wchar_to_utf7, + mb_check_utf7 }; const struct mbfl_convert_vtbl vtbl_utf7_wchar = { @@ -408,16 +411,24 @@ int mbfl_filt_conv_wchar_utf7_flush(mbfl_convert_filter *filter) return 0; } -/* Ways which a Base64-encoded section can end: */ -#define DASH 0xFD -#define ASCII 0xFE -#define ILLEGAL 0xFF - static inline bool is_base64_end(unsigned char c) { return c >= DASH; } +static bool is_optional_direct(unsigned char c) +{ + /* Characters that are allowed to be encoded by Base64 or directly encoded */ + return c == '!' || c == '"' || c == '#' || c == '$' || c == '%' || c == '&' || c == '*' || c == ';' || c == '<' || + c == '=' || c == '>' || c == '@' || c == '[' || c == ']' || c == '^' || c == '_' || c == '`' || c == '{' || + c == '|' || c == '}'; +} + +static bool can_end_base64(uint32_t c) +{ + return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\'' || c == '(' || c == ')' || c == ',' || c == '.' || c == ':' || c == '?'; +} + static unsigned char decode_base64(unsigned char c) { if (c >= 'A' && c <= 'Z') { @@ -432,6 +443,8 @@ static unsigned char decode_base64(unsigned char c) return 63; } else if (c == '-') { return DASH; + } else if (can_end_base64(c) || is_optional_direct(c) || c == '\0') { + return DIRECT; } else if (c <= 0x7F) { return ASCII; } @@ -470,7 +483,7 @@ static uint32_t* handle_base64_end(unsigned char n, unsigned char **p, uint32_t if (n == ILLEGAL) { *out++ = MBFL_BAD_INPUT; - } else if (n == ASCII) { + } else if (n == DIRECT || n == ASCII) { (*p)--; /* Unconsume byte */ } @@ -596,11 +609,6 @@ static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf return out - buf; } -static bool can_end_base64(uint32_t c) -{ - return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\'' || c == '(' || c == ')' || c == ',' || c == '.' || c == ':' || c == '?'; -} - static bool should_direct_encode(uint32_t c) { return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '\0' || c == '/' || c == '-' || can_end_base64(c); @@ -700,3 +708,129 @@ static void mb_wchar_to_utf7(uint32_t *in, size_t len, mb_convert_buf *buf, bool MB_CONVERT_BUF_STORE(buf, out, limit); } + +static bool is_utf16_cp_valid(uint16_t cp, bool is_surrogate) +{ + if (is_surrogate) { + return cp >= 0xDC00 && cp <= 0xDFFF; + } else { + /* 2nd part of surrogate pair came unexpectedly */ + return !(cp >= 0xDC00 && cp <= 0xDFFF); + } +} + +static bool can_encode_directly(unsigned char c) +{ + return should_direct_encode(c) || is_optional_direct(c) || c == '\0'; +} + +static bool mb_check_utf7(unsigned char *in, size_t in_len) +{ + unsigned char *p = in, *e = p + in_len; + bool base64 = false; + bool is_surrogate = false; + + while (p < e) { + if (base64) { + unsigned char n1 = decode_base64(*p++); + if (is_base64_end(n1)) { + if (!is_base64_end_valid(n1, false, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n2 = decode_base64(*p++); + if (is_base64_end(n2) || p == e) { + return false; + } + unsigned char n3 = decode_base64(*p++); + if (is_base64_end(n3)) { + return false; + } + uint16_t cp1 = (n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2); + if (!is_utf16_cp_valid(cp1, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp1, is_surrogate); + if (p == e) { + /* It is an error if trailing padding bits are not zeroes or if we were + * expecting the 2nd part of a surrogate pair when Base64 section ends */ + return !((n3 & 0x3) || is_surrogate); + } + + unsigned char n4 = decode_base64(*p++); + if (is_base64_end(n4)) { + if (!is_base64_end_valid(n4, n3 & 0x3, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n5 = decode_base64(*p++); + if (is_base64_end(n5) || p == e) { + return false; + } + unsigned char n6 = decode_base64(*p++); + if (is_base64_end(n6)) { + return false; + } + uint16_t cp2 = (n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4); + if (!is_utf16_cp_valid(cp2, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp2, is_surrogate); + if (p == e) { + return !((n6 & 0xF) || is_surrogate); + } + + unsigned char n7 = decode_base64(*p++); + if (is_base64_end(n7)) { + if (!is_base64_end_valid(n7, n6 & 0xF, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n8 = decode_base64(*p++); + if (is_base64_end(n8)) { + return false; + } + uint16_t cp3 = (n6 << 12) | (n7 << 6) | n8; + if (!is_utf16_cp_valid(cp3, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp3, is_surrogate); + } else { + /* ASCII text section */ + unsigned char c = *p++; + + if (c == '+') { + if (p == e) { + base64 = true; + return !is_surrogate; + } + unsigned char n = decode_base64(*p); + if (n == DASH) { + p++; + } else if (n > DASH) { + /* If a "+" character followed immediately by any character other than base64 or "-" */ + return false; + } else { + base64 = true; + } + } else if (can_encode_directly(c)) { + continue; + } else { + return false; + } + } + } + return !is_surrogate; +} diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c b/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c index 850edfbd63a..77b65bbeee8 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c @@ -77,11 +77,13 @@ #include "mbfilter.h" #include "mbfilter_utf7imap.h" +#include "utf7_helper.h" static int mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter *filter); static int mbfl_filt_conv_utf7imap_wchar_flush(mbfl_convert_filter *filter); static size_t mb_utf7imap_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state); static void mb_wchar_to_utf7imap(uint32_t *in, size_t len, mb_convert_buf *buf, bool end); +static bool mb_check_utf7imap(unsigned char *in, size_t in_len); static const char *mbfl_encoding_utf7imap_aliases[] = {"mUTF-7", NULL}; @@ -95,7 +97,8 @@ const mbfl_encoding mbfl_encoding_utf7imap = { &vtbl_utf7imap_wchar, &vtbl_wchar_utf7imap, mb_utf7imap_to_wchar, - mb_wchar_to_utf7imap + mb_wchar_to_utf7imap, + mb_check_utf7imap }; const struct mbfl_convert_vtbl vtbl_utf7imap_wchar = { @@ -444,10 +447,6 @@ static int mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter *filter) return 0; } -/* Ways which a Base64-encoded section can end: */ -#define DASH 0xFE -#define ILLEGAL 0xFF - static inline bool is_base64_end(unsigned char c) { return c >= DASH; @@ -732,3 +731,124 @@ static void mb_wchar_to_utf7imap(uint32_t *in, size_t len, mb_convert_buf *buf, MB_CONVERT_BUF_STORE(buf, out, limit); } + +static bool is_utf16_cp_valid(uint16_t cp, bool is_surrogate) +{ + if (is_surrogate) { + return cp >= 0xDC00 && cp <= 0xDFFF; + } else if (cp >= 0xDC00 && cp <= 0xDFFF) { + /* 2nd part of surrogate pair came unexpectedly */ + return false; + } else if (cp >= 0x20 && cp <= 0x7E && cp != '&') { + return false; + } + return true; +} + +static bool mb_check_utf7imap(unsigned char *in, size_t in_len) +{ + unsigned char *p = in, *e = p + in_len; + bool base64 = false; + bool is_surrogate = false; + + while (p < e) { + if (base64) { + /* Base64 section */ + unsigned char n1 = decode_base64(*p++); + if (is_base64_end(n1)) { + if (!is_base64_end_valid(n1, false, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n2 = decode_base64(*p++); + if (is_base64_end(n2) || p == e) { + return false; + } + unsigned char n3 = decode_base64(*p++); + if (is_base64_end(n3)) { + return false; + } + uint16_t cp1 = (n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2); + if (!is_utf16_cp_valid(cp1, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp1, is_surrogate); + if (p == e) { + return false; + } + + unsigned char n4 = decode_base64(*p++); + if (is_base64_end(n4)) { + if (!is_base64_end_valid(n4, n3 & 0x3, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n5 = decode_base64(*p++); + if (is_base64_end(n5) || p == e) { + return false; + } + unsigned char n6 = decode_base64(*p++); + if (is_base64_end(n6)) { + return false; + } + uint16_t cp2 = (n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4); + if (!is_utf16_cp_valid(cp2, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp2, is_surrogate); + if (p == e) { + return false; + } + + unsigned char n7 = decode_base64(*p++); + if (is_base64_end(n7)) { + if (!is_base64_end_valid(n7, n6 & 0xF, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n8 = decode_base64(*p++); + if (is_base64_end(n8)) { + return false; + } + uint16_t cp3 = (n6 << 12) | (n7 << 6) | n8; + if (!is_utf16_cp_valid(cp3, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp3, is_surrogate); + } else { + /* ASCII text section */ + unsigned char c = *p++; + + if (c == '&') { + if (p == e) { + return false; + } + unsigned char n = decode_base64(*p); + if (n == DASH) { + p++; + } else if (n == ILLEGAL) { + return false; + } else { + base64 = true; + } + } else if (c >= 0x20 && c <= 0x7E) { + continue; + } else { + return false; + } + } + } + return !base64; +} diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c index 6c7bad0e805..44df3ab4257 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c @@ -64,7 +64,8 @@ const mbfl_encoding mbfl_encoding_utf8 = { &vtbl_utf8_wchar, &vtbl_wchar_utf8, mb_utf8_to_wchar, - mb_wchar_to_utf8 + mb_wchar_to_utf8, + NULL }; const struct mbfl_convert_vtbl vtbl_utf8_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c b/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c index c573ec70f3b..59e2676208b 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c @@ -63,7 +63,8 @@ const mbfl_encoding mbfl_encoding_utf8_docomo = { &vtbl_utf8_docomo_wchar, &vtbl_wchar_utf8_docomo, mb_utf8_docomo_to_wchar, - mb_wchar_to_utf8_docomo + mb_wchar_to_utf8_docomo, + NULL }; const mbfl_encoding mbfl_encoding_utf8_kddi_a = { @@ -76,7 +77,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_a = { &vtbl_utf8_kddi_a_wchar, &vtbl_wchar_utf8_kddi_a, mb_utf8_kddi_a_to_wchar, - mb_wchar_to_utf8_kddi_a + mb_wchar_to_utf8_kddi_a, + NULL }; const mbfl_encoding mbfl_encoding_utf8_kddi_b = { @@ -89,7 +91,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_b = { &vtbl_utf8_kddi_b_wchar, &vtbl_wchar_utf8_kddi_b, mb_utf8_kddi_b_to_wchar, - mb_wchar_to_utf8_kddi_b + mb_wchar_to_utf8_kddi_b, + NULL }; const mbfl_encoding mbfl_encoding_utf8_sb = { @@ -102,7 +105,8 @@ const mbfl_encoding mbfl_encoding_utf8_sb = { &vtbl_utf8_sb_wchar, &vtbl_wchar_utf8_sb, mb_utf8_sb_to_wchar, - mb_wchar_to_utf8_sb + mb_wchar_to_utf8_sb, + NULL }; const struct mbfl_convert_vtbl vtbl_utf8_docomo_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c b/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c index cc90997c2fc..83a56977d3e 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c @@ -43,7 +43,8 @@ const mbfl_encoding mbfl_encoding_uuencode = { NULL, NULL, mb_uuencode_to_wchar, - mb_wchar_to_uuencode + mb_wchar_to_uuencode, + NULL }; const struct mbfl_convert_vtbl vtbl_uuencode_8bit = { diff --git a/ext/mbstring/libmbfl/filters/utf7_helper.h b/ext/mbstring/libmbfl/filters/utf7_helper.h new file mode 100644 index 00000000000..0e71a5a4490 --- /dev/null +++ b/ext/mbstring/libmbfl/filters/utf7_helper.h @@ -0,0 +1,22 @@ +#ifndef MBFL_UTF7_HELPER_H +#define MBFL_UTF7_HELPER_H + +#include "mbfilter.h" + +/* Ways which a Base64-encoded section can end: */ +#define DASH 0xFC +#define DIRECT 0xFD +#define ASCII 0xFE +#define ILLEGAL 0xFF + +static inline bool is_base64_end_valid(unsigned char n, bool gap, bool is_surrogate) +{ + return !(gap || is_surrogate || n == ASCII || n == ILLEGAL); +} + +static inline bool has_surrogate(uint16_t cp, bool is_surrogate) +{ + return !is_surrogate && cp >= 0xD800 && cp <= 0xDBFF; +} + +#endif /* MBFL_UTF7_HELPER_H */ diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter.c b/ext/mbstring/libmbfl/mbfl/mbfilter.c index e0cfa13e0b4..5f5ce07ce6f 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter.c @@ -312,6 +312,16 @@ int mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *str unsigned char *p = string->val; int bad = 0; + if (identd->strict) { + for (int i = 0; i < num; i++) { + mbfl_convert_filter *filter = identd->filter_list[i]; + mbfl_encoding_detector_data *data = &identd->filter_data[i]; + if (filter->from->check != NULL && !(filter->from->check)(p, n)) { + data->num_illegalchars++; + } + } + } + while (n--) { for (int i = 0; i < num; i++) { mbfl_convert_filter *filter = identd->filter_list[i]; diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c b/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c index 8fe51c9fd4c..43db2f7f5b2 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c @@ -51,7 +51,8 @@ const mbfl_encoding mbfl_encoding_8bit = { &vtbl_8bit_wchar, &vtbl_wchar_8bit, mb_8bit_to_wchar, - mb_wchar_to_8bit + mb_wchar_to_8bit, + NULL }; const struct mbfl_convert_vtbl vtbl_8bit_wchar = { diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter_pass.c b/ext/mbstring/libmbfl/mbfl/mbfilter_pass.c index 3fb7e991141..b932603e1c5 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter_pass.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter_pass.c @@ -44,6 +44,7 @@ const mbfl_encoding mbfl_encoding_pass = { NULL, NULL, NULL, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c b/ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c index 5472d792a83..2bd9cca7b5b 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c @@ -42,5 +42,6 @@ const mbfl_encoding mbfl_encoding_wchar = { NULL, NULL, NULL, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h index e5ae285098e..f66e85acd8a 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h +++ b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h @@ -143,6 +143,7 @@ typedef struct { typedef size_t (*mb_to_wchar_fn)(unsigned char **in, size_t *in_len, uint32_t *out, size_t out_len, unsigned int *state); typedef void (*mb_from_wchar_fn)(uint32_t *in, size_t in_len, mb_convert_buf *out, bool end); +typedef bool (*mb_check_fn)(unsigned char *in, size_t in_len); /* When converting encoded text to a buffer of wchars (Unicode codepoints) using `mb_to_wchar_fn`, * the buffer must be at least this size (to work with all supported text encodings) */ @@ -232,6 +233,7 @@ typedef struct { const struct mbfl_convert_vtbl *output_filter; mb_to_wchar_fn to_wchar; mb_from_wchar_fn from_wchar; + mb_check_fn check; } mbfl_encoding; MBFLAPI extern const mbfl_encoding *mbfl_name2encoding(const char *name); diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index e7b056d87c8..5aa25b57f01 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -4415,6 +4415,10 @@ MBSTRING_API int php_mb_check_encoding(const char *input, size_t length, const m unsigned char *in = (unsigned char*)input; unsigned int state = 0; + if (encoding->check != NULL) { + return encoding->check(in, length); + } + /* If the input string is not encoded in the given encoding, there is a significant chance * that this will be seen in the first bytes. Therefore, rather than converting an entire * buffer of 128 codepoints, convert and check just a few codepoints first */ diff --git a/ext/mbstring/tests/gh10192_utf7.phpt b/ext/mbstring/tests/gh10192_utf7.phpt new file mode 100644 index 00000000000..2930942c12c --- /dev/null +++ b/ext/mbstring/tests/gh10192_utf7.phpt @@ -0,0 +1,542 @@ +--TEST-- +GH-10192 (mb_detect_encoding() results for UTF-7 differ between PHP 8.0 and 8.1) +--EXTENSIONS-- +mbstring +--FILE-- + 'A + B', + 'non-base64 character after -' => 'A - B', + 'base64 character before +' => 'A 1+ B', + 'base64 character before -' => 'A 1- B', + 'base64 character after +' => 'A +1 B', + 'base64 character after -' => 'A -1 B', + 'base64 character before and after +' => 'A 1+1 B', + 'base64 character before and after -' => 'A 1-1 B', + 'string ends with +' => 'A +', + 'string ends with -' => 'A -', + '+ and -' => 'A +- B', + '- and +' => 'A -+ B', + 'valid direct encoding character =' => 'A = B', + 'invalid direct encoding character ~' => 'A ~ B', + 'invalid direct encoding character \\' => 'A \\ B', + 'invalid direct encoding character ESC' => "A \x1b B", + 'valid direct encoding character = after +' => 'A += B', + 'invalid direct encoding character ~ after +' => 'A +~ B', + 'invalid direct encoding character \\ after +' => 'A +\\ B', + 'invalid direct encoding character ESC after +' => "A +\x1b B", + 'valid base64 character between + and -' => 'A +ZeVnLIqe- B', // 日本語 in UTF-16BE + 'invalid base64 character between + and -' => 'A +ZeVnLIq- B', // 日本語 in UTF-16BE without the last character + 'valid base64 character between + and non-base64 character' => 'A +ZeVnLIqe B', + 'invalid base64 character between + and non-base64 character' => 'A +ZeVnLIq B', + 'valid base64 character between + and base64 character' => 'A +ZeVnLIqe1 B', + 'invalid base64 character between + and base64 character' => 'A +ZeVnLIq1 B', + 'valid base64 character between + and end of string' => 'A +ZeVnLIqe', + 'invalid base64 character between + and end of string' => 'A +ZeVnLIq', + 'valid base64 character consisting only of + between + and -' => 'A +++++++++- B', + 'invalid base64 character consisting only of + between + and -' => 'A +++++++++- B', + 'valid base64 character consisting only of + between + and non-base64 character' => 'A +++++++++ B', + 'invalid base64 character consisting only of + between + and non-base64 character' => 'A +++++++++ B', + 'valid base64 character consisting only of + between + and base64 character' => 'A +++++++++1 B', + 'invalid base64 character consisting only of + between + and base64 character' => 'A +++++++++1 B', + 'valid base64 character consisting only of + between + and end of string' => 'A +++++++++', + 'invalid base64 character consisting only of + between + and end of string' => 'A +++++++++', + 'valid base64 character using surrogate pair between + and -' => 'A +2GfePQ- B', // 𩸽 in UTF-16BE + 'first 16 bits of base64 character using surrogate pair between + and -' => 'A +2Gc- B', // first 16 bits of 𩸽 in UTF-16BE + 'valid base64 character using surrogate pair between + and non-base64 character' => 'A +2GfePQ B', + 'first 16 bits of base64 character using surrogate pair between + and non-base64 character' => 'A +2Gc B', + 'valid base64 character using surrogate pair between + and base64 character' => 'A +2GfePQ1 B', + 'first 16 bits of base64 character using surrogate pair between + and base64 character' => 'A +2Gc1 B', + 'valid base64 character using surrogate pair between + and end of string' => 'A +2GfePQ', + 'first 16 bits of base64 character using surrogate pair between + and end of string' => 'A +2Gc', + 'invalid base64 character using surrogate pair in reverse order between + and -' => 'A +3j3YZw- B', // 𩸽 in reverse order in UTF-16BE + 'last 16 bits of base64 character using surrogate pair in reverse order between + and -' => 'A +3j0- B', // last 16 bits of 𩸽 in UTF-16BE + 'invalid base64 character using surrogate pair in reverse order between + and non-base64 character' => 'A +3j3YZw B', + 'last 16 bits of base64 character using surrogate pair in reverse order between + and non-base64 character' => 'A +3j0 B', + 'invalid base64 character using surrogate pair in reverse order between + and base64 character' => 'A +3j3YZw1 B', + 'last 16 bits of base64 character using surrogate pair in reverse order between + and base64 character' => 'A +3j01 B', + 'invalid base64 character using surrogate pair in reverse order between + and end of string' => 'A +3j3YZw', + 'last 16 bits of base64 character using surrogate pair in reverse order between + and end of string' => 'A +3j0' +]; + +foreach ($testcases as $title => $case) { + echo $title . PHP_EOL; + var_dump(mb_detect_encoding($case, 'UTF-8, UTF-7', true)); + var_dump(mb_detect_encoding($case, 'UTF-8, UTF-7', false)); + var_dump(mb_detect_encoding($case, 'UTF-7', true)); + var_dump(mb_detect_encoding($case, 'UTF-7', false)); + var_dump(mb_check_encoding($case, 'UTF-7')); + var_dump(addcslashes(mb_convert_encoding($case, 'UTF-8', 'UTF-7'), "\0..\37\177")); + var_dump(mb_get_info('illegal_chars')); + echo PHP_EOL; +} +?> +--EXPECT-- +non-base64 character after + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(4) "A B" +int(0) + +non-base64 character after - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(5) "A - B" +int(0) + +base64 character before + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A 1 B" +int(0) + +base64 character before - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(6) "A 1- B" +int(0) + +base64 character after + +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(1) + +base64 character after - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(6) "A -1 B" +int(1) + +base64 character before and after + +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(6) "A 1? B" +int(2) + +base64 character before and after - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(7) "A 1-1 B" +int(2) + +string ends with + +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(2) "A " +int(2) + +string ends with - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(3) "A -" +int(2) + ++ and - +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(5) "A + B" +int(2) + +- and + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A - B" +int(2) + +valid direct encoding character = +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(5) "A = B" +int(2) + +invalid direct encoding character ~ +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ~ B" +int(2) + +invalid direct encoding character \ +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A \ B" +int(2) + +invalid direct encoding character ESC +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(8) "A \033 B" +int(2) + +valid direct encoding character = after + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A = B" +int(2) + +invalid direct encoding character ~ after + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ~ B" +int(2) + +invalid direct encoding character \ after + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A \ B" +int(2) + +invalid direct encoding character ESC after + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(8) "A \033 B" +int(2) + +valid base64 character between + and - +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A 日本語 B" +int(2) + +invalid base64 character between + and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(11) "A 日本? B" +int(3) + +valid base64 character between + and non-base64 character +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A 日本語 B" +int(3) + +invalid base64 character between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(11) "A 日本? B" +int(4) + +valid base64 character between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(14) "A 日本語? B" +int(5) + +invalid base64 character between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A 日本誵 B" +int(5) + +valid base64 character between + and end of string +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(11) "A 日本語" +int(5) + +invalid base64 character between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(9) "A 日本?" +int(6) + +valid base64 character consisting only of + between + and - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A ﯯ뻻 B" +int(6) + +invalid base64 character consisting only of + between + and - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A ﯯ뻻 B" +int(6) + +valid base64 character consisting only of + between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A ﯯ뻻 B" +int(6) + +invalid base64 character consisting only of + between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A ﯯ뻻 B" +int(6) + +valid base64 character consisting only of + between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(14) "A ﯯ뻻? B" +int(7) + +invalid base64 character consisting only of + between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(14) "A ﯯ뻻? B" +int(8) + +valid base64 character consisting only of + between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(11) "A ﯯ뻻" +int(8) + +invalid base64 character consisting only of + between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(11) "A ﯯ뻻" +int(8) + +valid base64 character using surrogate pair between + and - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(8) "A 𩸽 B" +int(8) + +first 16 bits of base64 character using surrogate pair between + and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(9) + +valid base64 character using surrogate pair between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(8) "A 𩸽 B" +int(9) + +first 16 bits of base64 character using surrogate pair between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(10) + +valid base64 character using surrogate pair between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(9) "A 𩸽? B" +int(11) + +first 16 bits of base64 character using surrogate pair between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(12) + +valid base64 character using surrogate pair between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(6) "A 𩸽" +int(12) + +first 16 bits of base64 character using surrogate pair between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(3) "A ?" +int(13) + +invalid base64 character using surrogate pair in reverse order between + and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(6) "A ?? B" +int(15) + +last 16 bits of base64 character using surrogate pair in reverse order between + and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(16) + +invalid base64 character using surrogate pair in reverse order between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(6) "A ?? B" +int(18) + +last 16 bits of base64 character using surrogate pair in reverse order between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(19) + +invalid base64 character using surrogate pair in reverse order between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(6) "A ?? B" +int(21) + +last 16 bits of base64 character using surrogate pair in reverse order between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(6) "A ?? B" +int(23) + +invalid base64 character using surrogate pair in reverse order between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(4) "A ??" +int(25) + +last 16 bits of base64 character using surrogate pair in reverse order between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(3) "A ?" +int(26) diff --git a/ext/mbstring/tests/gh10192_utf7imap.phpt b/ext/mbstring/tests/gh10192_utf7imap.phpt new file mode 100644 index 00000000000..c4f50884f6d --- /dev/null +++ b/ext/mbstring/tests/gh10192_utf7imap.phpt @@ -0,0 +1,423 @@ +--TEST-- +GH-10192 (mb_detect_encoding() results for UTF-7 differ between PHP 8.0 and 8.1) +--EXTENSIONS-- +mbstring +--FILE-- + 'A & B', + 'non-base64 character after -' => 'A - B', + 'base64 character before &' => 'A 1& B', + 'base64 character before -' => 'A 1- B', + 'base64 character after &' => 'A &1 B', + 'base64 character after -' => 'A -1 B', + 'base64 character before and after &' => 'A 1&1 B', + 'base64 character before and after -' => 'A 1-1 B', + 'string ends with &' => 'A &', + 'string ends with -' => 'A -', + '& and -' => 'A &- B', + '- and &' => 'A -& B', + 'valid direct encoding character ~' => 'A ~ B', + 'invalid direct encoding character ESC' => "A \x1b B", + 'valid direct encoding character ~ after &' => 'A &~ B', + 'invalid direct encoding character ESC after &' => "A &\x1b B", + 'valid base64 character between & and -' => 'A &ZeVnLIqe- B', // 日本語 in UTF-16BE + 'invalid base64 character between & and -' => 'A &ZeVnLIq- B', // 日本語 in UTF-16BE without the last character + 'valid base64 character between & and non-base64 character' => 'A &ZeVnLIqe B', + 'invalid base64 character between & and non-base64 character' => 'A &ZeVnLIq B', + 'valid base64 character between & and base64 character' => 'A &ZeVnLIqe1 B', + 'invalid base64 character between & and base64 character' => 'A &ZeVnLIq1 B', + 'valid base64 character between & and end of string' => 'A &ZeVnLIqe', + 'invalid base64 character between & and end of string' => 'A &ZeVnLIq', + 'valid base64 character using surrogate pair between & and -' => 'A &2GfePQ- B', // 𩸽 in UTF-16BE + 'first 16 bits of base64 character using surrogate pair between & and -' => 'A &2Gc- B', // first 16 bits of 𩸽 in UTF-16BE + 'valid base64 character using surrogate pair between & and non-base64 character' => 'A &2GfePQ B', + 'first 16 bits of base64 character using surrogate pair between & and non-base64 character' => 'A &2Gc B', + 'valid base64 character using surrogate pair between & and base64 character' => 'A &2GfePQ1 B', + 'first 16 bits of base64 character using surrogate pair between & and base64 character' => 'A &2Gc1 B', + 'valid base64 character using surrogate pair between & and end of string' => 'A &2GfePQ', + 'first 16 bits of base64 character using surrogate pair between & and end of string' => 'A &2Gc', + 'invalid base64 character using surrogate pair in reverse order between & and -' => 'A &3j3YZw- B', // 𩸽 in reverse order in UTF-16BE + 'last 16 bits of base64 character using surrogate pair in reverse order between & and -' => 'A &3j0- B', // last 16 bits of 𩸽 in UTF-16BE + 'invalid base64 character using surrogate pair in reverse order between & and non-base64 character' => 'A &3j3YZw B', + 'last 16 bits of base64 character using surrogate pair in reverse order between & and non-base64 character' => 'A &3j0 B', + 'invalid base64 character using surrogate pair in reverse order between & and base64 character' => 'A &3j3YZw1 B', + 'last 16 bits of base64 character using surrogate pair in reverse order between & and base64 character' => 'A &3j01 B', + 'invalid base64 character using surrogate pair in reverse order between & and end of string' => 'A &3j3YZw', + 'last 16 bits of base64 character using surrogate pair in reverse order between & and end of string' => 'A &3j0' +]; + +foreach ($testcases as $title => $case) { + echo $title . PHP_EOL; + var_dump(mb_detect_encoding($case, 'UTF-8, UTF7-IMAP', true)); + var_dump(mb_detect_encoding($case, 'UTF-8, UTF7-IMAP', false)); + var_dump(mb_detect_encoding($case, 'UTF7-IMAP', true)); + var_dump(mb_detect_encoding($case, 'UTF7-IMAP', false)); + var_dump(mb_check_encoding($case, 'UTF7-IMAP')); + var_dump(addcslashes(mb_convert_encoding($case, 'UTF-8', 'UTF7-IMAP'), "\0..\37\177")); + var_dump(mb_get_info('illegal_chars')); + echo PHP_EOL; +} + +?> +--EXPECT-- +non-base64 character after & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ?B" +int(1) + +non-base64 character after - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(5) "A - B" +int(1) + +base64 character before & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A 1?B" +int(2) + +base64 character before - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(6) "A 1- B" +int(2) + +base64 character after & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ?B" +int(3) + +base64 character after - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(6) "A -1 B" +int(3) + +base64 character before and after & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A 1?B" +int(4) + +base64 character before and after - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(7) "A 1-1 B" +int(4) + +string ends with & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(3) "A ?" +int(5) + +string ends with - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(3) "A -" +int(5) + +& and - +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(5) "A & B" +int(5) + +- and & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A -?B" +int(6) + +valid direct encoding character ~ +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(5) "A ~ B" +int(6) + +invalid direct encoding character ESC +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ? B" +int(7) + +valid direct encoding character ~ after & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ? B" +int(8) + +invalid direct encoding character ESC after & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ? B" +int(9) + +valid base64 character between & and - +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(13) "A 日本語 B" +int(9) + +invalid base64 character between & and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(11) "A 日本? B" +int(10) + +valid base64 character between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(13) "A 日本語?B" +int(11) + +invalid base64 character between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(10) "A 日本?B" +int(12) + +valid base64 character between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(13) "A 日本語?B" +int(13) + +invalid base64 character between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(13) "A 日本誵?B" +int(14) + +valid base64 character between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(12) "A 日本語?" +int(15) + +invalid base64 character between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(9) "A 日本?" +int(16) + +valid base64 character using surrogate pair between & and - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(8) "A 𩸽 B" +int(16) + +first 16 bits of base64 character using surrogate pair between & and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ? B" +int(17) + +valid base64 character using surrogate pair between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(8) "A 𩸽?B" +int(18) + +first 16 bits of base64 character using surrogate pair between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ?B" +int(19) + +valid base64 character using surrogate pair between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(8) "A 𩸽?B" +int(20) + +first 16 bits of base64 character using surrogate pair between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ?B" +int(21) + +valid base64 character using surrogate pair between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(7) "A 𩸽?" +int(22) + +first 16 bits of base64 character using surrogate pair between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ??" +int(24) + +invalid base64 character using surrogate pair in reverse order between & and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(6) "A ?? B" +int(26) + +last 16 bits of base64 character using surrogate pair in reverse order between & and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ? B" +int(27) + +invalid base64 character using surrogate pair in reverse order between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ??B" +int(29) + +last 16 bits of base64 character using surrogate pair in reverse order between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ??B" +int(31) + +invalid base64 character using surrogate pair in reverse order between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ??B" +int(33) + +last 16 bits of base64 character using surrogate pair in reverse order between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ??B" +int(35) + +invalid base64 character using surrogate pair in reverse order between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ???" +int(38) + +last 16 bits of base64 character using surrogate pair in reverse order between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ??" +int(40) diff --git a/ext/mbstring/tests/gh10648.phpt b/ext/mbstring/tests/gh10648.phpt new file mode 100644 index 00000000000..9f0b4b4db15 --- /dev/null +++ b/ext/mbstring/tests/gh10648.phpt @@ -0,0 +1,155 @@ +--TEST-- +GH-10648 (mb_check_encoding() returns true for incorrect but interpretable ISO-2022-JP byte sequences) +--EXTENSIONS-- +mbstring +--FILE-- + '1b244224221b2842', // 'あ' in ISO-2022-JP + 'ISO-2022-JP bytes without escape sequence' => '1b24422422', // 'ア' in JIS + 'JIS X 0201 7bit kana with escape sequence' => '1b2849311b2842', // 'ア' in JIS + 'JIS X 0201 7bit kana with SO/SI' => '0e310f', // 'ア' in JIS + 'JIS X 0201 8bit kana' => 'b1', // 'ア' in JIS + 'JIS X 0201 7bit kana with SO and ESC' => '0e311b2842', // 'ア' in JIS + 'JIS X 0201 7bit kana with ESC and SI' => '1b2849310f', // 'ア' in JIS + 'JIS X 0208 character' => '1b244242641b2842', // '鯛' in JIS and ISO-2022-JP, included in JIS X 0208 + 'JIS X 0212 character' => '1b2428446a591b2842', // '鮋' in JIS, included in JIS X 0212 + 'JIS X 0213 character' => '1b2428507d4c1b2842', // '𩸽' in ISO-2022-JP-2004, included in JIS X 0213 + 'JIS C 6220-1969 ESC ( H' => '1b284a1b2848', // an escape sequence transitioning to ASCII + 'SO/SI when not in ASCII mode' => '1b284a0e0f1b2842', // an escape sequence transitioning to ASCII +]; + +foreach ($testcases as $title => $case) { + echo $title . PHP_EOL; + echo 'JIS:' . PHP_EOL; + var_dump(mb_check_encoding(hex2bin($case), 'JIS')); + echo mb_convert_encoding(hex2bin($case), 'UTF-8', 'JIS'). PHP_EOL; + var_dump(mb_get_info('illegal_chars')); + echo 'ISO-2022-JP:' . PHP_EOL; + var_dump(mb_check_encoding(hex2bin($case), 'ISO-2022-JP')); + echo mb_convert_encoding(hex2bin($case), 'UTF-8', 'ISO-2022-JP'). PHP_EOL; + var_dump(mb_get_info('illegal_chars')); + echo PHP_EOL; +} +?> +--EXPECT-- +ISO-2022-JP bytes +JIS: +bool(true) +あ +int(0) +ISO-2022-JP: +bool(true) +あ +int(0) + +ISO-2022-JP bytes without escape sequence +JIS: +bool(false) +あ +int(0) +ISO-2022-JP: +bool(false) +あ +int(0) + +JIS X 0201 7bit kana with escape sequence +JIS: +bool(true) +ア +int(0) +ISO-2022-JP: +bool(false) +ア +int(0) + +JIS X 0201 7bit kana with SO/SI +JIS: +bool(true) +ア +int(0) +ISO-2022-JP: +bool(false) +ア +int(0) + +JIS X 0201 8bit kana +JIS: +bool(true) +ア +int(0) +ISO-2022-JP: +bool(false) +ア +int(0) + +JIS X 0201 7bit kana with SO and ESC +JIS: +bool(false) +ア +int(0) +ISO-2022-JP: +bool(false) +ア +int(0) + +JIS X 0201 7bit kana with ESC and SI +JIS: +bool(false) +ア +int(0) +ISO-2022-JP: +bool(false) +ア +int(0) + +JIS X 0208 character +JIS: +bool(true) +鯛 +int(0) +ISO-2022-JP: +bool(true) +鯛 +int(0) + +JIS X 0212 character +JIS: +bool(true) +鮋 +int(0) +ISO-2022-JP: +bool(false) +鮋 +int(0) + +JIS X 0213 character +JIS: +bool(false) +?$(P}L +int(1) +ISO-2022-JP: +bool(false) +?$(P}L +int(2) + +JIS C 6220-1969 ESC ( H +JIS: +bool(true) + +int(2) +ISO-2022-JP: +bool(false) + +int(2) + +SO/SI when not in ASCII mode +JIS: +bool(false) + +int(2) +ISO-2022-JP: +bool(false) + +int(2) diff --git a/ext/mbstring/tests/iso2022jp_encoding.phpt b/ext/mbstring/tests/iso2022jp_encoding.phpt index 634f0976994..5da1899c855 100644 --- a/ext/mbstring/tests/iso2022jp_encoding.phpt +++ b/ext/mbstring/tests/iso2022jp_encoding.phpt @@ -50,11 +50,6 @@ function testValid($from, $to, $encoding, $bothWays = true) { /* ESC ( B at the beginning is redundant, since ASCII mode is the default */ if (substr($from, 0, 3) == "\x1B(B") $from = substr($from, 3, strlen($from) - 3); - /* If the string switches to a different charset, it should switch back to - * ASCII at the end */ - if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B(J") !== false) - $from .= "\x1B(B"; - convertValidString($to, $from, 'UTF-16BE', $encoding, false); } } @@ -66,11 +61,11 @@ function testInvalid($from, $to, $encoding) { for ($i = 0; $i < 0x80; $i++) { if ($i == 0xE || $i == 0xF || $i == 0x1B) continue; - testValid(chr($i), "\x00" . chr($i), 'JIS'); - testValid("\x0F" . chr($i), "\x00" . chr($i), 'JIS'); /* 0xF is 'Shift Out' code */ - testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'JIS'); - testValid(chr($i), "\x00" . chr($i), 'ISO-2022-JP'); - testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'ISO-2022-JP'); + testValid(chr($i), "\x00" . chr($i), 'JIS'); + convertValidString("\x0F" . chr($i), "\x00" . chr($i), 'JIS', 'UTF-16BE', false); /* 0xF is 'Shift In' code */ + testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'JIS'); + testValid(chr($i), "\x00" . chr($i), 'ISO-2022-JP'); + testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'ISO-2022-JP'); } for ($i = 0x80; $i < 256; $i++) { @@ -92,27 +87,27 @@ echo "ASCII support OK\n"; foreach ($jisx0201Chars as $jisx0201 => $utf16BE) { if (ord($jisx0201) >= 128) { $kana = chr(ord($jisx0201) - 128); - testValid("\x1B(I" . $kana, $utf16BE, 'JIS', false); - testValid("\x0E" . $kana, $utf16BE, 'JIS', false); /* 0xE is 'Shift In' code */ + testValid("\x1B(I" . $kana . "\x1B(B", $utf16BE, 'JIS', false); + testValid("\x0E" . $kana . "\x0F", $utf16BE, 'JIS', false); /* 0xE is 'Shift Out' code */ testValid($jisx0201, $utf16BE, 'JIS', false); } else { - testValid("\x1B(J" . $jisx0201, $utf16BE, 'JIS', $utf16BE > "\x00\x80"); + testValid("\x1B(J" . $jisx0201 . "\x1B(B", $utf16BE, 'JIS', $utf16BE > "\x00\x80"); } } for ($i = 0x80; $i < 256; $i++) { if ($i >= 0xA1 && $i <= 0xDF) continue; - testInvalid("\x1B(I" . chr($i), "\x00%", 'JIS'); - testInvalid("\x1B(J" . chr($i), "\x00%", 'JIS'); + testInvalid("\x1B(I" . chr($i) . "\x1B(B", "\x00%", 'JIS'); + testInvalid("\x1B(J" . chr($i) . "\x1B(B", "\x00%", 'JIS'); } echo "JIS X 0201 support OK\n"; /* All valid JISX0208 characters */ foreach ($jisx0208Chars as $jisx0208 => $utf16BE) { - testValid("\x1B\$B" . $jisx0208, $utf16BE, 'JIS'); - testValid("\x1B\$B" . $jisx0208, $utf16BE, 'ISO-2022-JP'); + testValid("\x1B\$B" . $jisx0208 . "\x1B(B", $utf16BE, 'JIS'); + testValid("\x1B\$B" . $jisx0208 . "\x1B(B", $utf16BE, 'ISO-2022-JP'); } /* All invalid 2-byte JISX0208 characters */ @@ -120,8 +115,8 @@ for ($i = 0x21; $i <= 0x7E; $i++) { for ($j = 0; $j < 256; $j++) { $testString = chr($i) . chr($j); if (!isset($jisx0208Chars[$testString])) { - testInvalid("\x1B\$B" . $testString, "\x00%", 'JIS'); - testInvalid("\x1B\$B" . $testString, "\x00%", 'ISO-2022-JP'); + testInvalid("\x1B\$B" . $testString . "\x1B(B", "\x00%", 'JIS'); + testInvalid("\x1B\$B" . $testString . "\x1B(B", "\x00%", 'ISO-2022-JP'); } } } @@ -142,7 +137,7 @@ echo "JIS X 0208 support OK\n"; /* All valid JISX0212 characters */ foreach ($jisx0212Chars as $jisx0212 => $utf16BE) { - testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'JIS', false); + testValid("\x1B\$(D" . $jisx0212 . "\x1B(B", $utf16BE, 'JIS', false); } /* All invalid 2-byte JISX0212 characters */ @@ -150,14 +145,14 @@ for ($i = 0x21; $i <= 0x7E; $i++) { for ($j = 0; $j < 256; $j++) { $testString = chr($i) . chr($j); if (!isset($jisx0212Chars[$testString])) { - testInvalid("\x1B\$(D" . $testString, "\x00%", 'JIS'); + testInvalid("\x1B\$(D" . $testString . "\x1B(B", "\x00%", 'JIS'); } } } /* Try truncated JISX0212 characters */ for ($i = 0x21; $i <= 0x7E; $i++) { - testInvalid("\x1B\$(D" . chr($i), "\x00%", 'JIS'); + testInvalid("\x1B\$(D" . chr($i) . "\x1B(B", "\x00%\x00%", 'JIS'); } testValidString("\x00\xA1", "\x1B\$(D\x22\x42\x1B(B", "UTF-16BE", "JIS", false); @@ -167,29 +162,36 @@ convertInvalidString("\x00\xA1", "%", "UTF-16BE", "ISO-2022-JP", false); echo "JIS X 0212 support OK\n"; /* All possible escape sequences */ -$validEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B\$(@" => true, "\x1B\$(B" => true, "\x1B\$(D" => true, "\x1B(B" => true, "\x1B(H" => true, "\x1B(J" => true, "\x1B(I" => true]; +$validJisEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B\$(@" => true, "\x1B\$(B" => true, "\x1B\$(D" => true, "\x1B(B" => true, "\x1B(H" => true, "\x1B(J" => true, "\x1B(I" => true]; +$validIso2022jpEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B(B" => true, "\x1B(J" => true]; for ($i = 0; $i <= 0xFF; $i++) { for ($j = 0; $j <= 0xFF; $j++) { $escapeSequence = "\x1B" . chr($i) . chr($j); if ($escapeSequence === "\x1B\$(") continue; - if (isset($validEscapes[$escapeSequence])) { - testValid($escapeSequence, "", 'JIS', false); - testValid($escapeSequence, "", 'ISO-2022-JP', false); + if (isset($validJisEscapes[$escapeSequence])) { + testValid($escapeSequence . "\x1B(B", "", 'JIS', false); } else { - identifyInvalidString($escapeSequence, 'JIS'); - identifyInvalidString($escapeSequence, 'ISO-2022-JP'); + identifyInvalidString($escapeSequence . "\x1B(B", 'JIS'); + } + if (isset($validIso2022jpEscapes[$escapeSequence])) { + testValid($escapeSequence . "\x1B(B", "", 'ISO-2022-JP', false); + } else { + identifyInvalidString($escapeSequence . "\x1B(B", 'ISO-2022-JP'); } } } for ($i = 0; $i <= 0xFF; $i++) { $escapeSequence = "\x1B\$(" . chr($i); - if (isset($validEscapes[$escapeSequence])) { - testValid($escapeSequence, "", 'JIS', false); - testValid($escapeSequence, "", 'ISO-2022-JP', false); + if (isset($validJisEscapes[$escapeSequence])) { + testValid($escapeSequence . "\x1B(B", "", 'JIS', false); } else { - identifyInvalidString($escapeSequence, 'JIS'); - identifyInvalidString($escapeSequence, 'ISO-2022-JP'); + identifyInvalidString($escapeSequence . "\x1B(B", 'JIS'); + } + if (isset($validIso2022jpEscapes[$escapeSequence])) { + testValid($escapeSequence . "\x1B(B", "", 'ISO-2022-JP', false); + } else { + identifyInvalidString($escapeSequence . "\x1B(B", 'ISO-2022-JP'); } } /* Also try a bare ESC */ diff --git a/ext/mbstring/tests/utf_encodings.phpt b/ext/mbstring/tests/utf_encodings.phpt index 2f050c657c2..23166142088 100644 --- a/ext/mbstring/tests/utf_encodings.phpt +++ b/ext/mbstring/tests/utf_encodings.phpt @@ -1011,17 +1011,8 @@ testValidString('+' . encode('AB', 'ASCII') . '-+' . encode('CD', 'ASCII') . '-' // (Just trying to be exhaustive here) testValidString('+' . encode('AB', 'ASCII') . '-!+' . encode('CD', 'ASCII') . '-', "\x00A\x00B\x00!\x00C\x00D", 'UTF-7', 'UTF-16BE', false); -// + section terminated by a non-Base64 ASCII character which is NOT - -for ($i = 0; $i < 128; $i++) { - if ($i >= ord('A') && $i <= ord('Z')) - continue; - if ($i >= ord('a') && $i <= ord('z')) - continue; - if ($i >= ord('0') && $i <= ord('9')) - continue; - if ($i == ord('+') || $i == ord('/') || $i == ord('-') || $i == ord('\\') || $i == ord('~')) - continue; - $char = chr($i); +// + section terminated by a non-Base64 direct character which is NOT - +foreach (str_split(" \t\r\n'(),.:?!\"#$%&*;<=>@[]^_`{|}\x00") as $char) { testValidString('+' . encode("\x12\x34", 'UTF-16BE') . $char, "\x00\x00\x12\x34\x00\x00\x00" . $char, 'UTF-7', 'UTF-32BE', false); }