diff --git a/UPGRADING b/UPGRADING index d96509c485b..6a1ad31f21b 100644 --- a/UPGRADING +++ b/UPGRADING @@ -489,6 +489,16 @@ PHP 8.1 UPGRADE NOTES . All GMP function now accept octal string with the leading octal prefix ("0o"/"0O") RFC: https://wiki.php.net/rfc/explicit_octal_notation +- MBString + . mb_check_encoding() now checks input encoding more strictly. + . mb_detect_encoding() now checks input encoding more strictly + when strict detection is enabled. + . mb_convert_encoding() checks the input encoding more strictly + if multiple encodings are passed to from_encoding + and the mbstring.strict_detection INI directive is set to 1. + This change only affects the encoding selection, + not the result of the conversion. + - PDO ODBC: . PDO::getAttributes() with PDO::ATTR_SERVER_INFO and PDO::ATTR_SERVER_VERSION now return values instead of throwing PDOException. diff --git a/ext/mbstring/libmbfl/filters/mbfilter_7bit.c b/ext/mbstring/libmbfl/filters/mbfilter_7bit.c index 2885ca1fbd6..4dd513430c3 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_7bit.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_7bit.c @@ -39,6 +39,7 @@ const mbfl_encoding mbfl_encoding_7bit = { NULL, MBFL_ENCTYPE_SBCS, NULL, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/filters/mbfilter_base64.c b/ext/mbstring/libmbfl/filters/mbfilter_base64.c index 04161c94ab3..427211c8f8c 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_base64.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_base64.c @@ -39,6 +39,7 @@ const mbfl_encoding mbfl_encoding_base64 = { NULL, MBFL_ENCTYPE_GL_UNSAFE, NULL, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/filters/mbfilter_big5.c b/ext/mbstring/libmbfl/filters/mbfilter_big5.c index a3c23b0ec89..8e59343de91 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_big5.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_big5.c @@ -63,7 +63,8 @@ const mbfl_encoding mbfl_encoding_big5 = { mblen_table_big5, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_big5_wchar, - &vtbl_wchar_big5 + &vtbl_wchar_big5, + NULL }; const mbfl_encoding mbfl_encoding_cp950 = { @@ -74,7 +75,8 @@ const mbfl_encoding mbfl_encoding_cp950 = { mblen_table_big5, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_cp950_wchar, - &vtbl_wchar_cp950 + &vtbl_wchar_cp950, + NULL }; const struct mbfl_convert_vtbl vtbl_big5_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c index ba324eb3016..557ee7f446f 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp5022x.c @@ -54,7 +54,8 @@ const mbfl_encoding mbfl_encoding_cp50220 = { NULL, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_cp50220_wchar, - &vtbl_wchar_cp50220 + &vtbl_wchar_cp50220, + NULL }; const mbfl_encoding mbfl_encoding_cp50221 = { @@ -65,7 +66,8 @@ const mbfl_encoding mbfl_encoding_cp50221 = { NULL, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_cp50221_wchar, - &vtbl_wchar_cp50221 + &vtbl_wchar_cp50221, + NULL }; const mbfl_encoding mbfl_encoding_cp50222 = { @@ -76,7 +78,8 @@ const mbfl_encoding mbfl_encoding_cp50222 = { NULL, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_cp50222_wchar, - &vtbl_wchar_cp50222 + &vtbl_wchar_cp50222, + NULL }; const struct mbfl_convert_vtbl vtbl_cp50220_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c b/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c index 475e01e970d..1741601d23f 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp51932.c @@ -65,7 +65,8 @@ const mbfl_encoding mbfl_encoding_cp51932 = { mblen_table_eucjp, 0, &vtbl_cp51932_wchar, - &vtbl_wchar_cp51932 + &vtbl_wchar_cp51932, + NULL }; const struct mbfl_convert_vtbl vtbl_cp51932_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp932.c b/ext/mbstring/libmbfl/filters/mbfilter_cp932.c index 9130cea1163..54f93f91fe2 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp932.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp932.c @@ -95,7 +95,8 @@ const mbfl_encoding mbfl_encoding_cp932 = { mblen_table_sjis, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_cp932_wchar, - &vtbl_wchar_cp932 + &vtbl_wchar_cp932, + NULL }; const struct mbfl_convert_vtbl vtbl_cp932_wchar = { @@ -126,7 +127,8 @@ const mbfl_encoding mbfl_encoding_sjiswin = { mblen_table_sjis, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_sjiswin_wchar, - &vtbl_wchar_sjiswin + &vtbl_wchar_sjiswin, + NULL }; const struct mbfl_convert_vtbl vtbl_sjiswin_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_cp936.c b/ext/mbstring/libmbfl/filters/mbfilter_cp936.c index e51514aad72..cd689513e28 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_cp936.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_cp936.c @@ -63,7 +63,8 @@ const mbfl_encoding mbfl_encoding_cp936 = { mblen_table_cp936, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_cp936_wchar, - &vtbl_wchar_cp936 + &vtbl_wchar_cp936, + NULL }; const struct mbfl_convert_vtbl vtbl_cp936_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_cn.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_cn.c index a1e51bbd814..52b2ee863ba 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_cn.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_cn.c @@ -63,7 +63,8 @@ const mbfl_encoding mbfl_encoding_euc_cn = { mblen_table_euccn, 0, &vtbl_euccn_wchar, - &vtbl_wchar_euccn + &vtbl_wchar_euccn, + NULL }; const struct mbfl_convert_vtbl vtbl_euccn_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c index b113537b56a..0e1b8bec2e9 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp.c @@ -64,7 +64,8 @@ const mbfl_encoding mbfl_encoding_euc_jp = { mblen_table_eucjp, 0, &vtbl_eucjp_wchar, - &vtbl_wchar_eucjp + &vtbl_wchar_eucjp, + NULL }; const struct mbfl_convert_vtbl vtbl_eucjp_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_2004.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_2004.c index 0709a9f12d0..ef2f8154088 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_2004.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_2004.c @@ -43,7 +43,8 @@ const mbfl_encoding mbfl_encoding_eucjp2004 = { mblen_table_eucjp, 0, &vtbl_eucjp2004_wchar, - &vtbl_wchar_eucjp2004 + &vtbl_wchar_eucjp2004, + NULL }; const struct mbfl_convert_vtbl vtbl_eucjp2004_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_win.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_win.c index b7d0705391e..76650b5dd53 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_win.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_jp_win.c @@ -65,7 +65,8 @@ const mbfl_encoding mbfl_encoding_eucjp_win = { mblen_table_eucjp, 0, &vtbl_eucjpwin_wchar, - &vtbl_wchar_eucjpwin + &vtbl_wchar_eucjpwin, + NULL }; const struct mbfl_convert_vtbl vtbl_eucjpwin_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_kr.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_kr.c index 6bc52b744b7..9a98fc21d43 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_kr.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_kr.c @@ -62,7 +62,8 @@ const mbfl_encoding mbfl_encoding_euc_kr = { mblen_table_euckr, 0, &vtbl_euckr_wchar, - &vtbl_wchar_euckr + &vtbl_wchar_euckr, + NULL }; const struct mbfl_convert_vtbl vtbl_euckr_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c b/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c index 1cc740dd62e..6075514b3c7 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_euc_tw.c @@ -64,7 +64,8 @@ const mbfl_encoding mbfl_encoding_euc_tw = { mblen_table_euctw, 0, &vtbl_euctw_wchar, - &vtbl_wchar_euctw + &vtbl_wchar_euctw, + NULL }; const struct mbfl_convert_vtbl vtbl_euctw_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_gb18030.c b/ext/mbstring/libmbfl/filters/mbfilter_gb18030.c index 7889ad348af..c68dc0b557f 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_gb18030.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_gb18030.c @@ -45,7 +45,8 @@ const mbfl_encoding mbfl_encoding_gb18030 = { NULL, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_gb18030_wchar, - &vtbl_wchar_gb18030 + &vtbl_wchar_gb18030, + NULL }; const struct mbfl_convert_vtbl vtbl_gb18030_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c b/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c index 3a7e879ceaa..eef10fef108 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_htmlent.c @@ -61,7 +61,8 @@ const mbfl_encoding mbfl_encoding_html_ent = { NULL, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_html_wchar, - &vtbl_wchar_html + &vtbl_wchar_html, + NULL }; const struct mbfl_convert_vtbl vtbl_wchar_html = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_hz.c b/ext/mbstring/libmbfl/filters/mbfilter_hz.c index a9249788aa6..2eb16bd0161 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_hz.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_hz.c @@ -43,7 +43,8 @@ const mbfl_encoding mbfl_encoding_hz = { NULL, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_hz_wchar, - &vtbl_wchar_hz + &vtbl_wchar_hz, + NULL }; const struct mbfl_convert_vtbl vtbl_hz_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c index 87ffc37bc3c..c493c8a6253 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_jp_ms.c @@ -46,7 +46,8 @@ const mbfl_encoding mbfl_encoding_2022jpms = { NULL, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_2022jpms_wchar, - &vtbl_wchar_2022jpms + &vtbl_wchar_2022jpms, + NULL }; const struct mbfl_convert_vtbl vtbl_2022jpms_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c index e01931e397a..8c7b854f46c 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_iso2022_kr.c @@ -47,7 +47,8 @@ const mbfl_encoding mbfl_encoding_2022kr = { NULL, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_2022kr_wchar, - &vtbl_wchar_2022kr + &vtbl_wchar_2022kr, + NULL }; const struct mbfl_convert_vtbl vtbl_wchar_2022kr = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_2004.c b/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_2004.c index c54941f23dc..547177f3dee 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_2004.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_2004.c @@ -43,7 +43,8 @@ const mbfl_encoding mbfl_encoding_2022jp_2004 = { NULL, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_2022jp_2004_wchar, - &vtbl_wchar_2022jp_2004 + &vtbl_wchar_2022jp_2004, + NULL }; const struct mbfl_convert_vtbl vtbl_2022jp_2004_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c b/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c index 8308e0e901b..c824a534655 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_iso2022jp_mobile.c @@ -48,7 +48,8 @@ const mbfl_encoding mbfl_encoding_2022jp_kddi = { NULL, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_2022jp_kddi_wchar, - &vtbl_wchar_2022jp_kddi + &vtbl_wchar_2022jp_kddi, + NULL }; const struct mbfl_convert_vtbl vtbl_2022jp_kddi_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_jis.c b/ext/mbstring/libmbfl/filters/mbfilter_jis.c index d63d972042d..a0a15a23adc 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_jis.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_jis.c @@ -34,6 +34,8 @@ #include "unicode_table_jis.h" static int mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter *filter); +static bool mb_check_iso2022jp(unsigned char *in, size_t in_len); +static bool mb_check_jis(unsigned char *in, size_t in_len); const mbfl_encoding mbfl_encoding_jis = { mbfl_no_encoding_jis, @@ -43,7 +45,8 @@ const mbfl_encoding mbfl_encoding_jis = { NULL, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_jis_wchar, - &vtbl_wchar_jis + &vtbl_wchar_jis, + mb_check_jis }; const mbfl_encoding mbfl_encoding_2022jp = { @@ -54,7 +57,8 @@ const mbfl_encoding mbfl_encoding_2022jp = { NULL, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_2022jp_wchar, - &vtbl_wchar_2022jp + &vtbl_wchar_2022jp, + mb_check_iso2022jp }; const struct mbfl_convert_vtbl vtbl_jis_wchar = { @@ -463,3 +467,166 @@ mbfl_filt_conv_any_jis_flush(mbfl_convert_filter *filter) return 0; } + +#define ASCII 0 +#define JISX_0201_LATIN 1 +#define JISX_0201_KANA 2 +#define JISX_0208 3 +#define JISX_0212 4 +#define JISX_0201_KANA_SO 5 + +static bool mb_check_jis(unsigned char *in, size_t in_len) +{ + unsigned char *p = in, *e = p + in_len; + unsigned int state = ASCII; + + while (p < e) { + unsigned char c = *p++; + if (c == 0x1B) { + /* ESC seen; this is an escape sequence */ + if (state == JISX_0201_KANA_SO) { + return false; + } + if ((e - p) < 2) { + return false; + } + unsigned char c2 = *p++; + if (c2 == '$') { + unsigned char c3 = *p++; + if (c3 == '@' || c3 == 'B') { + state = JISX_0208; + } else if (c3 == '(') { + if (p == e) { + return false; + } + unsigned char c4 = *p++; + if (c4 == '@' || c4 == 'B') { + state = JISX_0208; + } else if (c4 == 'D') { + state = JISX_0212; + } else { + return false; + } + } else { + return false; + } + } else if (c2 == '(') { + unsigned char c3 = *p++; + /* ESC ( H is treated as a sequence transitioning to ASCII for historical reasons. + * see https://github.com/php/php-src/pull/10828#issuecomment-1478342432. */ + if (c3 == 'B' || c3 == 'H') { + state = ASCII; + } else if (c3 == 'J') { + state = JISX_0201_LATIN; + } else if (c3 == 'I') { + state = JISX_0201_KANA; + } else { + return false; + } + } else { + return false; + } + } else if (c == 0xE) { + /* "Kana In" marker */ + if (state != ASCII) { + return false; + } + state = JISX_0201_KANA_SO; + } else if (c == 0xF) { + /* "Kana Out" marker */ + if (state != JISX_0201_KANA_SO) { + return false; + } + state = ASCII; + } else if ((state == JISX_0208 || state == JISX_0212) && (c > 0x20 && c < 0x7F)) { + if (p == e) { + return false; + } + unsigned char c2 = *p++; + if (c2 > 0x20 && c2 < 0x7F) { + unsigned int s = (c - 0x21)*94 + c2 - 0x21; + if (state == JISX_0208) { + if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) { + continue; + } + } else { + if (s < jisx0212_ucs_table_size && jisx0212_ucs_table[s]) { + continue; + } + } + return false; + } else { + return false; + } + } else if (c < 0x80) { + continue; + } else if (c >= 0xA1 && c <= 0xDF) { + /* GR-invoked Kana */ + continue; + } else { + return false; + } + } + + return state == ASCII; +} + + +static bool mb_check_iso2022jp(unsigned char *in, size_t in_len) +{ + unsigned char *p = in, *e = p + in_len; + unsigned int state = ASCII; + + while (p < e) { + unsigned char c = *p++; + if (c == 0x1B) { + /* ESC seen; this is an escape sequence */ + if ((e - p) < 2) { + return false; + } + unsigned char c2 = *p++; + if (c2 == '$') { + unsigned char c3 = *p++; + if (c3 == '@' || c3 == 'B') { + state = JISX_0208; + } else { + return false; + } + } else if (c2 == '(') { + unsigned char c3 = *p++; + if (c3 == 'B') { + state = ASCII; + } else if (c3 == 'J') { + state = JISX_0201_LATIN; + } else { + return false; + } + } else { + return false; + } + } else if (c == 0xE || c == 0xF) { + /* "Kana In" or "Kana Out" marker; ISO-2022-JP is not accepted. */ + return false; + } else if (state == JISX_0208 && (c > 0x20 && c < 0x7F)) { + if (p == e) { + return false; + } + unsigned char c2 = *p++; + if (c2 > 0x20 && c2 < 0x7F) { + unsigned int s = (c - 0x21)*94 + c2 - 0x21; + if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) { + continue; + } + return false; + } else { + return false; + } + } else if (c < 0x80) { + continue; + } else { + return false; + } + } + + return state == ASCII; +} diff --git a/ext/mbstring/libmbfl/filters/mbfilter_qprint.c b/ext/mbstring/libmbfl/filters/mbfilter_qprint.c index 2c9e8234872..521d20adf29 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_qprint.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_qprint.c @@ -41,6 +41,7 @@ const mbfl_encoding mbfl_encoding_qprint = { NULL, MBFL_ENCTYPE_GL_UNSAFE, NULL, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c b/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c index e9b2b040865..8a0b920bff8 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_singlebyte.c @@ -78,7 +78,8 @@ static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int NULL, \ MBFL_ENCTYPE_SBCS, \ &vtbl_##id##_wchar, \ - &vtbl_wchar_##id \ + &vtbl_wchar_##id, \ + NULL \ } /* For single-byte encodings which use a conversion table */ diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c index 3406cc0cf4b..c797b6d794b 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis.c @@ -67,7 +67,8 @@ const mbfl_encoding mbfl_encoding_sjis = { mblen_table_sjis, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_sjis_wchar, - &vtbl_wchar_sjis + &vtbl_wchar_sjis, + NULL }; const struct mbfl_convert_vtbl vtbl_sjis_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c index 6a6fe92f497..f9964275b27 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_2004.c @@ -54,7 +54,8 @@ const mbfl_encoding mbfl_encoding_sjis2004 = { mblen_table_sjis_mobile, /* Leading byte values used for SJIS-2004 are the same as mobile SJIS variants */ MBFL_ENCTYPE_GL_UNSAFE, &vtbl_sjis2004_wchar, - &vtbl_wchar_sjis2004 + &vtbl_wchar_sjis2004, + NULL }; const struct mbfl_convert_vtbl vtbl_sjis2004_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c index 3f85cb5eecb..e2a08351b8e 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mac.c @@ -67,7 +67,8 @@ const mbfl_encoding mbfl_encoding_sjis_mac = { mblen_table_sjismac, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_sjis_mac_wchar, - &vtbl_wchar_sjis_mac + &vtbl_wchar_sjis_mac, + NULL }; const struct mbfl_convert_vtbl vtbl_sjis_mac_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c index cc04335fcc4..aff91b786d7 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_sjis_mobile.c @@ -70,7 +70,8 @@ const mbfl_encoding mbfl_encoding_sjis_docomo = { mblen_table_sjis_mobile, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_sjis_docomo_wchar, - &vtbl_wchar_sjis_docomo + &vtbl_wchar_sjis_docomo, + NULL }; const mbfl_encoding mbfl_encoding_sjis_kddi = { @@ -81,7 +82,8 @@ const mbfl_encoding mbfl_encoding_sjis_kddi = { mblen_table_sjis_mobile, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_sjis_kddi_wchar, - &vtbl_wchar_sjis_kddi + &vtbl_wchar_sjis_kddi, + NULL }; const mbfl_encoding mbfl_encoding_sjis_sb = { @@ -92,7 +94,8 @@ const mbfl_encoding mbfl_encoding_sjis_sb = { mblen_table_sjis_mobile, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_sjis_sb_wchar, - &vtbl_wchar_sjis_sb + &vtbl_wchar_sjis_sb, + NULL }; const struct mbfl_convert_vtbl vtbl_sjis_docomo_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c b/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c index 3301146a1e9..fcad8349cae 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_ucs2.c @@ -49,7 +49,8 @@ const mbfl_encoding mbfl_encoding_ucs2 = { NULL, MBFL_ENCTYPE_WCS2, &vtbl_ucs2_wchar, - &vtbl_wchar_ucs2 + &vtbl_wchar_ucs2, + NULL }; const mbfl_encoding mbfl_encoding_ucs2be = { @@ -60,7 +61,8 @@ const mbfl_encoding mbfl_encoding_ucs2be = { NULL, MBFL_ENCTYPE_WCS2, &vtbl_ucs2be_wchar, - &vtbl_wchar_ucs2be + &vtbl_wchar_ucs2be, + NULL }; const mbfl_encoding mbfl_encoding_ucs2le = { @@ -71,7 +73,8 @@ const mbfl_encoding mbfl_encoding_ucs2le = { NULL, MBFL_ENCTYPE_WCS2, &vtbl_ucs2le_wchar, - &vtbl_wchar_ucs2le + &vtbl_wchar_ucs2le, + NULL }; const struct mbfl_convert_vtbl vtbl_ucs2_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c b/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c index f0ac949ffda..70ee71d52a1 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_ucs4.c @@ -48,7 +48,8 @@ const mbfl_encoding mbfl_encoding_ucs4 = { NULL, MBFL_ENCTYPE_WCS4, &vtbl_ucs4_wchar, - &vtbl_wchar_ucs4 + &vtbl_wchar_ucs4, + NULL }; const mbfl_encoding mbfl_encoding_ucs4be = { @@ -59,7 +60,8 @@ const mbfl_encoding mbfl_encoding_ucs4be = { NULL, MBFL_ENCTYPE_WCS4, &vtbl_ucs4be_wchar, - &vtbl_wchar_ucs4be + &vtbl_wchar_ucs4be, + NULL }; const mbfl_encoding mbfl_encoding_ucs4le = { @@ -70,7 +72,8 @@ const mbfl_encoding mbfl_encoding_ucs4le = { NULL, MBFL_ENCTYPE_WCS4, &vtbl_ucs4le_wchar, - &vtbl_wchar_ucs4le + &vtbl_wchar_ucs4le, + NULL }; const struct mbfl_convert_vtbl vtbl_ucs4_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_uhc.c b/ext/mbstring/libmbfl/filters/mbfilter_uhc.c index 60fef4d3952..62b010cc3b0 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_uhc.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_uhc.c @@ -67,7 +67,8 @@ const mbfl_encoding mbfl_encoding_uhc = { mblen_table_uhc, 0, &vtbl_uhc_wchar, - &vtbl_wchar_uhc + &vtbl_wchar_uhc, + NULL }; const struct mbfl_convert_vtbl vtbl_uhc_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c index eb7d9fa2593..7470dbda551 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf16.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf16.c @@ -42,7 +42,8 @@ const mbfl_encoding mbfl_encoding_utf16 = { NULL, MBFL_ENCTYPE_MWC2, &vtbl_utf16_wchar, - &vtbl_wchar_utf16 + &vtbl_wchar_utf16, + NULL }; const mbfl_encoding mbfl_encoding_utf16be = { @@ -53,7 +54,8 @@ const mbfl_encoding mbfl_encoding_utf16be = { NULL, MBFL_ENCTYPE_MWC2, &vtbl_utf16be_wchar, - &vtbl_wchar_utf16be + &vtbl_wchar_utf16be, + NULL }; const mbfl_encoding mbfl_encoding_utf16le = { @@ -64,7 +66,8 @@ const mbfl_encoding mbfl_encoding_utf16le = { NULL, MBFL_ENCTYPE_MWC2, &vtbl_utf16le_wchar, - &vtbl_wchar_utf16le + &vtbl_wchar_utf16le, + NULL }; const struct mbfl_convert_vtbl vtbl_utf16_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c index 1269e515004..016b4e46a94 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf32.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf32.c @@ -42,7 +42,8 @@ const mbfl_encoding mbfl_encoding_utf32 = { NULL, MBFL_ENCTYPE_WCS4, &vtbl_utf32_wchar, - &vtbl_wchar_utf32 + &vtbl_wchar_utf32, + NULL }; const mbfl_encoding mbfl_encoding_utf32be = { @@ -53,7 +54,8 @@ const mbfl_encoding mbfl_encoding_utf32be = { NULL, MBFL_ENCTYPE_WCS4, &vtbl_utf32be_wchar, - &vtbl_wchar_utf32be + &vtbl_wchar_utf32be, + NULL }; const mbfl_encoding mbfl_encoding_utf32le = { @@ -64,7 +66,8 @@ const mbfl_encoding mbfl_encoding_utf32le = { NULL, MBFL_ENCTYPE_WCS4, &vtbl_utf32le_wchar, - &vtbl_wchar_utf32le + &vtbl_wchar_utf32le, + NULL }; const struct mbfl_convert_vtbl vtbl_utf32_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf7.c b/ext/mbstring/libmbfl/filters/mbfilter_utf7.c index bc34a394915..fc2bc36420b 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf7.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf7.c @@ -29,8 +29,10 @@ #include "mbfilter.h" #include "mbfilter_utf7.h" +#include "utf7_helper.h" static int mbfl_filt_conv_utf7_wchar_flush(mbfl_convert_filter *filter); +static bool mb_check_utf7(unsigned char *in, size_t in_len); static const unsigned char mbfl_base64_table[] = { /* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */ @@ -55,7 +57,8 @@ const mbfl_encoding mbfl_encoding_utf7 = { NULL, MBFL_ENCTYPE_GL_UNSAFE, &vtbl_utf7_wchar, - &vtbl_wchar_utf7 + &vtbl_wchar_utf7, + mb_check_utf7 }; const struct mbfl_convert_vtbl vtbl_utf7_wchar = { @@ -419,3 +422,169 @@ int mbfl_filt_conv_wchar_utf7_flush(mbfl_convert_filter *filter) return 0; } + +static bool is_optional_direct(unsigned char c) +{ + /* Characters that are allowed to be encoded by Base64 or directly encoded */ + return c == '!' || c == '"' || c == '#' || c == '$' || c == '%' || c == '&' || c == '*' || c == ';' || c == '<' || + c == '=' || c == '>' || c == '@' || c == '[' || c == ']' || c == '^' || c == '_' || c == '`' || c == '{' || + c == '|' || c == '}'; +} + +static bool can_end_base64(uint32_t c) +{ + return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\'' || c == '(' || c == ')' || c == ',' || c == '.' || c == ':' || c == '?'; +} + +static unsigned char decode_base64(unsigned char c) +{ + if (c >= 'A' && c <= 'Z') { + return c - 65; + } else if (c >= 'a' && c <= 'z') { + return c - 71; + } else if (c >= '0' && c <= '9') { + return c + 4; + } else if (c == '+') { + return 62; + } else if (c == '/') { + return 63; + } else if (c == '-') { + return DASH; + } else if (can_end_base64(c) || is_optional_direct(c) || c == '\0') { + return DIRECT; + } else if (c <= 0x7F) { + return ASCII; + } + return ILLEGAL; +} + +static bool is_utf16_cp_valid(uint16_t cp, bool is_surrogate) +{ + if (is_surrogate) { + return cp >= 0xDC00 && cp <= 0xDFFF; + } else { + /* 2nd part of surrogate pair came unexpectedly */ + return !(cp >= 0xDC00 && cp <= 0xDFFF); + } +} + +static bool should_direct_encode(uint32_t c) +{ + return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '\0' || c == '/' || c == '-' || can_end_base64(c); +} + +static bool can_encode_directly(unsigned char c) +{ + return should_direct_encode(c) || is_optional_direct(c) || c == '\0'; +} + +static bool mb_check_utf7(unsigned char *in, size_t in_len) +{ + unsigned char *p = in, *e = p + in_len; + bool base64 = false; + bool is_surrogate = false; + + while (p < e) { + if (base64) { + unsigned char n1 = decode_base64(*p++); + if (is_base64_end(n1)) { + if (!is_base64_end_valid(n1, false, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n2 = decode_base64(*p++); + if (is_base64_end(n2) || p == e) { + return false; + } + unsigned char n3 = decode_base64(*p++); + if (is_base64_end(n3)) { + return false; + } + uint16_t cp1 = (n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2); + if (!is_utf16_cp_valid(cp1, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp1, is_surrogate); + if (p == e) { + /* It is an error if trailing padding bits are not zeroes or if we were + * expecting the 2nd part of a surrogate pair when Base64 section ends */ + return !((n3 & 0x3) || is_surrogate); + } + + unsigned char n4 = decode_base64(*p++); + if (is_base64_end(n4)) { + if (!is_base64_end_valid(n4, n3 & 0x3, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n5 = decode_base64(*p++); + if (is_base64_end(n5) || p == e) { + return false; + } + unsigned char n6 = decode_base64(*p++); + if (is_base64_end(n6)) { + return false; + } + uint16_t cp2 = (n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4); + if (!is_utf16_cp_valid(cp2, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp2, is_surrogate); + if (p == e) { + return !((n6 & 0xF) || is_surrogate); + } + + unsigned char n7 = decode_base64(*p++); + if (is_base64_end(n7)) { + if (!is_base64_end_valid(n7, n6 & 0xF, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n8 = decode_base64(*p++); + if (is_base64_end(n8)) { + return false; + } + uint16_t cp3 = (n6 << 12) | (n7 << 6) | n8; + if (!is_utf16_cp_valid(cp3, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp3, is_surrogate); + } else { + /* ASCII text section */ + unsigned char c = *p++; + + if (c == '+') { + if (p == e) { + base64 = true; + return !is_surrogate; + } + unsigned char n = decode_base64(*p); + if (n == DASH) { + p++; + } else if (n > DASH) { + /* If a "+" character followed immediately by any character other than base64 or "-" */ + return false; + } else { + base64 = true; + } + } else if (can_encode_directly(c)) { + continue; + } else { + return false; + } + } + } + return !is_surrogate; +} diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c b/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c index 821155f1b71..012904a287e 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf7imap.c @@ -77,9 +77,11 @@ #include "mbfilter.h" #include "mbfilter_utf7imap.h" +#include "utf7_helper.h" static int mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter *filter); static int mbfl_filt_conv_utf7imap_wchar_flush(mbfl_convert_filter *filter); +static bool mb_check_utf7imap(unsigned char *in, size_t in_len); static const char *mbfl_encoding_utf7imap_aliases[] = {"mUTF-7", NULL}; @@ -91,7 +93,8 @@ const mbfl_encoding mbfl_encoding_utf7imap = { NULL, 0, &vtbl_utf7imap_wchar, - &vtbl_wchar_utf7imap + &vtbl_wchar_utf7imap, + mb_check_utf7imap }; const struct mbfl_convert_vtbl vtbl_utf7imap_wchar = { @@ -437,3 +440,142 @@ static int mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter *filter) } return 0; } + +static unsigned char decode_base64(unsigned char c) +{ + if (c >= 'A' && c <= 'Z') { + return c - 65; + } else if (c >= 'a' && c <= 'z') { + return c - 71; + } else if (c >= '0' && c <= '9') { + return c + 4; + } else if (c == '+') { + return 62; + } else if (c == ',') { + return 63; + } else if (c == '-') { + return DASH; + } + return ILLEGAL; +} + +static bool is_utf16_cp_valid(uint16_t cp, bool is_surrogate) +{ + if (is_surrogate) { + return cp >= 0xDC00 && cp <= 0xDFFF; + } else if (cp >= 0xDC00 && cp <= 0xDFFF) { + /* 2nd part of surrogate pair came unexpectedly */ + return false; + } else if (cp >= 0x20 && cp <= 0x7E && cp != '&') { + return false; + } + return true; +} + +static bool mb_check_utf7imap(unsigned char *in, size_t in_len) +{ + unsigned char *p = in, *e = p + in_len; + bool base64 = false; + bool is_surrogate = false; + + while (p < e) { + if (base64) { + /* Base64 section */ + unsigned char n1 = decode_base64(*p++); + if (is_base64_end(n1)) { + if (!is_base64_end_valid(n1, false, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n2 = decode_base64(*p++); + if (is_base64_end(n2) || p == e) { + return false; + } + unsigned char n3 = decode_base64(*p++); + if (is_base64_end(n3)) { + return false; + } + uint16_t cp1 = (n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2); + if (!is_utf16_cp_valid(cp1, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp1, is_surrogate); + if (p == e) { + return false; + } + + unsigned char n4 = decode_base64(*p++); + if (is_base64_end(n4)) { + if (!is_base64_end_valid(n4, n3 & 0x3, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n5 = decode_base64(*p++); + if (is_base64_end(n5) || p == e) { + return false; + } + unsigned char n6 = decode_base64(*p++); + if (is_base64_end(n6)) { + return false; + } + uint16_t cp2 = (n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4); + if (!is_utf16_cp_valid(cp2, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp2, is_surrogate); + if (p == e) { + return false; + } + + unsigned char n7 = decode_base64(*p++); + if (is_base64_end(n7)) { + if (!is_base64_end_valid(n7, n6 & 0xF, is_surrogate)) { + return false; + } + base64 = false; + continue; + } else if (p == e) { + return false; + } + unsigned char n8 = decode_base64(*p++); + if (is_base64_end(n8)) { + return false; + } + uint16_t cp3 = (n6 << 12) | (n7 << 6) | n8; + if (!is_utf16_cp_valid(cp3, is_surrogate)) { + return false; + } + is_surrogate = has_surrogate(cp3, is_surrogate); + } else { + /* ASCII text section */ + unsigned char c = *p++; + + if (c == '&') { + if (p == e) { + return false; + } + unsigned char n = decode_base64(*p); + if (n == DASH) { + p++; + } else if (n == ILLEGAL) { + return false; + } else { + base64 = true; + } + } else if (c >= 0x20 && c <= 0x7E) { + continue; + } else { + return false; + } + } + } + return !base64; +} diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c index 7ab4c5f96b8..2e71888327b 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf8.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8.c @@ -59,7 +59,8 @@ const mbfl_encoding mbfl_encoding_utf8 = { mblen_table_utf8, 0, &vtbl_utf8_wchar, - &vtbl_wchar_utf8 + &vtbl_wchar_utf8, + NULL }; const struct mbfl_convert_vtbl vtbl_utf8_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c b/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c index c52459dc3ab..a3dc2f8249a 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_utf8_mobile.c @@ -49,7 +49,8 @@ const mbfl_encoding mbfl_encoding_utf8_docomo = { mblen_table_utf8, 0, &vtbl_utf8_docomo_wchar, - &vtbl_wchar_utf8_docomo + &vtbl_wchar_utf8_docomo, + NULL }; const mbfl_encoding mbfl_encoding_utf8_kddi_a = { @@ -60,7 +61,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_a = { mblen_table_utf8, 0, &vtbl_utf8_kddi_a_wchar, - &vtbl_wchar_utf8_kddi_a + &vtbl_wchar_utf8_kddi_a, + NULL }; const mbfl_encoding mbfl_encoding_utf8_kddi_b = { @@ -71,7 +73,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_b = { mblen_table_utf8, 0, &vtbl_utf8_kddi_b_wchar, - &vtbl_wchar_utf8_kddi_b + &vtbl_wchar_utf8_kddi_b, + NULL }; const mbfl_encoding mbfl_encoding_utf8_sb = { @@ -82,7 +85,8 @@ const mbfl_encoding mbfl_encoding_utf8_sb = { mblen_table_utf8, 0, &vtbl_utf8_sb_wchar, - &vtbl_wchar_utf8_sb + &vtbl_wchar_utf8_sb, + NULL }; const struct mbfl_convert_vtbl vtbl_utf8_docomo_wchar = { diff --git a/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c b/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c index d602274db62..51e0e8777c1 100644 --- a/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c +++ b/ext/mbstring/libmbfl/filters/mbfilter_uuencode.c @@ -38,6 +38,7 @@ const mbfl_encoding mbfl_encoding_uuencode = { NULL, MBFL_ENCTYPE_SBCS, NULL, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/filters/utf7_helper.h b/ext/mbstring/libmbfl/filters/utf7_helper.h new file mode 100644 index 00000000000..4e13bf94da0 --- /dev/null +++ b/ext/mbstring/libmbfl/filters/utf7_helper.h @@ -0,0 +1,27 @@ +#ifndef MBFL_UTF7_HELPER_H +#define MBFL_UTF7_HELPER_H + +#include "mbfilter.h" + +/* Ways which a Base64-encoded section can end: */ +#define DASH 0xFC +#define DIRECT 0xFD +#define ASCII 0xFE +#define ILLEGAL 0xFF + +static inline bool is_base64_end(unsigned char c) +{ + return c >= DASH; +} + +static inline bool is_base64_end_valid(unsigned char n, bool gap, bool is_surrogate) +{ + return !(gap || is_surrogate || n == ASCII || n == ILLEGAL); +} + +static inline bool has_surrogate(uint16_t cp, bool is_surrogate) +{ + return !is_surrogate && cp >= 0xD800 && cp <= 0xDBFF; +} + +#endif /* MBFL_UTF7_HELPER_H */ diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter.c b/ext/mbstring/libmbfl/mbfl/mbfilter.c index 4820f15e214..4fffb90f8e7 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter.c @@ -376,6 +376,16 @@ int mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *str unsigned char *p = string->val; int bad = 0; + if (identd->strict) { + for (int i = 0; i < num; i++) { + mbfl_convert_filter *filter = identd->filter_list[i]; + mbfl_encoding_detector_data *data = &identd->filter_data[i]; + if (filter->from->check != NULL && !(filter->from->check)(p, n)) { + data->num_illegalchars++; + } + } + } + while (n--) { for (int i = 0; i < num; i++) { mbfl_convert_filter *filter = identd->filter_list[i]; diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c b/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c index e348ddb69cc..4745a5848de 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter_8bit.c @@ -47,7 +47,8 @@ const mbfl_encoding mbfl_encoding_8bit = { NULL, MBFL_ENCTYPE_SBCS, &vtbl_8bit_wchar, - &vtbl_wchar_8bit + &vtbl_wchar_8bit, + NULL }; const struct mbfl_convert_vtbl vtbl_8bit_wchar = { diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter_pass.c b/ext/mbstring/libmbfl/mbfl/mbfilter_pass.c index e43f746ecca..63525cc4960 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter_pass.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter_pass.c @@ -42,6 +42,7 @@ const mbfl_encoding mbfl_encoding_pass = { NULL, 0, NULL, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c b/ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c index a2b22c9105a..3b20e3ffdab 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c +++ b/ext/mbstring/libmbfl/mbfl/mbfilter_wchar.c @@ -40,5 +40,6 @@ const mbfl_encoding mbfl_encoding_wchar = { NULL, MBFL_ENCTYPE_WCS4, NULL, + NULL, NULL }; diff --git a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h index 09505d72382..f024e437814 100644 --- a/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h +++ b/ext/mbstring/libmbfl/mbfl/mbfl_encoding.h @@ -32,6 +32,7 @@ #define MBFL_ENCODING_H #include "mbfl_defs.h" +#include "zend.h" enum mbfl_no_encoding { mbfl_no_encoding_invalid = -1, @@ -132,6 +133,8 @@ struct mbfl_convert_vtbl { void (*filter_copy)(struct _mbfl_convert_filter *src, struct _mbfl_convert_filter *dest); }; +typedef bool (*mb_check_fn)(unsigned char *in, size_t in_len); + /* * encoding */ @@ -144,6 +147,7 @@ typedef struct _mbfl_encoding { unsigned int flag; const struct mbfl_convert_vtbl *input_filter; const struct mbfl_convert_vtbl *output_filter; + mb_check_fn check; } mbfl_encoding; MBFLAPI extern const mbfl_encoding *mbfl_name2encoding(const char *name); diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 9e0248e29e8..238d74e4838 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -3917,6 +3917,11 @@ MBSTRING_API int php_mb_check_encoding(const char *input, size_t length, const m { mbfl_convert_filter *filter = mbfl_convert_filter_new(encoding, &mbfl_encoding_wchar, mbfl_filt_check_errors, NULL, &filter); + if (encoding->check != NULL) { + mbfl_convert_filter_delete(filter); + return encoding->check((unsigned char*)input, length); + } + while (length--) { unsigned char c = *input++; (filter->filter_function)(c, filter); diff --git a/ext/mbstring/tests/gh10192_utf7.phpt b/ext/mbstring/tests/gh10192_utf7.phpt new file mode 100644 index 00000000000..eac0c9c00f1 --- /dev/null +++ b/ext/mbstring/tests/gh10192_utf7.phpt @@ -0,0 +1,462 @@ +--TEST-- +GH-10192 (mb_detect_encoding() results for UTF-7 differ between PHP 8.0 and 8.1) +--EXTENSIONS-- +mbstring +--FILE-- + 'A + B', + 'non-base64 character after -' => 'A - B', + 'base64 character before +' => 'A 1+ B', + 'base64 character before -' => 'A 1- B', + 'base64 character after +' => 'A +1 B', + 'base64 character after -' => 'A -1 B', + 'base64 character before and after +' => 'A 1+1 B', + 'base64 character before and after -' => 'A 1-1 B', + 'string ends with +' => 'A +', + 'string ends with -' => 'A -', + '+ and -' => 'A +- B', + '- and +' => 'A -+ B', + 'valid direct encoding character =' => 'A = B', + 'invalid direct encoding character ~' => 'A ~ B', + 'invalid direct encoding character \\' => 'A \\ B', + 'invalid direct encoding character ESC' => "A \x1b B", + 'valid direct encoding character = after +' => 'A += B', + 'invalid direct encoding character ~ after +' => 'A +~ B', + 'invalid direct encoding character \\ after +' => 'A +\\ B', + 'invalid direct encoding character ESC after +' => "A +\x1b B", + 'valid base64 character between + and -' => 'A +ZeVnLIqe- B', // 日本語 in UTF-16BE + 'invalid base64 character between + and -' => 'A +ZeVnLIq- B', // 日本語 in UTF-16BE without the last character + 'valid base64 character between + and non-base64 character' => 'A +ZeVnLIqe B', + 'invalid base64 character between + and non-base64 character' => 'A +ZeVnLIq B', + 'valid base64 character between + and base64 character' => 'A +ZeVnLIqe1 B', + 'invalid base64 character between + and base64 character' => 'A +ZeVnLIq1 B', + 'valid base64 character between + and end of string' => 'A +ZeVnLIqe', + 'invalid base64 character between + and end of string' => 'A +ZeVnLIq', + 'valid base64 character consisting only of + between + and -' => 'A +++++++++- B', + 'invalid base64 character consisting only of + between + and -' => 'A +++++++++- B', + 'valid base64 character consisting only of + between + and non-base64 character' => 'A +++++++++ B', + 'invalid base64 character consisting only of + between + and non-base64 character' => 'A +++++++++ B', + 'valid base64 character consisting only of + between + and base64 character' => 'A +++++++++1 B', + 'invalid base64 character consisting only of + between + and base64 character' => 'A +++++++++1 B', + 'valid base64 character consisting only of + between + and end of string' => 'A +++++++++', + 'invalid base64 character consisting only of + between + and end of string' => 'A +++++++++', + 'valid base64 character using surrogate pair between + and -' => 'A +2GfePQ- B', // 𩸽 in UTF-16BE + 'invalid base64 character using surrogate pair between + and -' => 'A +2Gc- B', // first 16 bits of 𩸽 in UTF-16BE + 'valid base64 character using surrogate pair between + and non-base64 character' => 'A +2GfePQ B', + 'invalid base64 character using surrogate pair between + and non-base64 character' => 'A +2Gc B', + 'valid base64 character using surrogate pair between + and base64 character' => 'A +2GfePQ1 B', + 'invalid base64 character using surrogate pair between + and base64 character' => 'A +2Gc1 B', + 'valid base64 character using surrogate pair between + and end of string' => 'A +2GfePQ', + 'invalid base64 character using surrogate pair between + and end of string' => 'A +2Gc' +]; + +foreach ($testcases as $title => $case) { + echo $title . PHP_EOL; + var_dump(mb_detect_encoding($case, 'UTF-8, UTF-7', true)); + var_dump(mb_detect_encoding($case, 'UTF-8, UTF-7', false)); + var_dump(mb_detect_encoding($case, 'UTF-7', true)); + var_dump(mb_detect_encoding($case, 'UTF-7', false)); + var_dump(mb_check_encoding($case, 'UTF-7')); + var_dump(addcslashes(mb_convert_encoding($case, 'UTF-8', 'UTF-7'), "\0..\37\177")); + var_dump(mb_get_info('illegal_chars')); + echo PHP_EOL; +} +?> +--EXPECT-- +non-base64 character after + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(4) "A B" +int(0) + +non-base64 character after - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(5) "A - B" +int(0) + +base64 character before + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A 1 B" +int(0) + +base64 character before - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(6) "A 1- B" +int(0) + +base64 character after + +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(1) + +base64 character after - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(6) "A -1 B" +int(1) + +base64 character before and after + +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(6) "A 1? B" +int(2) + +base64 character before and after - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(7) "A 1-1 B" +int(2) + +string ends with + +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(2) "A " +int(2) + +string ends with - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(3) "A -" +int(2) + ++ and - +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(5) "A + B" +int(2) + +- and + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A - B" +int(2) + +valid direct encoding character = +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(5) "A = B" +int(2) + +invalid direct encoding character ~ +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ~ B" +int(2) + +invalid direct encoding character \ +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A \ B" +int(2) + +invalid direct encoding character ESC +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(8) "A \033 B" +int(2) + +valid direct encoding character = after + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A = B" +int(2) + +invalid direct encoding character ~ after + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ~ B" +int(2) + +invalid direct encoding character \ after + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A \ B" +int(2) + +invalid direct encoding character ESC after + +string(5) "UTF-8" +string(5) "UTF-7" +bool(false) +string(5) "UTF-7" +bool(false) +string(8) "A \033 B" +int(2) + +valid base64 character between + and - +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A 日本語 B" +int(2) + +invalid base64 character between + and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(11) "A 日本? B" +int(3) + +valid base64 character between + and non-base64 character +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A 日本語 B" +int(3) + +invalid base64 character between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(11) "A 日本? B" +int(4) + +valid base64 character between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(14) "A 日本語? B" +int(5) + +invalid base64 character between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A 日本誵 B" +int(5) + +valid base64 character between + and end of string +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(11) "A 日本語" +int(5) + +invalid base64 character between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(9) "A 日本?" +int(6) + +valid base64 character consisting only of + between + and - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A ﯯ뻻 B" +int(6) + +invalid base64 character consisting only of + between + and - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A ﯯ뻻 B" +int(6) + +valid base64 character consisting only of + between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A ﯯ뻻 B" +int(6) + +invalid base64 character consisting only of + between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(13) "A ﯯ뻻 B" +int(6) + +valid base64 character consisting only of + between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(14) "A ﯯ뻻? B" +int(7) + +invalid base64 character consisting only of + between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(14) "A ﯯ뻻? B" +int(8) + +valid base64 character consisting only of + between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(11) "A ﯯ뻻" +int(8) + +invalid base64 character consisting only of + between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(11) "A ﯯ뻻" +int(8) + +valid base64 character using surrogate pair between + and - +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(8) "A 𩸽 B" +int(8) + +invalid base64 character using surrogate pair between + and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(9) + +valid base64 character using surrogate pair between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(8) "A 𩸽 B" +int(9) + +invalid base64 character using surrogate pair between + and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(10) + +valid base64 character using surrogate pair between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(9) "A 𩸽? B" +int(11) + +invalid base64 character using surrogate pair between + and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(5) "A ? B" +int(12) + +valid base64 character using surrogate pair between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +string(5) "UTF-7" +string(5) "UTF-7" +bool(true) +string(6) "A 𩸽" +int(12) + +invalid base64 character using surrogate pair between + and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(5) "UTF-7" +bool(false) +string(3) "A ?" +int(13) diff --git a/ext/mbstring/tests/gh10192_utf7imap.phpt b/ext/mbstring/tests/gh10192_utf7imap.phpt new file mode 100644 index 00000000000..574d994653c --- /dev/null +++ b/ext/mbstring/tests/gh10192_utf7imap.phpt @@ -0,0 +1,343 @@ +--TEST-- +GH-10192 (mb_detect_encoding() results for UTF-7 differ between PHP 8.0 and 8.1) +--EXTENSIONS-- +mbstring +--FILE-- + 'A & B', + 'non-base64 character after -' => 'A - B', + 'base64 character before &' => 'A 1& B', + 'base64 character before -' => 'A 1- B', + 'base64 character after &' => 'A &1 B', + 'base64 character after -' => 'A -1 B', + 'base64 character before and after &' => 'A 1&1 B', + 'base64 character before and after -' => 'A 1-1 B', + 'string ends with &' => 'A &', + 'string ends with -' => 'A -', + '& and -' => 'A &- B', + '- and &' => 'A -& B', + 'valid direct encoding character ~' => 'A ~ B', + 'invalid direct encoding character ESC' => "A \x1b B", + 'valid direct encoding character ~ after &' => 'A &~ B', + 'invalid direct encoding character ESC after &' => "A &\x1b B", + 'valid base64 character between & and -' => 'A &ZeVnLIqe- B', // 日本語 in UTF-16BE + 'invalid base64 character between & and -' => 'A &ZeVnLIq- B', // 日本語 in UTF-16BE without the last character + 'valid base64 character between & and non-base64 character' => 'A &ZeVnLIqe B', + 'invalid base64 character between & and non-base64 character' => 'A &ZeVnLIq B', + 'valid base64 character between & and base64 character' => 'A &ZeVnLIqe1 B', + 'invalid base64 character between & and base64 character' => 'A &ZeVnLIq1 B', + 'valid base64 character between & and end of string' => 'A &ZeVnLIqe', + 'invalid base64 character between & and end of string' => 'A &ZeVnLIq', + 'valid base64 character using surrogate pair between & and -' => 'A &2GfePQ- B', // 𩸽 in UTF-16BE + 'invalid base64 character using surrogate pair between & and -' => 'A &2Gc- B', // first 16 bits of 𩸽 in UTF-16BE + 'valid base64 character using surrogate pair between & and non-base64 character' => 'A &2GfePQ B', + 'invalid base64 character using surrogate pair between & and non-base64 character' => 'A &2Gc B', + 'valid base64 character using surrogate pair between & and base64 character' => 'A &2GfePQ1 B', + 'invalid base64 character using surrogate pair between & and base64 character' => 'A &2Gc1 B', + 'valid base64 character using surrogate pair between & and end of string' => 'A &2GfePQ', + 'invalid base64 character using surrogate pair between & and end of string' => 'A &2Gc' +]; + +foreach ($testcases as $title => $case) { + echo $title . PHP_EOL; + var_dump(mb_detect_encoding($case, 'UTF-8, UTF7-IMAP', true)); + var_dump(mb_detect_encoding($case, 'UTF-8, UTF7-IMAP', false)); + var_dump(mb_detect_encoding($case, 'UTF7-IMAP', true)); + var_dump(mb_detect_encoding($case, 'UTF7-IMAP', false)); + var_dump(mb_check_encoding($case, 'UTF7-IMAP')); + var_dump(addcslashes(mb_convert_encoding($case, 'UTF-8', 'UTF7-IMAP'), "\0..\37\177")); + var_dump(mb_get_info('illegal_chars')); + echo PHP_EOL; +} + +?> +--EXPECT-- +non-base64 character after & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ?B" +int(1) + +non-base64 character after - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(5) "A - B" +int(1) + +base64 character before & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A 1?B" +int(2) + +base64 character before - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(6) "A 1- B" +int(2) + +base64 character after & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ?B" +int(3) + +base64 character after - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(6) "A -1 B" +int(3) + +base64 character before and after & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A 1?B" +int(4) + +base64 character before and after - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(7) "A 1-1 B" +int(4) + +string ends with & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(3) "A ?" +int(5) + +string ends with - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(3) "A -" +int(5) + +& and - +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(5) "A & B" +int(5) + +- and & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A -?B" +int(6) + +valid direct encoding character ~ +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(5) "A ~ B" +int(6) + +invalid direct encoding character ESC +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ? B" +int(7) + +valid direct encoding character ~ after & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ? B" +int(8) + +invalid direct encoding character ESC after & +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ? B" +int(9) + +valid base64 character between & and - +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(13) "A 日本語 B" +int(9) + +invalid base64 character between & and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(11) "A 日本? B" +int(10) + +valid base64 character between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(13) "A 日本語?B" +int(11) + +invalid base64 character between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(10) "A 日本?B" +int(12) + +valid base64 character between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(13) "A 日本語?B" +int(13) + +invalid base64 character between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(13) "A 日本誵?B" +int(14) + +valid base64 character between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(12) "A 日本語?" +int(15) + +invalid base64 character between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(9) "A 日本?" +int(16) + +valid base64 character using surrogate pair between & and - +string(5) "UTF-8" +string(5) "UTF-8" +string(9) "UTF7-IMAP" +string(9) "UTF7-IMAP" +bool(true) +string(8) "A 𩸽 B" +int(16) + +invalid base64 character using surrogate pair between & and - +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(5) "A ? B" +int(17) + +valid base64 character using surrogate pair between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(8) "A 𩸽?B" +int(18) + +invalid base64 character using surrogate pair between & and non-base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ?B" +int(19) + +valid base64 character using surrogate pair between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(8) "A 𩸽?B" +int(20) + +invalid base64 character using surrogate pair between & and base64 character +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(4) "A ?B" +int(21) + +valid base64 character using surrogate pair between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(7) "A 𩸽?" +int(22) + +invalid base64 character using surrogate pair between & and end of string +string(5) "UTF-8" +string(5) "UTF-8" +bool(false) +string(9) "UTF7-IMAP" +bool(false) +string(3) "A ?" +int(23) diff --git a/ext/mbstring/tests/gh10648.phpt b/ext/mbstring/tests/gh10648.phpt new file mode 100644 index 00000000000..9f0b4b4db15 --- /dev/null +++ b/ext/mbstring/tests/gh10648.phpt @@ -0,0 +1,155 @@ +--TEST-- +GH-10648 (mb_check_encoding() returns true for incorrect but interpretable ISO-2022-JP byte sequences) +--EXTENSIONS-- +mbstring +--FILE-- + '1b244224221b2842', // 'あ' in ISO-2022-JP + 'ISO-2022-JP bytes without escape sequence' => '1b24422422', // 'ア' in JIS + 'JIS X 0201 7bit kana with escape sequence' => '1b2849311b2842', // 'ア' in JIS + 'JIS X 0201 7bit kana with SO/SI' => '0e310f', // 'ア' in JIS + 'JIS X 0201 8bit kana' => 'b1', // 'ア' in JIS + 'JIS X 0201 7bit kana with SO and ESC' => '0e311b2842', // 'ア' in JIS + 'JIS X 0201 7bit kana with ESC and SI' => '1b2849310f', // 'ア' in JIS + 'JIS X 0208 character' => '1b244242641b2842', // '鯛' in JIS and ISO-2022-JP, included in JIS X 0208 + 'JIS X 0212 character' => '1b2428446a591b2842', // '鮋' in JIS, included in JIS X 0212 + 'JIS X 0213 character' => '1b2428507d4c1b2842', // '𩸽' in ISO-2022-JP-2004, included in JIS X 0213 + 'JIS C 6220-1969 ESC ( H' => '1b284a1b2848', // an escape sequence transitioning to ASCII + 'SO/SI when not in ASCII mode' => '1b284a0e0f1b2842', // an escape sequence transitioning to ASCII +]; + +foreach ($testcases as $title => $case) { + echo $title . PHP_EOL; + echo 'JIS:' . PHP_EOL; + var_dump(mb_check_encoding(hex2bin($case), 'JIS')); + echo mb_convert_encoding(hex2bin($case), 'UTF-8', 'JIS'). PHP_EOL; + var_dump(mb_get_info('illegal_chars')); + echo 'ISO-2022-JP:' . PHP_EOL; + var_dump(mb_check_encoding(hex2bin($case), 'ISO-2022-JP')); + echo mb_convert_encoding(hex2bin($case), 'UTF-8', 'ISO-2022-JP'). PHP_EOL; + var_dump(mb_get_info('illegal_chars')); + echo PHP_EOL; +} +?> +--EXPECT-- +ISO-2022-JP bytes +JIS: +bool(true) +あ +int(0) +ISO-2022-JP: +bool(true) +あ +int(0) + +ISO-2022-JP bytes without escape sequence +JIS: +bool(false) +あ +int(0) +ISO-2022-JP: +bool(false) +あ +int(0) + +JIS X 0201 7bit kana with escape sequence +JIS: +bool(true) +ア +int(0) +ISO-2022-JP: +bool(false) +ア +int(0) + +JIS X 0201 7bit kana with SO/SI +JIS: +bool(true) +ア +int(0) +ISO-2022-JP: +bool(false) +ア +int(0) + +JIS X 0201 8bit kana +JIS: +bool(true) +ア +int(0) +ISO-2022-JP: +bool(false) +ア +int(0) + +JIS X 0201 7bit kana with SO and ESC +JIS: +bool(false) +ア +int(0) +ISO-2022-JP: +bool(false) +ア +int(0) + +JIS X 0201 7bit kana with ESC and SI +JIS: +bool(false) +ア +int(0) +ISO-2022-JP: +bool(false) +ア +int(0) + +JIS X 0208 character +JIS: +bool(true) +鯛 +int(0) +ISO-2022-JP: +bool(true) +鯛 +int(0) + +JIS X 0212 character +JIS: +bool(true) +鮋 +int(0) +ISO-2022-JP: +bool(false) +鮋 +int(0) + +JIS X 0213 character +JIS: +bool(false) +?$(P}L +int(1) +ISO-2022-JP: +bool(false) +?$(P}L +int(2) + +JIS C 6220-1969 ESC ( H +JIS: +bool(true) + +int(2) +ISO-2022-JP: +bool(false) + +int(2) + +SO/SI when not in ASCII mode +JIS: +bool(false) + +int(2) +ISO-2022-JP: +bool(false) + +int(2) diff --git a/ext/mbstring/tests/iso2022jp_encoding.phpt b/ext/mbstring/tests/iso2022jp_encoding.phpt index 54653d1bbf7..d47ad1406da 100644 --- a/ext/mbstring/tests/iso2022jp_encoding.phpt +++ b/ext/mbstring/tests/iso2022jp_encoding.phpt @@ -50,11 +50,6 @@ function testValid($from, $to, $encoding, $bothWays = true) { /* ESC ( B at the beginning is redundant, since ASCII mode is the default */ if (substr($from, 0, 3) == "\x1B(B") $from = substr($from, 3, strlen($from) - 3); - /* If the string switches to a different charset, it should switch back to - * ASCII at the end */ - if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B(J") !== false) - $from .= "\x1B(B"; - convertValidString($to, $from, 'UTF-16BE', $encoding, false); } } @@ -66,11 +61,11 @@ function testInvalid($from, $to, $encoding) { for ($i = 0; $i < 0x80; $i++) { if ($i == 0xE || $i == 0xF || $i == 0x1B) continue; - testValid(chr($i), "\x00" . chr($i), 'JIS'); - testValid("\x0F" . chr($i), "\x00" . chr($i), 'JIS'); /* 0xF is 'Shift Out' code */ - testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'JIS'); - testValid(chr($i), "\x00" . chr($i), 'ISO-2022-JP'); - testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'ISO-2022-JP'); + testValid(chr($i), "\x00" . chr($i), 'JIS'); + convertValidString("\x0F" . chr($i), "\x00" . chr($i), 'JIS', 'UTF-16BE', false); /* 0xF is 'Shift In' code */ + testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'JIS'); + testValid(chr($i), "\x00" . chr($i), 'ISO-2022-JP'); + testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'ISO-2022-JP'); } for ($i = 0x80; $i < 256; $i++) { @@ -92,27 +87,27 @@ echo "ASCII support OK\n"; foreach ($jisx0201Chars as $jisx0201 => $utf16BE) { if (ord($jisx0201) >= 128) { $kana = chr(ord($jisx0201) - 128); - testValid("\x1B(I" . $kana, $utf16BE, 'JIS', false); - testValid("\x0E" . $kana, $utf16BE, 'JIS', false); /* 0xE is 'Shift In' code */ + testValid("\x1B(I" . $kana . "\x1B(B", $utf16BE, 'JIS', false); + testValid("\x0E" . $kana . "\x0F", $utf16BE, 'JIS', false); /* 0xE is 'Shift Out' code */ testValid($jisx0201, $utf16BE, 'JIS', false); } else { - testValid("\x1B(J" . $jisx0201, $utf16BE, 'JIS', $utf16BE > "\x00\x80"); + testValid("\x1B(J" . $jisx0201 . "\x1B(B", $utf16BE, 'JIS', $utf16BE > "\x00\x80"); } } for ($i = 0x80; $i < 256; $i++) { if ($i >= 0xA1 && $i <= 0xDF) continue; - testInvalid("\x1B(I" . chr($i), "\x00%", 'JIS'); - testInvalid("\x1B(J" . chr($i), "\x00%", 'JIS'); + testInvalid("\x1B(I" . chr($i) . "\x1B(B", "\x00%", 'JIS'); + testInvalid("\x1B(J" . chr($i) . "\x1B(B", "\x00%", 'JIS'); } echo "JIS X 0201 support OK\n"; /* All valid JISX0208 characters */ foreach ($jisx0208Chars as $jisx0208 => $utf16BE) { - testValid("\x1B\$B" . $jisx0208, $utf16BE, 'JIS'); - testValid("\x1B\$B" . $jisx0208, $utf16BE, 'ISO-2022-JP'); + testValid("\x1B\$B" . $jisx0208 . "\x1B(B", $utf16BE, 'JIS'); + testValid("\x1B\$B" . $jisx0208 . "\x1B(B", $utf16BE, 'ISO-2022-JP'); } /* All invalid 2-byte JISX0208 characters */ @@ -120,8 +115,8 @@ for ($i = 0x21; $i <= 0x7E; $i++) { for ($j = 0; $j < 256; $j++) { $testString = chr($i) . chr($j); if (!isset($jisx0208Chars[$testString])) { - testInvalid("\x1B\$B" . $testString, "\x00%", 'JIS'); - testInvalid("\x1B\$B" . $testString, "\x00%", 'ISO-2022-JP'); + testInvalid("\x1B\$B" . $testString . "\x1B(B", "\x00%", 'JIS'); + testInvalid("\x1B\$B" . $testString . "\x1B(B", "\x00%", 'ISO-2022-JP'); } } } @@ -138,7 +133,7 @@ echo "JIS X 0208 support OK\n"; /* All valid JISX0212 characters */ foreach ($jisx0212Chars as $jisx0212 => $utf16BE) { - testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'JIS', false); + testValid("\x1B\$(D" . $jisx0212 . "\x1B(B", $utf16BE, 'JIS', false); } /* All invalid 2-byte JISX0212 characters */ @@ -146,42 +141,49 @@ for ($i = 0x21; $i <= 0x7E; $i++) { for ($j = 0; $j < 256; $j++) { $testString = chr($i) . chr($j); if (!isset($jisx0212Chars[$testString])) { - testInvalid("\x1B\$(D" . $testString, "\x00%", 'JIS'); + testInvalid("\x1B\$(D" . $testString . "\x1B(B", "\x00%", 'JIS'); } } } /* Try truncated JISX0212 characters */ for ($i = 0x21; $i <= 0x7E; $i++) { - testInvalid("\x1B\$(D" . chr($i), "\x00%", 'JIS'); + testInvalid("\x1B\$(D" . chr($i) . "\x1B(B", "\x00%\x00%", 'JIS'); } echo "JIS X 0212 support OK\n"; /* All possible escape sequences */ -$validEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B\$(@" => true, "\x1B\$(B" => true, "\x1B\$(D" => true, "\x1B(B" => true, "\x1B(H" => true, "\x1B(J" => true, "\x1B(I" => true]; +$validJisEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B\$(@" => true, "\x1B\$(B" => true, "\x1B\$(D" => true, "\x1B(B" => true, "\x1B(H" => true, "\x1B(J" => true, "\x1B(I" => true]; +$validIso2022jpEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B(B" => true, "\x1B(J" => true]; for ($i = 0; $i <= 0xFF; $i++) { for ($j = 0; $j <= 0xFF; $j++) { $escapeSequence = "\x1B" . chr($i) . chr($j); if ($escapeSequence === "\x1B\$(") continue; - if (isset($validEscapes[$escapeSequence])) { - testValid($escapeSequence, "", 'JIS', false); - testValid($escapeSequence, "", 'ISO-2022-JP', false); + if (isset($validJisEscapes[$escapeSequence])) { + testValid($escapeSequence . "\x1B(B", "", 'JIS', false); } else { - identifyInvalidString($escapeSequence, 'JIS'); - identifyInvalidString($escapeSequence, 'ISO-2022-JP'); + identifyInvalidString($escapeSequence . "\x1B(B", 'JIS'); + } + if (isset($validIso2022jpEscapes[$escapeSequence])) { + testValid($escapeSequence . "\x1B(B", "", 'ISO-2022-JP', false); + } else { + identifyInvalidString($escapeSequence . "\x1B(B", 'ISO-2022-JP'); } } } for ($i = 0; $i <= 0xFF; $i++) { $escapeSequence = "\x1B\$(" . chr($i); - if (isset($validEscapes[$escapeSequence])) { - testValid($escapeSequence, "", 'JIS', false); - testValid($escapeSequence, "", 'ISO-2022-JP', false); + if (isset($validJisEscapes[$escapeSequence])) { + testValid($escapeSequence . "\x1B(B", "", 'JIS', false); } else { - identifyInvalidString($escapeSequence, 'JIS'); - identifyInvalidString($escapeSequence, 'ISO-2022-JP'); + identifyInvalidString($escapeSequence . "\x1B(B", 'JIS'); + } + if (isset($validIso2022jpEscapes[$escapeSequence])) { + testValid($escapeSequence . "\x1B(B", "", 'ISO-2022-JP', false); + } else { + identifyInvalidString($escapeSequence . "\x1B(B", 'ISO-2022-JP'); } } diff --git a/ext/mbstring/tests/utf_encodings.phpt b/ext/mbstring/tests/utf_encodings.phpt index b0973e527e7..c07a27419b0 100644 --- a/ext/mbstring/tests/utf_encodings.phpt +++ b/ext/mbstring/tests/utf_encodings.phpt @@ -980,17 +980,8 @@ testValidString('+' . encode('AB', 'ASCII') . '-+' . encode('CD', 'ASCII') . '-' // (Just trying to be exhaustive here) testValidString('+' . encode('AB', 'ASCII') . '-!+' . encode('CD', 'ASCII') . '-', "\x00A\x00B\x00!\x00C\x00D", 'UTF-7', 'UTF-16BE', false); -// + section terminated by a non-Base64 ASCII character which is NOT - -for ($i = 0; $i < 128; $i++) { - if ($i >= ord('A') && $i <= ord('Z')) - continue; - if ($i >= ord('a') && $i <= ord('z')) - continue; - if ($i >= ord('0') && $i <= ord('9')) - continue; - if ($i == ord('+') || $i == ord('/') || $i == ord('-') || $i == ord('\\') || $i == ord('~')) - continue; - $char = chr($i); +// + section terminated by a non-Base64 direct character which is NOT - +foreach (str_split(" \t\r\n'(),.:?!\"#$%&*;<=>@[]^_`{|}\x00") as $char) { testValidString('+' . encode("\x12\x34", 'UTF-16BE') . $char, "\x00\x00\x12\x34\x00\x00\x00" . $char, 'UTF-7', 'UTF-32BE', false); }