mirror of
https://github.com/php/php-src.git
synced 2025-08-15 13:38:49 +02:00
Fix phpGH-10648: add check function pointer into mbfl_encoding
Previously, mbstring used the same logic for encoding validation as for
encoding conversion.
However, there are cases where we want to use different logic for validation
and conversion. For example, if a string ends up with missing input
required by the encoding, or if a character is input that is invalid
as an encoding but can be converted, the conversion should succeed and
the validation should fail.
To achieve this, a function pointer mb_check_fn has been added to
struct mbfl_encoding to implement the logic used for validation.
Also, added implementation of validation logic for UTF-7, UTF7-IMAP,
ISO-2022-JP and JIS.
(The same change has already been made to PHP 8.2 and 8.3; see
6fc8d014df
. This commit is backporting the change to PHP 8.1.)
This commit is contained in:
parent
8930bf8c33
commit
b721d0f71e
50 changed files with 1637 additions and 96 deletions
10
UPGRADING
10
UPGRADING
|
@ -489,6 +489,16 @@ PHP 8.1 UPGRADE NOTES
|
|||
. All GMP function now accept octal string with the leading octal prefix ("0o"/"0O")
|
||||
RFC: https://wiki.php.net/rfc/explicit_octal_notation
|
||||
|
||||
- MBString
|
||||
. mb_check_encoding() now checks input encoding more strictly.
|
||||
. mb_detect_encoding() now checks input encoding more strictly
|
||||
when strict detection is enabled.
|
||||
. mb_convert_encoding() checks the input encoding more strictly
|
||||
if multiple encodings are passed to from_encoding
|
||||
and the mbstring.strict_detection INI directive is set to 1.
|
||||
This change only affects the encoding selection,
|
||||
not the result of the conversion.
|
||||
|
||||
- PDO ODBC:
|
||||
. PDO::getAttributes() with PDO::ATTR_SERVER_INFO and PDO::ATTR_SERVER_VERSION
|
||||
now return values instead of throwing PDOException.
|
||||
|
|
|
@ -39,6 +39,7 @@ const mbfl_encoding mbfl_encoding_7bit = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_SBCS,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
};
|
||||
|
||||
|
|
|
@ -39,6 +39,7 @@ const mbfl_encoding mbfl_encoding_base64 = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
};
|
||||
|
||||
|
|
|
@ -63,7 +63,8 @@ const mbfl_encoding mbfl_encoding_big5 = {
|
|||
mblen_table_big5,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_big5_wchar,
|
||||
&vtbl_wchar_big5
|
||||
&vtbl_wchar_big5,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_cp950 = {
|
||||
|
@ -74,7 +75,8 @@ const mbfl_encoding mbfl_encoding_cp950 = {
|
|||
mblen_table_big5,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_cp950_wchar,
|
||||
&vtbl_wchar_cp950
|
||||
&vtbl_wchar_cp950,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_big5_wchar = {
|
||||
|
|
|
@ -54,7 +54,8 @@ const mbfl_encoding mbfl_encoding_cp50220 = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_cp50220_wchar,
|
||||
&vtbl_wchar_cp50220
|
||||
&vtbl_wchar_cp50220,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_cp50221 = {
|
||||
|
@ -65,7 +66,8 @@ const mbfl_encoding mbfl_encoding_cp50221 = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_cp50221_wchar,
|
||||
&vtbl_wchar_cp50221
|
||||
&vtbl_wchar_cp50221,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_cp50222 = {
|
||||
|
@ -76,7 +78,8 @@ const mbfl_encoding mbfl_encoding_cp50222 = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_cp50222_wchar,
|
||||
&vtbl_wchar_cp50222
|
||||
&vtbl_wchar_cp50222,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_cp50220_wchar = {
|
||||
|
|
|
@ -65,7 +65,8 @@ const mbfl_encoding mbfl_encoding_cp51932 = {
|
|||
mblen_table_eucjp,
|
||||
0,
|
||||
&vtbl_cp51932_wchar,
|
||||
&vtbl_wchar_cp51932
|
||||
&vtbl_wchar_cp51932,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_cp51932_wchar = {
|
||||
|
|
|
@ -95,7 +95,8 @@ const mbfl_encoding mbfl_encoding_cp932 = {
|
|||
mblen_table_sjis,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_cp932_wchar,
|
||||
&vtbl_wchar_cp932
|
||||
&vtbl_wchar_cp932,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_cp932_wchar = {
|
||||
|
@ -126,7 +127,8 @@ const mbfl_encoding mbfl_encoding_sjiswin = {
|
|||
mblen_table_sjis,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_sjiswin_wchar,
|
||||
&vtbl_wchar_sjiswin
|
||||
&vtbl_wchar_sjiswin,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_sjiswin_wchar = {
|
||||
|
|
|
@ -63,7 +63,8 @@ const mbfl_encoding mbfl_encoding_cp936 = {
|
|||
mblen_table_cp936,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_cp936_wchar,
|
||||
&vtbl_wchar_cp936
|
||||
&vtbl_wchar_cp936,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_cp936_wchar = {
|
||||
|
|
|
@ -63,7 +63,8 @@ const mbfl_encoding mbfl_encoding_euc_cn = {
|
|||
mblen_table_euccn,
|
||||
0,
|
||||
&vtbl_euccn_wchar,
|
||||
&vtbl_wchar_euccn
|
||||
&vtbl_wchar_euccn,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_euccn_wchar = {
|
||||
|
|
|
@ -64,7 +64,8 @@ const mbfl_encoding mbfl_encoding_euc_jp = {
|
|||
mblen_table_eucjp,
|
||||
0,
|
||||
&vtbl_eucjp_wchar,
|
||||
&vtbl_wchar_eucjp
|
||||
&vtbl_wchar_eucjp,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_eucjp_wchar = {
|
||||
|
|
|
@ -43,7 +43,8 @@ const mbfl_encoding mbfl_encoding_eucjp2004 = {
|
|||
mblen_table_eucjp,
|
||||
0,
|
||||
&vtbl_eucjp2004_wchar,
|
||||
&vtbl_wchar_eucjp2004
|
||||
&vtbl_wchar_eucjp2004,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_eucjp2004_wchar = {
|
||||
|
|
|
@ -65,7 +65,8 @@ const mbfl_encoding mbfl_encoding_eucjp_win = {
|
|||
mblen_table_eucjp,
|
||||
0,
|
||||
&vtbl_eucjpwin_wchar,
|
||||
&vtbl_wchar_eucjpwin
|
||||
&vtbl_wchar_eucjpwin,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_eucjpwin_wchar = {
|
||||
|
|
|
@ -62,7 +62,8 @@ const mbfl_encoding mbfl_encoding_euc_kr = {
|
|||
mblen_table_euckr,
|
||||
0,
|
||||
&vtbl_euckr_wchar,
|
||||
&vtbl_wchar_euckr
|
||||
&vtbl_wchar_euckr,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_euckr_wchar = {
|
||||
|
|
|
@ -64,7 +64,8 @@ const mbfl_encoding mbfl_encoding_euc_tw = {
|
|||
mblen_table_euctw,
|
||||
0,
|
||||
&vtbl_euctw_wchar,
|
||||
&vtbl_wchar_euctw
|
||||
&vtbl_wchar_euctw,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_euctw_wchar = {
|
||||
|
|
|
@ -45,7 +45,8 @@ const mbfl_encoding mbfl_encoding_gb18030 = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_gb18030_wchar,
|
||||
&vtbl_wchar_gb18030
|
||||
&vtbl_wchar_gb18030,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_gb18030_wchar = {
|
||||
|
|
|
@ -61,7 +61,8 @@ const mbfl_encoding mbfl_encoding_html_ent = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_html_wchar,
|
||||
&vtbl_wchar_html
|
||||
&vtbl_wchar_html,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_wchar_html = {
|
||||
|
|
|
@ -43,7 +43,8 @@ const mbfl_encoding mbfl_encoding_hz = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_hz_wchar,
|
||||
&vtbl_wchar_hz
|
||||
&vtbl_wchar_hz,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_hz_wchar = {
|
||||
|
|
|
@ -46,7 +46,8 @@ const mbfl_encoding mbfl_encoding_2022jpms = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_2022jpms_wchar,
|
||||
&vtbl_wchar_2022jpms
|
||||
&vtbl_wchar_2022jpms,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_2022jpms_wchar = {
|
||||
|
|
|
@ -47,7 +47,8 @@ const mbfl_encoding mbfl_encoding_2022kr = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_2022kr_wchar,
|
||||
&vtbl_wchar_2022kr
|
||||
&vtbl_wchar_2022kr,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_wchar_2022kr = {
|
||||
|
|
|
@ -43,7 +43,8 @@ const mbfl_encoding mbfl_encoding_2022jp_2004 = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_2022jp_2004_wchar,
|
||||
&vtbl_wchar_2022jp_2004
|
||||
&vtbl_wchar_2022jp_2004,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_2022jp_2004_wchar = {
|
||||
|
|
|
@ -48,7 +48,8 @@ const mbfl_encoding mbfl_encoding_2022jp_kddi = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_2022jp_kddi_wchar,
|
||||
&vtbl_wchar_2022jp_kddi
|
||||
&vtbl_wchar_2022jp_kddi,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_2022jp_kddi_wchar = {
|
||||
|
|
|
@ -34,6 +34,8 @@
|
|||
#include "unicode_table_jis.h"
|
||||
|
||||
static int mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter *filter);
|
||||
static bool mb_check_iso2022jp(unsigned char *in, size_t in_len);
|
||||
static bool mb_check_jis(unsigned char *in, size_t in_len);
|
||||
|
||||
const mbfl_encoding mbfl_encoding_jis = {
|
||||
mbfl_no_encoding_jis,
|
||||
|
@ -43,7 +45,8 @@ const mbfl_encoding mbfl_encoding_jis = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_jis_wchar,
|
||||
&vtbl_wchar_jis
|
||||
&vtbl_wchar_jis,
|
||||
mb_check_jis
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_2022jp = {
|
||||
|
@ -54,7 +57,8 @@ const mbfl_encoding mbfl_encoding_2022jp = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_2022jp_wchar,
|
||||
&vtbl_wchar_2022jp
|
||||
&vtbl_wchar_2022jp,
|
||||
mb_check_iso2022jp
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_jis_wchar = {
|
||||
|
@ -463,3 +467,166 @@ mbfl_filt_conv_any_jis_flush(mbfl_convert_filter *filter)
|
|||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define ASCII 0
|
||||
#define JISX_0201_LATIN 1
|
||||
#define JISX_0201_KANA 2
|
||||
#define JISX_0208 3
|
||||
#define JISX_0212 4
|
||||
#define JISX_0201_KANA_SO 5
|
||||
|
||||
static bool mb_check_jis(unsigned char *in, size_t in_len)
|
||||
{
|
||||
unsigned char *p = in, *e = p + in_len;
|
||||
unsigned int state = ASCII;
|
||||
|
||||
while (p < e) {
|
||||
unsigned char c = *p++;
|
||||
if (c == 0x1B) {
|
||||
/* ESC seen; this is an escape sequence */
|
||||
if (state == JISX_0201_KANA_SO) {
|
||||
return false;
|
||||
}
|
||||
if ((e - p) < 2) {
|
||||
return false;
|
||||
}
|
||||
unsigned char c2 = *p++;
|
||||
if (c2 == '$') {
|
||||
unsigned char c3 = *p++;
|
||||
if (c3 == '@' || c3 == 'B') {
|
||||
state = JISX_0208;
|
||||
} else if (c3 == '(') {
|
||||
if (p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char c4 = *p++;
|
||||
if (c4 == '@' || c4 == 'B') {
|
||||
state = JISX_0208;
|
||||
} else if (c4 == 'D') {
|
||||
state = JISX_0212;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else if (c2 == '(') {
|
||||
unsigned char c3 = *p++;
|
||||
/* ESC ( H is treated as a sequence transitioning to ASCII for historical reasons.
|
||||
* see https://github.com/php/php-src/pull/10828#issuecomment-1478342432. */
|
||||
if (c3 == 'B' || c3 == 'H') {
|
||||
state = ASCII;
|
||||
} else if (c3 == 'J') {
|
||||
state = JISX_0201_LATIN;
|
||||
} else if (c3 == 'I') {
|
||||
state = JISX_0201_KANA;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else if (c == 0xE) {
|
||||
/* "Kana In" marker */
|
||||
if (state != ASCII) {
|
||||
return false;
|
||||
}
|
||||
state = JISX_0201_KANA_SO;
|
||||
} else if (c == 0xF) {
|
||||
/* "Kana Out" marker */
|
||||
if (state != JISX_0201_KANA_SO) {
|
||||
return false;
|
||||
}
|
||||
state = ASCII;
|
||||
} else if ((state == JISX_0208 || state == JISX_0212) && (c > 0x20 && c < 0x7F)) {
|
||||
if (p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char c2 = *p++;
|
||||
if (c2 > 0x20 && c2 < 0x7F) {
|
||||
unsigned int s = (c - 0x21)*94 + c2 - 0x21;
|
||||
if (state == JISX_0208) {
|
||||
if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) {
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if (s < jisx0212_ucs_table_size && jisx0212_ucs_table[s]) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else if (c < 0x80) {
|
||||
continue;
|
||||
} else if (c >= 0xA1 && c <= 0xDF) {
|
||||
/* GR-invoked Kana */
|
||||
continue;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return state == ASCII;
|
||||
}
|
||||
|
||||
|
||||
static bool mb_check_iso2022jp(unsigned char *in, size_t in_len)
|
||||
{
|
||||
unsigned char *p = in, *e = p + in_len;
|
||||
unsigned int state = ASCII;
|
||||
|
||||
while (p < e) {
|
||||
unsigned char c = *p++;
|
||||
if (c == 0x1B) {
|
||||
/* ESC seen; this is an escape sequence */
|
||||
if ((e - p) < 2) {
|
||||
return false;
|
||||
}
|
||||
unsigned char c2 = *p++;
|
||||
if (c2 == '$') {
|
||||
unsigned char c3 = *p++;
|
||||
if (c3 == '@' || c3 == 'B') {
|
||||
state = JISX_0208;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else if (c2 == '(') {
|
||||
unsigned char c3 = *p++;
|
||||
if (c3 == 'B') {
|
||||
state = ASCII;
|
||||
} else if (c3 == 'J') {
|
||||
state = JISX_0201_LATIN;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else if (c == 0xE || c == 0xF) {
|
||||
/* "Kana In" or "Kana Out" marker; ISO-2022-JP is not accepted. */
|
||||
return false;
|
||||
} else if (state == JISX_0208 && (c > 0x20 && c < 0x7F)) {
|
||||
if (p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char c2 = *p++;
|
||||
if (c2 > 0x20 && c2 < 0x7F) {
|
||||
unsigned int s = (c - 0x21)*94 + c2 - 0x21;
|
||||
if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) {
|
||||
continue;
|
||||
}
|
||||
return false;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else if (c < 0x80) {
|
||||
continue;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return state == ASCII;
|
||||
}
|
||||
|
|
|
@ -41,6 +41,7 @@ const mbfl_encoding mbfl_encoding_qprint = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
};
|
||||
|
||||
|
|
|
@ -78,7 +78,8 @@ static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int
|
|||
NULL, \
|
||||
MBFL_ENCTYPE_SBCS, \
|
||||
&vtbl_##id##_wchar, \
|
||||
&vtbl_wchar_##id \
|
||||
&vtbl_wchar_##id, \
|
||||
NULL \
|
||||
}
|
||||
|
||||
/* For single-byte encodings which use a conversion table */
|
||||
|
|
|
@ -67,7 +67,8 @@ const mbfl_encoding mbfl_encoding_sjis = {
|
|||
mblen_table_sjis,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_sjis_wchar,
|
||||
&vtbl_wchar_sjis
|
||||
&vtbl_wchar_sjis,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_sjis_wchar = {
|
||||
|
|
|
@ -54,7 +54,8 @@ const mbfl_encoding mbfl_encoding_sjis2004 = {
|
|||
mblen_table_sjis_mobile, /* Leading byte values used for SJIS-2004 are the same as mobile SJIS variants */
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_sjis2004_wchar,
|
||||
&vtbl_wchar_sjis2004
|
||||
&vtbl_wchar_sjis2004,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_sjis2004_wchar = {
|
||||
|
|
|
@ -67,7 +67,8 @@ const mbfl_encoding mbfl_encoding_sjis_mac = {
|
|||
mblen_table_sjismac,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_sjis_mac_wchar,
|
||||
&vtbl_wchar_sjis_mac
|
||||
&vtbl_wchar_sjis_mac,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_sjis_mac_wchar = {
|
||||
|
|
|
@ -70,7 +70,8 @@ const mbfl_encoding mbfl_encoding_sjis_docomo = {
|
|||
mblen_table_sjis_mobile,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_sjis_docomo_wchar,
|
||||
&vtbl_wchar_sjis_docomo
|
||||
&vtbl_wchar_sjis_docomo,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_sjis_kddi = {
|
||||
|
@ -81,7 +82,8 @@ const mbfl_encoding mbfl_encoding_sjis_kddi = {
|
|||
mblen_table_sjis_mobile,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_sjis_kddi_wchar,
|
||||
&vtbl_wchar_sjis_kddi
|
||||
&vtbl_wchar_sjis_kddi,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_sjis_sb = {
|
||||
|
@ -92,7 +94,8 @@ const mbfl_encoding mbfl_encoding_sjis_sb = {
|
|||
mblen_table_sjis_mobile,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_sjis_sb_wchar,
|
||||
&vtbl_wchar_sjis_sb
|
||||
&vtbl_wchar_sjis_sb,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_sjis_docomo_wchar = {
|
||||
|
|
|
@ -49,7 +49,8 @@ const mbfl_encoding mbfl_encoding_ucs2 = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_WCS2,
|
||||
&vtbl_ucs2_wchar,
|
||||
&vtbl_wchar_ucs2
|
||||
&vtbl_wchar_ucs2,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_ucs2be = {
|
||||
|
@ -60,7 +61,8 @@ const mbfl_encoding mbfl_encoding_ucs2be = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_WCS2,
|
||||
&vtbl_ucs2be_wchar,
|
||||
&vtbl_wchar_ucs2be
|
||||
&vtbl_wchar_ucs2be,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_ucs2le = {
|
||||
|
@ -71,7 +73,8 @@ const mbfl_encoding mbfl_encoding_ucs2le = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_WCS2,
|
||||
&vtbl_ucs2le_wchar,
|
||||
&vtbl_wchar_ucs2le
|
||||
&vtbl_wchar_ucs2le,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_ucs2_wchar = {
|
||||
|
|
|
@ -48,7 +48,8 @@ const mbfl_encoding mbfl_encoding_ucs4 = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_WCS4,
|
||||
&vtbl_ucs4_wchar,
|
||||
&vtbl_wchar_ucs4
|
||||
&vtbl_wchar_ucs4,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_ucs4be = {
|
||||
|
@ -59,7 +60,8 @@ const mbfl_encoding mbfl_encoding_ucs4be = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_WCS4,
|
||||
&vtbl_ucs4be_wchar,
|
||||
&vtbl_wchar_ucs4be
|
||||
&vtbl_wchar_ucs4be,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_ucs4le = {
|
||||
|
@ -70,7 +72,8 @@ const mbfl_encoding mbfl_encoding_ucs4le = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_WCS4,
|
||||
&vtbl_ucs4le_wchar,
|
||||
&vtbl_wchar_ucs4le
|
||||
&vtbl_wchar_ucs4le,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_ucs4_wchar = {
|
||||
|
|
|
@ -67,7 +67,8 @@ const mbfl_encoding mbfl_encoding_uhc = {
|
|||
mblen_table_uhc,
|
||||
0,
|
||||
&vtbl_uhc_wchar,
|
||||
&vtbl_wchar_uhc
|
||||
&vtbl_wchar_uhc,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_uhc_wchar = {
|
||||
|
|
|
@ -42,7 +42,8 @@ const mbfl_encoding mbfl_encoding_utf16 = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_MWC2,
|
||||
&vtbl_utf16_wchar,
|
||||
&vtbl_wchar_utf16
|
||||
&vtbl_wchar_utf16,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf16be = {
|
||||
|
@ -53,7 +54,8 @@ const mbfl_encoding mbfl_encoding_utf16be = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_MWC2,
|
||||
&vtbl_utf16be_wchar,
|
||||
&vtbl_wchar_utf16be
|
||||
&vtbl_wchar_utf16be,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf16le = {
|
||||
|
@ -64,7 +66,8 @@ const mbfl_encoding mbfl_encoding_utf16le = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_MWC2,
|
||||
&vtbl_utf16le_wchar,
|
||||
&vtbl_wchar_utf16le
|
||||
&vtbl_wchar_utf16le,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
|
||||
|
|
|
@ -42,7 +42,8 @@ const mbfl_encoding mbfl_encoding_utf32 = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_WCS4,
|
||||
&vtbl_utf32_wchar,
|
||||
&vtbl_wchar_utf32
|
||||
&vtbl_wchar_utf32,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf32be = {
|
||||
|
@ -53,7 +54,8 @@ const mbfl_encoding mbfl_encoding_utf32be = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_WCS4,
|
||||
&vtbl_utf32be_wchar,
|
||||
&vtbl_wchar_utf32be
|
||||
&vtbl_wchar_utf32be,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf32le = {
|
||||
|
@ -64,7 +66,8 @@ const mbfl_encoding mbfl_encoding_utf32le = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_WCS4,
|
||||
&vtbl_utf32le_wchar,
|
||||
&vtbl_wchar_utf32le
|
||||
&vtbl_wchar_utf32le,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_utf32_wchar = {
|
||||
|
|
|
@ -29,8 +29,10 @@
|
|||
|
||||
#include "mbfilter.h"
|
||||
#include "mbfilter_utf7.h"
|
||||
#include "utf7_helper.h"
|
||||
|
||||
static int mbfl_filt_conv_utf7_wchar_flush(mbfl_convert_filter *filter);
|
||||
static bool mb_check_utf7(unsigned char *in, size_t in_len);
|
||||
|
||||
static const unsigned char mbfl_base64_table[] = {
|
||||
/* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
|
||||
|
@ -55,7 +57,8 @@ const mbfl_encoding mbfl_encoding_utf7 = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_GL_UNSAFE,
|
||||
&vtbl_utf7_wchar,
|
||||
&vtbl_wchar_utf7
|
||||
&vtbl_wchar_utf7,
|
||||
mb_check_utf7
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_utf7_wchar = {
|
||||
|
@ -419,3 +422,169 @@ int mbfl_filt_conv_wchar_utf7_flush(mbfl_convert_filter *filter)
|
|||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static bool is_optional_direct(unsigned char c)
|
||||
{
|
||||
/* Characters that are allowed to be encoded by Base64 or directly encoded */
|
||||
return c == '!' || c == '"' || c == '#' || c == '$' || c == '%' || c == '&' || c == '*' || c == ';' || c == '<' ||
|
||||
c == '=' || c == '>' || c == '@' || c == '[' || c == ']' || c == '^' || c == '_' || c == '`' || c == '{' ||
|
||||
c == '|' || c == '}';
|
||||
}
|
||||
|
||||
static bool can_end_base64(uint32_t c)
|
||||
{
|
||||
return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\'' || c == '(' || c == ')' || c == ',' || c == '.' || c == ':' || c == '?';
|
||||
}
|
||||
|
||||
static unsigned char decode_base64(unsigned char c)
|
||||
{
|
||||
if (c >= 'A' && c <= 'Z') {
|
||||
return c - 65;
|
||||
} else if (c >= 'a' && c <= 'z') {
|
||||
return c - 71;
|
||||
} else if (c >= '0' && c <= '9') {
|
||||
return c + 4;
|
||||
} else if (c == '+') {
|
||||
return 62;
|
||||
} else if (c == '/') {
|
||||
return 63;
|
||||
} else if (c == '-') {
|
||||
return DASH;
|
||||
} else if (can_end_base64(c) || is_optional_direct(c) || c == '\0') {
|
||||
return DIRECT;
|
||||
} else if (c <= 0x7F) {
|
||||
return ASCII;
|
||||
}
|
||||
return ILLEGAL;
|
||||
}
|
||||
|
||||
static bool is_utf16_cp_valid(uint16_t cp, bool is_surrogate)
|
||||
{
|
||||
if (is_surrogate) {
|
||||
return cp >= 0xDC00 && cp <= 0xDFFF;
|
||||
} else {
|
||||
/* 2nd part of surrogate pair came unexpectedly */
|
||||
return !(cp >= 0xDC00 && cp <= 0xDFFF);
|
||||
}
|
||||
}
|
||||
|
||||
static bool should_direct_encode(uint32_t c)
|
||||
{
|
||||
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '\0' || c == '/' || c == '-' || can_end_base64(c);
|
||||
}
|
||||
|
||||
static bool can_encode_directly(unsigned char c)
|
||||
{
|
||||
return should_direct_encode(c) || is_optional_direct(c) || c == '\0';
|
||||
}
|
||||
|
||||
static bool mb_check_utf7(unsigned char *in, size_t in_len)
|
||||
{
|
||||
unsigned char *p = in, *e = p + in_len;
|
||||
bool base64 = false;
|
||||
bool is_surrogate = false;
|
||||
|
||||
while (p < e) {
|
||||
if (base64) {
|
||||
unsigned char n1 = decode_base64(*p++);
|
||||
if (is_base64_end(n1)) {
|
||||
if (!is_base64_end_valid(n1, false, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
base64 = false;
|
||||
continue;
|
||||
} else if (p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n2 = decode_base64(*p++);
|
||||
if (is_base64_end(n2) || p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n3 = decode_base64(*p++);
|
||||
if (is_base64_end(n3)) {
|
||||
return false;
|
||||
}
|
||||
uint16_t cp1 = (n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2);
|
||||
if (!is_utf16_cp_valid(cp1, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
is_surrogate = has_surrogate(cp1, is_surrogate);
|
||||
if (p == e) {
|
||||
/* It is an error if trailing padding bits are not zeroes or if we were
|
||||
* expecting the 2nd part of a surrogate pair when Base64 section ends */
|
||||
return !((n3 & 0x3) || is_surrogate);
|
||||
}
|
||||
|
||||
unsigned char n4 = decode_base64(*p++);
|
||||
if (is_base64_end(n4)) {
|
||||
if (!is_base64_end_valid(n4, n3 & 0x3, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
base64 = false;
|
||||
continue;
|
||||
} else if (p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n5 = decode_base64(*p++);
|
||||
if (is_base64_end(n5) || p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n6 = decode_base64(*p++);
|
||||
if (is_base64_end(n6)) {
|
||||
return false;
|
||||
}
|
||||
uint16_t cp2 = (n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4);
|
||||
if (!is_utf16_cp_valid(cp2, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
is_surrogate = has_surrogate(cp2, is_surrogate);
|
||||
if (p == e) {
|
||||
return !((n6 & 0xF) || is_surrogate);
|
||||
}
|
||||
|
||||
unsigned char n7 = decode_base64(*p++);
|
||||
if (is_base64_end(n7)) {
|
||||
if (!is_base64_end_valid(n7, n6 & 0xF, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
base64 = false;
|
||||
continue;
|
||||
} else if (p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n8 = decode_base64(*p++);
|
||||
if (is_base64_end(n8)) {
|
||||
return false;
|
||||
}
|
||||
uint16_t cp3 = (n6 << 12) | (n7 << 6) | n8;
|
||||
if (!is_utf16_cp_valid(cp3, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
is_surrogate = has_surrogate(cp3, is_surrogate);
|
||||
} else {
|
||||
/* ASCII text section */
|
||||
unsigned char c = *p++;
|
||||
|
||||
if (c == '+') {
|
||||
if (p == e) {
|
||||
base64 = true;
|
||||
return !is_surrogate;
|
||||
}
|
||||
unsigned char n = decode_base64(*p);
|
||||
if (n == DASH) {
|
||||
p++;
|
||||
} else if (n > DASH) {
|
||||
/* If a "+" character followed immediately by any character other than base64 or "-" */
|
||||
return false;
|
||||
} else {
|
||||
base64 = true;
|
||||
}
|
||||
} else if (can_encode_directly(c)) {
|
||||
continue;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return !is_surrogate;
|
||||
}
|
||||
|
|
|
@ -77,9 +77,11 @@
|
|||
|
||||
#include "mbfilter.h"
|
||||
#include "mbfilter_utf7imap.h"
|
||||
#include "utf7_helper.h"
|
||||
|
||||
static int mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter *filter);
|
||||
static int mbfl_filt_conv_utf7imap_wchar_flush(mbfl_convert_filter *filter);
|
||||
static bool mb_check_utf7imap(unsigned char *in, size_t in_len);
|
||||
|
||||
static const char *mbfl_encoding_utf7imap_aliases[] = {"mUTF-7", NULL};
|
||||
|
||||
|
@ -91,7 +93,8 @@ const mbfl_encoding mbfl_encoding_utf7imap = {
|
|||
NULL,
|
||||
0,
|
||||
&vtbl_utf7imap_wchar,
|
||||
&vtbl_wchar_utf7imap
|
||||
&vtbl_wchar_utf7imap,
|
||||
mb_check_utf7imap
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_utf7imap_wchar = {
|
||||
|
@ -437,3 +440,142 @@ static int mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter *filter)
|
|||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static unsigned char decode_base64(unsigned char c)
|
||||
{
|
||||
if (c >= 'A' && c <= 'Z') {
|
||||
return c - 65;
|
||||
} else if (c >= 'a' && c <= 'z') {
|
||||
return c - 71;
|
||||
} else if (c >= '0' && c <= '9') {
|
||||
return c + 4;
|
||||
} else if (c == '+') {
|
||||
return 62;
|
||||
} else if (c == ',') {
|
||||
return 63;
|
||||
} else if (c == '-') {
|
||||
return DASH;
|
||||
}
|
||||
return ILLEGAL;
|
||||
}
|
||||
|
||||
static bool is_utf16_cp_valid(uint16_t cp, bool is_surrogate)
|
||||
{
|
||||
if (is_surrogate) {
|
||||
return cp >= 0xDC00 && cp <= 0xDFFF;
|
||||
} else if (cp >= 0xDC00 && cp <= 0xDFFF) {
|
||||
/* 2nd part of surrogate pair came unexpectedly */
|
||||
return false;
|
||||
} else if (cp >= 0x20 && cp <= 0x7E && cp != '&') {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool mb_check_utf7imap(unsigned char *in, size_t in_len)
|
||||
{
|
||||
unsigned char *p = in, *e = p + in_len;
|
||||
bool base64 = false;
|
||||
bool is_surrogate = false;
|
||||
|
||||
while (p < e) {
|
||||
if (base64) {
|
||||
/* Base64 section */
|
||||
unsigned char n1 = decode_base64(*p++);
|
||||
if (is_base64_end(n1)) {
|
||||
if (!is_base64_end_valid(n1, false, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
base64 = false;
|
||||
continue;
|
||||
} else if (p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n2 = decode_base64(*p++);
|
||||
if (is_base64_end(n2) || p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n3 = decode_base64(*p++);
|
||||
if (is_base64_end(n3)) {
|
||||
return false;
|
||||
}
|
||||
uint16_t cp1 = (n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2);
|
||||
if (!is_utf16_cp_valid(cp1, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
is_surrogate = has_surrogate(cp1, is_surrogate);
|
||||
if (p == e) {
|
||||
return false;
|
||||
}
|
||||
|
||||
unsigned char n4 = decode_base64(*p++);
|
||||
if (is_base64_end(n4)) {
|
||||
if (!is_base64_end_valid(n4, n3 & 0x3, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
base64 = false;
|
||||
continue;
|
||||
} else if (p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n5 = decode_base64(*p++);
|
||||
if (is_base64_end(n5) || p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n6 = decode_base64(*p++);
|
||||
if (is_base64_end(n6)) {
|
||||
return false;
|
||||
}
|
||||
uint16_t cp2 = (n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4);
|
||||
if (!is_utf16_cp_valid(cp2, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
is_surrogate = has_surrogate(cp2, is_surrogate);
|
||||
if (p == e) {
|
||||
return false;
|
||||
}
|
||||
|
||||
unsigned char n7 = decode_base64(*p++);
|
||||
if (is_base64_end(n7)) {
|
||||
if (!is_base64_end_valid(n7, n6 & 0xF, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
base64 = false;
|
||||
continue;
|
||||
} else if (p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n8 = decode_base64(*p++);
|
||||
if (is_base64_end(n8)) {
|
||||
return false;
|
||||
}
|
||||
uint16_t cp3 = (n6 << 12) | (n7 << 6) | n8;
|
||||
if (!is_utf16_cp_valid(cp3, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
is_surrogate = has_surrogate(cp3, is_surrogate);
|
||||
} else {
|
||||
/* ASCII text section */
|
||||
unsigned char c = *p++;
|
||||
|
||||
if (c == '&') {
|
||||
if (p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n = decode_base64(*p);
|
||||
if (n == DASH) {
|
||||
p++;
|
||||
} else if (n == ILLEGAL) {
|
||||
return false;
|
||||
} else {
|
||||
base64 = true;
|
||||
}
|
||||
} else if (c >= 0x20 && c <= 0x7E) {
|
||||
continue;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return !base64;
|
||||
}
|
||||
|
|
|
@ -59,7 +59,8 @@ const mbfl_encoding mbfl_encoding_utf8 = {
|
|||
mblen_table_utf8,
|
||||
0,
|
||||
&vtbl_utf8_wchar,
|
||||
&vtbl_wchar_utf8
|
||||
&vtbl_wchar_utf8,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_utf8_wchar = {
|
||||
|
|
|
@ -49,7 +49,8 @@ const mbfl_encoding mbfl_encoding_utf8_docomo = {
|
|||
mblen_table_utf8,
|
||||
0,
|
||||
&vtbl_utf8_docomo_wchar,
|
||||
&vtbl_wchar_utf8_docomo
|
||||
&vtbl_wchar_utf8_docomo,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf8_kddi_a = {
|
||||
|
@ -60,7 +61,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_a = {
|
|||
mblen_table_utf8,
|
||||
0,
|
||||
&vtbl_utf8_kddi_a_wchar,
|
||||
&vtbl_wchar_utf8_kddi_a
|
||||
&vtbl_wchar_utf8_kddi_a,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf8_kddi_b = {
|
||||
|
@ -71,7 +73,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_b = {
|
|||
mblen_table_utf8,
|
||||
0,
|
||||
&vtbl_utf8_kddi_b_wchar,
|
||||
&vtbl_wchar_utf8_kddi_b
|
||||
&vtbl_wchar_utf8_kddi_b,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf8_sb = {
|
||||
|
@ -82,7 +85,8 @@ const mbfl_encoding mbfl_encoding_utf8_sb = {
|
|||
mblen_table_utf8,
|
||||
0,
|
||||
&vtbl_utf8_sb_wchar,
|
||||
&vtbl_wchar_utf8_sb
|
||||
&vtbl_wchar_utf8_sb,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_utf8_docomo_wchar = {
|
||||
|
|
|
@ -38,6 +38,7 @@ const mbfl_encoding mbfl_encoding_uuencode = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_SBCS,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
};
|
||||
|
||||
|
|
27
ext/mbstring/libmbfl/filters/utf7_helper.h
Normal file
27
ext/mbstring/libmbfl/filters/utf7_helper.h
Normal file
|
@ -0,0 +1,27 @@
|
|||
#ifndef MBFL_UTF7_HELPER_H
|
||||
#define MBFL_UTF7_HELPER_H
|
||||
|
||||
#include "mbfilter.h"
|
||||
|
||||
/* Ways which a Base64-encoded section can end: */
|
||||
#define DASH 0xFC
|
||||
#define DIRECT 0xFD
|
||||
#define ASCII 0xFE
|
||||
#define ILLEGAL 0xFF
|
||||
|
||||
static inline bool is_base64_end(unsigned char c)
|
||||
{
|
||||
return c >= DASH;
|
||||
}
|
||||
|
||||
static inline bool is_base64_end_valid(unsigned char n, bool gap, bool is_surrogate)
|
||||
{
|
||||
return !(gap || is_surrogate || n == ASCII || n == ILLEGAL);
|
||||
}
|
||||
|
||||
static inline bool has_surrogate(uint16_t cp, bool is_surrogate)
|
||||
{
|
||||
return !is_surrogate && cp >= 0xD800 && cp <= 0xDBFF;
|
||||
}
|
||||
|
||||
#endif /* MBFL_UTF7_HELPER_H */
|
|
@ -376,6 +376,16 @@ int mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *str
|
|||
unsigned char *p = string->val;
|
||||
int bad = 0;
|
||||
|
||||
if (identd->strict) {
|
||||
for (int i = 0; i < num; i++) {
|
||||
mbfl_convert_filter *filter = identd->filter_list[i];
|
||||
mbfl_encoding_detector_data *data = &identd->filter_data[i];
|
||||
if (filter->from->check != NULL && !(filter->from->check)(p, n)) {
|
||||
data->num_illegalchars++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
while (n--) {
|
||||
for (int i = 0; i < num; i++) {
|
||||
mbfl_convert_filter *filter = identd->filter_list[i];
|
||||
|
|
|
@ -47,7 +47,8 @@ const mbfl_encoding mbfl_encoding_8bit = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_SBCS,
|
||||
&vtbl_8bit_wchar,
|
||||
&vtbl_wchar_8bit
|
||||
&vtbl_wchar_8bit,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_8bit_wchar = {
|
||||
|
|
|
@ -42,6 +42,7 @@ const mbfl_encoding mbfl_encoding_pass = {
|
|||
NULL,
|
||||
0,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
};
|
||||
|
||||
|
|
|
@ -40,5 +40,6 @@ const mbfl_encoding mbfl_encoding_wchar = {
|
|||
NULL,
|
||||
MBFL_ENCTYPE_WCS4,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
};
|
||||
|
|
|
@ -32,6 +32,7 @@
|
|||
#define MBFL_ENCODING_H
|
||||
|
||||
#include "mbfl_defs.h"
|
||||
#include "zend.h"
|
||||
|
||||
enum mbfl_no_encoding {
|
||||
mbfl_no_encoding_invalid = -1,
|
||||
|
@ -132,6 +133,8 @@ struct mbfl_convert_vtbl {
|
|||
void (*filter_copy)(struct _mbfl_convert_filter *src, struct _mbfl_convert_filter *dest);
|
||||
};
|
||||
|
||||
typedef bool (*mb_check_fn)(unsigned char *in, size_t in_len);
|
||||
|
||||
/*
|
||||
* encoding
|
||||
*/
|
||||
|
@ -144,6 +147,7 @@ typedef struct _mbfl_encoding {
|
|||
unsigned int flag;
|
||||
const struct mbfl_convert_vtbl *input_filter;
|
||||
const struct mbfl_convert_vtbl *output_filter;
|
||||
mb_check_fn check;
|
||||
} mbfl_encoding;
|
||||
|
||||
MBFLAPI extern const mbfl_encoding *mbfl_name2encoding(const char *name);
|
||||
|
|
|
@ -3917,6 +3917,11 @@ MBSTRING_API int php_mb_check_encoding(const char *input, size_t length, const m
|
|||
{
|
||||
mbfl_convert_filter *filter = mbfl_convert_filter_new(encoding, &mbfl_encoding_wchar, mbfl_filt_check_errors, NULL, &filter);
|
||||
|
||||
if (encoding->check != NULL) {
|
||||
mbfl_convert_filter_delete(filter);
|
||||
return encoding->check((unsigned char*)input, length);
|
||||
}
|
||||
|
||||
while (length--) {
|
||||
unsigned char c = *input++;
|
||||
(filter->filter_function)(c, filter);
|
||||
|
|
462
ext/mbstring/tests/gh10192_utf7.phpt
Normal file
462
ext/mbstring/tests/gh10192_utf7.phpt
Normal file
|
@ -0,0 +1,462 @@
|
|||
--TEST--
|
||||
GH-10192 (mb_detect_encoding() results for UTF-7 differ between PHP 8.0 and 8.1)
|
||||
--EXTENSIONS--
|
||||
mbstring
|
||||
--FILE--
|
||||
<?php
|
||||
|
||||
$testcases = [
|
||||
'non-base64 character after +' => 'A + B',
|
||||
'non-base64 character after -' => 'A - B',
|
||||
'base64 character before +' => 'A 1+ B',
|
||||
'base64 character before -' => 'A 1- B',
|
||||
'base64 character after +' => 'A +1 B',
|
||||
'base64 character after -' => 'A -1 B',
|
||||
'base64 character before and after +' => 'A 1+1 B',
|
||||
'base64 character before and after -' => 'A 1-1 B',
|
||||
'string ends with +' => 'A +',
|
||||
'string ends with -' => 'A -',
|
||||
'+ and -' => 'A +- B',
|
||||
'- and +' => 'A -+ B',
|
||||
'valid direct encoding character =' => 'A = B',
|
||||
'invalid direct encoding character ~' => 'A ~ B',
|
||||
'invalid direct encoding character \\' => 'A \\ B',
|
||||
'invalid direct encoding character ESC' => "A \x1b B",
|
||||
'valid direct encoding character = after +' => 'A += B',
|
||||
'invalid direct encoding character ~ after +' => 'A +~ B',
|
||||
'invalid direct encoding character \\ after +' => 'A +\\ B',
|
||||
'invalid direct encoding character ESC after +' => "A +\x1b B",
|
||||
'valid base64 character between + and -' => 'A +ZeVnLIqe- B', // 日本語 in UTF-16BE
|
||||
'invalid base64 character between + and -' => 'A +ZeVnLIq- B', // 日本語 in UTF-16BE without the last character
|
||||
'valid base64 character between + and non-base64 character' => 'A +ZeVnLIqe B',
|
||||
'invalid base64 character between + and non-base64 character' => 'A +ZeVnLIq B',
|
||||
'valid base64 character between + and base64 character' => 'A +ZeVnLIqe1 B',
|
||||
'invalid base64 character between + and base64 character' => 'A +ZeVnLIq1 B',
|
||||
'valid base64 character between + and end of string' => 'A +ZeVnLIqe',
|
||||
'invalid base64 character between + and end of string' => 'A +ZeVnLIq',
|
||||
'valid base64 character consisting only of + between + and -' => 'A +++++++++- B',
|
||||
'invalid base64 character consisting only of + between + and -' => 'A +++++++++- B',
|
||||
'valid base64 character consisting only of + between + and non-base64 character' => 'A +++++++++ B',
|
||||
'invalid base64 character consisting only of + between + and non-base64 character' => 'A +++++++++ B',
|
||||
'valid base64 character consisting only of + between + and base64 character' => 'A +++++++++1 B',
|
||||
'invalid base64 character consisting only of + between + and base64 character' => 'A +++++++++1 B',
|
||||
'valid base64 character consisting only of + between + and end of string' => 'A +++++++++',
|
||||
'invalid base64 character consisting only of + between + and end of string' => 'A +++++++++',
|
||||
'valid base64 character using surrogate pair between + and -' => 'A +2GfePQ- B', // 𩸽 in UTF-16BE
|
||||
'invalid base64 character using surrogate pair between + and -' => 'A +2Gc- B', // first 16 bits of 𩸽 in UTF-16BE
|
||||
'valid base64 character using surrogate pair between + and non-base64 character' => 'A +2GfePQ B',
|
||||
'invalid base64 character using surrogate pair between + and non-base64 character' => 'A +2Gc B',
|
||||
'valid base64 character using surrogate pair between + and base64 character' => 'A +2GfePQ1 B',
|
||||
'invalid base64 character using surrogate pair between + and base64 character' => 'A +2Gc1 B',
|
||||
'valid base64 character using surrogate pair between + and end of string' => 'A +2GfePQ',
|
||||
'invalid base64 character using surrogate pair between + and end of string' => 'A +2Gc'
|
||||
];
|
||||
|
||||
foreach ($testcases as $title => $case) {
|
||||
echo $title . PHP_EOL;
|
||||
var_dump(mb_detect_encoding($case, 'UTF-8, UTF-7', true));
|
||||
var_dump(mb_detect_encoding($case, 'UTF-8, UTF-7', false));
|
||||
var_dump(mb_detect_encoding($case, 'UTF-7', true));
|
||||
var_dump(mb_detect_encoding($case, 'UTF-7', false));
|
||||
var_dump(mb_check_encoding($case, 'UTF-7'));
|
||||
var_dump(addcslashes(mb_convert_encoding($case, 'UTF-8', 'UTF-7'), "\0..\37\177"));
|
||||
var_dump(mb_get_info('illegal_chars'));
|
||||
echo PHP_EOL;
|
||||
}
|
||||
?>
|
||||
--EXPECT--
|
||||
non-base64 character after +
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(4) "A B"
|
||||
int(0)
|
||||
|
||||
non-base64 character after -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(5) "A - B"
|
||||
int(0)
|
||||
|
||||
base64 character before +
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A 1 B"
|
||||
int(0)
|
||||
|
||||
base64 character before -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(6) "A 1- B"
|
||||
int(0)
|
||||
|
||||
base64 character after +
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A ? B"
|
||||
int(1)
|
||||
|
||||
base64 character after -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(6) "A -1 B"
|
||||
int(1)
|
||||
|
||||
base64 character before and after +
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(6) "A 1? B"
|
||||
int(2)
|
||||
|
||||
base64 character before and after -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(7) "A 1-1 B"
|
||||
int(2)
|
||||
|
||||
string ends with +
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(2) "A "
|
||||
int(2)
|
||||
|
||||
string ends with -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(3) "A -"
|
||||
int(2)
|
||||
|
||||
+ and -
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(5) "A + B"
|
||||
int(2)
|
||||
|
||||
- and +
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A - B"
|
||||
int(2)
|
||||
|
||||
valid direct encoding character =
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(5) "A = B"
|
||||
int(2)
|
||||
|
||||
invalid direct encoding character ~
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A ~ B"
|
||||
int(2)
|
||||
|
||||
invalid direct encoding character \
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A \ B"
|
||||
int(2)
|
||||
|
||||
invalid direct encoding character ESC
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(8) "A \033 B"
|
||||
int(2)
|
||||
|
||||
valid direct encoding character = after +
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A = B"
|
||||
int(2)
|
||||
|
||||
invalid direct encoding character ~ after +
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A ~ B"
|
||||
int(2)
|
||||
|
||||
invalid direct encoding character \ after +
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A \ B"
|
||||
int(2)
|
||||
|
||||
invalid direct encoding character ESC after +
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(8) "A \033 B"
|
||||
int(2)
|
||||
|
||||
valid base64 character between + and -
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(13) "A 日本語 B"
|
||||
int(2)
|
||||
|
||||
invalid base64 character between + and -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(11) "A 日本? B"
|
||||
int(3)
|
||||
|
||||
valid base64 character between + and non-base64 character
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(13) "A 日本語 B"
|
||||
int(3)
|
||||
|
||||
invalid base64 character between + and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(11) "A 日本? B"
|
||||
int(4)
|
||||
|
||||
valid base64 character between + and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(14) "A 日本語? B"
|
||||
int(5)
|
||||
|
||||
invalid base64 character between + and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(13) "A 日本誵 B"
|
||||
int(5)
|
||||
|
||||
valid base64 character between + and end of string
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(11) "A 日本語"
|
||||
int(5)
|
||||
|
||||
invalid base64 character between + and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(9) "A 日本?"
|
||||
int(6)
|
||||
|
||||
valid base64 character consisting only of + between + and -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(13) "A ﯯ뻻 B"
|
||||
int(6)
|
||||
|
||||
invalid base64 character consisting only of + between + and -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(13) "A ﯯ뻻 B"
|
||||
int(6)
|
||||
|
||||
valid base64 character consisting only of + between + and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(13) "A ﯯ뻻 B"
|
||||
int(6)
|
||||
|
||||
invalid base64 character consisting only of + between + and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(13) "A ﯯ뻻 B"
|
||||
int(6)
|
||||
|
||||
valid base64 character consisting only of + between + and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(14) "A ﯯ뻻? B"
|
||||
int(7)
|
||||
|
||||
invalid base64 character consisting only of + between + and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(14) "A ﯯ뻻? B"
|
||||
int(8)
|
||||
|
||||
valid base64 character consisting only of + between + and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(11) "A ﯯ뻻"
|
||||
int(8)
|
||||
|
||||
invalid base64 character consisting only of + between + and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(11) "A ﯯ뻻"
|
||||
int(8)
|
||||
|
||||
valid base64 character using surrogate pair between + and -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(8) "A 𩸽 B"
|
||||
int(8)
|
||||
|
||||
invalid base64 character using surrogate pair between + and -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A ? B"
|
||||
int(9)
|
||||
|
||||
valid base64 character using surrogate pair between + and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(8) "A 𩸽 B"
|
||||
int(9)
|
||||
|
||||
invalid base64 character using surrogate pair between + and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A ? B"
|
||||
int(10)
|
||||
|
||||
valid base64 character using surrogate pair between + and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(9) "A 𩸽? B"
|
||||
int(11)
|
||||
|
||||
invalid base64 character using surrogate pair between + and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A ? B"
|
||||
int(12)
|
||||
|
||||
valid base64 character using surrogate pair between + and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(6) "A 𩸽"
|
||||
int(12)
|
||||
|
||||
invalid base64 character using surrogate pair between + and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(3) "A ?"
|
||||
int(13)
|
343
ext/mbstring/tests/gh10192_utf7imap.phpt
Normal file
343
ext/mbstring/tests/gh10192_utf7imap.phpt
Normal file
|
@ -0,0 +1,343 @@
|
|||
--TEST--
|
||||
GH-10192 (mb_detect_encoding() results for UTF-7 differ between PHP 8.0 and 8.1)
|
||||
--EXTENSIONS--
|
||||
mbstring
|
||||
--FILE--
|
||||
<?php
|
||||
|
||||
$testcases = [
|
||||
'non-base64 character after &' => 'A & B',
|
||||
'non-base64 character after -' => 'A - B',
|
||||
'base64 character before &' => 'A 1& B',
|
||||
'base64 character before -' => 'A 1- B',
|
||||
'base64 character after &' => 'A &1 B',
|
||||
'base64 character after -' => 'A -1 B',
|
||||
'base64 character before and after &' => 'A 1&1 B',
|
||||
'base64 character before and after -' => 'A 1-1 B',
|
||||
'string ends with &' => 'A &',
|
||||
'string ends with -' => 'A -',
|
||||
'& and -' => 'A &- B',
|
||||
'- and &' => 'A -& B',
|
||||
'valid direct encoding character ~' => 'A ~ B',
|
||||
'invalid direct encoding character ESC' => "A \x1b B",
|
||||
'valid direct encoding character ~ after &' => 'A &~ B',
|
||||
'invalid direct encoding character ESC after &' => "A &\x1b B",
|
||||
'valid base64 character between & and -' => 'A &ZeVnLIqe- B', // 日本語 in UTF-16BE
|
||||
'invalid base64 character between & and -' => 'A &ZeVnLIq- B', // 日本語 in UTF-16BE without the last character
|
||||
'valid base64 character between & and non-base64 character' => 'A &ZeVnLIqe B',
|
||||
'invalid base64 character between & and non-base64 character' => 'A &ZeVnLIq B',
|
||||
'valid base64 character between & and base64 character' => 'A &ZeVnLIqe1 B',
|
||||
'invalid base64 character between & and base64 character' => 'A &ZeVnLIq1 B',
|
||||
'valid base64 character between & and end of string' => 'A &ZeVnLIqe',
|
||||
'invalid base64 character between & and end of string' => 'A &ZeVnLIq',
|
||||
'valid base64 character using surrogate pair between & and -' => 'A &2GfePQ- B', // 𩸽 in UTF-16BE
|
||||
'invalid base64 character using surrogate pair between & and -' => 'A &2Gc- B', // first 16 bits of 𩸽 in UTF-16BE
|
||||
'valid base64 character using surrogate pair between & and non-base64 character' => 'A &2GfePQ B',
|
||||
'invalid base64 character using surrogate pair between & and non-base64 character' => 'A &2Gc B',
|
||||
'valid base64 character using surrogate pair between & and base64 character' => 'A &2GfePQ1 B',
|
||||
'invalid base64 character using surrogate pair between & and base64 character' => 'A &2Gc1 B',
|
||||
'valid base64 character using surrogate pair between & and end of string' => 'A &2GfePQ',
|
||||
'invalid base64 character using surrogate pair between & and end of string' => 'A &2Gc'
|
||||
];
|
||||
|
||||
foreach ($testcases as $title => $case) {
|
||||
echo $title . PHP_EOL;
|
||||
var_dump(mb_detect_encoding($case, 'UTF-8, UTF7-IMAP', true));
|
||||
var_dump(mb_detect_encoding($case, 'UTF-8, UTF7-IMAP', false));
|
||||
var_dump(mb_detect_encoding($case, 'UTF7-IMAP', true));
|
||||
var_dump(mb_detect_encoding($case, 'UTF7-IMAP', false));
|
||||
var_dump(mb_check_encoding($case, 'UTF7-IMAP'));
|
||||
var_dump(addcslashes(mb_convert_encoding($case, 'UTF-8', 'UTF7-IMAP'), "\0..\37\177"));
|
||||
var_dump(mb_get_info('illegal_chars'));
|
||||
echo PHP_EOL;
|
||||
}
|
||||
|
||||
?>
|
||||
--EXPECT--
|
||||
non-base64 character after &
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(4) "A ?B"
|
||||
int(1)
|
||||
|
||||
non-base64 character after -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(true)
|
||||
string(5) "A - B"
|
||||
int(1)
|
||||
|
||||
base64 character before &
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(5) "A 1?B"
|
||||
int(2)
|
||||
|
||||
base64 character before -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(true)
|
||||
string(6) "A 1- B"
|
||||
int(2)
|
||||
|
||||
base64 character after &
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(4) "A ?B"
|
||||
int(3)
|
||||
|
||||
base64 character after -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(true)
|
||||
string(6) "A -1 B"
|
||||
int(3)
|
||||
|
||||
base64 character before and after &
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(5) "A 1?B"
|
||||
int(4)
|
||||
|
||||
base64 character before and after -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(true)
|
||||
string(7) "A 1-1 B"
|
||||
int(4)
|
||||
|
||||
string ends with &
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(3) "A ?"
|
||||
int(5)
|
||||
|
||||
string ends with -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(true)
|
||||
string(3) "A -"
|
||||
int(5)
|
||||
|
||||
& and -
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(true)
|
||||
string(5) "A & B"
|
||||
int(5)
|
||||
|
||||
- and &
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(5) "A -?B"
|
||||
int(6)
|
||||
|
||||
valid direct encoding character ~
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(true)
|
||||
string(5) "A ~ B"
|
||||
int(6)
|
||||
|
||||
invalid direct encoding character ESC
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(5) "A ? B"
|
||||
int(7)
|
||||
|
||||
valid direct encoding character ~ after &
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(5) "A ? B"
|
||||
int(8)
|
||||
|
||||
invalid direct encoding character ESC after &
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(5) "A ? B"
|
||||
int(9)
|
||||
|
||||
valid base64 character between & and -
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(true)
|
||||
string(13) "A 日本語 B"
|
||||
int(9)
|
||||
|
||||
invalid base64 character between & and -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(11) "A 日本? B"
|
||||
int(10)
|
||||
|
||||
valid base64 character between & and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(13) "A 日本語?B"
|
||||
int(11)
|
||||
|
||||
invalid base64 character between & and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(10) "A 日本?B"
|
||||
int(12)
|
||||
|
||||
valid base64 character between & and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(13) "A 日本語?B"
|
||||
int(13)
|
||||
|
||||
invalid base64 character between & and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(13) "A 日本誵?B"
|
||||
int(14)
|
||||
|
||||
valid base64 character between & and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(12) "A 日本語?"
|
||||
int(15)
|
||||
|
||||
invalid base64 character between & and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(9) "A 日本?"
|
||||
int(16)
|
||||
|
||||
valid base64 character using surrogate pair between & and -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(true)
|
||||
string(8) "A 𩸽 B"
|
||||
int(16)
|
||||
|
||||
invalid base64 character using surrogate pair between & and -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(5) "A ? B"
|
||||
int(17)
|
||||
|
||||
valid base64 character using surrogate pair between & and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(8) "A 𩸽?B"
|
||||
int(18)
|
||||
|
||||
invalid base64 character using surrogate pair between & and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(4) "A ?B"
|
||||
int(19)
|
||||
|
||||
valid base64 character using surrogate pair between & and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(8) "A 𩸽?B"
|
||||
int(20)
|
||||
|
||||
invalid base64 character using surrogate pair between & and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(4) "A ?B"
|
||||
int(21)
|
||||
|
||||
valid base64 character using surrogate pair between & and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(7) "A 𩸽?"
|
||||
int(22)
|
||||
|
||||
invalid base64 character using surrogate pair between & and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(3) "A ?"
|
||||
int(23)
|
155
ext/mbstring/tests/gh10648.phpt
Normal file
155
ext/mbstring/tests/gh10648.phpt
Normal file
|
@ -0,0 +1,155 @@
|
|||
--TEST--
|
||||
GH-10648 (mb_check_encoding() returns true for incorrect but interpretable ISO-2022-JP byte sequences)
|
||||
--EXTENSIONS--
|
||||
mbstring
|
||||
--FILE--
|
||||
<?php
|
||||
|
||||
$testcases = [
|
||||
'ISO-2022-JP bytes' => '1b244224221b2842', // 'あ' in ISO-2022-JP
|
||||
'ISO-2022-JP bytes without escape sequence' => '1b24422422', // 'ア' in JIS
|
||||
'JIS X 0201 7bit kana with escape sequence' => '1b2849311b2842', // 'ア' in JIS
|
||||
'JIS X 0201 7bit kana with SO/SI' => '0e310f', // 'ア' in JIS
|
||||
'JIS X 0201 8bit kana' => 'b1', // 'ア' in JIS
|
||||
'JIS X 0201 7bit kana with SO and ESC' => '0e311b2842', // 'ア' in JIS
|
||||
'JIS X 0201 7bit kana with ESC and SI' => '1b2849310f', // 'ア' in JIS
|
||||
'JIS X 0208 character' => '1b244242641b2842', // '鯛' in JIS and ISO-2022-JP, included in JIS X 0208
|
||||
'JIS X 0212 character' => '1b2428446a591b2842', // '鮋' in JIS, included in JIS X 0212
|
||||
'JIS X 0213 character' => '1b2428507d4c1b2842', // '𩸽' in ISO-2022-JP-2004, included in JIS X 0213
|
||||
'JIS C 6220-1969 ESC ( H' => '1b284a1b2848', // an escape sequence transitioning to ASCII
|
||||
'SO/SI when not in ASCII mode' => '1b284a0e0f1b2842', // an escape sequence transitioning to ASCII
|
||||
];
|
||||
|
||||
foreach ($testcases as $title => $case) {
|
||||
echo $title . PHP_EOL;
|
||||
echo 'JIS:' . PHP_EOL;
|
||||
var_dump(mb_check_encoding(hex2bin($case), 'JIS'));
|
||||
echo mb_convert_encoding(hex2bin($case), 'UTF-8', 'JIS'). PHP_EOL;
|
||||
var_dump(mb_get_info('illegal_chars'));
|
||||
echo 'ISO-2022-JP:' . PHP_EOL;
|
||||
var_dump(mb_check_encoding(hex2bin($case), 'ISO-2022-JP'));
|
||||
echo mb_convert_encoding(hex2bin($case), 'UTF-8', 'ISO-2022-JP'). PHP_EOL;
|
||||
var_dump(mb_get_info('illegal_chars'));
|
||||
echo PHP_EOL;
|
||||
}
|
||||
?>
|
||||
--EXPECT--
|
||||
ISO-2022-JP bytes
|
||||
JIS:
|
||||
bool(true)
|
||||
あ
|
||||
int(0)
|
||||
ISO-2022-JP:
|
||||
bool(true)
|
||||
あ
|
||||
int(0)
|
||||
|
||||
ISO-2022-JP bytes without escape sequence
|
||||
JIS:
|
||||
bool(false)
|
||||
あ
|
||||
int(0)
|
||||
ISO-2022-JP:
|
||||
bool(false)
|
||||
あ
|
||||
int(0)
|
||||
|
||||
JIS X 0201 7bit kana with escape sequence
|
||||
JIS:
|
||||
bool(true)
|
||||
ア
|
||||
int(0)
|
||||
ISO-2022-JP:
|
||||
bool(false)
|
||||
ア
|
||||
int(0)
|
||||
|
||||
JIS X 0201 7bit kana with SO/SI
|
||||
JIS:
|
||||
bool(true)
|
||||
ア
|
||||
int(0)
|
||||
ISO-2022-JP:
|
||||
bool(false)
|
||||
ア
|
||||
int(0)
|
||||
|
||||
JIS X 0201 8bit kana
|
||||
JIS:
|
||||
bool(true)
|
||||
ア
|
||||
int(0)
|
||||
ISO-2022-JP:
|
||||
bool(false)
|
||||
ア
|
||||
int(0)
|
||||
|
||||
JIS X 0201 7bit kana with SO and ESC
|
||||
JIS:
|
||||
bool(false)
|
||||
ア
|
||||
int(0)
|
||||
ISO-2022-JP:
|
||||
bool(false)
|
||||
ア
|
||||
int(0)
|
||||
|
||||
JIS X 0201 7bit kana with ESC and SI
|
||||
JIS:
|
||||
bool(false)
|
||||
ア
|
||||
int(0)
|
||||
ISO-2022-JP:
|
||||
bool(false)
|
||||
ア
|
||||
int(0)
|
||||
|
||||
JIS X 0208 character
|
||||
JIS:
|
||||
bool(true)
|
||||
鯛
|
||||
int(0)
|
||||
ISO-2022-JP:
|
||||
bool(true)
|
||||
鯛
|
||||
int(0)
|
||||
|
||||
JIS X 0212 character
|
||||
JIS:
|
||||
bool(true)
|
||||
鮋
|
||||
int(0)
|
||||
ISO-2022-JP:
|
||||
bool(false)
|
||||
鮋
|
||||
int(0)
|
||||
|
||||
JIS X 0213 character
|
||||
JIS:
|
||||
bool(false)
|
||||
?$(P}L
|
||||
int(1)
|
||||
ISO-2022-JP:
|
||||
bool(false)
|
||||
?$(P}L
|
||||
int(2)
|
||||
|
||||
JIS C 6220-1969 ESC ( H
|
||||
JIS:
|
||||
bool(true)
|
||||
|
||||
int(2)
|
||||
ISO-2022-JP:
|
||||
bool(false)
|
||||
|
||||
int(2)
|
||||
|
||||
SO/SI when not in ASCII mode
|
||||
JIS:
|
||||
bool(false)
|
||||
|
||||
int(2)
|
||||
ISO-2022-JP:
|
||||
bool(false)
|
||||
|
||||
int(2)
|
|
@ -50,11 +50,6 @@ function testValid($from, $to, $encoding, $bothWays = true) {
|
|||
/* ESC ( B at the beginning is redundant, since ASCII mode is the default */
|
||||
if (substr($from, 0, 3) == "\x1B(B")
|
||||
$from = substr($from, 3, strlen($from) - 3);
|
||||
/* If the string switches to a different charset, it should switch back to
|
||||
* ASCII at the end */
|
||||
if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B(J") !== false)
|
||||
$from .= "\x1B(B";
|
||||
|
||||
convertValidString($to, $from, 'UTF-16BE', $encoding, false);
|
||||
}
|
||||
}
|
||||
|
@ -66,11 +61,11 @@ function testInvalid($from, $to, $encoding) {
|
|||
for ($i = 0; $i < 0x80; $i++) {
|
||||
if ($i == 0xE || $i == 0xF || $i == 0x1B)
|
||||
continue;
|
||||
testValid(chr($i), "\x00" . chr($i), 'JIS');
|
||||
testValid("\x0F" . chr($i), "\x00" . chr($i), 'JIS'); /* 0xF is 'Shift Out' code */
|
||||
testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'JIS');
|
||||
testValid(chr($i), "\x00" . chr($i), 'ISO-2022-JP');
|
||||
testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'ISO-2022-JP');
|
||||
testValid(chr($i), "\x00" . chr($i), 'JIS');
|
||||
convertValidString("\x0F" . chr($i), "\x00" . chr($i), 'JIS', 'UTF-16BE', false); /* 0xF is 'Shift In' code */
|
||||
testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'JIS');
|
||||
testValid(chr($i), "\x00" . chr($i), 'ISO-2022-JP');
|
||||
testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'ISO-2022-JP');
|
||||
}
|
||||
|
||||
for ($i = 0x80; $i < 256; $i++) {
|
||||
|
@ -92,27 +87,27 @@ echo "ASCII support OK\n";
|
|||
foreach ($jisx0201Chars as $jisx0201 => $utf16BE) {
|
||||
if (ord($jisx0201) >= 128) {
|
||||
$kana = chr(ord($jisx0201) - 128);
|
||||
testValid("\x1B(I" . $kana, $utf16BE, 'JIS', false);
|
||||
testValid("\x0E" . $kana, $utf16BE, 'JIS', false); /* 0xE is 'Shift In' code */
|
||||
testValid("\x1B(I" . $kana . "\x1B(B", $utf16BE, 'JIS', false);
|
||||
testValid("\x0E" . $kana . "\x0F", $utf16BE, 'JIS', false); /* 0xE is 'Shift Out' code */
|
||||
testValid($jisx0201, $utf16BE, 'JIS', false);
|
||||
} else {
|
||||
testValid("\x1B(J" . $jisx0201, $utf16BE, 'JIS', $utf16BE > "\x00\x80");
|
||||
testValid("\x1B(J" . $jisx0201 . "\x1B(B", $utf16BE, 'JIS', $utf16BE > "\x00\x80");
|
||||
}
|
||||
}
|
||||
|
||||
for ($i = 0x80; $i < 256; $i++) {
|
||||
if ($i >= 0xA1 && $i <= 0xDF)
|
||||
continue;
|
||||
testInvalid("\x1B(I" . chr($i), "\x00%", 'JIS');
|
||||
testInvalid("\x1B(J" . chr($i), "\x00%", 'JIS');
|
||||
testInvalid("\x1B(I" . chr($i) . "\x1B(B", "\x00%", 'JIS');
|
||||
testInvalid("\x1B(J" . chr($i) . "\x1B(B", "\x00%", 'JIS');
|
||||
}
|
||||
|
||||
echo "JIS X 0201 support OK\n";
|
||||
|
||||
/* All valid JISX0208 characters */
|
||||
foreach ($jisx0208Chars as $jisx0208 => $utf16BE) {
|
||||
testValid("\x1B\$B" . $jisx0208, $utf16BE, 'JIS');
|
||||
testValid("\x1B\$B" . $jisx0208, $utf16BE, 'ISO-2022-JP');
|
||||
testValid("\x1B\$B" . $jisx0208 . "\x1B(B", $utf16BE, 'JIS');
|
||||
testValid("\x1B\$B" . $jisx0208 . "\x1B(B", $utf16BE, 'ISO-2022-JP');
|
||||
}
|
||||
|
||||
/* All invalid 2-byte JISX0208 characters */
|
||||
|
@ -120,8 +115,8 @@ for ($i = 0x21; $i <= 0x7E; $i++) {
|
|||
for ($j = 0; $j < 256; $j++) {
|
||||
$testString = chr($i) . chr($j);
|
||||
if (!isset($jisx0208Chars[$testString])) {
|
||||
testInvalid("\x1B\$B" . $testString, "\x00%", 'JIS');
|
||||
testInvalid("\x1B\$B" . $testString, "\x00%", 'ISO-2022-JP');
|
||||
testInvalid("\x1B\$B" . $testString . "\x1B(B", "\x00%", 'JIS');
|
||||
testInvalid("\x1B\$B" . $testString . "\x1B(B", "\x00%", 'ISO-2022-JP');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -138,7 +133,7 @@ echo "JIS X 0208 support OK\n";
|
|||
|
||||
/* All valid JISX0212 characters */
|
||||
foreach ($jisx0212Chars as $jisx0212 => $utf16BE) {
|
||||
testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'JIS', false);
|
||||
testValid("\x1B\$(D" . $jisx0212 . "\x1B(B", $utf16BE, 'JIS', false);
|
||||
}
|
||||
|
||||
/* All invalid 2-byte JISX0212 characters */
|
||||
|
@ -146,42 +141,49 @@ for ($i = 0x21; $i <= 0x7E; $i++) {
|
|||
for ($j = 0; $j < 256; $j++) {
|
||||
$testString = chr($i) . chr($j);
|
||||
if (!isset($jisx0212Chars[$testString])) {
|
||||
testInvalid("\x1B\$(D" . $testString, "\x00%", 'JIS');
|
||||
testInvalid("\x1B\$(D" . $testString . "\x1B(B", "\x00%", 'JIS');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Try truncated JISX0212 characters */
|
||||
for ($i = 0x21; $i <= 0x7E; $i++) {
|
||||
testInvalid("\x1B\$(D" . chr($i), "\x00%", 'JIS');
|
||||
testInvalid("\x1B\$(D" . chr($i) . "\x1B(B", "\x00%\x00%", 'JIS');
|
||||
}
|
||||
|
||||
echo "JIS X 0212 support OK\n";
|
||||
|
||||
/* All possible escape sequences */
|
||||
$validEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B\$(@" => true, "\x1B\$(B" => true, "\x1B\$(D" => true, "\x1B(B" => true, "\x1B(H" => true, "\x1B(J" => true, "\x1B(I" => true];
|
||||
$validJisEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B\$(@" => true, "\x1B\$(B" => true, "\x1B\$(D" => true, "\x1B(B" => true, "\x1B(H" => true, "\x1B(J" => true, "\x1B(I" => true];
|
||||
$validIso2022jpEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B(B" => true, "\x1B(J" => true];
|
||||
for ($i = 0; $i <= 0xFF; $i++) {
|
||||
for ($j = 0; $j <= 0xFF; $j++) {
|
||||
$escapeSequence = "\x1B" . chr($i) . chr($j);
|
||||
if ($escapeSequence === "\x1B\$(")
|
||||
continue;
|
||||
if (isset($validEscapes[$escapeSequence])) {
|
||||
testValid($escapeSequence, "", 'JIS', false);
|
||||
testValid($escapeSequence, "", 'ISO-2022-JP', false);
|
||||
if (isset($validJisEscapes[$escapeSequence])) {
|
||||
testValid($escapeSequence . "\x1B(B", "", 'JIS', false);
|
||||
} else {
|
||||
identifyInvalidString($escapeSequence, 'JIS');
|
||||
identifyInvalidString($escapeSequence, 'ISO-2022-JP');
|
||||
identifyInvalidString($escapeSequence . "\x1B(B", 'JIS');
|
||||
}
|
||||
if (isset($validIso2022jpEscapes[$escapeSequence])) {
|
||||
testValid($escapeSequence . "\x1B(B", "", 'ISO-2022-JP', false);
|
||||
} else {
|
||||
identifyInvalidString($escapeSequence . "\x1B(B", 'ISO-2022-JP');
|
||||
}
|
||||
}
|
||||
}
|
||||
for ($i = 0; $i <= 0xFF; $i++) {
|
||||
$escapeSequence = "\x1B\$(" . chr($i);
|
||||
if (isset($validEscapes[$escapeSequence])) {
|
||||
testValid($escapeSequence, "", 'JIS', false);
|
||||
testValid($escapeSequence, "", 'ISO-2022-JP', false);
|
||||
if (isset($validJisEscapes[$escapeSequence])) {
|
||||
testValid($escapeSequence . "\x1B(B", "", 'JIS', false);
|
||||
} else {
|
||||
identifyInvalidString($escapeSequence, 'JIS');
|
||||
identifyInvalidString($escapeSequence, 'ISO-2022-JP');
|
||||
identifyInvalidString($escapeSequence . "\x1B(B", 'JIS');
|
||||
}
|
||||
if (isset($validIso2022jpEscapes[$escapeSequence])) {
|
||||
testValid($escapeSequence . "\x1B(B", "", 'ISO-2022-JP', false);
|
||||
} else {
|
||||
identifyInvalidString($escapeSequence . "\x1B(B", 'ISO-2022-JP');
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -980,17 +980,8 @@ testValidString('+' . encode('AB', 'ASCII') . '-+' . encode('CD', 'ASCII') . '-'
|
|||
// (Just trying to be exhaustive here)
|
||||
testValidString('+' . encode('AB', 'ASCII') . '-!+' . encode('CD', 'ASCII') . '-', "\x00A\x00B\x00!\x00C\x00D", 'UTF-7', 'UTF-16BE', false);
|
||||
|
||||
// + section terminated by a non-Base64 ASCII character which is NOT -
|
||||
for ($i = 0; $i < 128; $i++) {
|
||||
if ($i >= ord('A') && $i <= ord('Z'))
|
||||
continue;
|
||||
if ($i >= ord('a') && $i <= ord('z'))
|
||||
continue;
|
||||
if ($i >= ord('0') && $i <= ord('9'))
|
||||
continue;
|
||||
if ($i == ord('+') || $i == ord('/') || $i == ord('-') || $i == ord('\\') || $i == ord('~'))
|
||||
continue;
|
||||
$char = chr($i);
|
||||
// + section terminated by a non-Base64 direct character which is NOT -
|
||||
foreach (str_split(" \t\r\n'(),.:?!\"#$%&*;<=>@[]^_`{|}\x00") as $char) {
|
||||
testValidString('+' . encode("\x12\x34", 'UTF-16BE') . $char, "\x00\x00\x12\x34\x00\x00\x00" . $char, 'UTF-7', 'UTF-32BE', false);
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue