mirror of
https://github.com/php/php-src.git
synced 2025-08-16 05:58:45 +02:00
Merge branch 'PHP-8.2'
* PHP-8.2: Fix phpGH-10648: add check function pointer into mbfl_encoding
This commit is contained in:
commit
0779950768
45 changed files with 1739 additions and 110 deletions
|
@ -64,7 +64,8 @@ const mbfl_encoding mbfl_encoding_7bit = {
|
|||
&vtbl_7bit_wchar,
|
||||
&vtbl_wchar_7bit,
|
||||
mb_7bit_to_wchar,
|
||||
mb_wchar_to_7bit
|
||||
mb_wchar_to_7bit,
|
||||
NULL
|
||||
};
|
||||
|
||||
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)
|
||||
|
|
|
@ -44,7 +44,8 @@ const mbfl_encoding mbfl_encoding_base64 = {
|
|||
NULL,
|
||||
NULL,
|
||||
mb_base64_to_wchar,
|
||||
mb_wchar_to_base64
|
||||
mb_wchar_to_base64,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_8bit_b64 = {
|
||||
|
|
|
@ -69,7 +69,8 @@ const mbfl_encoding mbfl_encoding_big5 = {
|
|||
&vtbl_big5_wchar,
|
||||
&vtbl_wchar_big5,
|
||||
mb_big5_to_wchar,
|
||||
mb_wchar_to_big5
|
||||
mb_wchar_to_big5,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_cp950 = {
|
||||
|
@ -82,7 +83,8 @@ const mbfl_encoding mbfl_encoding_cp950 = {
|
|||
&vtbl_cp950_wchar,
|
||||
&vtbl_wchar_cp950,
|
||||
mb_cp950_to_wchar,
|
||||
mb_wchar_to_cp950
|
||||
mb_wchar_to_cp950,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_big5_wchar = {
|
||||
|
|
|
@ -61,7 +61,8 @@ const mbfl_encoding mbfl_encoding_cp50220 = {
|
|||
&vtbl_cp50220_wchar,
|
||||
&vtbl_wchar_cp50220,
|
||||
mb_cp5022x_to_wchar,
|
||||
mb_wchar_to_cp50220
|
||||
mb_wchar_to_cp50220,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_cp50221 = {
|
||||
|
@ -74,7 +75,8 @@ const mbfl_encoding mbfl_encoding_cp50221 = {
|
|||
&vtbl_cp50221_wchar,
|
||||
&vtbl_wchar_cp50221,
|
||||
mb_cp5022x_to_wchar,
|
||||
mb_wchar_to_cp50221
|
||||
mb_wchar_to_cp50221,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_cp50222 = {
|
||||
|
@ -87,7 +89,8 @@ const mbfl_encoding mbfl_encoding_cp50222 = {
|
|||
&vtbl_cp50222_wchar,
|
||||
&vtbl_wchar_cp50222,
|
||||
mb_cp5022x_to_wchar,
|
||||
mb_wchar_to_cp50222
|
||||
mb_wchar_to_cp50222,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_cp50220_wchar = {
|
||||
|
|
|
@ -69,7 +69,8 @@ const mbfl_encoding mbfl_encoding_cp51932 = {
|
|||
&vtbl_cp51932_wchar,
|
||||
&vtbl_wchar_cp51932,
|
||||
mb_cp51932_to_wchar,
|
||||
mb_wchar_to_cp51932
|
||||
mb_wchar_to_cp51932,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_cp51932_wchar = {
|
||||
|
|
|
@ -100,7 +100,8 @@ const mbfl_encoding mbfl_encoding_cp932 = {
|
|||
&vtbl_cp932_wchar,
|
||||
&vtbl_wchar_cp932,
|
||||
mb_cp932_to_wchar,
|
||||
mb_wchar_to_cp932
|
||||
mb_wchar_to_cp932,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_cp932_wchar = {
|
||||
|
@ -133,7 +134,8 @@ const mbfl_encoding mbfl_encoding_sjiswin = {
|
|||
&vtbl_sjiswin_wchar,
|
||||
&vtbl_wchar_sjiswin,
|
||||
mb_cp932_to_wchar,
|
||||
mb_wchar_to_sjiswin
|
||||
mb_wchar_to_sjiswin,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_sjiswin_wchar = {
|
||||
|
|
|
@ -68,7 +68,8 @@ const mbfl_encoding mbfl_encoding_cp936 = {
|
|||
&vtbl_cp936_wchar,
|
||||
&vtbl_wchar_cp936,
|
||||
mb_cp936_to_wchar,
|
||||
mb_wchar_to_cp936
|
||||
mb_wchar_to_cp936,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_cp936_wchar = {
|
||||
|
|
|
@ -67,7 +67,8 @@ const mbfl_encoding mbfl_encoding_euc_cn = {
|
|||
&vtbl_euccn_wchar,
|
||||
&vtbl_wchar_euccn,
|
||||
mb_euccn_to_wchar,
|
||||
mb_wchar_to_euccn
|
||||
mb_wchar_to_euccn,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_euccn_wchar = {
|
||||
|
|
|
@ -68,7 +68,8 @@ const mbfl_encoding mbfl_encoding_euc_jp = {
|
|||
&vtbl_eucjp_wchar,
|
||||
&vtbl_wchar_eucjp,
|
||||
mb_eucjp_to_wchar,
|
||||
mb_wchar_to_eucjp
|
||||
mb_wchar_to_eucjp,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_eucjp_wchar = {
|
||||
|
|
|
@ -69,7 +69,8 @@ const mbfl_encoding mbfl_encoding_eucjp_win = {
|
|||
&vtbl_eucjpwin_wchar,
|
||||
&vtbl_wchar_eucjpwin,
|
||||
mb_eucjpwin_to_wchar,
|
||||
mb_wchar_to_eucjpwin
|
||||
mb_wchar_to_eucjpwin,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_eucjpwin_wchar = {
|
||||
|
|
|
@ -66,7 +66,8 @@ const mbfl_encoding mbfl_encoding_euc_kr = {
|
|||
&vtbl_euckr_wchar,
|
||||
&vtbl_wchar_euckr,
|
||||
mb_euckr_to_wchar,
|
||||
mb_wchar_to_euckr
|
||||
mb_wchar_to_euckr,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_euckr_wchar = {
|
||||
|
|
|
@ -68,7 +68,8 @@ const mbfl_encoding mbfl_encoding_euc_tw = {
|
|||
&vtbl_euctw_wchar,
|
||||
&vtbl_wchar_euctw,
|
||||
mb_euctw_to_wchar,
|
||||
mb_wchar_to_euctw
|
||||
mb_wchar_to_euctw,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_euctw_wchar = {
|
||||
|
|
|
@ -49,7 +49,8 @@ const mbfl_encoding mbfl_encoding_gb18030 = {
|
|||
&vtbl_gb18030_wchar,
|
||||
&vtbl_wchar_gb18030,
|
||||
mb_gb18030_to_wchar,
|
||||
mb_wchar_to_gb18030
|
||||
mb_wchar_to_gb18030,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_gb18030_wchar = {
|
||||
|
|
|
@ -66,7 +66,8 @@ const mbfl_encoding mbfl_encoding_html_ent = {
|
|||
&vtbl_html_wchar,
|
||||
&vtbl_wchar_html,
|
||||
mb_htmlent_to_wchar,
|
||||
mb_wchar_to_htmlent
|
||||
mb_wchar_to_htmlent,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_wchar_html = {
|
||||
|
|
|
@ -47,7 +47,8 @@ const mbfl_encoding mbfl_encoding_hz = {
|
|||
&vtbl_hz_wchar,
|
||||
&vtbl_wchar_hz,
|
||||
mb_hz_to_wchar,
|
||||
mb_wchar_to_hz
|
||||
mb_wchar_to_hz,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_hz_wchar = {
|
||||
|
|
|
@ -51,7 +51,8 @@ const mbfl_encoding mbfl_encoding_2022jpms = {
|
|||
&vtbl_2022jpms_wchar,
|
||||
&vtbl_wchar_2022jpms,
|
||||
mb_iso2022jpms_to_wchar,
|
||||
mb_wchar_to_iso2022jpms
|
||||
mb_wchar_to_iso2022jpms,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_2022jpms_wchar = {
|
||||
|
|
|
@ -54,7 +54,8 @@ const mbfl_encoding mbfl_encoding_2022kr = {
|
|||
&vtbl_2022kr_wchar,
|
||||
&vtbl_wchar_2022kr,
|
||||
mb_iso2022kr_to_wchar,
|
||||
mb_wchar_to_iso2022kr
|
||||
mb_wchar_to_iso2022kr,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_wchar_2022kr = {
|
||||
|
|
|
@ -71,7 +71,8 @@ const mbfl_encoding mbfl_encoding_2022jp_kddi = {
|
|||
&vtbl_2022jp_kddi_wchar,
|
||||
&vtbl_wchar_2022jp_kddi,
|
||||
mb_iso2022jp_kddi_to_wchar,
|
||||
mb_wchar_to_iso2022jp_kddi
|
||||
mb_wchar_to_iso2022jp_kddi,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_2022jp_kddi_wchar = {
|
||||
|
|
|
@ -37,6 +37,8 @@ static int mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter *filter);
|
|||
static size_t mb_iso2022jp_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
|
||||
static void mb_wchar_to_iso2022jp(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
|
||||
static void mb_wchar_to_jis(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
|
||||
static bool mb_check_iso2022jp(unsigned char *in, size_t in_len);
|
||||
static bool mb_check_jis(unsigned char *in, size_t in_len);
|
||||
|
||||
const mbfl_encoding mbfl_encoding_jis = {
|
||||
mbfl_no_encoding_jis,
|
||||
|
@ -49,6 +51,7 @@ const mbfl_encoding mbfl_encoding_jis = {
|
|||
&vtbl_wchar_jis,
|
||||
mb_iso2022jp_to_wchar,
|
||||
mb_wchar_to_jis,
|
||||
mb_check_jis
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_2022jp = {
|
||||
|
@ -61,7 +64,8 @@ const mbfl_encoding mbfl_encoding_2022jp = {
|
|||
&vtbl_2022jp_wchar,
|
||||
&vtbl_wchar_2022jp,
|
||||
mb_iso2022jp_to_wchar,
|
||||
mb_wchar_to_iso2022jp
|
||||
mb_wchar_to_iso2022jp,
|
||||
mb_check_iso2022jp
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_jis_wchar = {
|
||||
|
@ -780,3 +784,161 @@ static void mb_wchar_to_jis(uint32_t *in, size_t len, mb_convert_buf *buf, bool
|
|||
|
||||
MB_CONVERT_BUF_STORE(buf, out, limit);
|
||||
}
|
||||
|
||||
#define JISX_0201_KANA_SO 5
|
||||
|
||||
static bool mb_check_jis(unsigned char *in, size_t in_len)
|
||||
{
|
||||
unsigned char *p = in, *e = p + in_len;
|
||||
unsigned int state = ASCII;
|
||||
|
||||
while (p < e) {
|
||||
unsigned char c = *p++;
|
||||
if (c == 0x1B) {
|
||||
/* ESC seen; this is an escape sequence */
|
||||
if (state == JISX_0201_KANA_SO) {
|
||||
return false;
|
||||
}
|
||||
if ((e - p) < 2) {
|
||||
return false;
|
||||
}
|
||||
unsigned char c2 = *p++;
|
||||
if (c2 == '$') {
|
||||
unsigned char c3 = *p++;
|
||||
if (c3 == '@' || c3 == 'B') {
|
||||
state = JISX_0208;
|
||||
} else if (c3 == '(') {
|
||||
if (p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char c4 = *p++;
|
||||
if (c4 == '@' || c4 == 'B') {
|
||||
state = JISX_0208;
|
||||
} else if (c4 == 'D') {
|
||||
state = JISX_0212;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else if (c2 == '(') {
|
||||
unsigned char c3 = *p++;
|
||||
/* ESC ( H is treated as a sequence transitioning to ASCII for historical reasons.
|
||||
* see https://github.com/php/php-src/pull/10828#issuecomment-1478342432. */
|
||||
if (c3 == 'B' || c3 == 'H') {
|
||||
state = ASCII;
|
||||
} else if (c3 == 'J') {
|
||||
state = JISX_0201_LATIN;
|
||||
} else if (c3 == 'I') {
|
||||
state = JISX_0201_KANA;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else if (c == 0xE) {
|
||||
/* "Kana In" marker */
|
||||
if (state != ASCII) {
|
||||
return false;
|
||||
}
|
||||
state = JISX_0201_KANA_SO;
|
||||
} else if (c == 0xF) {
|
||||
/* "Kana Out" marker */
|
||||
if (state != JISX_0201_KANA_SO) {
|
||||
return false;
|
||||
}
|
||||
state = ASCII;
|
||||
} else if ((state == JISX_0208 || state == JISX_0212) && (c > 0x20 && c < 0x7F)) {
|
||||
if (p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char c2 = *p++;
|
||||
if (c2 > 0x20 && c2 < 0x7F) {
|
||||
unsigned int s = (c - 0x21)*94 + c2 - 0x21;
|
||||
if (state == JISX_0208) {
|
||||
if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) {
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if (s < jisx0212_ucs_table_size && jisx0212_ucs_table[s]) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else if (c < 0x80) {
|
||||
continue;
|
||||
} else if (c >= 0xA1 && c <= 0xDF) {
|
||||
/* GR-invoked Kana */
|
||||
continue;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return state == ASCII;
|
||||
}
|
||||
|
||||
|
||||
static bool mb_check_iso2022jp(unsigned char *in, size_t in_len)
|
||||
{
|
||||
unsigned char *p = in, *e = p + in_len;
|
||||
unsigned int state = ASCII;
|
||||
|
||||
while (p < e) {
|
||||
unsigned char c = *p++;
|
||||
if (c == 0x1B) {
|
||||
/* ESC seen; this is an escape sequence */
|
||||
if ((e - p) < 2) {
|
||||
return false;
|
||||
}
|
||||
unsigned char c2 = *p++;
|
||||
if (c2 == '$') {
|
||||
unsigned char c3 = *p++;
|
||||
if (c3 == '@' || c3 == 'B') {
|
||||
state = JISX_0208;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else if (c2 == '(') {
|
||||
unsigned char c3 = *p++;
|
||||
if (c3 == 'B') {
|
||||
state = ASCII;
|
||||
} else if (c3 == 'J') {
|
||||
state = JISX_0201_LATIN;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else if (c == 0xE || c == 0xF) {
|
||||
/* "Kana In" or "Kana Out" marker; ISO-2022-JP is not accepted. */
|
||||
return false;
|
||||
} else if (state == JISX_0208 && (c > 0x20 && c < 0x7F)) {
|
||||
if (p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char c2 = *p++;
|
||||
if (c2 > 0x20 && c2 < 0x7F) {
|
||||
unsigned int s = (c - 0x21)*94 + c2 - 0x21;
|
||||
if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) {
|
||||
continue;
|
||||
}
|
||||
return false;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
} else if (c < 0x80) {
|
||||
continue;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
return state == ASCII;
|
||||
}
|
||||
|
|
|
@ -45,7 +45,8 @@ const mbfl_encoding mbfl_encoding_qprint = {
|
|||
NULL,
|
||||
NULL,
|
||||
mb_qprint_to_wchar,
|
||||
mb_wchar_to_qprint
|
||||
mb_wchar_to_qprint,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_8bit_qprint = {
|
||||
|
|
|
@ -86,7 +86,8 @@ static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int
|
|||
&vtbl_##id##_wchar, \
|
||||
&vtbl_wchar_##id, \
|
||||
mb_##id##_to_wchar, \
|
||||
mb_wchar_to_##id \
|
||||
mb_wchar_to_##id, \
|
||||
NULL \
|
||||
}
|
||||
|
||||
/* For single-byte encodings which use a conversion table */
|
||||
|
|
|
@ -130,7 +130,8 @@ const mbfl_encoding mbfl_encoding_sjis = {
|
|||
&vtbl_sjis_wchar,
|
||||
&vtbl_wchar_sjis,
|
||||
mb_sjis_to_wchar,
|
||||
mb_wchar_to_sjis
|
||||
mb_wchar_to_sjis,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_sjis_wchar = {
|
||||
|
|
|
@ -67,7 +67,8 @@ const mbfl_encoding mbfl_encoding_sjis2004 = {
|
|||
&vtbl_sjis2004_wchar,
|
||||
&vtbl_wchar_sjis2004,
|
||||
mb_sjis2004_to_wchar,
|
||||
mb_wchar_to_sjis2004
|
||||
mb_wchar_to_sjis2004,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_sjis2004_wchar = {
|
||||
|
@ -100,7 +101,8 @@ const mbfl_encoding mbfl_encoding_eucjp2004 = {
|
|||
&vtbl_eucjp2004_wchar,
|
||||
&vtbl_wchar_eucjp2004,
|
||||
mb_eucjp2004_to_wchar,
|
||||
mb_wchar_to_eucjp2004
|
||||
mb_wchar_to_eucjp2004,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_eucjp2004_wchar = {
|
||||
|
@ -133,7 +135,8 @@ const mbfl_encoding mbfl_encoding_2022jp_2004 = {
|
|||
&vtbl_2022jp_2004_wchar,
|
||||
&vtbl_wchar_2022jp_2004,
|
||||
mb_iso2022jp2004_to_wchar,
|
||||
mb_wchar_to_iso2022jp2004
|
||||
mb_wchar_to_iso2022jp2004,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_2022jp_2004_wchar = {
|
||||
|
|
|
@ -56,7 +56,8 @@ const mbfl_encoding mbfl_encoding_ucs2 = {
|
|||
&vtbl_ucs2_wchar,
|
||||
&vtbl_wchar_ucs2,
|
||||
mb_ucs2_to_wchar,
|
||||
mb_wchar_to_ucs2be
|
||||
mb_wchar_to_ucs2be,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_ucs2be = {
|
||||
|
@ -69,7 +70,8 @@ const mbfl_encoding mbfl_encoding_ucs2be = {
|
|||
&vtbl_ucs2be_wchar,
|
||||
&vtbl_wchar_ucs2be,
|
||||
mb_ucs2be_to_wchar,
|
||||
mb_wchar_to_ucs2be
|
||||
mb_wchar_to_ucs2be,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_ucs2le = {
|
||||
|
@ -82,7 +84,8 @@ const mbfl_encoding mbfl_encoding_ucs2le = {
|
|||
&vtbl_ucs2le_wchar,
|
||||
&vtbl_wchar_ucs2le,
|
||||
mb_ucs2le_to_wchar,
|
||||
mb_wchar_to_ucs2le
|
||||
mb_wchar_to_ucs2le,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_ucs2_wchar = {
|
||||
|
|
|
@ -56,7 +56,8 @@ const mbfl_encoding mbfl_encoding_ucs4 = {
|
|||
&vtbl_ucs4_wchar,
|
||||
&vtbl_wchar_ucs4,
|
||||
mb_ucs4_to_wchar,
|
||||
mb_wchar_to_ucs4be
|
||||
mb_wchar_to_ucs4be,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_ucs4be = {
|
||||
|
@ -69,7 +70,8 @@ const mbfl_encoding mbfl_encoding_ucs4be = {
|
|||
&vtbl_ucs4be_wchar,
|
||||
&vtbl_wchar_ucs4be,
|
||||
mb_ucs4be_to_wchar,
|
||||
mb_wchar_to_ucs4be
|
||||
mb_wchar_to_ucs4be,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_ucs4le = {
|
||||
|
@ -82,7 +84,8 @@ const mbfl_encoding mbfl_encoding_ucs4le = {
|
|||
&vtbl_ucs4le_wchar,
|
||||
&vtbl_wchar_ucs4le,
|
||||
mb_ucs4le_to_wchar,
|
||||
mb_wchar_to_ucs4le
|
||||
mb_wchar_to_ucs4le,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_ucs4_wchar = {
|
||||
|
|
|
@ -71,7 +71,8 @@ const mbfl_encoding mbfl_encoding_uhc = {
|
|||
&vtbl_uhc_wchar,
|
||||
&vtbl_wchar_uhc,
|
||||
mb_uhc_to_wchar,
|
||||
mb_wchar_to_uhc
|
||||
mb_wchar_to_uhc,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_uhc_wchar = {
|
||||
|
|
|
@ -188,7 +188,8 @@ const mbfl_encoding mbfl_encoding_utf16 = {
|
|||
&vtbl_utf16_wchar,
|
||||
&vtbl_wchar_utf16,
|
||||
mb_utf16_to_wchar,
|
||||
mb_wchar_to_utf16be
|
||||
mb_wchar_to_utf16be,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf16be = {
|
||||
|
@ -201,7 +202,8 @@ const mbfl_encoding mbfl_encoding_utf16be = {
|
|||
&vtbl_utf16be_wchar,
|
||||
&vtbl_wchar_utf16be,
|
||||
mb_utf16be_to_wchar,
|
||||
mb_wchar_to_utf16be
|
||||
mb_wchar_to_utf16be,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf16le = {
|
||||
|
@ -214,7 +216,8 @@ const mbfl_encoding mbfl_encoding_utf16le = {
|
|||
&vtbl_utf16le_wchar,
|
||||
&vtbl_wchar_utf16le,
|
||||
mb_utf16le_to_wchar,
|
||||
mb_wchar_to_utf16le
|
||||
mb_wchar_to_utf16le,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_utf16_wchar = {
|
||||
|
|
|
@ -49,7 +49,8 @@ const mbfl_encoding mbfl_encoding_utf32 = {
|
|||
&vtbl_utf32_wchar,
|
||||
&vtbl_wchar_utf32,
|
||||
mb_utf32_to_wchar,
|
||||
mb_wchar_to_utf32be
|
||||
mb_wchar_to_utf32be,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf32be = {
|
||||
|
@ -62,7 +63,8 @@ const mbfl_encoding mbfl_encoding_utf32be = {
|
|||
&vtbl_utf32be_wchar,
|
||||
&vtbl_wchar_utf32be,
|
||||
mb_utf32be_to_wchar,
|
||||
mb_wchar_to_utf32be
|
||||
mb_wchar_to_utf32be,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf32le = {
|
||||
|
@ -75,7 +77,8 @@ const mbfl_encoding mbfl_encoding_utf32le = {
|
|||
&vtbl_utf32le_wchar,
|
||||
&vtbl_wchar_utf32le,
|
||||
mb_utf32le_to_wchar,
|
||||
mb_wchar_to_utf32le
|
||||
mb_wchar_to_utf32le,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_utf32_wchar = {
|
||||
|
|
|
@ -29,10 +29,12 @@
|
|||
|
||||
#include "mbfilter.h"
|
||||
#include "mbfilter_utf7.h"
|
||||
#include "utf7_helper.h"
|
||||
|
||||
static int mbfl_filt_conv_utf7_wchar_flush(mbfl_convert_filter *filter);
|
||||
static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
|
||||
static void mb_wchar_to_utf7(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
|
||||
static bool mb_check_utf7(unsigned char *in, size_t in_len);
|
||||
|
||||
static const unsigned char mbfl_base64_table[] = {
|
||||
/* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
|
||||
|
@ -59,7 +61,8 @@ const mbfl_encoding mbfl_encoding_utf7 = {
|
|||
&vtbl_utf7_wchar,
|
||||
&vtbl_wchar_utf7,
|
||||
mb_utf7_to_wchar,
|
||||
mb_wchar_to_utf7
|
||||
mb_wchar_to_utf7,
|
||||
mb_check_utf7
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_utf7_wchar = {
|
||||
|
@ -408,16 +411,24 @@ int mbfl_filt_conv_wchar_utf7_flush(mbfl_convert_filter *filter)
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* Ways which a Base64-encoded section can end: */
|
||||
#define DASH 0xFD
|
||||
#define ASCII 0xFE
|
||||
#define ILLEGAL 0xFF
|
||||
|
||||
static inline bool is_base64_end(unsigned char c)
|
||||
{
|
||||
return c >= DASH;
|
||||
}
|
||||
|
||||
static bool is_optional_direct(unsigned char c)
|
||||
{
|
||||
/* Characters that are allowed to be encoded by Base64 or directly encoded */
|
||||
return c == '!' || c == '"' || c == '#' || c == '$' || c == '%' || c == '&' || c == '*' || c == ';' || c == '<' ||
|
||||
c == '=' || c == '>' || c == '@' || c == '[' || c == ']' || c == '^' || c == '_' || c == '`' || c == '{' ||
|
||||
c == '|' || c == '}';
|
||||
}
|
||||
|
||||
static bool can_end_base64(uint32_t c)
|
||||
{
|
||||
return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\'' || c == '(' || c == ')' || c == ',' || c == '.' || c == ':' || c == '?';
|
||||
}
|
||||
|
||||
static unsigned char decode_base64(unsigned char c)
|
||||
{
|
||||
if (c >= 'A' && c <= 'Z') {
|
||||
|
@ -432,6 +443,8 @@ static unsigned char decode_base64(unsigned char c)
|
|||
return 63;
|
||||
} else if (c == '-') {
|
||||
return DASH;
|
||||
} else if (can_end_base64(c) || is_optional_direct(c) || c == '\0') {
|
||||
return DIRECT;
|
||||
} else if (c <= 0x7F) {
|
||||
return ASCII;
|
||||
}
|
||||
|
@ -470,7 +483,7 @@ static uint32_t* handle_base64_end(unsigned char n, unsigned char **p, uint32_t
|
|||
|
||||
if (n == ILLEGAL) {
|
||||
*out++ = MBFL_BAD_INPUT;
|
||||
} else if (n == ASCII) {
|
||||
} else if (n == DIRECT || n == ASCII) {
|
||||
(*p)--; /* Unconsume byte */
|
||||
}
|
||||
|
||||
|
@ -596,11 +609,6 @@ static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
|
|||
return out - buf;
|
||||
}
|
||||
|
||||
static bool can_end_base64(uint32_t c)
|
||||
{
|
||||
return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\'' || c == '(' || c == ')' || c == ',' || c == '.' || c == ':' || c == '?';
|
||||
}
|
||||
|
||||
static bool should_direct_encode(uint32_t c)
|
||||
{
|
||||
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '\0' || c == '/' || c == '-' || can_end_base64(c);
|
||||
|
@ -700,3 +708,129 @@ static void mb_wchar_to_utf7(uint32_t *in, size_t len, mb_convert_buf *buf, bool
|
|||
|
||||
MB_CONVERT_BUF_STORE(buf, out, limit);
|
||||
}
|
||||
|
||||
static bool is_utf16_cp_valid(uint16_t cp, bool is_surrogate)
|
||||
{
|
||||
if (is_surrogate) {
|
||||
return cp >= 0xDC00 && cp <= 0xDFFF;
|
||||
} else {
|
||||
/* 2nd part of surrogate pair came unexpectedly */
|
||||
return !(cp >= 0xDC00 && cp <= 0xDFFF);
|
||||
}
|
||||
}
|
||||
|
||||
static bool can_encode_directly(unsigned char c)
|
||||
{
|
||||
return should_direct_encode(c) || is_optional_direct(c) || c == '\0';
|
||||
}
|
||||
|
||||
static bool mb_check_utf7(unsigned char *in, size_t in_len)
|
||||
{
|
||||
unsigned char *p = in, *e = p + in_len;
|
||||
bool base64 = false;
|
||||
bool is_surrogate = false;
|
||||
|
||||
while (p < e) {
|
||||
if (base64) {
|
||||
unsigned char n1 = decode_base64(*p++);
|
||||
if (is_base64_end(n1)) {
|
||||
if (!is_base64_end_valid(n1, false, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
base64 = false;
|
||||
continue;
|
||||
} else if (p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n2 = decode_base64(*p++);
|
||||
if (is_base64_end(n2) || p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n3 = decode_base64(*p++);
|
||||
if (is_base64_end(n3)) {
|
||||
return false;
|
||||
}
|
||||
uint16_t cp1 = (n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2);
|
||||
if (!is_utf16_cp_valid(cp1, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
is_surrogate = has_surrogate(cp1, is_surrogate);
|
||||
if (p == e) {
|
||||
/* It is an error if trailing padding bits are not zeroes or if we were
|
||||
* expecting the 2nd part of a surrogate pair when Base64 section ends */
|
||||
return !((n3 & 0x3) || is_surrogate);
|
||||
}
|
||||
|
||||
unsigned char n4 = decode_base64(*p++);
|
||||
if (is_base64_end(n4)) {
|
||||
if (!is_base64_end_valid(n4, n3 & 0x3, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
base64 = false;
|
||||
continue;
|
||||
} else if (p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n5 = decode_base64(*p++);
|
||||
if (is_base64_end(n5) || p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n6 = decode_base64(*p++);
|
||||
if (is_base64_end(n6)) {
|
||||
return false;
|
||||
}
|
||||
uint16_t cp2 = (n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4);
|
||||
if (!is_utf16_cp_valid(cp2, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
is_surrogate = has_surrogate(cp2, is_surrogate);
|
||||
if (p == e) {
|
||||
return !((n6 & 0xF) || is_surrogate);
|
||||
}
|
||||
|
||||
unsigned char n7 = decode_base64(*p++);
|
||||
if (is_base64_end(n7)) {
|
||||
if (!is_base64_end_valid(n7, n6 & 0xF, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
base64 = false;
|
||||
continue;
|
||||
} else if (p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n8 = decode_base64(*p++);
|
||||
if (is_base64_end(n8)) {
|
||||
return false;
|
||||
}
|
||||
uint16_t cp3 = (n6 << 12) | (n7 << 6) | n8;
|
||||
if (!is_utf16_cp_valid(cp3, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
is_surrogate = has_surrogate(cp3, is_surrogate);
|
||||
} else {
|
||||
/* ASCII text section */
|
||||
unsigned char c = *p++;
|
||||
|
||||
if (c == '+') {
|
||||
if (p == e) {
|
||||
base64 = true;
|
||||
return !is_surrogate;
|
||||
}
|
||||
unsigned char n = decode_base64(*p);
|
||||
if (n == DASH) {
|
||||
p++;
|
||||
} else if (n > DASH) {
|
||||
/* If a "+" character followed immediately by any character other than base64 or "-" */
|
||||
return false;
|
||||
} else {
|
||||
base64 = true;
|
||||
}
|
||||
} else if (can_encode_directly(c)) {
|
||||
continue;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return !is_surrogate;
|
||||
}
|
||||
|
|
|
@ -77,11 +77,13 @@
|
|||
|
||||
#include "mbfilter.h"
|
||||
#include "mbfilter_utf7imap.h"
|
||||
#include "utf7_helper.h"
|
||||
|
||||
static int mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter *filter);
|
||||
static int mbfl_filt_conv_utf7imap_wchar_flush(mbfl_convert_filter *filter);
|
||||
static size_t mb_utf7imap_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
|
||||
static void mb_wchar_to_utf7imap(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
|
||||
static bool mb_check_utf7imap(unsigned char *in, size_t in_len);
|
||||
|
||||
static const char *mbfl_encoding_utf7imap_aliases[] = {"mUTF-7", NULL};
|
||||
|
||||
|
@ -95,7 +97,8 @@ const mbfl_encoding mbfl_encoding_utf7imap = {
|
|||
&vtbl_utf7imap_wchar,
|
||||
&vtbl_wchar_utf7imap,
|
||||
mb_utf7imap_to_wchar,
|
||||
mb_wchar_to_utf7imap
|
||||
mb_wchar_to_utf7imap,
|
||||
mb_check_utf7imap
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_utf7imap_wchar = {
|
||||
|
@ -444,10 +447,6 @@ static int mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter *filter)
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* Ways which a Base64-encoded section can end: */
|
||||
#define DASH 0xFE
|
||||
#define ILLEGAL 0xFF
|
||||
|
||||
static inline bool is_base64_end(unsigned char c)
|
||||
{
|
||||
return c >= DASH;
|
||||
|
@ -732,3 +731,124 @@ static void mb_wchar_to_utf7imap(uint32_t *in, size_t len, mb_convert_buf *buf,
|
|||
|
||||
MB_CONVERT_BUF_STORE(buf, out, limit);
|
||||
}
|
||||
|
||||
static bool is_utf16_cp_valid(uint16_t cp, bool is_surrogate)
|
||||
{
|
||||
if (is_surrogate) {
|
||||
return cp >= 0xDC00 && cp <= 0xDFFF;
|
||||
} else if (cp >= 0xDC00 && cp <= 0xDFFF) {
|
||||
/* 2nd part of surrogate pair came unexpectedly */
|
||||
return false;
|
||||
} else if (cp >= 0x20 && cp <= 0x7E && cp != '&') {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool mb_check_utf7imap(unsigned char *in, size_t in_len)
|
||||
{
|
||||
unsigned char *p = in, *e = p + in_len;
|
||||
bool base64 = false;
|
||||
bool is_surrogate = false;
|
||||
|
||||
while (p < e) {
|
||||
if (base64) {
|
||||
/* Base64 section */
|
||||
unsigned char n1 = decode_base64(*p++);
|
||||
if (is_base64_end(n1)) {
|
||||
if (!is_base64_end_valid(n1, false, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
base64 = false;
|
||||
continue;
|
||||
} else if (p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n2 = decode_base64(*p++);
|
||||
if (is_base64_end(n2) || p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n3 = decode_base64(*p++);
|
||||
if (is_base64_end(n3)) {
|
||||
return false;
|
||||
}
|
||||
uint16_t cp1 = (n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2);
|
||||
if (!is_utf16_cp_valid(cp1, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
is_surrogate = has_surrogate(cp1, is_surrogate);
|
||||
if (p == e) {
|
||||
return false;
|
||||
}
|
||||
|
||||
unsigned char n4 = decode_base64(*p++);
|
||||
if (is_base64_end(n4)) {
|
||||
if (!is_base64_end_valid(n4, n3 & 0x3, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
base64 = false;
|
||||
continue;
|
||||
} else if (p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n5 = decode_base64(*p++);
|
||||
if (is_base64_end(n5) || p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n6 = decode_base64(*p++);
|
||||
if (is_base64_end(n6)) {
|
||||
return false;
|
||||
}
|
||||
uint16_t cp2 = (n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4);
|
||||
if (!is_utf16_cp_valid(cp2, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
is_surrogate = has_surrogate(cp2, is_surrogate);
|
||||
if (p == e) {
|
||||
return false;
|
||||
}
|
||||
|
||||
unsigned char n7 = decode_base64(*p++);
|
||||
if (is_base64_end(n7)) {
|
||||
if (!is_base64_end_valid(n7, n6 & 0xF, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
base64 = false;
|
||||
continue;
|
||||
} else if (p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n8 = decode_base64(*p++);
|
||||
if (is_base64_end(n8)) {
|
||||
return false;
|
||||
}
|
||||
uint16_t cp3 = (n6 << 12) | (n7 << 6) | n8;
|
||||
if (!is_utf16_cp_valid(cp3, is_surrogate)) {
|
||||
return false;
|
||||
}
|
||||
is_surrogate = has_surrogate(cp3, is_surrogate);
|
||||
} else {
|
||||
/* ASCII text section */
|
||||
unsigned char c = *p++;
|
||||
|
||||
if (c == '&') {
|
||||
if (p == e) {
|
||||
return false;
|
||||
}
|
||||
unsigned char n = decode_base64(*p);
|
||||
if (n == DASH) {
|
||||
p++;
|
||||
} else if (n == ILLEGAL) {
|
||||
return false;
|
||||
} else {
|
||||
base64 = true;
|
||||
}
|
||||
} else if (c >= 0x20 && c <= 0x7E) {
|
||||
continue;
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
return !base64;
|
||||
}
|
||||
|
|
|
@ -64,7 +64,8 @@ const mbfl_encoding mbfl_encoding_utf8 = {
|
|||
&vtbl_utf8_wchar,
|
||||
&vtbl_wchar_utf8,
|
||||
mb_utf8_to_wchar,
|
||||
mb_wchar_to_utf8
|
||||
mb_wchar_to_utf8,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_utf8_wchar = {
|
||||
|
|
|
@ -63,7 +63,8 @@ const mbfl_encoding mbfl_encoding_utf8_docomo = {
|
|||
&vtbl_utf8_docomo_wchar,
|
||||
&vtbl_wchar_utf8_docomo,
|
||||
mb_utf8_docomo_to_wchar,
|
||||
mb_wchar_to_utf8_docomo
|
||||
mb_wchar_to_utf8_docomo,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf8_kddi_a = {
|
||||
|
@ -76,7 +77,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_a = {
|
|||
&vtbl_utf8_kddi_a_wchar,
|
||||
&vtbl_wchar_utf8_kddi_a,
|
||||
mb_utf8_kddi_a_to_wchar,
|
||||
mb_wchar_to_utf8_kddi_a
|
||||
mb_wchar_to_utf8_kddi_a,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf8_kddi_b = {
|
||||
|
@ -89,7 +91,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_b = {
|
|||
&vtbl_utf8_kddi_b_wchar,
|
||||
&vtbl_wchar_utf8_kddi_b,
|
||||
mb_utf8_kddi_b_to_wchar,
|
||||
mb_wchar_to_utf8_kddi_b
|
||||
mb_wchar_to_utf8_kddi_b,
|
||||
NULL
|
||||
};
|
||||
|
||||
const mbfl_encoding mbfl_encoding_utf8_sb = {
|
||||
|
@ -102,7 +105,8 @@ const mbfl_encoding mbfl_encoding_utf8_sb = {
|
|||
&vtbl_utf8_sb_wchar,
|
||||
&vtbl_wchar_utf8_sb,
|
||||
mb_utf8_sb_to_wchar,
|
||||
mb_wchar_to_utf8_sb
|
||||
mb_wchar_to_utf8_sb,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_utf8_docomo_wchar = {
|
||||
|
|
|
@ -43,7 +43,8 @@ const mbfl_encoding mbfl_encoding_uuencode = {
|
|||
NULL,
|
||||
NULL,
|
||||
mb_uuencode_to_wchar,
|
||||
mb_wchar_to_uuencode
|
||||
mb_wchar_to_uuencode,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_uuencode_8bit = {
|
||||
|
|
22
ext/mbstring/libmbfl/filters/utf7_helper.h
Normal file
22
ext/mbstring/libmbfl/filters/utf7_helper.h
Normal file
|
@ -0,0 +1,22 @@
|
|||
#ifndef MBFL_UTF7_HELPER_H
|
||||
#define MBFL_UTF7_HELPER_H
|
||||
|
||||
#include "mbfilter.h"
|
||||
|
||||
/* Ways which a Base64-encoded section can end: */
|
||||
#define DASH 0xFC
|
||||
#define DIRECT 0xFD
|
||||
#define ASCII 0xFE
|
||||
#define ILLEGAL 0xFF
|
||||
|
||||
static inline bool is_base64_end_valid(unsigned char n, bool gap, bool is_surrogate)
|
||||
{
|
||||
return !(gap || is_surrogate || n == ASCII || n == ILLEGAL);
|
||||
}
|
||||
|
||||
static inline bool has_surrogate(uint16_t cp, bool is_surrogate)
|
||||
{
|
||||
return !is_surrogate && cp >= 0xD800 && cp <= 0xDBFF;
|
||||
}
|
||||
|
||||
#endif /* MBFL_UTF7_HELPER_H */
|
|
@ -188,6 +188,16 @@ int mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *str
|
|||
unsigned char *p = string->val;
|
||||
int bad = 0;
|
||||
|
||||
if (identd->strict) {
|
||||
for (int i = 0; i < num; i++) {
|
||||
mbfl_convert_filter *filter = identd->filter_list[i];
|
||||
mbfl_encoding_detector_data *data = &identd->filter_data[i];
|
||||
if (filter->from->check != NULL && !(filter->from->check)(p, n)) {
|
||||
data->num_illegalchars++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
while (n--) {
|
||||
for (int i = 0; i < num; i++) {
|
||||
mbfl_convert_filter *filter = identd->filter_list[i];
|
||||
|
|
|
@ -51,7 +51,8 @@ const mbfl_encoding mbfl_encoding_8bit = {
|
|||
&vtbl_8bit_wchar,
|
||||
&vtbl_wchar_8bit,
|
||||
mb_8bit_to_wchar,
|
||||
mb_wchar_to_8bit
|
||||
mb_wchar_to_8bit,
|
||||
NULL
|
||||
};
|
||||
|
||||
const struct mbfl_convert_vtbl vtbl_8bit_wchar = {
|
||||
|
|
|
@ -44,6 +44,7 @@ const mbfl_encoding mbfl_encoding_pass = {
|
|||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
};
|
||||
|
||||
|
|
|
@ -42,5 +42,6 @@ const mbfl_encoding mbfl_encoding_wchar = {
|
|||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
NULL
|
||||
};
|
||||
|
|
|
@ -144,6 +144,7 @@ typedef struct {
|
|||
|
||||
typedef size_t (*mb_to_wchar_fn)(unsigned char **in, size_t *in_len, uint32_t *out, size_t out_len, unsigned int *state);
|
||||
typedef void (*mb_from_wchar_fn)(uint32_t *in, size_t in_len, mb_convert_buf *out, bool end);
|
||||
typedef bool (*mb_check_fn)(unsigned char *in, size_t in_len);
|
||||
|
||||
/* When converting encoded text to a buffer of wchars (Unicode codepoints) using `mb_to_wchar_fn`,
|
||||
* the buffer must be at least this size (to work with all supported text encodings) */
|
||||
|
@ -249,6 +250,7 @@ typedef struct {
|
|||
const struct mbfl_convert_vtbl *output_filter;
|
||||
mb_to_wchar_fn to_wchar;
|
||||
mb_from_wchar_fn from_wchar;
|
||||
mb_check_fn check;
|
||||
} mbfl_encoding;
|
||||
|
||||
extern const mbfl_encoding mbfl_encoding_utf8;
|
||||
|
|
|
@ -3001,6 +3001,18 @@ static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len,
|
|||
return *elist;
|
||||
}
|
||||
|
||||
/* If any candidate encoding have specialized validation functions, use those first
|
||||
* to eliminate as many candidates as possible */
|
||||
if (strict) {
|
||||
for (unsigned int i = 0; i < elist_size; i++) {
|
||||
if (elist[i]->check != NULL && !elist[i]->check(in, in_len)) {
|
||||
elist_size--;
|
||||
memmove(&elist[i], &elist[i+1], (elist_size - i) * sizeof(mbfl_encoding*));
|
||||
i--;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uint32_t wchar_buf[128];
|
||||
struct conversion_data {
|
||||
const mbfl_encoding *enc;
|
||||
|
@ -4510,6 +4522,10 @@ MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const
|
|||
unsigned char *in = (unsigned char*)input;
|
||||
unsigned int state = 0;
|
||||
|
||||
if (encoding->check != NULL) {
|
||||
return encoding->check(in, length);
|
||||
}
|
||||
|
||||
/* If the input string is not encoded in the given encoding, there is a significant chance
|
||||
* that this will be seen in the first bytes. Therefore, rather than converting an entire
|
||||
* buffer of 128 codepoints, convert and check just a few codepoints first */
|
||||
|
|
542
ext/mbstring/tests/gh10192_utf7.phpt
Normal file
542
ext/mbstring/tests/gh10192_utf7.phpt
Normal file
|
@ -0,0 +1,542 @@
|
|||
--TEST--
|
||||
GH-10192 (mb_detect_encoding() results for UTF-7 differ between PHP 8.0 and 8.1)
|
||||
--EXTENSIONS--
|
||||
mbstring
|
||||
--FILE--
|
||||
<?php
|
||||
|
||||
$testcases = [
|
||||
'non-base64 character after +' => 'A + B',
|
||||
'non-base64 character after -' => 'A - B',
|
||||
'base64 character before +' => 'A 1+ B',
|
||||
'base64 character before -' => 'A 1- B',
|
||||
'base64 character after +' => 'A +1 B',
|
||||
'base64 character after -' => 'A -1 B',
|
||||
'base64 character before and after +' => 'A 1+1 B',
|
||||
'base64 character before and after -' => 'A 1-1 B',
|
||||
'string ends with +' => 'A +',
|
||||
'string ends with -' => 'A -',
|
||||
'+ and -' => 'A +- B',
|
||||
'- and +' => 'A -+ B',
|
||||
'valid direct encoding character =' => 'A = B',
|
||||
'invalid direct encoding character ~' => 'A ~ B',
|
||||
'invalid direct encoding character \\' => 'A \\ B',
|
||||
'invalid direct encoding character ESC' => "A \x1b B",
|
||||
'valid direct encoding character = after +' => 'A += B',
|
||||
'invalid direct encoding character ~ after +' => 'A +~ B',
|
||||
'invalid direct encoding character \\ after +' => 'A +\\ B',
|
||||
'invalid direct encoding character ESC after +' => "A +\x1b B",
|
||||
'valid base64 character between + and -' => 'A +ZeVnLIqe- B', // 日本語 in UTF-16BE
|
||||
'invalid base64 character between + and -' => 'A +ZeVnLIq- B', // 日本語 in UTF-16BE without the last character
|
||||
'valid base64 character between + and non-base64 character' => 'A +ZeVnLIqe B',
|
||||
'invalid base64 character between + and non-base64 character' => 'A +ZeVnLIq B',
|
||||
'valid base64 character between + and base64 character' => 'A +ZeVnLIqe1 B',
|
||||
'invalid base64 character between + and base64 character' => 'A +ZeVnLIq1 B',
|
||||
'valid base64 character between + and end of string' => 'A +ZeVnLIqe',
|
||||
'invalid base64 character between + and end of string' => 'A +ZeVnLIq',
|
||||
'valid base64 character consisting only of + between + and -' => 'A +++++++++- B',
|
||||
'invalid base64 character consisting only of + between + and -' => 'A +++++++++- B',
|
||||
'valid base64 character consisting only of + between + and non-base64 character' => 'A +++++++++ B',
|
||||
'invalid base64 character consisting only of + between + and non-base64 character' => 'A +++++++++ B',
|
||||
'valid base64 character consisting only of + between + and base64 character' => 'A +++++++++1 B',
|
||||
'invalid base64 character consisting only of + between + and base64 character' => 'A +++++++++1 B',
|
||||
'valid base64 character consisting only of + between + and end of string' => 'A +++++++++',
|
||||
'invalid base64 character consisting only of + between + and end of string' => 'A +++++++++',
|
||||
'valid base64 character using surrogate pair between + and -' => 'A +2GfePQ- B', // 𩸽 in UTF-16BE
|
||||
'first 16 bits of base64 character using surrogate pair between + and -' => 'A +2Gc- B', // first 16 bits of 𩸽 in UTF-16BE
|
||||
'valid base64 character using surrogate pair between + and non-base64 character' => 'A +2GfePQ B',
|
||||
'first 16 bits of base64 character using surrogate pair between + and non-base64 character' => 'A +2Gc B',
|
||||
'valid base64 character using surrogate pair between + and base64 character' => 'A +2GfePQ1 B',
|
||||
'first 16 bits of base64 character using surrogate pair between + and base64 character' => 'A +2Gc1 B',
|
||||
'valid base64 character using surrogate pair between + and end of string' => 'A +2GfePQ',
|
||||
'first 16 bits of base64 character using surrogate pair between + and end of string' => 'A +2Gc',
|
||||
'invalid base64 character using surrogate pair in reverse order between + and -' => 'A +3j3YZw- B', // 𩸽 in reverse order in UTF-16BE
|
||||
'last 16 bits of base64 character using surrogate pair in reverse order between + and -' => 'A +3j0- B', // last 16 bits of 𩸽 in UTF-16BE
|
||||
'invalid base64 character using surrogate pair in reverse order between + and non-base64 character' => 'A +3j3YZw B',
|
||||
'last 16 bits of base64 character using surrogate pair in reverse order between + and non-base64 character' => 'A +3j0 B',
|
||||
'invalid base64 character using surrogate pair in reverse order between + and base64 character' => 'A +3j3YZw1 B',
|
||||
'last 16 bits of base64 character using surrogate pair in reverse order between + and base64 character' => 'A +3j01 B',
|
||||
'invalid base64 character using surrogate pair in reverse order between + and end of string' => 'A +3j3YZw',
|
||||
'last 16 bits of base64 character using surrogate pair in reverse order between + and end of string' => 'A +3j0'
|
||||
];
|
||||
|
||||
foreach ($testcases as $title => $case) {
|
||||
echo $title . PHP_EOL;
|
||||
var_dump(mb_detect_encoding($case, 'UTF-8, UTF-7', true));
|
||||
var_dump(mb_detect_encoding($case, 'UTF-8, UTF-7', false));
|
||||
var_dump(mb_detect_encoding($case, 'UTF-7', true));
|
||||
var_dump(mb_detect_encoding($case, 'UTF-7', false));
|
||||
var_dump(mb_check_encoding($case, 'UTF-7'));
|
||||
var_dump(addcslashes(mb_convert_encoding($case, 'UTF-8', 'UTF-7'), "\0..\37\177"));
|
||||
var_dump(mb_get_info('illegal_chars'));
|
||||
echo PHP_EOL;
|
||||
}
|
||||
?>
|
||||
--EXPECT--
|
||||
non-base64 character after +
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(4) "A B"
|
||||
int(0)
|
||||
|
||||
non-base64 character after -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(5) "A - B"
|
||||
int(0)
|
||||
|
||||
base64 character before +
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A 1 B"
|
||||
int(0)
|
||||
|
||||
base64 character before -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(6) "A 1- B"
|
||||
int(0)
|
||||
|
||||
base64 character after +
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A ? B"
|
||||
int(1)
|
||||
|
||||
base64 character after -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(6) "A -1 B"
|
||||
int(1)
|
||||
|
||||
base64 character before and after +
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(6) "A 1? B"
|
||||
int(2)
|
||||
|
||||
base64 character before and after -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(7) "A 1-1 B"
|
||||
int(2)
|
||||
|
||||
string ends with +
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(2) "A "
|
||||
int(2)
|
||||
|
||||
string ends with -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(3) "A -"
|
||||
int(2)
|
||||
|
||||
+ and -
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(5) "A + B"
|
||||
int(2)
|
||||
|
||||
- and +
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A - B"
|
||||
int(2)
|
||||
|
||||
valid direct encoding character =
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(5) "A = B"
|
||||
int(2)
|
||||
|
||||
invalid direct encoding character ~
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A ~ B"
|
||||
int(2)
|
||||
|
||||
invalid direct encoding character \
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A \ B"
|
||||
int(2)
|
||||
|
||||
invalid direct encoding character ESC
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(8) "A \033 B"
|
||||
int(2)
|
||||
|
||||
valid direct encoding character = after +
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A = B"
|
||||
int(2)
|
||||
|
||||
invalid direct encoding character ~ after +
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A ~ B"
|
||||
int(2)
|
||||
|
||||
invalid direct encoding character \ after +
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A \ B"
|
||||
int(2)
|
||||
|
||||
invalid direct encoding character ESC after +
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(8) "A \033 B"
|
||||
int(2)
|
||||
|
||||
valid base64 character between + and -
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(13) "A 日本語 B"
|
||||
int(2)
|
||||
|
||||
invalid base64 character between + and -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(11) "A 日本? B"
|
||||
int(3)
|
||||
|
||||
valid base64 character between + and non-base64 character
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(13) "A 日本語 B"
|
||||
int(3)
|
||||
|
||||
invalid base64 character between + and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(11) "A 日本? B"
|
||||
int(4)
|
||||
|
||||
valid base64 character between + and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(14) "A 日本語? B"
|
||||
int(5)
|
||||
|
||||
invalid base64 character between + and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(13) "A 日本誵 B"
|
||||
int(5)
|
||||
|
||||
valid base64 character between + and end of string
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(11) "A 日本語"
|
||||
int(5)
|
||||
|
||||
invalid base64 character between + and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(9) "A 日本?"
|
||||
int(6)
|
||||
|
||||
valid base64 character consisting only of + between + and -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(13) "A ﯯ뻻 B"
|
||||
int(6)
|
||||
|
||||
invalid base64 character consisting only of + between + and -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(13) "A ﯯ뻻 B"
|
||||
int(6)
|
||||
|
||||
valid base64 character consisting only of + between + and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(13) "A ﯯ뻻 B"
|
||||
int(6)
|
||||
|
||||
invalid base64 character consisting only of + between + and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(13) "A ﯯ뻻 B"
|
||||
int(6)
|
||||
|
||||
valid base64 character consisting only of + between + and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(14) "A ﯯ뻻? B"
|
||||
int(7)
|
||||
|
||||
invalid base64 character consisting only of + between + and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(14) "A ﯯ뻻? B"
|
||||
int(8)
|
||||
|
||||
valid base64 character consisting only of + between + and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(11) "A ﯯ뻻"
|
||||
int(8)
|
||||
|
||||
invalid base64 character consisting only of + between + and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(11) "A ﯯ뻻"
|
||||
int(8)
|
||||
|
||||
valid base64 character using surrogate pair between + and -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(8) "A 𩸽 B"
|
||||
int(8)
|
||||
|
||||
first 16 bits of base64 character using surrogate pair between + and -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A ? B"
|
||||
int(9)
|
||||
|
||||
valid base64 character using surrogate pair between + and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(8) "A 𩸽 B"
|
||||
int(9)
|
||||
|
||||
first 16 bits of base64 character using surrogate pair between + and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A ? B"
|
||||
int(10)
|
||||
|
||||
valid base64 character using surrogate pair between + and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(9) "A 𩸽? B"
|
||||
int(11)
|
||||
|
||||
first 16 bits of base64 character using surrogate pair between + and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A ? B"
|
||||
int(12)
|
||||
|
||||
valid base64 character using surrogate pair between + and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-7"
|
||||
string(5) "UTF-7"
|
||||
bool(true)
|
||||
string(6) "A 𩸽"
|
||||
int(12)
|
||||
|
||||
first 16 bits of base64 character using surrogate pair between + and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(3) "A ?"
|
||||
int(13)
|
||||
|
||||
invalid base64 character using surrogate pair in reverse order between + and -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(6) "A ?? B"
|
||||
int(15)
|
||||
|
||||
last 16 bits of base64 character using surrogate pair in reverse order between + and -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A ? B"
|
||||
int(16)
|
||||
|
||||
invalid base64 character using surrogate pair in reverse order between + and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(6) "A ?? B"
|
||||
int(18)
|
||||
|
||||
last 16 bits of base64 character using surrogate pair in reverse order between + and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(5) "A ? B"
|
||||
int(19)
|
||||
|
||||
invalid base64 character using surrogate pair in reverse order between + and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(6) "A ?? B"
|
||||
int(21)
|
||||
|
||||
last 16 bits of base64 character using surrogate pair in reverse order between + and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(6) "A ?? B"
|
||||
int(23)
|
||||
|
||||
invalid base64 character using surrogate pair in reverse order between + and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(4) "A ??"
|
||||
int(25)
|
||||
|
||||
last 16 bits of base64 character using surrogate pair in reverse order between + and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(5) "UTF-7"
|
||||
bool(false)
|
||||
string(3) "A ?"
|
||||
int(26)
|
423
ext/mbstring/tests/gh10192_utf7imap.phpt
Normal file
423
ext/mbstring/tests/gh10192_utf7imap.phpt
Normal file
|
@ -0,0 +1,423 @@
|
|||
--TEST--
|
||||
GH-10192 (mb_detect_encoding() results for UTF-7 differ between PHP 8.0 and 8.1)
|
||||
--EXTENSIONS--
|
||||
mbstring
|
||||
--FILE--
|
||||
<?php
|
||||
|
||||
$testcases = [
|
||||
'non-base64 character after &' => 'A & B',
|
||||
'non-base64 character after -' => 'A - B',
|
||||
'base64 character before &' => 'A 1& B',
|
||||
'base64 character before -' => 'A 1- B',
|
||||
'base64 character after &' => 'A &1 B',
|
||||
'base64 character after -' => 'A -1 B',
|
||||
'base64 character before and after &' => 'A 1&1 B',
|
||||
'base64 character before and after -' => 'A 1-1 B',
|
||||
'string ends with &' => 'A &',
|
||||
'string ends with -' => 'A -',
|
||||
'& and -' => 'A &- B',
|
||||
'- and &' => 'A -& B',
|
||||
'valid direct encoding character ~' => 'A ~ B',
|
||||
'invalid direct encoding character ESC' => "A \x1b B",
|
||||
'valid direct encoding character ~ after &' => 'A &~ B',
|
||||
'invalid direct encoding character ESC after &' => "A &\x1b B",
|
||||
'valid base64 character between & and -' => 'A &ZeVnLIqe- B', // 日本語 in UTF-16BE
|
||||
'invalid base64 character between & and -' => 'A &ZeVnLIq- B', // 日本語 in UTF-16BE without the last character
|
||||
'valid base64 character between & and non-base64 character' => 'A &ZeVnLIqe B',
|
||||
'invalid base64 character between & and non-base64 character' => 'A &ZeVnLIq B',
|
||||
'valid base64 character between & and base64 character' => 'A &ZeVnLIqe1 B',
|
||||
'invalid base64 character between & and base64 character' => 'A &ZeVnLIq1 B',
|
||||
'valid base64 character between & and end of string' => 'A &ZeVnLIqe',
|
||||
'invalid base64 character between & and end of string' => 'A &ZeVnLIq',
|
||||
'valid base64 character using surrogate pair between & and -' => 'A &2GfePQ- B', // 𩸽 in UTF-16BE
|
||||
'first 16 bits of base64 character using surrogate pair between & and -' => 'A &2Gc- B', // first 16 bits of 𩸽 in UTF-16BE
|
||||
'valid base64 character using surrogate pair between & and non-base64 character' => 'A &2GfePQ B',
|
||||
'first 16 bits of base64 character using surrogate pair between & and non-base64 character' => 'A &2Gc B',
|
||||
'valid base64 character using surrogate pair between & and base64 character' => 'A &2GfePQ1 B',
|
||||
'first 16 bits of base64 character using surrogate pair between & and base64 character' => 'A &2Gc1 B',
|
||||
'valid base64 character using surrogate pair between & and end of string' => 'A &2GfePQ',
|
||||
'first 16 bits of base64 character using surrogate pair between & and end of string' => 'A &2Gc',
|
||||
'invalid base64 character using surrogate pair in reverse order between & and -' => 'A &3j3YZw- B', // 𩸽 in reverse order in UTF-16BE
|
||||
'last 16 bits of base64 character using surrogate pair in reverse order between & and -' => 'A &3j0- B', // last 16 bits of 𩸽 in UTF-16BE
|
||||
'invalid base64 character using surrogate pair in reverse order between & and non-base64 character' => 'A &3j3YZw B',
|
||||
'last 16 bits of base64 character using surrogate pair in reverse order between & and non-base64 character' => 'A &3j0 B',
|
||||
'invalid base64 character using surrogate pair in reverse order between & and base64 character' => 'A &3j3YZw1 B',
|
||||
'last 16 bits of base64 character using surrogate pair in reverse order between & and base64 character' => 'A &3j01 B',
|
||||
'invalid base64 character using surrogate pair in reverse order between & and end of string' => 'A &3j3YZw',
|
||||
'last 16 bits of base64 character using surrogate pair in reverse order between & and end of string' => 'A &3j0'
|
||||
];
|
||||
|
||||
foreach ($testcases as $title => $case) {
|
||||
echo $title . PHP_EOL;
|
||||
var_dump(mb_detect_encoding($case, 'UTF-8, UTF7-IMAP', true));
|
||||
var_dump(mb_detect_encoding($case, 'UTF-8, UTF7-IMAP', false));
|
||||
var_dump(mb_detect_encoding($case, 'UTF7-IMAP', true));
|
||||
var_dump(mb_detect_encoding($case, 'UTF7-IMAP', false));
|
||||
var_dump(mb_check_encoding($case, 'UTF7-IMAP'));
|
||||
var_dump(addcslashes(mb_convert_encoding($case, 'UTF-8', 'UTF7-IMAP'), "\0..\37\177"));
|
||||
var_dump(mb_get_info('illegal_chars'));
|
||||
echo PHP_EOL;
|
||||
}
|
||||
|
||||
?>
|
||||
--EXPECT--
|
||||
non-base64 character after &
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(4) "A ?B"
|
||||
int(1)
|
||||
|
||||
non-base64 character after -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(true)
|
||||
string(5) "A - B"
|
||||
int(1)
|
||||
|
||||
base64 character before &
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(5) "A 1?B"
|
||||
int(2)
|
||||
|
||||
base64 character before -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(true)
|
||||
string(6) "A 1- B"
|
||||
int(2)
|
||||
|
||||
base64 character after &
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(4) "A ?B"
|
||||
int(3)
|
||||
|
||||
base64 character after -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(true)
|
||||
string(6) "A -1 B"
|
||||
int(3)
|
||||
|
||||
base64 character before and after &
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(5) "A 1?B"
|
||||
int(4)
|
||||
|
||||
base64 character before and after -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(true)
|
||||
string(7) "A 1-1 B"
|
||||
int(4)
|
||||
|
||||
string ends with &
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(3) "A ?"
|
||||
int(5)
|
||||
|
||||
string ends with -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(true)
|
||||
string(3) "A -"
|
||||
int(5)
|
||||
|
||||
& and -
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(true)
|
||||
string(5) "A & B"
|
||||
int(5)
|
||||
|
||||
- and &
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(5) "A -?B"
|
||||
int(6)
|
||||
|
||||
valid direct encoding character ~
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(true)
|
||||
string(5) "A ~ B"
|
||||
int(6)
|
||||
|
||||
invalid direct encoding character ESC
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(5) "A ? B"
|
||||
int(7)
|
||||
|
||||
valid direct encoding character ~ after &
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(5) "A ? B"
|
||||
int(8)
|
||||
|
||||
invalid direct encoding character ESC after &
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(5) "A ? B"
|
||||
int(9)
|
||||
|
||||
valid base64 character between & and -
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(true)
|
||||
string(13) "A 日本語 B"
|
||||
int(9)
|
||||
|
||||
invalid base64 character between & and -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(11) "A 日本? B"
|
||||
int(10)
|
||||
|
||||
valid base64 character between & and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(13) "A 日本語?B"
|
||||
int(11)
|
||||
|
||||
invalid base64 character between & and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(10) "A 日本?B"
|
||||
int(12)
|
||||
|
||||
valid base64 character between & and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(13) "A 日本語?B"
|
||||
int(13)
|
||||
|
||||
invalid base64 character between & and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(13) "A 日本誵?B"
|
||||
int(14)
|
||||
|
||||
valid base64 character between & and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(12) "A 日本語?"
|
||||
int(15)
|
||||
|
||||
invalid base64 character between & and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(9) "A 日本?"
|
||||
int(16)
|
||||
|
||||
valid base64 character using surrogate pair between & and -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
string(9) "UTF7-IMAP"
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(true)
|
||||
string(8) "A 𩸽 B"
|
||||
int(16)
|
||||
|
||||
first 16 bits of base64 character using surrogate pair between & and -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(5) "A ? B"
|
||||
int(17)
|
||||
|
||||
valid base64 character using surrogate pair between & and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(8) "A 𩸽?B"
|
||||
int(18)
|
||||
|
||||
first 16 bits of base64 character using surrogate pair between & and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(4) "A ?B"
|
||||
int(19)
|
||||
|
||||
valid base64 character using surrogate pair between & and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(8) "A 𩸽?B"
|
||||
int(20)
|
||||
|
||||
first 16 bits of base64 character using surrogate pair between & and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(4) "A ?B"
|
||||
int(21)
|
||||
|
||||
valid base64 character using surrogate pair between & and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(7) "A 𩸽?"
|
||||
int(22)
|
||||
|
||||
first 16 bits of base64 character using surrogate pair between & and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(4) "A ??"
|
||||
int(24)
|
||||
|
||||
invalid base64 character using surrogate pair in reverse order between & and -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(6) "A ?? B"
|
||||
int(26)
|
||||
|
||||
last 16 bits of base64 character using surrogate pair in reverse order between & and -
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(5) "A ? B"
|
||||
int(27)
|
||||
|
||||
invalid base64 character using surrogate pair in reverse order between & and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(5) "A ??B"
|
||||
int(29)
|
||||
|
||||
last 16 bits of base64 character using surrogate pair in reverse order between & and non-base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(5) "A ??B"
|
||||
int(31)
|
||||
|
||||
invalid base64 character using surrogate pair in reverse order between & and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(5) "A ??B"
|
||||
int(33)
|
||||
|
||||
last 16 bits of base64 character using surrogate pair in reverse order between & and base64 character
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(5) "A ??B"
|
||||
int(35)
|
||||
|
||||
invalid base64 character using surrogate pair in reverse order between & and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(5) "A ???"
|
||||
int(38)
|
||||
|
||||
last 16 bits of base64 character using surrogate pair in reverse order between & and end of string
|
||||
string(5) "UTF-8"
|
||||
string(5) "UTF-8"
|
||||
bool(false)
|
||||
string(9) "UTF7-IMAP"
|
||||
bool(false)
|
||||
string(4) "A ??"
|
||||
int(40)
|
155
ext/mbstring/tests/gh10648.phpt
Normal file
155
ext/mbstring/tests/gh10648.phpt
Normal file
|
@ -0,0 +1,155 @@
|
|||
--TEST--
|
||||
GH-10648 (mb_check_encoding() returns true for incorrect but interpretable ISO-2022-JP byte sequences)
|
||||
--EXTENSIONS--
|
||||
mbstring
|
||||
--FILE--
|
||||
<?php
|
||||
|
||||
$testcases = [
|
||||
'ISO-2022-JP bytes' => '1b244224221b2842', // 'あ' in ISO-2022-JP
|
||||
'ISO-2022-JP bytes without escape sequence' => '1b24422422', // 'ア' in JIS
|
||||
'JIS X 0201 7bit kana with escape sequence' => '1b2849311b2842', // 'ア' in JIS
|
||||
'JIS X 0201 7bit kana with SO/SI' => '0e310f', // 'ア' in JIS
|
||||
'JIS X 0201 8bit kana' => 'b1', // 'ア' in JIS
|
||||
'JIS X 0201 7bit kana with SO and ESC' => '0e311b2842', // 'ア' in JIS
|
||||
'JIS X 0201 7bit kana with ESC and SI' => '1b2849310f', // 'ア' in JIS
|
||||
'JIS X 0208 character' => '1b244242641b2842', // '鯛' in JIS and ISO-2022-JP, included in JIS X 0208
|
||||
'JIS X 0212 character' => '1b2428446a591b2842', // '鮋' in JIS, included in JIS X 0212
|
||||
'JIS X 0213 character' => '1b2428507d4c1b2842', // '𩸽' in ISO-2022-JP-2004, included in JIS X 0213
|
||||
'JIS C 6220-1969 ESC ( H' => '1b284a1b2848', // an escape sequence transitioning to ASCII
|
||||
'SO/SI when not in ASCII mode' => '1b284a0e0f1b2842', // an escape sequence transitioning to ASCII
|
||||
];
|
||||
|
||||
foreach ($testcases as $title => $case) {
|
||||
echo $title . PHP_EOL;
|
||||
echo 'JIS:' . PHP_EOL;
|
||||
var_dump(mb_check_encoding(hex2bin($case), 'JIS'));
|
||||
echo mb_convert_encoding(hex2bin($case), 'UTF-8', 'JIS'). PHP_EOL;
|
||||
var_dump(mb_get_info('illegal_chars'));
|
||||
echo 'ISO-2022-JP:' . PHP_EOL;
|
||||
var_dump(mb_check_encoding(hex2bin($case), 'ISO-2022-JP'));
|
||||
echo mb_convert_encoding(hex2bin($case), 'UTF-8', 'ISO-2022-JP'). PHP_EOL;
|
||||
var_dump(mb_get_info('illegal_chars'));
|
||||
echo PHP_EOL;
|
||||
}
|
||||
?>
|
||||
--EXPECT--
|
||||
ISO-2022-JP bytes
|
||||
JIS:
|
||||
bool(true)
|
||||
あ
|
||||
int(0)
|
||||
ISO-2022-JP:
|
||||
bool(true)
|
||||
あ
|
||||
int(0)
|
||||
|
||||
ISO-2022-JP bytes without escape sequence
|
||||
JIS:
|
||||
bool(false)
|
||||
あ
|
||||
int(0)
|
||||
ISO-2022-JP:
|
||||
bool(false)
|
||||
あ
|
||||
int(0)
|
||||
|
||||
JIS X 0201 7bit kana with escape sequence
|
||||
JIS:
|
||||
bool(true)
|
||||
ア
|
||||
int(0)
|
||||
ISO-2022-JP:
|
||||
bool(false)
|
||||
ア
|
||||
int(0)
|
||||
|
||||
JIS X 0201 7bit kana with SO/SI
|
||||
JIS:
|
||||
bool(true)
|
||||
ア
|
||||
int(0)
|
||||
ISO-2022-JP:
|
||||
bool(false)
|
||||
ア
|
||||
int(0)
|
||||
|
||||
JIS X 0201 8bit kana
|
||||
JIS:
|
||||
bool(true)
|
||||
ア
|
||||
int(0)
|
||||
ISO-2022-JP:
|
||||
bool(false)
|
||||
ア
|
||||
int(0)
|
||||
|
||||
JIS X 0201 7bit kana with SO and ESC
|
||||
JIS:
|
||||
bool(false)
|
||||
ア
|
||||
int(0)
|
||||
ISO-2022-JP:
|
||||
bool(false)
|
||||
ア
|
||||
int(0)
|
||||
|
||||
JIS X 0201 7bit kana with ESC and SI
|
||||
JIS:
|
||||
bool(false)
|
||||
ア
|
||||
int(0)
|
||||
ISO-2022-JP:
|
||||
bool(false)
|
||||
ア
|
||||
int(0)
|
||||
|
||||
JIS X 0208 character
|
||||
JIS:
|
||||
bool(true)
|
||||
鯛
|
||||
int(0)
|
||||
ISO-2022-JP:
|
||||
bool(true)
|
||||
鯛
|
||||
int(0)
|
||||
|
||||
JIS X 0212 character
|
||||
JIS:
|
||||
bool(true)
|
||||
鮋
|
||||
int(0)
|
||||
ISO-2022-JP:
|
||||
bool(false)
|
||||
鮋
|
||||
int(0)
|
||||
|
||||
JIS X 0213 character
|
||||
JIS:
|
||||
bool(false)
|
||||
?$(P}L
|
||||
int(1)
|
||||
ISO-2022-JP:
|
||||
bool(false)
|
||||
?$(P}L
|
||||
int(2)
|
||||
|
||||
JIS C 6220-1969 ESC ( H
|
||||
JIS:
|
||||
bool(true)
|
||||
|
||||
int(2)
|
||||
ISO-2022-JP:
|
||||
bool(false)
|
||||
|
||||
int(2)
|
||||
|
||||
SO/SI when not in ASCII mode
|
||||
JIS:
|
||||
bool(false)
|
||||
|
||||
int(2)
|
||||
ISO-2022-JP:
|
||||
bool(false)
|
||||
|
||||
int(2)
|
|
@ -50,11 +50,6 @@ function testValid($from, $to, $encoding, $bothWays = true) {
|
|||
/* ESC ( B at the beginning is redundant, since ASCII mode is the default */
|
||||
if (substr($from, 0, 3) == "\x1B(B")
|
||||
$from = substr($from, 3, strlen($from) - 3);
|
||||
/* If the string switches to a different charset, it should switch back to
|
||||
* ASCII at the end */
|
||||
if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B(J") !== false)
|
||||
$from .= "\x1B(B";
|
||||
|
||||
convertValidString($to, $from, 'UTF-16BE', $encoding, false);
|
||||
}
|
||||
}
|
||||
|
@ -66,11 +61,11 @@ function testInvalid($from, $to, $encoding) {
|
|||
for ($i = 0; $i < 0x80; $i++) {
|
||||
if ($i == 0xE || $i == 0xF || $i == 0x1B)
|
||||
continue;
|
||||
testValid(chr($i), "\x00" . chr($i), 'JIS');
|
||||
testValid("\x0F" . chr($i), "\x00" . chr($i), 'JIS'); /* 0xF is 'Shift Out' code */
|
||||
testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'JIS');
|
||||
testValid(chr($i), "\x00" . chr($i), 'ISO-2022-JP');
|
||||
testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'ISO-2022-JP');
|
||||
testValid(chr($i), "\x00" . chr($i), 'JIS');
|
||||
convertValidString("\x0F" . chr($i), "\x00" . chr($i), 'JIS', 'UTF-16BE', false); /* 0xF is 'Shift In' code */
|
||||
testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'JIS');
|
||||
testValid(chr($i), "\x00" . chr($i), 'ISO-2022-JP');
|
||||
testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'ISO-2022-JP');
|
||||
}
|
||||
|
||||
for ($i = 0x80; $i < 256; $i++) {
|
||||
|
@ -92,27 +87,27 @@ echo "ASCII support OK\n";
|
|||
foreach ($jisx0201Chars as $jisx0201 => $utf16BE) {
|
||||
if (ord($jisx0201) >= 128) {
|
||||
$kana = chr(ord($jisx0201) - 128);
|
||||
testValid("\x1B(I" . $kana, $utf16BE, 'JIS', false);
|
||||
testValid("\x0E" . $kana, $utf16BE, 'JIS', false); /* 0xE is 'Shift In' code */
|
||||
testValid("\x1B(I" . $kana . "\x1B(B", $utf16BE, 'JIS', false);
|
||||
testValid("\x0E" . $kana . "\x0F", $utf16BE, 'JIS', false); /* 0xE is 'Shift Out' code */
|
||||
testValid($jisx0201, $utf16BE, 'JIS', false);
|
||||
} else {
|
||||
testValid("\x1B(J" . $jisx0201, $utf16BE, 'JIS', $utf16BE > "\x00\x80");
|
||||
testValid("\x1B(J" . $jisx0201 . "\x1B(B", $utf16BE, 'JIS', $utf16BE > "\x00\x80");
|
||||
}
|
||||
}
|
||||
|
||||
for ($i = 0x80; $i < 256; $i++) {
|
||||
if ($i >= 0xA1 && $i <= 0xDF)
|
||||
continue;
|
||||
testInvalid("\x1B(I" . chr($i), "\x00%", 'JIS');
|
||||
testInvalid("\x1B(J" . chr($i), "\x00%", 'JIS');
|
||||
testInvalid("\x1B(I" . chr($i) . "\x1B(B", "\x00%", 'JIS');
|
||||
testInvalid("\x1B(J" . chr($i) . "\x1B(B", "\x00%", 'JIS');
|
||||
}
|
||||
|
||||
echo "JIS X 0201 support OK\n";
|
||||
|
||||
/* All valid JISX0208 characters */
|
||||
foreach ($jisx0208Chars as $jisx0208 => $utf16BE) {
|
||||
testValid("\x1B\$B" . $jisx0208, $utf16BE, 'JIS');
|
||||
testValid("\x1B\$B" . $jisx0208, $utf16BE, 'ISO-2022-JP');
|
||||
testValid("\x1B\$B" . $jisx0208 . "\x1B(B", $utf16BE, 'JIS');
|
||||
testValid("\x1B\$B" . $jisx0208 . "\x1B(B", $utf16BE, 'ISO-2022-JP');
|
||||
}
|
||||
|
||||
/* All invalid 2-byte JISX0208 characters */
|
||||
|
@ -120,8 +115,8 @@ for ($i = 0x21; $i <= 0x7E; $i++) {
|
|||
for ($j = 0; $j < 256; $j++) {
|
||||
$testString = chr($i) . chr($j);
|
||||
if (!isset($jisx0208Chars[$testString])) {
|
||||
testInvalid("\x1B\$B" . $testString, "\x00%", 'JIS');
|
||||
testInvalid("\x1B\$B" . $testString, "\x00%", 'ISO-2022-JP');
|
||||
testInvalid("\x1B\$B" . $testString . "\x1B(B", "\x00%", 'JIS');
|
||||
testInvalid("\x1B\$B" . $testString . "\x1B(B", "\x00%", 'ISO-2022-JP');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -142,7 +137,7 @@ echo "JIS X 0208 support OK\n";
|
|||
|
||||
/* All valid JISX0212 characters */
|
||||
foreach ($jisx0212Chars as $jisx0212 => $utf16BE) {
|
||||
testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'JIS', false);
|
||||
testValid("\x1B\$(D" . $jisx0212 . "\x1B(B", $utf16BE, 'JIS', false);
|
||||
}
|
||||
|
||||
/* All invalid 2-byte JISX0212 characters */
|
||||
|
@ -150,14 +145,14 @@ for ($i = 0x21; $i <= 0x7E; $i++) {
|
|||
for ($j = 0; $j < 256; $j++) {
|
||||
$testString = chr($i) . chr($j);
|
||||
if (!isset($jisx0212Chars[$testString])) {
|
||||
testInvalid("\x1B\$(D" . $testString, "\x00%", 'JIS');
|
||||
testInvalid("\x1B\$(D" . $testString . "\x1B(B", "\x00%", 'JIS');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Try truncated JISX0212 characters */
|
||||
for ($i = 0x21; $i <= 0x7E; $i++) {
|
||||
testInvalid("\x1B\$(D" . chr($i), "\x00%", 'JIS');
|
||||
testInvalid("\x1B\$(D" . chr($i) . "\x1B(B", "\x00%\x00%", 'JIS');
|
||||
}
|
||||
|
||||
testValidString("\x00\xA1", "\x1B\$(D\x22\x42\x1B(B", "UTF-16BE", "JIS", false);
|
||||
|
@ -167,29 +162,36 @@ convertInvalidString("\x00\xA1", "%", "UTF-16BE", "ISO-2022-JP", false);
|
|||
echo "JIS X 0212 support OK\n";
|
||||
|
||||
/* All possible escape sequences */
|
||||
$validEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B\$(@" => true, "\x1B\$(B" => true, "\x1B\$(D" => true, "\x1B(B" => true, "\x1B(H" => true, "\x1B(J" => true, "\x1B(I" => true];
|
||||
$validJisEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B\$(@" => true, "\x1B\$(B" => true, "\x1B\$(D" => true, "\x1B(B" => true, "\x1B(H" => true, "\x1B(J" => true, "\x1B(I" => true];
|
||||
$validIso2022jpEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B(B" => true, "\x1B(J" => true];
|
||||
for ($i = 0; $i <= 0xFF; $i++) {
|
||||
for ($j = 0; $j <= 0xFF; $j++) {
|
||||
$escapeSequence = "\x1B" . chr($i) . chr($j);
|
||||
if ($escapeSequence === "\x1B\$(")
|
||||
continue;
|
||||
if (isset($validEscapes[$escapeSequence])) {
|
||||
testValid($escapeSequence, "", 'JIS', false);
|
||||
testValid($escapeSequence, "", 'ISO-2022-JP', false);
|
||||
if (isset($validJisEscapes[$escapeSequence])) {
|
||||
testValid($escapeSequence . "\x1B(B", "", 'JIS', false);
|
||||
} else {
|
||||
identifyInvalidString($escapeSequence, 'JIS');
|
||||
identifyInvalidString($escapeSequence, 'ISO-2022-JP');
|
||||
identifyInvalidString($escapeSequence . "\x1B(B", 'JIS');
|
||||
}
|
||||
if (isset($validIso2022jpEscapes[$escapeSequence])) {
|
||||
testValid($escapeSequence . "\x1B(B", "", 'ISO-2022-JP', false);
|
||||
} else {
|
||||
identifyInvalidString($escapeSequence . "\x1B(B", 'ISO-2022-JP');
|
||||
}
|
||||
}
|
||||
}
|
||||
for ($i = 0; $i <= 0xFF; $i++) {
|
||||
$escapeSequence = "\x1B\$(" . chr($i);
|
||||
if (isset($validEscapes[$escapeSequence])) {
|
||||
testValid($escapeSequence, "", 'JIS', false);
|
||||
testValid($escapeSequence, "", 'ISO-2022-JP', false);
|
||||
if (isset($validJisEscapes[$escapeSequence])) {
|
||||
testValid($escapeSequence . "\x1B(B", "", 'JIS', false);
|
||||
} else {
|
||||
identifyInvalidString($escapeSequence, 'JIS');
|
||||
identifyInvalidString($escapeSequence, 'ISO-2022-JP');
|
||||
identifyInvalidString($escapeSequence . "\x1B(B", 'JIS');
|
||||
}
|
||||
if (isset($validIso2022jpEscapes[$escapeSequence])) {
|
||||
testValid($escapeSequence . "\x1B(B", "", 'ISO-2022-JP', false);
|
||||
} else {
|
||||
identifyInvalidString($escapeSequence . "\x1B(B", 'ISO-2022-JP');
|
||||
}
|
||||
}
|
||||
/* Also try a bare ESC */
|
||||
|
|
|
@ -1036,17 +1036,8 @@ testValidString('+' . encode('AB', 'ASCII') . '-+' . encode('CD', 'ASCII') . '-'
|
|||
// (Just trying to be exhaustive here)
|
||||
testValidString('+' . encode('AB', 'ASCII') . '-!+' . encode('CD', 'ASCII') . '-', "\x00A\x00B\x00!\x00C\x00D", 'UTF-7', 'UTF-16BE', false);
|
||||
|
||||
// + section terminated by a non-Base64 ASCII character which is NOT -
|
||||
for ($i = 0; $i < 128; $i++) {
|
||||
if ($i >= ord('A') && $i <= ord('Z'))
|
||||
continue;
|
||||
if ($i >= ord('a') && $i <= ord('z'))
|
||||
continue;
|
||||
if ($i >= ord('0') && $i <= ord('9'))
|
||||
continue;
|
||||
if ($i == ord('+') || $i == ord('/') || $i == ord('-') || $i == ord('\\') || $i == ord('~'))
|
||||
continue;
|
||||
$char = chr($i);
|
||||
// + section terminated by a non-Base64 direct character which is NOT -
|
||||
foreach (str_split(" \t\r\n'(),.:?!\"#$%&*;<=>@[]^_`{|}\x00") as $char) {
|
||||
testValidString('+' . encode("\x12\x34", 'UTF-16BE') . $char, "\x00\x00\x12\x34\x00\x00\x00" . $char, 'UTF-7', 'UTF-32BE', false);
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue