Merge branch 'PHP-8.2'

* PHP-8.2:
  Fix phpGH-10648: add check function pointer into mbfl_encoding
This commit is contained in:
Alex Dowad 2023-03-24 20:44:33 +02:00
commit 0779950768
45 changed files with 1739 additions and 110 deletions

View file

@ -64,7 +64,8 @@ const mbfl_encoding mbfl_encoding_7bit = {
&vtbl_7bit_wchar,
&vtbl_wchar_7bit,
mb_7bit_to_wchar,
mb_wchar_to_7bit
mb_wchar_to_7bit,
NULL
};
#define CK(statement) do { if ((statement) < 0) return (-1); } while (0)

View file

@ -44,7 +44,8 @@ const mbfl_encoding mbfl_encoding_base64 = {
NULL,
NULL,
mb_base64_to_wchar,
mb_wchar_to_base64
mb_wchar_to_base64,
NULL
};
const struct mbfl_convert_vtbl vtbl_8bit_b64 = {

View file

@ -69,7 +69,8 @@ const mbfl_encoding mbfl_encoding_big5 = {
&vtbl_big5_wchar,
&vtbl_wchar_big5,
mb_big5_to_wchar,
mb_wchar_to_big5
mb_wchar_to_big5,
NULL
};
const mbfl_encoding mbfl_encoding_cp950 = {
@ -82,7 +83,8 @@ const mbfl_encoding mbfl_encoding_cp950 = {
&vtbl_cp950_wchar,
&vtbl_wchar_cp950,
mb_cp950_to_wchar,
mb_wchar_to_cp950
mb_wchar_to_cp950,
NULL
};
const struct mbfl_convert_vtbl vtbl_big5_wchar = {

View file

@ -61,7 +61,8 @@ const mbfl_encoding mbfl_encoding_cp50220 = {
&vtbl_cp50220_wchar,
&vtbl_wchar_cp50220,
mb_cp5022x_to_wchar,
mb_wchar_to_cp50220
mb_wchar_to_cp50220,
NULL
};
const mbfl_encoding mbfl_encoding_cp50221 = {
@ -74,7 +75,8 @@ const mbfl_encoding mbfl_encoding_cp50221 = {
&vtbl_cp50221_wchar,
&vtbl_wchar_cp50221,
mb_cp5022x_to_wchar,
mb_wchar_to_cp50221
mb_wchar_to_cp50221,
NULL
};
const mbfl_encoding mbfl_encoding_cp50222 = {
@ -87,7 +89,8 @@ const mbfl_encoding mbfl_encoding_cp50222 = {
&vtbl_cp50222_wchar,
&vtbl_wchar_cp50222,
mb_cp5022x_to_wchar,
mb_wchar_to_cp50222
mb_wchar_to_cp50222,
NULL
};
const struct mbfl_convert_vtbl vtbl_cp50220_wchar = {

View file

@ -69,7 +69,8 @@ const mbfl_encoding mbfl_encoding_cp51932 = {
&vtbl_cp51932_wchar,
&vtbl_wchar_cp51932,
mb_cp51932_to_wchar,
mb_wchar_to_cp51932
mb_wchar_to_cp51932,
NULL
};
const struct mbfl_convert_vtbl vtbl_cp51932_wchar = {

View file

@ -100,7 +100,8 @@ const mbfl_encoding mbfl_encoding_cp932 = {
&vtbl_cp932_wchar,
&vtbl_wchar_cp932,
mb_cp932_to_wchar,
mb_wchar_to_cp932
mb_wchar_to_cp932,
NULL
};
const struct mbfl_convert_vtbl vtbl_cp932_wchar = {
@ -133,7 +134,8 @@ const mbfl_encoding mbfl_encoding_sjiswin = {
&vtbl_sjiswin_wchar,
&vtbl_wchar_sjiswin,
mb_cp932_to_wchar,
mb_wchar_to_sjiswin
mb_wchar_to_sjiswin,
NULL
};
const struct mbfl_convert_vtbl vtbl_sjiswin_wchar = {

View file

@ -68,7 +68,8 @@ const mbfl_encoding mbfl_encoding_cp936 = {
&vtbl_cp936_wchar,
&vtbl_wchar_cp936,
mb_cp936_to_wchar,
mb_wchar_to_cp936
mb_wchar_to_cp936,
NULL
};
const struct mbfl_convert_vtbl vtbl_cp936_wchar = {

View file

@ -67,7 +67,8 @@ const mbfl_encoding mbfl_encoding_euc_cn = {
&vtbl_euccn_wchar,
&vtbl_wchar_euccn,
mb_euccn_to_wchar,
mb_wchar_to_euccn
mb_wchar_to_euccn,
NULL
};
const struct mbfl_convert_vtbl vtbl_euccn_wchar = {

View file

@ -68,7 +68,8 @@ const mbfl_encoding mbfl_encoding_euc_jp = {
&vtbl_eucjp_wchar,
&vtbl_wchar_eucjp,
mb_eucjp_to_wchar,
mb_wchar_to_eucjp
mb_wchar_to_eucjp,
NULL
};
const struct mbfl_convert_vtbl vtbl_eucjp_wchar = {

View file

@ -69,7 +69,8 @@ const mbfl_encoding mbfl_encoding_eucjp_win = {
&vtbl_eucjpwin_wchar,
&vtbl_wchar_eucjpwin,
mb_eucjpwin_to_wchar,
mb_wchar_to_eucjpwin
mb_wchar_to_eucjpwin,
NULL
};
const struct mbfl_convert_vtbl vtbl_eucjpwin_wchar = {

View file

@ -66,7 +66,8 @@ const mbfl_encoding mbfl_encoding_euc_kr = {
&vtbl_euckr_wchar,
&vtbl_wchar_euckr,
mb_euckr_to_wchar,
mb_wchar_to_euckr
mb_wchar_to_euckr,
NULL
};
const struct mbfl_convert_vtbl vtbl_euckr_wchar = {

View file

@ -68,7 +68,8 @@ const mbfl_encoding mbfl_encoding_euc_tw = {
&vtbl_euctw_wchar,
&vtbl_wchar_euctw,
mb_euctw_to_wchar,
mb_wchar_to_euctw
mb_wchar_to_euctw,
NULL
};
const struct mbfl_convert_vtbl vtbl_euctw_wchar = {

View file

@ -49,7 +49,8 @@ const mbfl_encoding mbfl_encoding_gb18030 = {
&vtbl_gb18030_wchar,
&vtbl_wchar_gb18030,
mb_gb18030_to_wchar,
mb_wchar_to_gb18030
mb_wchar_to_gb18030,
NULL
};
const struct mbfl_convert_vtbl vtbl_gb18030_wchar = {

View file

@ -66,7 +66,8 @@ const mbfl_encoding mbfl_encoding_html_ent = {
&vtbl_html_wchar,
&vtbl_wchar_html,
mb_htmlent_to_wchar,
mb_wchar_to_htmlent
mb_wchar_to_htmlent,
NULL
};
const struct mbfl_convert_vtbl vtbl_wchar_html = {

View file

@ -47,7 +47,8 @@ const mbfl_encoding mbfl_encoding_hz = {
&vtbl_hz_wchar,
&vtbl_wchar_hz,
mb_hz_to_wchar,
mb_wchar_to_hz
mb_wchar_to_hz,
NULL
};
const struct mbfl_convert_vtbl vtbl_hz_wchar = {

View file

@ -51,7 +51,8 @@ const mbfl_encoding mbfl_encoding_2022jpms = {
&vtbl_2022jpms_wchar,
&vtbl_wchar_2022jpms,
mb_iso2022jpms_to_wchar,
mb_wchar_to_iso2022jpms
mb_wchar_to_iso2022jpms,
NULL
};
const struct mbfl_convert_vtbl vtbl_2022jpms_wchar = {

View file

@ -54,7 +54,8 @@ const mbfl_encoding mbfl_encoding_2022kr = {
&vtbl_2022kr_wchar,
&vtbl_wchar_2022kr,
mb_iso2022kr_to_wchar,
mb_wchar_to_iso2022kr
mb_wchar_to_iso2022kr,
NULL
};
const struct mbfl_convert_vtbl vtbl_wchar_2022kr = {

View file

@ -71,7 +71,8 @@ const mbfl_encoding mbfl_encoding_2022jp_kddi = {
&vtbl_2022jp_kddi_wchar,
&vtbl_wchar_2022jp_kddi,
mb_iso2022jp_kddi_to_wchar,
mb_wchar_to_iso2022jp_kddi
mb_wchar_to_iso2022jp_kddi,
NULL
};
const struct mbfl_convert_vtbl vtbl_2022jp_kddi_wchar = {

View file

@ -37,6 +37,8 @@ static int mbfl_filt_conv_jis_wchar_flush(mbfl_convert_filter *filter);
static size_t mb_iso2022jp_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_iso2022jp(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
static void mb_wchar_to_jis(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
static bool mb_check_iso2022jp(unsigned char *in, size_t in_len);
static bool mb_check_jis(unsigned char *in, size_t in_len);
const mbfl_encoding mbfl_encoding_jis = {
mbfl_no_encoding_jis,
@ -49,6 +51,7 @@ const mbfl_encoding mbfl_encoding_jis = {
&vtbl_wchar_jis,
mb_iso2022jp_to_wchar,
mb_wchar_to_jis,
mb_check_jis
};
const mbfl_encoding mbfl_encoding_2022jp = {
@ -61,7 +64,8 @@ const mbfl_encoding mbfl_encoding_2022jp = {
&vtbl_2022jp_wchar,
&vtbl_wchar_2022jp,
mb_iso2022jp_to_wchar,
mb_wchar_to_iso2022jp
mb_wchar_to_iso2022jp,
mb_check_iso2022jp
};
const struct mbfl_convert_vtbl vtbl_jis_wchar = {
@ -780,3 +784,161 @@ static void mb_wchar_to_jis(uint32_t *in, size_t len, mb_convert_buf *buf, bool
MB_CONVERT_BUF_STORE(buf, out, limit);
}
#define JISX_0201_KANA_SO 5
static bool mb_check_jis(unsigned char *in, size_t in_len)
{
unsigned char *p = in, *e = p + in_len;
unsigned int state = ASCII;
while (p < e) {
unsigned char c = *p++;
if (c == 0x1B) {
/* ESC seen; this is an escape sequence */
if (state == JISX_0201_KANA_SO) {
return false;
}
if ((e - p) < 2) {
return false;
}
unsigned char c2 = *p++;
if (c2 == '$') {
unsigned char c3 = *p++;
if (c3 == '@' || c3 == 'B') {
state = JISX_0208;
} else if (c3 == '(') {
if (p == e) {
return false;
}
unsigned char c4 = *p++;
if (c4 == '@' || c4 == 'B') {
state = JISX_0208;
} else if (c4 == 'D') {
state = JISX_0212;
} else {
return false;
}
} else {
return false;
}
} else if (c2 == '(') {
unsigned char c3 = *p++;
/* ESC ( H is treated as a sequence transitioning to ASCII for historical reasons.
* see https://github.com/php/php-src/pull/10828#issuecomment-1478342432. */
if (c3 == 'B' || c3 == 'H') {
state = ASCII;
} else if (c3 == 'J') {
state = JISX_0201_LATIN;
} else if (c3 == 'I') {
state = JISX_0201_KANA;
} else {
return false;
}
} else {
return false;
}
} else if (c == 0xE) {
/* "Kana In" marker */
if (state != ASCII) {
return false;
}
state = JISX_0201_KANA_SO;
} else if (c == 0xF) {
/* "Kana Out" marker */
if (state != JISX_0201_KANA_SO) {
return false;
}
state = ASCII;
} else if ((state == JISX_0208 || state == JISX_0212) && (c > 0x20 && c < 0x7F)) {
if (p == e) {
return false;
}
unsigned char c2 = *p++;
if (c2 > 0x20 && c2 < 0x7F) {
unsigned int s = (c - 0x21)*94 + c2 - 0x21;
if (state == JISX_0208) {
if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) {
continue;
}
} else {
if (s < jisx0212_ucs_table_size && jisx0212_ucs_table[s]) {
continue;
}
}
return false;
} else {
return false;
}
} else if (c < 0x80) {
continue;
} else if (c >= 0xA1 && c <= 0xDF) {
/* GR-invoked Kana */
continue;
} else {
return false;
}
}
return state == ASCII;
}
static bool mb_check_iso2022jp(unsigned char *in, size_t in_len)
{
unsigned char *p = in, *e = p + in_len;
unsigned int state = ASCII;
while (p < e) {
unsigned char c = *p++;
if (c == 0x1B) {
/* ESC seen; this is an escape sequence */
if ((e - p) < 2) {
return false;
}
unsigned char c2 = *p++;
if (c2 == '$') {
unsigned char c3 = *p++;
if (c3 == '@' || c3 == 'B') {
state = JISX_0208;
} else {
return false;
}
} else if (c2 == '(') {
unsigned char c3 = *p++;
if (c3 == 'B') {
state = ASCII;
} else if (c3 == 'J') {
state = JISX_0201_LATIN;
} else {
return false;
}
} else {
return false;
}
} else if (c == 0xE || c == 0xF) {
/* "Kana In" or "Kana Out" marker; ISO-2022-JP is not accepted. */
return false;
} else if (state == JISX_0208 && (c > 0x20 && c < 0x7F)) {
if (p == e) {
return false;
}
unsigned char c2 = *p++;
if (c2 > 0x20 && c2 < 0x7F) {
unsigned int s = (c - 0x21)*94 + c2 - 0x21;
if (s < jisx0208_ucs_table_size && jisx0208_ucs_table[s]) {
continue;
}
return false;
} else {
return false;
}
} else if (c < 0x80) {
continue;
} else {
return false;
}
}
return state == ASCII;
}

View file

@ -45,7 +45,8 @@ const mbfl_encoding mbfl_encoding_qprint = {
NULL,
NULL,
mb_qprint_to_wchar,
mb_wchar_to_qprint
mb_wchar_to_qprint,
NULL
};
const struct mbfl_convert_vtbl vtbl_8bit_qprint = {

View file

@ -86,7 +86,8 @@ static int mbfl_conv_reverselookup_table(int c, mbfl_convert_filter *filter, int
&vtbl_##id##_wchar, \
&vtbl_wchar_##id, \
mb_##id##_to_wchar, \
mb_wchar_to_##id \
mb_wchar_to_##id, \
NULL \
}
/* For single-byte encodings which use a conversion table */

View file

@ -130,7 +130,8 @@ const mbfl_encoding mbfl_encoding_sjis = {
&vtbl_sjis_wchar,
&vtbl_wchar_sjis,
mb_sjis_to_wchar,
mb_wchar_to_sjis
mb_wchar_to_sjis,
NULL
};
const struct mbfl_convert_vtbl vtbl_sjis_wchar = {

View file

@ -67,7 +67,8 @@ const mbfl_encoding mbfl_encoding_sjis2004 = {
&vtbl_sjis2004_wchar,
&vtbl_wchar_sjis2004,
mb_sjis2004_to_wchar,
mb_wchar_to_sjis2004
mb_wchar_to_sjis2004,
NULL
};
const struct mbfl_convert_vtbl vtbl_sjis2004_wchar = {
@ -100,7 +101,8 @@ const mbfl_encoding mbfl_encoding_eucjp2004 = {
&vtbl_eucjp2004_wchar,
&vtbl_wchar_eucjp2004,
mb_eucjp2004_to_wchar,
mb_wchar_to_eucjp2004
mb_wchar_to_eucjp2004,
NULL
};
const struct mbfl_convert_vtbl vtbl_eucjp2004_wchar = {
@ -133,7 +135,8 @@ const mbfl_encoding mbfl_encoding_2022jp_2004 = {
&vtbl_2022jp_2004_wchar,
&vtbl_wchar_2022jp_2004,
mb_iso2022jp2004_to_wchar,
mb_wchar_to_iso2022jp2004
mb_wchar_to_iso2022jp2004,
NULL
};
const struct mbfl_convert_vtbl vtbl_2022jp_2004_wchar = {

View file

@ -56,7 +56,8 @@ const mbfl_encoding mbfl_encoding_ucs2 = {
&vtbl_ucs2_wchar,
&vtbl_wchar_ucs2,
mb_ucs2_to_wchar,
mb_wchar_to_ucs2be
mb_wchar_to_ucs2be,
NULL
};
const mbfl_encoding mbfl_encoding_ucs2be = {
@ -69,7 +70,8 @@ const mbfl_encoding mbfl_encoding_ucs2be = {
&vtbl_ucs2be_wchar,
&vtbl_wchar_ucs2be,
mb_ucs2be_to_wchar,
mb_wchar_to_ucs2be
mb_wchar_to_ucs2be,
NULL
};
const mbfl_encoding mbfl_encoding_ucs2le = {
@ -82,7 +84,8 @@ const mbfl_encoding mbfl_encoding_ucs2le = {
&vtbl_ucs2le_wchar,
&vtbl_wchar_ucs2le,
mb_ucs2le_to_wchar,
mb_wchar_to_ucs2le
mb_wchar_to_ucs2le,
NULL
};
const struct mbfl_convert_vtbl vtbl_ucs2_wchar = {

View file

@ -56,7 +56,8 @@ const mbfl_encoding mbfl_encoding_ucs4 = {
&vtbl_ucs4_wchar,
&vtbl_wchar_ucs4,
mb_ucs4_to_wchar,
mb_wchar_to_ucs4be
mb_wchar_to_ucs4be,
NULL
};
const mbfl_encoding mbfl_encoding_ucs4be = {
@ -69,7 +70,8 @@ const mbfl_encoding mbfl_encoding_ucs4be = {
&vtbl_ucs4be_wchar,
&vtbl_wchar_ucs4be,
mb_ucs4be_to_wchar,
mb_wchar_to_ucs4be
mb_wchar_to_ucs4be,
NULL
};
const mbfl_encoding mbfl_encoding_ucs4le = {
@ -82,7 +84,8 @@ const mbfl_encoding mbfl_encoding_ucs4le = {
&vtbl_ucs4le_wchar,
&vtbl_wchar_ucs4le,
mb_ucs4le_to_wchar,
mb_wchar_to_ucs4le
mb_wchar_to_ucs4le,
NULL
};
const struct mbfl_convert_vtbl vtbl_ucs4_wchar = {

View file

@ -71,7 +71,8 @@ const mbfl_encoding mbfl_encoding_uhc = {
&vtbl_uhc_wchar,
&vtbl_wchar_uhc,
mb_uhc_to_wchar,
mb_wchar_to_uhc
mb_wchar_to_uhc,
NULL
};
const struct mbfl_convert_vtbl vtbl_uhc_wchar = {

View file

@ -188,7 +188,8 @@ const mbfl_encoding mbfl_encoding_utf16 = {
&vtbl_utf16_wchar,
&vtbl_wchar_utf16,
mb_utf16_to_wchar,
mb_wchar_to_utf16be
mb_wchar_to_utf16be,
NULL
};
const mbfl_encoding mbfl_encoding_utf16be = {
@ -201,7 +202,8 @@ const mbfl_encoding mbfl_encoding_utf16be = {
&vtbl_utf16be_wchar,
&vtbl_wchar_utf16be,
mb_utf16be_to_wchar,
mb_wchar_to_utf16be
mb_wchar_to_utf16be,
NULL
};
const mbfl_encoding mbfl_encoding_utf16le = {
@ -214,7 +216,8 @@ const mbfl_encoding mbfl_encoding_utf16le = {
&vtbl_utf16le_wchar,
&vtbl_wchar_utf16le,
mb_utf16le_to_wchar,
mb_wchar_to_utf16le
mb_wchar_to_utf16le,
NULL
};
const struct mbfl_convert_vtbl vtbl_utf16_wchar = {

View file

@ -49,7 +49,8 @@ const mbfl_encoding mbfl_encoding_utf32 = {
&vtbl_utf32_wchar,
&vtbl_wchar_utf32,
mb_utf32_to_wchar,
mb_wchar_to_utf32be
mb_wchar_to_utf32be,
NULL
};
const mbfl_encoding mbfl_encoding_utf32be = {
@ -62,7 +63,8 @@ const mbfl_encoding mbfl_encoding_utf32be = {
&vtbl_utf32be_wchar,
&vtbl_wchar_utf32be,
mb_utf32be_to_wchar,
mb_wchar_to_utf32be
mb_wchar_to_utf32be,
NULL
};
const mbfl_encoding mbfl_encoding_utf32le = {
@ -75,7 +77,8 @@ const mbfl_encoding mbfl_encoding_utf32le = {
&vtbl_utf32le_wchar,
&vtbl_wchar_utf32le,
mb_utf32le_to_wchar,
mb_wchar_to_utf32le
mb_wchar_to_utf32le,
NULL
};
const struct mbfl_convert_vtbl vtbl_utf32_wchar = {

View file

@ -29,10 +29,12 @@
#include "mbfilter.h"
#include "mbfilter_utf7.h"
#include "utf7_helper.h"
static int mbfl_filt_conv_utf7_wchar_flush(mbfl_convert_filter *filter);
static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_utf7(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
static bool mb_check_utf7(unsigned char *in, size_t in_len);
static const unsigned char mbfl_base64_table[] = {
/* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
@ -59,7 +61,8 @@ const mbfl_encoding mbfl_encoding_utf7 = {
&vtbl_utf7_wchar,
&vtbl_wchar_utf7,
mb_utf7_to_wchar,
mb_wchar_to_utf7
mb_wchar_to_utf7,
mb_check_utf7
};
const struct mbfl_convert_vtbl vtbl_utf7_wchar = {
@ -408,16 +411,24 @@ int mbfl_filt_conv_wchar_utf7_flush(mbfl_convert_filter *filter)
return 0;
}
/* Ways which a Base64-encoded section can end: */
#define DASH 0xFD
#define ASCII 0xFE
#define ILLEGAL 0xFF
static inline bool is_base64_end(unsigned char c)
{
return c >= DASH;
}
static bool is_optional_direct(unsigned char c)
{
/* Characters that are allowed to be encoded by Base64 or directly encoded */
return c == '!' || c == '"' || c == '#' || c == '$' || c == '%' || c == '&' || c == '*' || c == ';' || c == '<' ||
c == '=' || c == '>' || c == '@' || c == '[' || c == ']' || c == '^' || c == '_' || c == '`' || c == '{' ||
c == '|' || c == '}';
}
static bool can_end_base64(uint32_t c)
{
return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\'' || c == '(' || c == ')' || c == ',' || c == '.' || c == ':' || c == '?';
}
static unsigned char decode_base64(unsigned char c)
{
if (c >= 'A' && c <= 'Z') {
@ -432,6 +443,8 @@ static unsigned char decode_base64(unsigned char c)
return 63;
} else if (c == '-') {
return DASH;
} else if (can_end_base64(c) || is_optional_direct(c) || c == '\0') {
return DIRECT;
} else if (c <= 0x7F) {
return ASCII;
}
@ -470,7 +483,7 @@ static uint32_t* handle_base64_end(unsigned char n, unsigned char **p, uint32_t
if (n == ILLEGAL) {
*out++ = MBFL_BAD_INPUT;
} else if (n == ASCII) {
} else if (n == DIRECT || n == ASCII) {
(*p)--; /* Unconsume byte */
}
@ -596,11 +609,6 @@ static size_t mb_utf7_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf
return out - buf;
}
static bool can_end_base64(uint32_t c)
{
return c == ' ' || c == '\t' || c == '\r' || c == '\n' || c == '\'' || c == '(' || c == ')' || c == ',' || c == '.' || c == ':' || c == '?';
}
static bool should_direct_encode(uint32_t c)
{
return (c >= 'A' && c <= 'Z') || (c >= 'a' && c <= 'z') || (c >= '0' && c <= '9') || c == '\0' || c == '/' || c == '-' || can_end_base64(c);
@ -700,3 +708,129 @@ static void mb_wchar_to_utf7(uint32_t *in, size_t len, mb_convert_buf *buf, bool
MB_CONVERT_BUF_STORE(buf, out, limit);
}
static bool is_utf16_cp_valid(uint16_t cp, bool is_surrogate)
{
if (is_surrogate) {
return cp >= 0xDC00 && cp <= 0xDFFF;
} else {
/* 2nd part of surrogate pair came unexpectedly */
return !(cp >= 0xDC00 && cp <= 0xDFFF);
}
}
static bool can_encode_directly(unsigned char c)
{
return should_direct_encode(c) || is_optional_direct(c) || c == '\0';
}
static bool mb_check_utf7(unsigned char *in, size_t in_len)
{
unsigned char *p = in, *e = p + in_len;
bool base64 = false;
bool is_surrogate = false;
while (p < e) {
if (base64) {
unsigned char n1 = decode_base64(*p++);
if (is_base64_end(n1)) {
if (!is_base64_end_valid(n1, false, is_surrogate)) {
return false;
}
base64 = false;
continue;
} else if (p == e) {
return false;
}
unsigned char n2 = decode_base64(*p++);
if (is_base64_end(n2) || p == e) {
return false;
}
unsigned char n3 = decode_base64(*p++);
if (is_base64_end(n3)) {
return false;
}
uint16_t cp1 = (n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2);
if (!is_utf16_cp_valid(cp1, is_surrogate)) {
return false;
}
is_surrogate = has_surrogate(cp1, is_surrogate);
if (p == e) {
/* It is an error if trailing padding bits are not zeroes or if we were
* expecting the 2nd part of a surrogate pair when Base64 section ends */
return !((n3 & 0x3) || is_surrogate);
}
unsigned char n4 = decode_base64(*p++);
if (is_base64_end(n4)) {
if (!is_base64_end_valid(n4, n3 & 0x3, is_surrogate)) {
return false;
}
base64 = false;
continue;
} else if (p == e) {
return false;
}
unsigned char n5 = decode_base64(*p++);
if (is_base64_end(n5) || p == e) {
return false;
}
unsigned char n6 = decode_base64(*p++);
if (is_base64_end(n6)) {
return false;
}
uint16_t cp2 = (n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4);
if (!is_utf16_cp_valid(cp2, is_surrogate)) {
return false;
}
is_surrogate = has_surrogate(cp2, is_surrogate);
if (p == e) {
return !((n6 & 0xF) || is_surrogate);
}
unsigned char n7 = decode_base64(*p++);
if (is_base64_end(n7)) {
if (!is_base64_end_valid(n7, n6 & 0xF, is_surrogate)) {
return false;
}
base64 = false;
continue;
} else if (p == e) {
return false;
}
unsigned char n8 = decode_base64(*p++);
if (is_base64_end(n8)) {
return false;
}
uint16_t cp3 = (n6 << 12) | (n7 << 6) | n8;
if (!is_utf16_cp_valid(cp3, is_surrogate)) {
return false;
}
is_surrogate = has_surrogate(cp3, is_surrogate);
} else {
/* ASCII text section */
unsigned char c = *p++;
if (c == '+') {
if (p == e) {
base64 = true;
return !is_surrogate;
}
unsigned char n = decode_base64(*p);
if (n == DASH) {
p++;
} else if (n > DASH) {
/* If a "+" character followed immediately by any character other than base64 or "-" */
return false;
} else {
base64 = true;
}
} else if (can_encode_directly(c)) {
continue;
} else {
return false;
}
}
}
return !is_surrogate;
}

View file

@ -77,11 +77,13 @@
#include "mbfilter.h"
#include "mbfilter_utf7imap.h"
#include "utf7_helper.h"
static int mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter *filter);
static int mbfl_filt_conv_utf7imap_wchar_flush(mbfl_convert_filter *filter);
static size_t mb_utf7imap_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_utf7imap(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
static bool mb_check_utf7imap(unsigned char *in, size_t in_len);
static const char *mbfl_encoding_utf7imap_aliases[] = {"mUTF-7", NULL};
@ -95,7 +97,8 @@ const mbfl_encoding mbfl_encoding_utf7imap = {
&vtbl_utf7imap_wchar,
&vtbl_wchar_utf7imap,
mb_utf7imap_to_wchar,
mb_wchar_to_utf7imap
mb_wchar_to_utf7imap,
mb_check_utf7imap
};
const struct mbfl_convert_vtbl vtbl_utf7imap_wchar = {
@ -444,10 +447,6 @@ static int mbfl_filt_conv_wchar_utf7imap_flush(mbfl_convert_filter *filter)
return 0;
}
/* Ways which a Base64-encoded section can end: */
#define DASH 0xFE
#define ILLEGAL 0xFF
static inline bool is_base64_end(unsigned char c)
{
return c >= DASH;
@ -732,3 +731,124 @@ static void mb_wchar_to_utf7imap(uint32_t *in, size_t len, mb_convert_buf *buf,
MB_CONVERT_BUF_STORE(buf, out, limit);
}
static bool is_utf16_cp_valid(uint16_t cp, bool is_surrogate)
{
if (is_surrogate) {
return cp >= 0xDC00 && cp <= 0xDFFF;
} else if (cp >= 0xDC00 && cp <= 0xDFFF) {
/* 2nd part of surrogate pair came unexpectedly */
return false;
} else if (cp >= 0x20 && cp <= 0x7E && cp != '&') {
return false;
}
return true;
}
static bool mb_check_utf7imap(unsigned char *in, size_t in_len)
{
unsigned char *p = in, *e = p + in_len;
bool base64 = false;
bool is_surrogate = false;
while (p < e) {
if (base64) {
/* Base64 section */
unsigned char n1 = decode_base64(*p++);
if (is_base64_end(n1)) {
if (!is_base64_end_valid(n1, false, is_surrogate)) {
return false;
}
base64 = false;
continue;
} else if (p == e) {
return false;
}
unsigned char n2 = decode_base64(*p++);
if (is_base64_end(n2) || p == e) {
return false;
}
unsigned char n3 = decode_base64(*p++);
if (is_base64_end(n3)) {
return false;
}
uint16_t cp1 = (n1 << 10) | (n2 << 4) | ((n3 & 0x3C) >> 2);
if (!is_utf16_cp_valid(cp1, is_surrogate)) {
return false;
}
is_surrogate = has_surrogate(cp1, is_surrogate);
if (p == e) {
return false;
}
unsigned char n4 = decode_base64(*p++);
if (is_base64_end(n4)) {
if (!is_base64_end_valid(n4, n3 & 0x3, is_surrogate)) {
return false;
}
base64 = false;
continue;
} else if (p == e) {
return false;
}
unsigned char n5 = decode_base64(*p++);
if (is_base64_end(n5) || p == e) {
return false;
}
unsigned char n6 = decode_base64(*p++);
if (is_base64_end(n6)) {
return false;
}
uint16_t cp2 = (n3 << 14) | (n4 << 8) | (n5 << 2) | ((n6 & 0x30) >> 4);
if (!is_utf16_cp_valid(cp2, is_surrogate)) {
return false;
}
is_surrogate = has_surrogate(cp2, is_surrogate);
if (p == e) {
return false;
}
unsigned char n7 = decode_base64(*p++);
if (is_base64_end(n7)) {
if (!is_base64_end_valid(n7, n6 & 0xF, is_surrogate)) {
return false;
}
base64 = false;
continue;
} else if (p == e) {
return false;
}
unsigned char n8 = decode_base64(*p++);
if (is_base64_end(n8)) {
return false;
}
uint16_t cp3 = (n6 << 12) | (n7 << 6) | n8;
if (!is_utf16_cp_valid(cp3, is_surrogate)) {
return false;
}
is_surrogate = has_surrogate(cp3, is_surrogate);
} else {
/* ASCII text section */
unsigned char c = *p++;
if (c == '&') {
if (p == e) {
return false;
}
unsigned char n = decode_base64(*p);
if (n == DASH) {
p++;
} else if (n == ILLEGAL) {
return false;
} else {
base64 = true;
}
} else if (c >= 0x20 && c <= 0x7E) {
continue;
} else {
return false;
}
}
}
return !base64;
}

View file

@ -64,7 +64,8 @@ const mbfl_encoding mbfl_encoding_utf8 = {
&vtbl_utf8_wchar,
&vtbl_wchar_utf8,
mb_utf8_to_wchar,
mb_wchar_to_utf8
mb_wchar_to_utf8,
NULL
};
const struct mbfl_convert_vtbl vtbl_utf8_wchar = {

View file

@ -63,7 +63,8 @@ const mbfl_encoding mbfl_encoding_utf8_docomo = {
&vtbl_utf8_docomo_wchar,
&vtbl_wchar_utf8_docomo,
mb_utf8_docomo_to_wchar,
mb_wchar_to_utf8_docomo
mb_wchar_to_utf8_docomo,
NULL
};
const mbfl_encoding mbfl_encoding_utf8_kddi_a = {
@ -76,7 +77,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_a = {
&vtbl_utf8_kddi_a_wchar,
&vtbl_wchar_utf8_kddi_a,
mb_utf8_kddi_a_to_wchar,
mb_wchar_to_utf8_kddi_a
mb_wchar_to_utf8_kddi_a,
NULL
};
const mbfl_encoding mbfl_encoding_utf8_kddi_b = {
@ -89,7 +91,8 @@ const mbfl_encoding mbfl_encoding_utf8_kddi_b = {
&vtbl_utf8_kddi_b_wchar,
&vtbl_wchar_utf8_kddi_b,
mb_utf8_kddi_b_to_wchar,
mb_wchar_to_utf8_kddi_b
mb_wchar_to_utf8_kddi_b,
NULL
};
const mbfl_encoding mbfl_encoding_utf8_sb = {
@ -102,7 +105,8 @@ const mbfl_encoding mbfl_encoding_utf8_sb = {
&vtbl_utf8_sb_wchar,
&vtbl_wchar_utf8_sb,
mb_utf8_sb_to_wchar,
mb_wchar_to_utf8_sb
mb_wchar_to_utf8_sb,
NULL
};
const struct mbfl_convert_vtbl vtbl_utf8_docomo_wchar = {

View file

@ -43,7 +43,8 @@ const mbfl_encoding mbfl_encoding_uuencode = {
NULL,
NULL,
mb_uuencode_to_wchar,
mb_wchar_to_uuencode
mb_wchar_to_uuencode,
NULL
};
const struct mbfl_convert_vtbl vtbl_uuencode_8bit = {

View file

@ -0,0 +1,22 @@
#ifndef MBFL_UTF7_HELPER_H
#define MBFL_UTF7_HELPER_H
#include "mbfilter.h"
/* Ways which a Base64-encoded section can end: */
#define DASH 0xFC
#define DIRECT 0xFD
#define ASCII 0xFE
#define ILLEGAL 0xFF
static inline bool is_base64_end_valid(unsigned char n, bool gap, bool is_surrogate)
{
return !(gap || is_surrogate || n == ASCII || n == ILLEGAL);
}
static inline bool has_surrogate(uint16_t cp, bool is_surrogate)
{
return !is_surrogate && cp >= 0xD800 && cp <= 0xDBFF;
}
#endif /* MBFL_UTF7_HELPER_H */

View file

@ -188,6 +188,16 @@ int mbfl_encoding_detector_feed(mbfl_encoding_detector *identd, mbfl_string *str
unsigned char *p = string->val;
int bad = 0;
if (identd->strict) {
for (int i = 0; i < num; i++) {
mbfl_convert_filter *filter = identd->filter_list[i];
mbfl_encoding_detector_data *data = &identd->filter_data[i];
if (filter->from->check != NULL && !(filter->from->check)(p, n)) {
data->num_illegalchars++;
}
}
}
while (n--) {
for (int i = 0; i < num; i++) {
mbfl_convert_filter *filter = identd->filter_list[i];

View file

@ -51,7 +51,8 @@ const mbfl_encoding mbfl_encoding_8bit = {
&vtbl_8bit_wchar,
&vtbl_wchar_8bit,
mb_8bit_to_wchar,
mb_wchar_to_8bit
mb_wchar_to_8bit,
NULL
};
const struct mbfl_convert_vtbl vtbl_8bit_wchar = {

View file

@ -44,6 +44,7 @@ const mbfl_encoding mbfl_encoding_pass = {
NULL,
NULL,
NULL,
NULL,
NULL
};

View file

@ -42,5 +42,6 @@ const mbfl_encoding mbfl_encoding_wchar = {
NULL,
NULL,
NULL,
NULL,
NULL
};

View file

@ -144,6 +144,7 @@ typedef struct {
typedef size_t (*mb_to_wchar_fn)(unsigned char **in, size_t *in_len, uint32_t *out, size_t out_len, unsigned int *state);
typedef void (*mb_from_wchar_fn)(uint32_t *in, size_t in_len, mb_convert_buf *out, bool end);
typedef bool (*mb_check_fn)(unsigned char *in, size_t in_len);
/* When converting encoded text to a buffer of wchars (Unicode codepoints) using `mb_to_wchar_fn`,
* the buffer must be at least this size (to work with all supported text encodings) */
@ -249,6 +250,7 @@ typedef struct {
const struct mbfl_convert_vtbl *output_filter;
mb_to_wchar_fn to_wchar;
mb_from_wchar_fn from_wchar;
mb_check_fn check;
} mbfl_encoding;
extern const mbfl_encoding mbfl_encoding_utf8;

View file

@ -3001,6 +3001,18 @@ static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len,
return *elist;
}
/* If any candidate encoding have specialized validation functions, use those first
* to eliminate as many candidates as possible */
if (strict) {
for (unsigned int i = 0; i < elist_size; i++) {
if (elist[i]->check != NULL && !elist[i]->check(in, in_len)) {
elist_size--;
memmove(&elist[i], &elist[i+1], (elist_size - i) * sizeof(mbfl_encoding*));
i--;
}
}
}
uint32_t wchar_buf[128];
struct conversion_data {
const mbfl_encoding *enc;
@ -4510,6 +4522,10 @@ MBSTRING_API bool php_mb_check_encoding(const char *input, size_t length, const
unsigned char *in = (unsigned char*)input;
unsigned int state = 0;
if (encoding->check != NULL) {
return encoding->check(in, length);
}
/* If the input string is not encoded in the given encoding, there is a significant chance
* that this will be seen in the first bytes. Therefore, rather than converting an entire
* buffer of 128 codepoints, convert and check just a few codepoints first */

View file

@ -0,0 +1,542 @@
--TEST--
GH-10192 (mb_detect_encoding() results for UTF-7 differ between PHP 8.0 and 8.1)
--EXTENSIONS--
mbstring
--FILE--
<?php
$testcases = [
'non-base64 character after +' => 'A + B',
'non-base64 character after -' => 'A - B',
'base64 character before +' => 'A 1+ B',
'base64 character before -' => 'A 1- B',
'base64 character after +' => 'A +1 B',
'base64 character after -' => 'A -1 B',
'base64 character before and after +' => 'A 1+1 B',
'base64 character before and after -' => 'A 1-1 B',
'string ends with +' => 'A +',
'string ends with -' => 'A -',
'+ and -' => 'A +- B',
'- and +' => 'A -+ B',
'valid direct encoding character =' => 'A = B',
'invalid direct encoding character ~' => 'A ~ B',
'invalid direct encoding character \\' => 'A \\ B',
'invalid direct encoding character ESC' => "A \x1b B",
'valid direct encoding character = after +' => 'A += B',
'invalid direct encoding character ~ after +' => 'A +~ B',
'invalid direct encoding character \\ after +' => 'A +\\ B',
'invalid direct encoding character ESC after +' => "A +\x1b B",
'valid base64 character between + and -' => 'A +ZeVnLIqe- B', // 日本語 in UTF-16BE
'invalid base64 character between + and -' => 'A +ZeVnLIq- B', // 日本語 in UTF-16BE without the last character
'valid base64 character between + and non-base64 character' => 'A +ZeVnLIqe B',
'invalid base64 character between + and non-base64 character' => 'A +ZeVnLIq B',
'valid base64 character between + and base64 character' => 'A +ZeVnLIqe1 B',
'invalid base64 character between + and base64 character' => 'A +ZeVnLIq1 B',
'valid base64 character between + and end of string' => 'A +ZeVnLIqe',
'invalid base64 character between + and end of string' => 'A +ZeVnLIq',
'valid base64 character consisting only of + between + and -' => 'A +++++++++- B',
'invalid base64 character consisting only of + between + and -' => 'A +++++++++- B',
'valid base64 character consisting only of + between + and non-base64 character' => 'A +++++++++ B',
'invalid base64 character consisting only of + between + and non-base64 character' => 'A +++++++++ B',
'valid base64 character consisting only of + between + and base64 character' => 'A +++++++++1 B',
'invalid base64 character consisting only of + between + and base64 character' => 'A +++++++++1 B',
'valid base64 character consisting only of + between + and end of string' => 'A +++++++++',
'invalid base64 character consisting only of + between + and end of string' => 'A +++++++++',
'valid base64 character using surrogate pair between + and -' => 'A +2GfePQ- B', // 𩸽 in UTF-16BE
'first 16 bits of base64 character using surrogate pair between + and -' => 'A +2Gc- B', // first 16 bits of 𩸽 in UTF-16BE
'valid base64 character using surrogate pair between + and non-base64 character' => 'A +2GfePQ B',
'first 16 bits of base64 character using surrogate pair between + and non-base64 character' => 'A +2Gc B',
'valid base64 character using surrogate pair between + and base64 character' => 'A +2GfePQ1 B',
'first 16 bits of base64 character using surrogate pair between + and base64 character' => 'A +2Gc1 B',
'valid base64 character using surrogate pair between + and end of string' => 'A +2GfePQ',
'first 16 bits of base64 character using surrogate pair between + and end of string' => 'A +2Gc',
'invalid base64 character using surrogate pair in reverse order between + and -' => 'A +3j3YZw- B', // 𩸽 in reverse order in UTF-16BE
'last 16 bits of base64 character using surrogate pair in reverse order between + and -' => 'A +3j0- B', // last 16 bits of 𩸽 in UTF-16BE
'invalid base64 character using surrogate pair in reverse order between + and non-base64 character' => 'A +3j3YZw B',
'last 16 bits of base64 character using surrogate pair in reverse order between + and non-base64 character' => 'A +3j0 B',
'invalid base64 character using surrogate pair in reverse order between + and base64 character' => 'A +3j3YZw1 B',
'last 16 bits of base64 character using surrogate pair in reverse order between + and base64 character' => 'A +3j01 B',
'invalid base64 character using surrogate pair in reverse order between + and end of string' => 'A +3j3YZw',
'last 16 bits of base64 character using surrogate pair in reverse order between + and end of string' => 'A +3j0'
];
foreach ($testcases as $title => $case) {
echo $title . PHP_EOL;
var_dump(mb_detect_encoding($case, 'UTF-8, UTF-7', true));
var_dump(mb_detect_encoding($case, 'UTF-8, UTF-7', false));
var_dump(mb_detect_encoding($case, 'UTF-7', true));
var_dump(mb_detect_encoding($case, 'UTF-7', false));
var_dump(mb_check_encoding($case, 'UTF-7'));
var_dump(addcslashes(mb_convert_encoding($case, 'UTF-8', 'UTF-7'), "\0..\37\177"));
var_dump(mb_get_info('illegal_chars'));
echo PHP_EOL;
}
?>
--EXPECT--
non-base64 character after +
string(5) "UTF-8"
string(5) "UTF-7"
bool(false)
string(5) "UTF-7"
bool(false)
string(4) "A B"
int(0)
non-base64 character after -
string(5) "UTF-8"
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-7"
bool(true)
string(5) "A - B"
int(0)
base64 character before +
string(5) "UTF-8"
string(5) "UTF-7"
bool(false)
string(5) "UTF-7"
bool(false)
string(5) "A 1 B"
int(0)
base64 character before -
string(5) "UTF-8"
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-7"
bool(true)
string(6) "A 1- B"
int(0)
base64 character after +
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(5) "A ? B"
int(1)
base64 character after -
string(5) "UTF-8"
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-7"
bool(true)
string(6) "A -1 B"
int(1)
base64 character before and after +
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(6) "A 1? B"
int(2)
base64 character before and after -
string(5) "UTF-8"
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-7"
bool(true)
string(7) "A 1-1 B"
int(2)
string ends with +
string(5) "UTF-7"
string(5) "UTF-7"
string(5) "UTF-7"
string(5) "UTF-7"
bool(true)
string(2) "A "
int(2)
string ends with -
string(5) "UTF-8"
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-7"
bool(true)
string(3) "A -"
int(2)
+ and -
string(5) "UTF-7"
string(5) "UTF-7"
string(5) "UTF-7"
string(5) "UTF-7"
bool(true)
string(5) "A + B"
int(2)
- and +
string(5) "UTF-8"
string(5) "UTF-7"
bool(false)
string(5) "UTF-7"
bool(false)
string(5) "A - B"
int(2)
valid direct encoding character =
string(5) "UTF-8"
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-7"
bool(true)
string(5) "A = B"
int(2)
invalid direct encoding character ~
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(5) "A ~ B"
int(2)
invalid direct encoding character \
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(5) "A \ B"
int(2)
invalid direct encoding character ESC
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(8) "A \033 B"
int(2)
valid direct encoding character = after +
string(5) "UTF-8"
string(5) "UTF-7"
bool(false)
string(5) "UTF-7"
bool(false)
string(5) "A = B"
int(2)
invalid direct encoding character ~ after +
string(5) "UTF-8"
string(5) "UTF-7"
bool(false)
string(5) "UTF-7"
bool(false)
string(5) "A ~ B"
int(2)
invalid direct encoding character \ after +
string(5) "UTF-8"
string(5) "UTF-7"
bool(false)
string(5) "UTF-7"
bool(false)
string(5) "A \ B"
int(2)
invalid direct encoding character ESC after +
string(5) "UTF-8"
string(5) "UTF-7"
bool(false)
string(5) "UTF-7"
bool(false)
string(8) "A \033 B"
int(2)
valid base64 character between + and -
string(5) "UTF-7"
string(5) "UTF-7"
string(5) "UTF-7"
string(5) "UTF-7"
bool(true)
string(13) "A 日本語 B"
int(2)
invalid base64 character between + and -
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(11) "A 日本? B"
int(3)
valid base64 character between + and non-base64 character
string(5) "UTF-7"
string(5) "UTF-7"
string(5) "UTF-7"
string(5) "UTF-7"
bool(true)
string(13) "A 日本語 B"
int(3)
invalid base64 character between + and non-base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(11) "A 日本? B"
int(4)
valid base64 character between + and base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(14) "A 日本語? B"
int(5)
invalid base64 character between + and base64 character
string(5) "UTF-8"
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-7"
bool(true)
string(13) "A 日本誵 B"
int(5)
valid base64 character between + and end of string
string(5) "UTF-7"
string(5) "UTF-7"
string(5) "UTF-7"
string(5) "UTF-7"
bool(true)
string(11) "A 日本語"
int(5)
invalid base64 character between + and end of string
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(9) "A 日本?"
int(6)
valid base64 character consisting only of + between + and -
string(5) "UTF-8"
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-7"
bool(true)
string(13) "A ﯯ뻻 B"
int(6)
invalid base64 character consisting only of + between + and -
string(5) "UTF-8"
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-7"
bool(true)
string(13) "A ﯯ뻻 B"
int(6)
valid base64 character consisting only of + between + and non-base64 character
string(5) "UTF-8"
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-7"
bool(true)
string(13) "A ﯯ뻻 B"
int(6)
invalid base64 character consisting only of + between + and non-base64 character
string(5) "UTF-8"
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-7"
bool(true)
string(13) "A ﯯ뻻 B"
int(6)
valid base64 character consisting only of + between + and base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(14) "A ﯯ뻻? B"
int(7)
invalid base64 character consisting only of + between + and base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(14) "A ﯯ뻻? B"
int(8)
valid base64 character consisting only of + between + and end of string
string(5) "UTF-8"
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-7"
bool(true)
string(11) "A ﯯ뻻"
int(8)
invalid base64 character consisting only of + between + and end of string
string(5) "UTF-8"
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-7"
bool(true)
string(11) "A ﯯ뻻"
int(8)
valid base64 character using surrogate pair between + and -
string(5) "UTF-8"
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-7"
bool(true)
string(8) "A 𩸽 B"
int(8)
first 16 bits of base64 character using surrogate pair between + and -
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(5) "A ? B"
int(9)
valid base64 character using surrogate pair between + and non-base64 character
string(5) "UTF-8"
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-7"
bool(true)
string(8) "A 𩸽 B"
int(9)
first 16 bits of base64 character using surrogate pair between + and non-base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(5) "A ? B"
int(10)
valid base64 character using surrogate pair between + and base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(9) "A 𩸽? B"
int(11)
first 16 bits of base64 character using surrogate pair between + and base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(5) "A ? B"
int(12)
valid base64 character using surrogate pair between + and end of string
string(5) "UTF-8"
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-7"
bool(true)
string(6) "A 𩸽"
int(12)
first 16 bits of base64 character using surrogate pair between + and end of string
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(3) "A ?"
int(13)
invalid base64 character using surrogate pair in reverse order between + and -
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(6) "A ?? B"
int(15)
last 16 bits of base64 character using surrogate pair in reverse order between + and -
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(5) "A ? B"
int(16)
invalid base64 character using surrogate pair in reverse order between + and non-base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(6) "A ?? B"
int(18)
last 16 bits of base64 character using surrogate pair in reverse order between + and non-base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(5) "A ? B"
int(19)
invalid base64 character using surrogate pair in reverse order between + and base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(6) "A ?? B"
int(21)
last 16 bits of base64 character using surrogate pair in reverse order between + and base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(6) "A ?? B"
int(23)
invalid base64 character using surrogate pair in reverse order between + and end of string
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(4) "A ??"
int(25)
last 16 bits of base64 character using surrogate pair in reverse order between + and end of string
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
string(3) "A ?"
int(26)

View file

@ -0,0 +1,423 @@
--TEST--
GH-10192 (mb_detect_encoding() results for UTF-7 differ between PHP 8.0 and 8.1)
--EXTENSIONS--
mbstring
--FILE--
<?php
$testcases = [
'non-base64 character after &' => 'A & B',
'non-base64 character after -' => 'A - B',
'base64 character before &' => 'A 1& B',
'base64 character before -' => 'A 1- B',
'base64 character after &' => 'A &1 B',
'base64 character after -' => 'A -1 B',
'base64 character before and after &' => 'A 1&1 B',
'base64 character before and after -' => 'A 1-1 B',
'string ends with &' => 'A &',
'string ends with -' => 'A -',
'& and -' => 'A &- B',
'- and &' => 'A -& B',
'valid direct encoding character ~' => 'A ~ B',
'invalid direct encoding character ESC' => "A \x1b B",
'valid direct encoding character ~ after &' => 'A &~ B',
'invalid direct encoding character ESC after &' => "A &\x1b B",
'valid base64 character between & and -' => 'A &ZeVnLIqe- B', // 日本語 in UTF-16BE
'invalid base64 character between & and -' => 'A &ZeVnLIq- B', // 日本語 in UTF-16BE without the last character
'valid base64 character between & and non-base64 character' => 'A &ZeVnLIqe B',
'invalid base64 character between & and non-base64 character' => 'A &ZeVnLIq B',
'valid base64 character between & and base64 character' => 'A &ZeVnLIqe1 B',
'invalid base64 character between & and base64 character' => 'A &ZeVnLIq1 B',
'valid base64 character between & and end of string' => 'A &ZeVnLIqe',
'invalid base64 character between & and end of string' => 'A &ZeVnLIq',
'valid base64 character using surrogate pair between & and -' => 'A &2GfePQ- B', // 𩸽 in UTF-16BE
'first 16 bits of base64 character using surrogate pair between & and -' => 'A &2Gc- B', // first 16 bits of 𩸽 in UTF-16BE
'valid base64 character using surrogate pair between & and non-base64 character' => 'A &2GfePQ B',
'first 16 bits of base64 character using surrogate pair between & and non-base64 character' => 'A &2Gc B',
'valid base64 character using surrogate pair between & and base64 character' => 'A &2GfePQ1 B',
'first 16 bits of base64 character using surrogate pair between & and base64 character' => 'A &2Gc1 B',
'valid base64 character using surrogate pair between & and end of string' => 'A &2GfePQ',
'first 16 bits of base64 character using surrogate pair between & and end of string' => 'A &2Gc',
'invalid base64 character using surrogate pair in reverse order between & and -' => 'A &3j3YZw- B', // 𩸽 in reverse order in UTF-16BE
'last 16 bits of base64 character using surrogate pair in reverse order between & and -' => 'A &3j0- B', // last 16 bits of 𩸽 in UTF-16BE
'invalid base64 character using surrogate pair in reverse order between & and non-base64 character' => 'A &3j3YZw B',
'last 16 bits of base64 character using surrogate pair in reverse order between & and non-base64 character' => 'A &3j0 B',
'invalid base64 character using surrogate pair in reverse order between & and base64 character' => 'A &3j3YZw1 B',
'last 16 bits of base64 character using surrogate pair in reverse order between & and base64 character' => 'A &3j01 B',
'invalid base64 character using surrogate pair in reverse order between & and end of string' => 'A &3j3YZw',
'last 16 bits of base64 character using surrogate pair in reverse order between & and end of string' => 'A &3j0'
];
foreach ($testcases as $title => $case) {
echo $title . PHP_EOL;
var_dump(mb_detect_encoding($case, 'UTF-8, UTF7-IMAP', true));
var_dump(mb_detect_encoding($case, 'UTF-8, UTF7-IMAP', false));
var_dump(mb_detect_encoding($case, 'UTF7-IMAP', true));
var_dump(mb_detect_encoding($case, 'UTF7-IMAP', false));
var_dump(mb_check_encoding($case, 'UTF7-IMAP'));
var_dump(addcslashes(mb_convert_encoding($case, 'UTF-8', 'UTF7-IMAP'), "\0..\37\177"));
var_dump(mb_get_info('illegal_chars'));
echo PHP_EOL;
}
?>
--EXPECT--
non-base64 character after &
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(4) "A ?B"
int(1)
non-base64 character after -
string(5) "UTF-8"
string(5) "UTF-8"
string(9) "UTF7-IMAP"
string(9) "UTF7-IMAP"
bool(true)
string(5) "A - B"
int(1)
base64 character before &
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(5) "A 1?B"
int(2)
base64 character before -
string(5) "UTF-8"
string(5) "UTF-8"
string(9) "UTF7-IMAP"
string(9) "UTF7-IMAP"
bool(true)
string(6) "A 1- B"
int(2)
base64 character after &
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(4) "A ?B"
int(3)
base64 character after -
string(5) "UTF-8"
string(5) "UTF-8"
string(9) "UTF7-IMAP"
string(9) "UTF7-IMAP"
bool(true)
string(6) "A -1 B"
int(3)
base64 character before and after &
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(5) "A 1?B"
int(4)
base64 character before and after -
string(5) "UTF-8"
string(5) "UTF-8"
string(9) "UTF7-IMAP"
string(9) "UTF7-IMAP"
bool(true)
string(7) "A 1-1 B"
int(4)
string ends with &
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(3) "A ?"
int(5)
string ends with -
string(5) "UTF-8"
string(5) "UTF-8"
string(9) "UTF7-IMAP"
string(9) "UTF7-IMAP"
bool(true)
string(3) "A -"
int(5)
& and -
string(9) "UTF7-IMAP"
string(9) "UTF7-IMAP"
string(9) "UTF7-IMAP"
string(9) "UTF7-IMAP"
bool(true)
string(5) "A & B"
int(5)
- and &
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(5) "A -?B"
int(6)
valid direct encoding character ~
string(5) "UTF-8"
string(5) "UTF-8"
string(9) "UTF7-IMAP"
string(9) "UTF7-IMAP"
bool(true)
string(5) "A ~ B"
int(6)
invalid direct encoding character ESC
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(5) "A ? B"
int(7)
valid direct encoding character ~ after &
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(5) "A ? B"
int(8)
invalid direct encoding character ESC after &
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(5) "A ? B"
int(9)
valid base64 character between & and -
string(9) "UTF7-IMAP"
string(9) "UTF7-IMAP"
string(9) "UTF7-IMAP"
string(9) "UTF7-IMAP"
bool(true)
string(13) "A 日本語 B"
int(9)
invalid base64 character between & and -
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(11) "A 日本? B"
int(10)
valid base64 character between & and non-base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(13) "A 日本語?B"
int(11)
invalid base64 character between & and non-base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(10) "A 日本?B"
int(12)
valid base64 character between & and base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(13) "A 日本語?B"
int(13)
invalid base64 character between & and base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(13) "A 日本誵?B"
int(14)
valid base64 character between & and end of string
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(12) "A 日本語?"
int(15)
invalid base64 character between & and end of string
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(9) "A 日本?"
int(16)
valid base64 character using surrogate pair between & and -
string(5) "UTF-8"
string(5) "UTF-8"
string(9) "UTF7-IMAP"
string(9) "UTF7-IMAP"
bool(true)
string(8) "A 𩸽 B"
int(16)
first 16 bits of base64 character using surrogate pair between & and -
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(5) "A ? B"
int(17)
valid base64 character using surrogate pair between & and non-base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(8) "A 𩸽?B"
int(18)
first 16 bits of base64 character using surrogate pair between & and non-base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(4) "A ?B"
int(19)
valid base64 character using surrogate pair between & and base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(8) "A 𩸽?B"
int(20)
first 16 bits of base64 character using surrogate pair between & and base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(4) "A ?B"
int(21)
valid base64 character using surrogate pair between & and end of string
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(7) "A 𩸽?"
int(22)
first 16 bits of base64 character using surrogate pair between & and end of string
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(4) "A ??"
int(24)
invalid base64 character using surrogate pair in reverse order between & and -
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(6) "A ?? B"
int(26)
last 16 bits of base64 character using surrogate pair in reverse order between & and -
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(5) "A ? B"
int(27)
invalid base64 character using surrogate pair in reverse order between & and non-base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(5) "A ??B"
int(29)
last 16 bits of base64 character using surrogate pair in reverse order between & and non-base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(5) "A ??B"
int(31)
invalid base64 character using surrogate pair in reverse order between & and base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(5) "A ??B"
int(33)
last 16 bits of base64 character using surrogate pair in reverse order between & and base64 character
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(5) "A ??B"
int(35)
invalid base64 character using surrogate pair in reverse order between & and end of string
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(5) "A ???"
int(38)
last 16 bits of base64 character using surrogate pair in reverse order between & and end of string
string(5) "UTF-8"
string(5) "UTF-8"
bool(false)
string(9) "UTF7-IMAP"
bool(false)
string(4) "A ??"
int(40)

View file

@ -0,0 +1,155 @@
--TEST--
GH-10648 (mb_check_encoding() returns true for incorrect but interpretable ISO-2022-JP byte sequences)
--EXTENSIONS--
mbstring
--FILE--
<?php
$testcases = [
'ISO-2022-JP bytes' => '1b244224221b2842', // 'あ' in ISO-2022-JP
'ISO-2022-JP bytes without escape sequence' => '1b24422422', // 'ア' in JIS
'JIS X 0201 7bit kana with escape sequence' => '1b2849311b2842', // 'ア' in JIS
'JIS X 0201 7bit kana with SO/SI' => '0e310f', // 'ア' in JIS
'JIS X 0201 8bit kana' => 'b1', // 'ア' in JIS
'JIS X 0201 7bit kana with SO and ESC' => '0e311b2842', // 'ア' in JIS
'JIS X 0201 7bit kana with ESC and SI' => '1b2849310f', // 'ア' in JIS
'JIS X 0208 character' => '1b244242641b2842', // '鯛' in JIS and ISO-2022-JP, included in JIS X 0208
'JIS X 0212 character' => '1b2428446a591b2842', // '鮋' in JIS, included in JIS X 0212
'JIS X 0213 character' => '1b2428507d4c1b2842', // '𩸽' in ISO-2022-JP-2004, included in JIS X 0213
'JIS C 6220-1969 ESC ( H' => '1b284a1b2848', // an escape sequence transitioning to ASCII
'SO/SI when not in ASCII mode' => '1b284a0e0f1b2842', // an escape sequence transitioning to ASCII
];
foreach ($testcases as $title => $case) {
echo $title . PHP_EOL;
echo 'JIS:' . PHP_EOL;
var_dump(mb_check_encoding(hex2bin($case), 'JIS'));
echo mb_convert_encoding(hex2bin($case), 'UTF-8', 'JIS'). PHP_EOL;
var_dump(mb_get_info('illegal_chars'));
echo 'ISO-2022-JP:' . PHP_EOL;
var_dump(mb_check_encoding(hex2bin($case), 'ISO-2022-JP'));
echo mb_convert_encoding(hex2bin($case), 'UTF-8', 'ISO-2022-JP'). PHP_EOL;
var_dump(mb_get_info('illegal_chars'));
echo PHP_EOL;
}
?>
--EXPECT--
ISO-2022-JP bytes
JIS:
bool(true)
int(0)
ISO-2022-JP:
bool(true)
int(0)
ISO-2022-JP bytes without escape sequence
JIS:
bool(false)
int(0)
ISO-2022-JP:
bool(false)
int(0)
JIS X 0201 7bit kana with escape sequence
JIS:
bool(true)
int(0)
ISO-2022-JP:
bool(false)
int(0)
JIS X 0201 7bit kana with SO/SI
JIS:
bool(true)
int(0)
ISO-2022-JP:
bool(false)
int(0)
JIS X 0201 8bit kana
JIS:
bool(true)
int(0)
ISO-2022-JP:
bool(false)
int(0)
JIS X 0201 7bit kana with SO and ESC
JIS:
bool(false)
int(0)
ISO-2022-JP:
bool(false)
int(0)
JIS X 0201 7bit kana with ESC and SI
JIS:
bool(false)
int(0)
ISO-2022-JP:
bool(false)
int(0)
JIS X 0208 character
JIS:
bool(true)
int(0)
ISO-2022-JP:
bool(true)
int(0)
JIS X 0212 character
JIS:
bool(true)
int(0)
ISO-2022-JP:
bool(false)
int(0)
JIS X 0213 character
JIS:
bool(false)
?$(P}L
int(1)
ISO-2022-JP:
bool(false)
?$(P}L
int(2)
JIS C 6220-1969 ESC ( H
JIS:
bool(true)
int(2)
ISO-2022-JP:
bool(false)
int(2)
SO/SI when not in ASCII mode
JIS:
bool(false)
int(2)
ISO-2022-JP:
bool(false)
int(2)

View file

@ -50,11 +50,6 @@ function testValid($from, $to, $encoding, $bothWays = true) {
/* ESC ( B at the beginning is redundant, since ASCII mode is the default */
if (substr($from, 0, 3) == "\x1B(B")
$from = substr($from, 3, strlen($from) - 3);
/* If the string switches to a different charset, it should switch back to
* ASCII at the end */
if (strpos($from, "\x1B\$B") !== false || strpos($from, "\x1B(J") !== false)
$from .= "\x1B(B";
convertValidString($to, $from, 'UTF-16BE', $encoding, false);
}
}
@ -66,11 +61,11 @@ function testInvalid($from, $to, $encoding) {
for ($i = 0; $i < 0x80; $i++) {
if ($i == 0xE || $i == 0xF || $i == 0x1B)
continue;
testValid(chr($i), "\x00" . chr($i), 'JIS');
testValid("\x0F" . chr($i), "\x00" . chr($i), 'JIS'); /* 0xF is 'Shift Out' code */
testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'JIS');
testValid(chr($i), "\x00" . chr($i), 'ISO-2022-JP');
testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'ISO-2022-JP');
testValid(chr($i), "\x00" . chr($i), 'JIS');
convertValidString("\x0F" . chr($i), "\x00" . chr($i), 'JIS', 'UTF-16BE', false); /* 0xF is 'Shift In' code */
testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'JIS');
testValid(chr($i), "\x00" . chr($i), 'ISO-2022-JP');
testValid("\x1B(B" . chr($i), "\x00" . chr($i), 'ISO-2022-JP');
}
for ($i = 0x80; $i < 256; $i++) {
@ -92,27 +87,27 @@ echo "ASCII support OK\n";
foreach ($jisx0201Chars as $jisx0201 => $utf16BE) {
if (ord($jisx0201) >= 128) {
$kana = chr(ord($jisx0201) - 128);
testValid("\x1B(I" . $kana, $utf16BE, 'JIS', false);
testValid("\x0E" . $kana, $utf16BE, 'JIS', false); /* 0xE is 'Shift In' code */
testValid("\x1B(I" . $kana . "\x1B(B", $utf16BE, 'JIS', false);
testValid("\x0E" . $kana . "\x0F", $utf16BE, 'JIS', false); /* 0xE is 'Shift Out' code */
testValid($jisx0201, $utf16BE, 'JIS', false);
} else {
testValid("\x1B(J" . $jisx0201, $utf16BE, 'JIS', $utf16BE > "\x00\x80");
testValid("\x1B(J" . $jisx0201 . "\x1B(B", $utf16BE, 'JIS', $utf16BE > "\x00\x80");
}
}
for ($i = 0x80; $i < 256; $i++) {
if ($i >= 0xA1 && $i <= 0xDF)
continue;
testInvalid("\x1B(I" . chr($i), "\x00%", 'JIS');
testInvalid("\x1B(J" . chr($i), "\x00%", 'JIS');
testInvalid("\x1B(I" . chr($i) . "\x1B(B", "\x00%", 'JIS');
testInvalid("\x1B(J" . chr($i) . "\x1B(B", "\x00%", 'JIS');
}
echo "JIS X 0201 support OK\n";
/* All valid JISX0208 characters */
foreach ($jisx0208Chars as $jisx0208 => $utf16BE) {
testValid("\x1B\$B" . $jisx0208, $utf16BE, 'JIS');
testValid("\x1B\$B" . $jisx0208, $utf16BE, 'ISO-2022-JP');
testValid("\x1B\$B" . $jisx0208 . "\x1B(B", $utf16BE, 'JIS');
testValid("\x1B\$B" . $jisx0208 . "\x1B(B", $utf16BE, 'ISO-2022-JP');
}
/* All invalid 2-byte JISX0208 characters */
@ -120,8 +115,8 @@ for ($i = 0x21; $i <= 0x7E; $i++) {
for ($j = 0; $j < 256; $j++) {
$testString = chr($i) . chr($j);
if (!isset($jisx0208Chars[$testString])) {
testInvalid("\x1B\$B" . $testString, "\x00%", 'JIS');
testInvalid("\x1B\$B" . $testString, "\x00%", 'ISO-2022-JP');
testInvalid("\x1B\$B" . $testString . "\x1B(B", "\x00%", 'JIS');
testInvalid("\x1B\$B" . $testString . "\x1B(B", "\x00%", 'ISO-2022-JP');
}
}
}
@ -142,7 +137,7 @@ echo "JIS X 0208 support OK\n";
/* All valid JISX0212 characters */
foreach ($jisx0212Chars as $jisx0212 => $utf16BE) {
testValid("\x1B\$(D" . $jisx0212, $utf16BE, 'JIS', false);
testValid("\x1B\$(D" . $jisx0212 . "\x1B(B", $utf16BE, 'JIS', false);
}
/* All invalid 2-byte JISX0212 characters */
@ -150,14 +145,14 @@ for ($i = 0x21; $i <= 0x7E; $i++) {
for ($j = 0; $j < 256; $j++) {
$testString = chr($i) . chr($j);
if (!isset($jisx0212Chars[$testString])) {
testInvalid("\x1B\$(D" . $testString, "\x00%", 'JIS');
testInvalid("\x1B\$(D" . $testString . "\x1B(B", "\x00%", 'JIS');
}
}
}
/* Try truncated JISX0212 characters */
for ($i = 0x21; $i <= 0x7E; $i++) {
testInvalid("\x1B\$(D" . chr($i), "\x00%", 'JIS');
testInvalid("\x1B\$(D" . chr($i) . "\x1B(B", "\x00%\x00%", 'JIS');
}
testValidString("\x00\xA1", "\x1B\$(D\x22\x42\x1B(B", "UTF-16BE", "JIS", false);
@ -167,29 +162,36 @@ convertInvalidString("\x00\xA1", "%", "UTF-16BE", "ISO-2022-JP", false);
echo "JIS X 0212 support OK\n";
/* All possible escape sequences */
$validEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B\$(@" => true, "\x1B\$(B" => true, "\x1B\$(D" => true, "\x1B(B" => true, "\x1B(H" => true, "\x1B(J" => true, "\x1B(I" => true];
$validJisEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B\$(@" => true, "\x1B\$(B" => true, "\x1B\$(D" => true, "\x1B(B" => true, "\x1B(H" => true, "\x1B(J" => true, "\x1B(I" => true];
$validIso2022jpEscapes = ["\x1B\$@" => true, "\x1B\$B" => true, "\x1B(B" => true, "\x1B(J" => true];
for ($i = 0; $i <= 0xFF; $i++) {
for ($j = 0; $j <= 0xFF; $j++) {
$escapeSequence = "\x1B" . chr($i) . chr($j);
if ($escapeSequence === "\x1B\$(")
continue;
if (isset($validEscapes[$escapeSequence])) {
testValid($escapeSequence, "", 'JIS', false);
testValid($escapeSequence, "", 'ISO-2022-JP', false);
if (isset($validJisEscapes[$escapeSequence])) {
testValid($escapeSequence . "\x1B(B", "", 'JIS', false);
} else {
identifyInvalidString($escapeSequence, 'JIS');
identifyInvalidString($escapeSequence, 'ISO-2022-JP');
identifyInvalidString($escapeSequence . "\x1B(B", 'JIS');
}
if (isset($validIso2022jpEscapes[$escapeSequence])) {
testValid($escapeSequence . "\x1B(B", "", 'ISO-2022-JP', false);
} else {
identifyInvalidString($escapeSequence . "\x1B(B", 'ISO-2022-JP');
}
}
}
for ($i = 0; $i <= 0xFF; $i++) {
$escapeSequence = "\x1B\$(" . chr($i);
if (isset($validEscapes[$escapeSequence])) {
testValid($escapeSequence, "", 'JIS', false);
testValid($escapeSequence, "", 'ISO-2022-JP', false);
if (isset($validJisEscapes[$escapeSequence])) {
testValid($escapeSequence . "\x1B(B", "", 'JIS', false);
} else {
identifyInvalidString($escapeSequence, 'JIS');
identifyInvalidString($escapeSequence, 'ISO-2022-JP');
identifyInvalidString($escapeSequence . "\x1B(B", 'JIS');
}
if (isset($validIso2022jpEscapes[$escapeSequence])) {
testValid($escapeSequence . "\x1B(B", "", 'ISO-2022-JP', false);
} else {
identifyInvalidString($escapeSequence . "\x1B(B", 'ISO-2022-JP');
}
}
/* Also try a bare ESC */

View file

@ -1036,17 +1036,8 @@ testValidString('+' . encode('AB', 'ASCII') . '-+' . encode('CD', 'ASCII') . '-'
// (Just trying to be exhaustive here)
testValidString('+' . encode('AB', 'ASCII') . '-!+' . encode('CD', 'ASCII') . '-', "\x00A\x00B\x00!\x00C\x00D", 'UTF-7', 'UTF-16BE', false);
// + section terminated by a non-Base64 ASCII character which is NOT -
for ($i = 0; $i < 128; $i++) {
if ($i >= ord('A') && $i <= ord('Z'))
continue;
if ($i >= ord('a') && $i <= ord('z'))
continue;
if ($i >= ord('0') && $i <= ord('9'))
continue;
if ($i == ord('+') || $i == ord('/') || $i == ord('-') || $i == ord('\\') || $i == ord('~'))
continue;
$char = chr($i);
// + section terminated by a non-Base64 direct character which is NOT -
foreach (str_split(" \t\r\n'(),.:?!\"#$%&*;<=>@[]^_`{|}\x00") as $char) {
testValidString('+' . encode("\x12\x34", 'UTF-16BE') . $char, "\x00\x00\x12\x34\x00\x00\x00" . $char, 'UTF-7', 'UTF-32BE', false);
}