Implement mb_encode_mimeheader using fast text conversion filters

The behavior of the new mb_encode_mimeheader implementation closely
follows the old implementation, except for three points:

• The old implementation was missing a call to the mbfl_convert_filter
  flush function. So it would sometimes truncate the input string just
  before its end.

• The old implementation would drop zero bytes when QPrint-encoding.
  So for example, if you tried to QPrint-encode the UTF-32BE string
  "\x00\x00\x12\x34", its QPrint-encoding would be "=12=34", which
  does not decode to a valid UTF-32BE string. This is now fixed.

• In some rare corner cases, the new implementation will choose to
  Base64-encode or QPrint-encode the input string, where the old
  implementation would have just added newlines to it. Specifically,
  this can happen when there is a non-space ASCII character, followed
  by a large number of ASCII spaces, followed by a non-ASCII character.

The new implementation is around 2.5-8x faster than the old one,
depending on the text encoding and transfer encoding used. Performance
gains are greater with Base64 transfer encoding than with QPrint
transfer encoding; this is not because QPrint-encoding bytes is slow,
but because QPrint-encoded output is much bigger than Base64-encoded
output and takes more lines, so we have to go through the process of
finding the right place to break a line many more times.
This commit is contained in:
Alex Dowad 2023-02-17 13:51:02 +02:00
parent 6ebb506637
commit 0ce755be26
8 changed files with 649 additions and 464 deletions

View file

@ -99,15 +99,13 @@ int mbfl_filt_conv_base64enc(int c, mbfl_convert_filter *filter)
filter->cache |= (c & 0xff) << 8;
} else {
filter->status &= ~0xff;
if ((filter->status & MBFL_BASE64_STS_MIME_HEADER) == 0) {
n = (filter->status & 0xff00) >> 8;
if (n > 72) {
CK((*filter->output_function)(0x0d, filter->data)); /* CR */
CK((*filter->output_function)(0x0a, filter->data)); /* LF */
filter->status &= ~0xff00;
}
filter->status += 0x400;
n = (filter->status & 0xff00) >> 8;
if (n > 72) {
CK((*filter->output_function)(0x0d, filter->data)); /* CR */
CK((*filter->output_function)(0x0a, filter->data)); /* LF */
filter->status &= ~0xff00;
}
filter->status += 0x400;
n = filter->cache | (c & 0xff);
CK((*filter->output_function)(mbfl_base64_table[(n >> 18) & 0x3f], filter->data));
CK((*filter->output_function)(mbfl_base64_table[(n >> 12) & 0x3f], filter->data));
@ -129,11 +127,9 @@ int mbfl_filt_conv_base64enc_flush(mbfl_convert_filter *filter)
filter->cache = 0;
/* flush fragments */
if (status >= 1) {
if ((filter->status & MBFL_BASE64_STS_MIME_HEADER) == 0) {
if (len > 72){
CK((*filter->output_function)(0x0d, filter->data)); /* CR */
CK((*filter->output_function)(0x0a, filter->data)); /* LF */
}
if (len > 72){
CK((*filter->output_function)(0x0d, filter->data)); /* CR */
CK((*filter->output_function)(0x0a, filter->data)); /* LF */
}
CK((*filter->output_function)(mbfl_base64_table[(cache >> 18) & 0x3f], filter->data));
CK((*filter->output_function)(mbfl_base64_table[(cache >> 12) & 0x3f], filter->data));

View file

@ -29,7 +29,6 @@
#include "mbfilter.h"
#include "mbfilter_qprint.h"
#include "unicode_prop.h"
static size_t mb_qprint_to_wchar(unsigned char **in, size_t *in_len, uint32_t *buf, size_t bufsize, unsigned int *state);
static void mb_wchar_to_qprint(uint32_t *in, size_t len, mb_convert_buf *buf, bool end);
@ -96,28 +95,25 @@ int mbfl_filt_conv_qprintenc(int c, mbfl_convert_filter *filter)
break;
}
if ((filter->status & MBFL_QPRINT_STS_MIME_HEADER) == 0) {
if (s == 0x0a || (s == 0x0d && c != 0x0a)) { /* line feed */
CK((*filter->output_function)(0x0d, filter->data)); /* CR */
CK((*filter->output_function)(0x0a, filter->data)); /* LF */
filter->status &= ~0xff00;
break;
} else if (s == 0x0d) {
break;
}
if (s == '\n' || (s == '\r' && c != '\n')) { /* line feed */
CK((*filter->output_function)('\r', filter->data));
CK((*filter->output_function)('\n', filter->data));
filter->status &= ~0xff00;
break;
} else if (s == 0x0d) {
break;
}
if ((filter->status & MBFL_QPRINT_STS_MIME_HEADER) == 0 && n >= 72) { /* soft line feed */
CK((*filter->output_function)(0x3d, filter->data)); /* '=' */
CK((*filter->output_function)(0x0d, filter->data)); /* CR */
CK((*filter->output_function)(0x0a, filter->data)); /* LF */
if (n >= 72) { /* soft line feed */
CK((*filter->output_function)('=', filter->data));
CK((*filter->output_function)('\r', filter->data));
CK((*filter->output_function)('\n', filter->data));
filter->status &= ~0xff00;
}
if (s <= 0 || s >= 0x80 || s == 0x3d /* not ASCII or '=' */
|| ((filter->status & MBFL_QPRINT_STS_MIME_HEADER) && mime_char_needs_qencode[s])) {
if (s <= 0 || s >= 0x80 || s == '=') { /* not ASCII or '=' */
/* hex-octet */
CK((*filter->output_function)(0x3d, filter->data)); /* '=' */
CK((*filter->output_function)('=', filter->data));
n = (s >> 4) & 0xf;
if (n < 10) {
n += 48; /* '0' */
@ -132,14 +128,10 @@ int mbfl_filt_conv_qprintenc(int c, mbfl_convert_filter *filter)
n += 55;
}
CK((*filter->output_function)(n, filter->data));
if ((filter->status & MBFL_QPRINT_STS_MIME_HEADER) == 0) {
filter->status += 0x300;
}
filter->status += 0x300;
} else {
CK((*filter->output_function)(s, filter->data));
if ((filter->status & MBFL_QPRINT_STS_MIME_HEADER) == 0) {
filter->status += 0x100;
}
filter->status += 0x100;
}
break;
}

View file

@ -523,312 +523,3 @@ mbfl_strcut(
return result;
}
/*
* MIME header encode
*/
struct mime_header_encoder_data {
mbfl_convert_filter *conv1_filter;
mbfl_convert_filter *block_filter;
mbfl_convert_filter *conv2_filter;
mbfl_convert_filter *conv2_filter_backup;
mbfl_convert_filter *encod_filter;
mbfl_convert_filter *encod_filter_backup;
mbfl_memory_device outdev;
mbfl_memory_device tmpdev;
int status1;
int status2;
size_t prevpos;
size_t linehead;
size_t firstindent;
int encnamelen;
int lwsplen;
char encname[128];
char lwsp[16];
};
static int
mime_header_encoder_block_collector(int c, void *data)
{
size_t n;
struct mime_header_encoder_data *pe = (struct mime_header_encoder_data *)data;
switch (pe->status2) {
case 1: /* encoded word */
pe->prevpos = pe->outdev.pos;
mbfl_convert_filter_copy(pe->conv2_filter, pe->conv2_filter_backup);
mbfl_convert_filter_copy(pe->encod_filter, pe->encod_filter_backup);
(*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
(*pe->conv2_filter->filter_flush)(pe->conv2_filter);
(*pe->encod_filter->filter_flush)(pe->encod_filter);
n = pe->outdev.pos - pe->linehead + pe->firstindent;
pe->outdev.pos = pe->prevpos;
mbfl_convert_filter_copy(pe->conv2_filter_backup, pe->conv2_filter);
mbfl_convert_filter_copy(pe->encod_filter_backup, pe->encod_filter);
if (n >= 74) {
(*pe->conv2_filter->filter_flush)(pe->conv2_filter);
(*pe->encod_filter->filter_flush)(pe->encod_filter);
mbfl_memory_device_strncat(&pe->outdev, "\x3f\x3d", 2); /* ?= */
mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen);
pe->linehead = pe->outdev.pos;
pe->firstindent = 0;
mbfl_memory_device_strncat(&pe->outdev, pe->encname, pe->encnamelen);
c = (*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
} else {
c = (*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
}
break;
default:
mbfl_memory_device_strncat(&pe->outdev, pe->encname, pe->encnamelen);
c = (*pe->conv2_filter->filter_function)(c, pe->conv2_filter);
pe->status2 = 1;
break;
}
return 0;
}
static int
mime_header_encoder_collector(int c, void *data)
{
static int qp_table[256] = {
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00 */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x00 */
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x20 */
0, 0, 0, 0, 0, 0, 0 ,0, 0, 0, 0, 0, 0, 1, 0, 1, /* 0x10 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x40 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, /* 0x50 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x60 */
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, /* 0x70 */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x80 */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0x90 */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xA0 */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xB0 */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xC0 */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xD0 */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 0xE0 */
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 /* 0xF0 */
};
size_t n;
struct mime_header_encoder_data *pe = (struct mime_header_encoder_data *)data;
switch (pe->status1) {
case 11: /* encoded word */
(*pe->block_filter->filter_function)(c, pe->block_filter);
break;
default: /* ASCII */
if (c <= 0x00ff && !qp_table[(c & 0xff)]) { /* ordinary characters */
mbfl_memory_device_output(c, &pe->tmpdev);
pe->status1 = 1;
} else if (pe->status1 == 0 && c == 0x20) { /* repeat SPACE */
mbfl_memory_device_output(c, &pe->tmpdev);
} else {
if (pe->tmpdev.pos < 74 && c == 0x20) {
n = pe->outdev.pos - pe->linehead + pe->tmpdev.pos + pe->firstindent;
if (n > 74) {
mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen); /* LWSP */
pe->linehead = pe->outdev.pos;
pe->firstindent = 0;
} else if (pe->outdev.pos > 0) {
mbfl_memory_device_output(0x20, &pe->outdev);
}
mbfl_memory_device_devcat(&pe->outdev, &pe->tmpdev);
mbfl_memory_device_reset(&pe->tmpdev);
pe->status1 = 0;
} else {
n = pe->outdev.pos - pe->linehead + pe->encnamelen + pe->firstindent;
if (n > 60) {
mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen); /* LWSP */
pe->linehead = pe->outdev.pos;
pe->firstindent = 0;
} else if (pe->outdev.pos > 0) {
mbfl_memory_device_output(0x20, &pe->outdev);
}
mbfl_convert_filter_devcat(pe->block_filter, &pe->tmpdev);
mbfl_memory_device_reset(&pe->tmpdev);
(*pe->block_filter->filter_function)(c, pe->block_filter);
pe->status1 = 11;
}
}
break;
}
return 0;
}
mbfl_string *
mime_header_encoder_result(struct mime_header_encoder_data *pe, mbfl_string *result)
{
if (pe->status1 >= 10) {
(*pe->conv2_filter->filter_flush)(pe->conv2_filter);
(*pe->encod_filter->filter_flush)(pe->encod_filter);
mbfl_memory_device_strncat(&pe->outdev, "\x3f\x3d", 2); /* ?= */
} else if (pe->tmpdev.pos > 0) {
if (pe->outdev.pos > 0) {
if ((pe->outdev.pos - pe->linehead + pe->tmpdev.pos + pe->firstindent) > 74) {
mbfl_memory_device_strncat(&pe->outdev, pe->lwsp, pe->lwsplen);
} else {
mbfl_memory_device_output(0x20, &pe->outdev);
}
}
mbfl_memory_device_devcat(&pe->outdev, &pe->tmpdev);
}
mbfl_memory_device_reset(&pe->tmpdev);
pe->prevpos = 0;
pe->linehead = 0;
pe->status1 = 0;
pe->status2 = 0;
return mbfl_memory_device_result(&pe->outdev, result);
}
struct mime_header_encoder_data*
mime_header_encoder_new(
const mbfl_encoding *incode,
const mbfl_encoding *outcode,
const mbfl_encoding *transenc)
{
size_t n;
const char *s;
struct mime_header_encoder_data *pe;
/* get output encoding and check MIME charset name */
if (outcode->mime_name == NULL || outcode->mime_name[0] == '\0') {
return NULL;
}
pe = emalloc(sizeof(struct mime_header_encoder_data));
mbfl_memory_device_init(&pe->outdev, 0, 0);
mbfl_memory_device_init(&pe->tmpdev, 0, 0);
pe->prevpos = 0;
pe->linehead = 0;
pe->firstindent = 0;
pe->status1 = 0;
pe->status2 = 0;
/* make the encoding description string exp. "=?ISO-2022-JP?B?" */
n = 0;
pe->encname[n++] = 0x3d;
pe->encname[n++] = 0x3f;
s = outcode->mime_name;
while (*s) {
pe->encname[n++] = *s++;
}
pe->encname[n++] = 0x3f;
if (transenc->no_encoding == mbfl_no_encoding_qprint) {
pe->encname[n++] = 0x51;
} else {
pe->encname[n++] = 0x42;
transenc = &mbfl_encoding_base64;
}
pe->encname[n++] = 0x3f;
pe->encname[n] = '\0';
pe->encnamelen = n;
n = 0;
pe->lwsp[n++] = 0x0d;
pe->lwsp[n++] = 0x0a;
pe->lwsp[n++] = 0x20;
pe->lwsp[n] = '\0';
pe->lwsplen = n;
/* transfer encode filter */
pe->encod_filter = mbfl_convert_filter_new(outcode, transenc, mbfl_memory_device_output, 0, &(pe->outdev));
pe->encod_filter_backup = mbfl_convert_filter_new(outcode, transenc, mbfl_memory_device_output, 0, &(pe->outdev));
/* Output code filter */
pe->conv2_filter = mbfl_convert_filter_new(&mbfl_encoding_wchar, outcode, mbfl_filter_output_pipe, 0, pe->encod_filter);
pe->conv2_filter_backup = mbfl_convert_filter_new(&mbfl_encoding_wchar, outcode, mbfl_filter_output_pipe, 0, pe->encod_filter);
/* encoded block filter */
pe->block_filter = mbfl_convert_filter_new(&mbfl_encoding_wchar, &mbfl_encoding_wchar, mime_header_encoder_block_collector, 0, pe);
/* Input code filter */
pe->conv1_filter = mbfl_convert_filter_new(incode, &mbfl_encoding_wchar, mime_header_encoder_collector, 0, pe);
if (pe->encod_filter == NULL ||
pe->encod_filter_backup == NULL ||
pe->conv2_filter == NULL ||
pe->conv2_filter_backup == NULL ||
pe->conv1_filter == NULL) {
mime_header_encoder_delete(pe);
return NULL;
}
if (transenc->no_encoding == mbfl_no_encoding_qprint) {
pe->encod_filter->status |= MBFL_QPRINT_STS_MIME_HEADER;
pe->encod_filter_backup->status |= MBFL_QPRINT_STS_MIME_HEADER;
} else {
pe->encod_filter->status |= MBFL_BASE64_STS_MIME_HEADER;
pe->encod_filter_backup->status |= MBFL_BASE64_STS_MIME_HEADER;
}
return pe;
}
void
mime_header_encoder_delete(struct mime_header_encoder_data *pe)
{
if (pe) {
mbfl_convert_filter_delete(pe->conv1_filter);
mbfl_convert_filter_delete(pe->block_filter);
mbfl_convert_filter_delete(pe->conv2_filter);
mbfl_convert_filter_delete(pe->conv2_filter_backup);
mbfl_convert_filter_delete(pe->encod_filter);
mbfl_convert_filter_delete(pe->encod_filter_backup);
mbfl_memory_device_clear(&pe->outdev);
mbfl_memory_device_clear(&pe->tmpdev);
efree((void*)pe);
}
}
mbfl_string *
mbfl_mime_header_encode(
mbfl_string *string,
mbfl_string *result,
const mbfl_encoding *outcode,
const mbfl_encoding *encoding,
const char *linefeed,
int indent)
{
size_t n;
unsigned char *p;
struct mime_header_encoder_data *pe;
mbfl_string_init(result);
result->encoding = &mbfl_encoding_ascii;
pe = mime_header_encoder_new(string->encoding, outcode, encoding);
if (pe == NULL) {
return NULL;
}
if (linefeed != NULL) {
n = 0;
while (*linefeed && n < 8) {
pe->lwsp[n++] = *linefeed++;
}
pe->lwsp[n++] = 0x20;
pe->lwsp[n] = '\0';
pe->lwsplen = n;
}
if (indent > 0 && indent < 74) {
pe->firstindent = indent;
}
n = string->len;
p = string->val;
while (n > 0) {
(*pe->conv1_filter->filter_function)(*p++, pe->conv1_filter);
n--;
}
result = mime_header_encoder_result(pe, result);
mime_header_encoder_delete(pe);
return result;
}

View file

@ -168,29 +168,4 @@ static inline int mbfl_is_error(size_t len) {
MBFLAPI extern mbfl_string *
mbfl_strcut(mbfl_string *string, mbfl_string *result, size_t from, size_t length);
/*
* MIME header encode
*/
struct mime_header_encoder_data; /* forward declaration */
MBFLAPI extern struct mime_header_encoder_data *
mime_header_encoder_new(
const mbfl_encoding *incode,
const mbfl_encoding *outcode,
const mbfl_encoding *encoding);
MBFLAPI extern void
mime_header_encoder_delete(struct mime_header_encoder_data *pe);
MBFLAPI extern mbfl_string *
mime_header_encoder_result(struct mime_header_encoder_data *pe, mbfl_string *result);
MBFLAPI extern mbfl_string *
mbfl_mime_header_encode(
mbfl_string *string, mbfl_string *result,
const mbfl_encoding *outcode,
const mbfl_encoding *encoding,
const char *linefeed,
int indent);
#endif /* MBFL_MBFILTER_H */

View file

@ -44,9 +44,6 @@
/* Marker for an erroneous input byte (or sequence of bytes) */
#define MBFL_BAD_INPUT (-1)
#define MBFL_QPRINT_STS_MIME_HEADER 0x1000000
#define MBFL_BASE64_STS_MIME_HEADER 0x1000000
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE 0
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR 1
#define MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG 2

View file

@ -162,17 +162,17 @@ static inline void mb_convert_buf_init(mb_convert_buf *buf, size_t initsize, uin
#define MB_CONVERT_BUF_ENSURE(buf, out, limit, needed) \
ZEND_ASSERT(out <= limit); \
if ((limit - out) < (needed)) { \
size_t oldsize = limit - (unsigned char*)ZSTR_VAL(buf->str); \
size_t oldsize = limit - (unsigned char*)ZSTR_VAL((buf)->str); \
size_t newsize = oldsize + MAX(oldsize >> 1, needed); \
zend_string *newstr = erealloc(buf->str, _ZSTR_STRUCT_SIZE(newsize)); \
out = (unsigned char*)ZSTR_VAL(newstr) + (out - (unsigned char*)ZSTR_VAL(buf->str)); \
zend_string *newstr = erealloc((buf)->str, _ZSTR_STRUCT_SIZE(newsize)); \
out = (unsigned char*)ZSTR_VAL(newstr) + (out - (unsigned char*)ZSTR_VAL((buf)->str)); \
limit = (unsigned char*)ZSTR_VAL(newstr) + newsize; \
buf->str = newstr; \
(buf)->str = newstr; \
}
#define MB_CONVERT_BUF_STORE(buf, _out, _limit) buf->out = _out; buf->limit = _limit
#define MB_CONVERT_BUF_STORE(buf, _out, _limit) (buf)->out = _out; (buf)->limit = _limit
#define MB_CONVERT_BUF_LOAD(buf, _out, _limit) _out = buf->out; _limit = buf->limit
#define MB_CONVERT_BUF_LOAD(buf, _out, _limit) _out = (buf)->out; _limit = (buf)->limit
#define MB_CONVERT_ERROR(buf, out, limit, bad_cp, conv_fn) \
MB_CONVERT_BUF_STORE(buf, out, limit); \
@ -209,6 +209,22 @@ static inline unsigned char* mb_convert_buf_add4(unsigned char *out, char c1, ch
return out;
}
static inline unsigned char* mb_convert_buf_appends(unsigned char *out, const char *s)
{
while (*s) {
*out++ = *s++;
}
return out;
}
static inline unsigned char* mb_convert_buf_appendn(unsigned char *out, const char *s, size_t n)
{
while (n--) {
*out++ = *s++;
}
return out;
}
static inline zend_string* mb_convert_buf_result_raw(mb_convert_buf *buf)
{
ZEND_ASSERT(buf->out <= buf->limit);
@ -246,6 +262,24 @@ static inline zend_string* mb_convert_buf_result(mb_convert_buf *buf, const mbfl
return ret;
}
/* Used if we initialize an `mb_convert_buf` but then discover we don't actually
* want to return `zend_string` */
static inline void mb_convert_buf_free(mb_convert_buf *buf)
{
efree(buf->str);
}
static inline size_t mb_convert_buf_len(mb_convert_buf *buf)
{
return buf->out - (unsigned char*)ZSTR_VAL(buf->str);
}
static inline void mb_convert_buf_reset(mb_convert_buf *buf, size_t len)
{
buf->out = (unsigned char*)ZSTR_VAL(buf->str) + len;
ZEND_ASSERT(buf->out <= buf->limit);
}
MBFLAPI extern const mbfl_encoding *mbfl_name2encoding(const char *name);
MBFLAPI extern const mbfl_encoding *mbfl_no2encoding(enum mbfl_no_encoding no_encoding);
MBFLAPI extern const mbfl_encoding **mbfl_get_supported_encodings(void);

View file

@ -46,6 +46,7 @@
#include "libmbfl/filters/mbfilter_utf16.h"
#include "libmbfl/filters/mbfilter_singlebyte.h"
#include "libmbfl/filters/translit_kana_jisx0201_jisx0208.h"
#include "libmbfl/filters/unicode_prop.h"
#include "php_variables.h"
#include "php_globals.h"
@ -91,6 +92,8 @@ static bool mb_check_str_encoding(zend_string *str, const mbfl_encoding *encodin
static const mbfl_encoding* mb_guess_encoding(unsigned char *in, size_t in_len, const mbfl_encoding **elist, unsigned int elist_size, bool strict);
static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent);
/* See mbfilter_cp5022x.c */
uint32_t mb_convert_kana_codepoint(uint32_t c, uint32_t next, bool *consumed, uint32_t *second, int mode);
/* }}} */
@ -3201,66 +3204,6 @@ PHP_FUNCTION(mb_encoding_aliases)
}
/* }}} */
/* {{{ Converts the string to MIME "encoded-word" in the format of =?charset?(B|Q)?encoded_string?= */
PHP_FUNCTION(mb_encode_mimeheader)
{
const mbfl_encoding *charset, *transenc;
mbfl_string string, result, *ret;
zend_string *charset_name = NULL;
char *trans_enc_name = NULL, *string_val;
size_t trans_enc_name_len;
char *linefeed = "\r\n";
size_t linefeed_len;
zend_long indent = 0;
string.encoding = MBSTRG(current_internal_encoding);
ZEND_PARSE_PARAMETERS_START(1, 5)
Z_PARAM_STRING(string_val, string.len)
Z_PARAM_OPTIONAL
Z_PARAM_STR(charset_name)
Z_PARAM_STRING(trans_enc_name, trans_enc_name_len)
Z_PARAM_STRING(linefeed, linefeed_len)
Z_PARAM_LONG(indent)
ZEND_PARSE_PARAMETERS_END();
string.val = (unsigned char*)string_val;
charset = &mbfl_encoding_pass;
transenc = &mbfl_encoding_base64;
if (charset_name != NULL) {
charset = php_mb_get_encoding(charset_name, 2);
if (!charset) {
RETURN_THROWS();
} else if (charset->mime_name == NULL || charset->mime_name[0] == '\0') {
zend_argument_value_error(2, "\"%s\" cannot be used for MIME header encoding", ZSTR_VAL(charset_name));
RETURN_THROWS();
}
} else {
const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
if (lang != NULL) {
charset = mbfl_no2encoding(lang->mail_charset);
transenc = mbfl_no2encoding(lang->mail_header_encoding);
}
}
if (trans_enc_name != NULL) {
if (*trans_enc_name == 'B' || *trans_enc_name == 'b') {
transenc = &mbfl_encoding_base64;
} else if (*trans_enc_name == 'Q' || *trans_enc_name == 'q') {
transenc = &mbfl_encoding_qprint;
}
}
mbfl_string_init(&result);
ret = mbfl_mime_header_encode(&string, &result, charset, transenc, linefeed, indent);
ZEND_ASSERT(ret != NULL);
// TODO: avoid reallocation ???
RETVAL_STRINGL((char *)ret->val, ret->len); /* the string is already strdup()'ed */
efree(ret->val);
}
/* }}} */
static zend_string* jp_kana_convert(zend_string *input, const mbfl_encoding *encoding, unsigned int mode)
{
/* Each wchar may potentially expand to 2 when we perform kana conversion...
@ -4156,8 +4099,7 @@ PHP_FUNCTION(mb_send_mail)
size_t to_len;
char *message;
size_t message_len;
char *subject;
size_t subject_len;
zend_string *subject;
zend_string *extra_cmd = NULL;
HashTable *headers_ht = NULL;
zend_string *str_headers = NULL;
@ -4169,9 +4111,7 @@ PHP_FUNCTION(mb_send_mail)
int cnt_trans_enc:1;
} suppressed_hdrs = { 0, 0 };
char *subject_buf = NULL, *p;
mbfl_string orig_str, conv_str;
mbfl_string *pstr; /* pointer to mbfl string for return value */
char *p;
enum mbfl_no_encoding;
const mbfl_encoding *tran_cs, /* transfer text charset */
*head_enc, /* header transfer encoding */
@ -4181,10 +4121,6 @@ PHP_FUNCTION(mb_send_mail)
HashTable ht_headers;
zval *s;
/* initialize */
mbfl_string_init(&orig_str);
mbfl_string_init(&conv_str);
/* character-set, transfer-encoding */
tran_cs = &mbfl_encoding_utf8;
head_enc = &mbfl_encoding_base64;
@ -4198,7 +4134,7 @@ PHP_FUNCTION(mb_send_mail)
ZEND_PARSE_PARAMETERS_START(3, 5)
Z_PARAM_PATH(to, to_len)
Z_PARAM_PATH(subject, subject_len)
Z_PARAM_PATH_STR(subject)
Z_PARAM_PATH(message, message_len)
Z_PARAM_OPTIONAL
Z_PARAM_ARRAY_HT_OR_STR(headers_ht, str_headers)
@ -4310,22 +4246,17 @@ PHP_FUNCTION(mb_send_mail)
}
/* Subject: */
orig_str.val = (unsigned char *)subject;
orig_str.len = subject_len;
orig_str.encoding = MBSTRG(current_internal_encoding);
if (orig_str.encoding->no_encoding == mbfl_no_encoding_invalid || orig_str.encoding->no_encoding == mbfl_no_encoding_pass) {
orig_str.encoding = mb_guess_encoding((unsigned char*)subject, subject_len, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection));
const mbfl_encoding *enc = MBSTRG(current_internal_encoding);
if (enc == &mbfl_encoding_pass) {
enc = mb_guess_encoding((unsigned char*)ZSTR_VAL(subject), ZSTR_LEN(subject), MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection));
}
const char *line_sep = PG(mail_mixed_lf_and_crlf) ? "\n" : CRLF;
size_t line_sep_len = strlen(line_sep);
pstr = mbfl_mime_header_encode(&orig_str, &conv_str, tran_cs, head_enc, line_sep, strlen("Subject: [PHP-jp nnnnnnnn]") + line_sep_len);
if (pstr != NULL) {
subject_buf = subject = (char *)pstr->val;
}
subject = mb_mime_header_encode(subject, enc, tran_cs, head_enc == &mbfl_encoding_base64, (char*)line_sep, line_sep_len, strlen("Subject: [PHP-jp nnnnnnnn]") + line_sep_len);
/* message body */
const mbfl_encoding *msg_enc = MBSTRG(current_internal_encoding);
if (msg_enc == &mbfl_encoding_pass) {
msg_enc = mb_guess_encoding((unsigned char*)message, message_len, MBSTRG(current_detect_order_list), MBSTRG(current_detect_order_list_size), MBSTRG(strict_detection));
}
@ -4401,18 +4332,15 @@ PHP_FUNCTION(mb_send_mail)
extra_cmd = php_escape_shell_cmd(ZSTR_VAL(extra_cmd));
}
RETVAL_BOOL(!err && php_mail(to_r, subject, message, ZSTR_VAL(str_headers), extra_cmd ? ZSTR_VAL(extra_cmd) : NULL));
RETVAL_BOOL(!err && php_mail(to_r, ZSTR_VAL(subject), message, ZSTR_VAL(str_headers), extra_cmd ? ZSTR_VAL(extra_cmd) : NULL));
if (extra_cmd) {
zend_string_release_ex(extra_cmd, 0);
}
if (to_r != to) {
efree(to_r);
}
if (subject_buf) {
efree((void *)subject_buf);
}
zend_string_release(subject);
zend_string_free(conv);
zend_hash_destroy(&ht_headers);
if (str_headers) {
@ -5634,6 +5562,418 @@ static void php_mb_gpc_set_input_encoding(const zend_encoding *encoding) /* {{{
}
/* }}} */
static const unsigned char base64_table[] = {
/* 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', */
0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4a,0x4b,0x4c,0x4d,
/* 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', */
0x4e,0x4f,0x50,0x51,0x52,0x53,0x54,0x55,0x56,0x57,0x58,0x59,0x5a,
/* 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', */
0x61,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6a,0x6b,0x6c,0x6d,
/* 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', */
0x6e,0x6f,0x70,0x71,0x72,0x73,0x74,0x75,0x76,0x77,0x78,0x79,0x7a,
/* '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '+', '/', '\0' */
0x30,0x31,0x32,0x33,0x34,0x35,0x36,0x37,0x38,0x39,0x2b,0x2f,0x00
};
static size_t transfer_encoded_size(mb_convert_buf *tmpbuf, bool base64)
{
if (base64) {
return ((mb_convert_buf_len(tmpbuf) + 2) / 3) * 4;
} else {
size_t enc_size = 0;
unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str);
while (p < tmpbuf->out) {
unsigned char c = *p++;
enc_size += (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) ? 3 : 1;
}
return enc_size;
}
}
static void transfer_encode_mime_bytes(mb_convert_buf *tmpbuf, mb_convert_buf *outbuf, bool base64)
{
unsigned char *out, *limit;
MB_CONVERT_BUF_LOAD(outbuf, out, limit);
unsigned char *p = (unsigned char*)ZSTR_VAL(tmpbuf->str), *e = tmpbuf->out;
if (base64) {
MB_CONVERT_BUF_ENSURE(outbuf, out, limit, ((e - p) + 2) / 3 * 4);
while ((e - p) >= 3) {
unsigned char a = *p++;
unsigned char b = *p++;
unsigned char c = *p++;
uint32_t bits = (a << 16) | (b << 8) | c;
out = mb_convert_buf_add4(out,
base64_table[(bits >> 18) & 0x3F],
base64_table[(bits >> 12) & 0x3F],
base64_table[(bits >> 6) & 0x3F],
base64_table[bits & 0x3F]);
}
if (p != e) {
if ((e - p) == 1) {
uint32_t bits = *p++;
out = mb_convert_buf_add4(out, base64_table[(bits >> 2) & 0x3F], base64_table[(bits & 0x3) << 4], '=', '=');
} else {
unsigned char a = *p++;
unsigned char b = *p++;
uint32_t bits = (a << 8) | b;
out = mb_convert_buf_add4(out, base64_table[(bits >> 10) & 0x3F], base64_table[(bits >> 4) & 0x3F], base64_table[(bits & 0xF) << 2], '=');
}
}
} else {
MB_CONVERT_BUF_ENSURE(outbuf, out, limit, (e - p) * 3);
while (p < e) {
unsigned char c = *p++;
if (c > 0x7F || c == '=' || mime_char_needs_qencode[c]) {
out = mb_convert_buf_add3(out, '=', "0123456789ABCDEF"[(c >> 4) & 0xF], "0123456789ABCDEF"[c & 0xF]);
} else {
out = mb_convert_buf_add(out, c);
}
}
}
mb_convert_buf_reset(tmpbuf, 0);
MB_CONVERT_BUF_STORE(outbuf, out, limit);
}
static zend_string* mb_mime_header_encode(zend_string *input, const mbfl_encoding *incode, const mbfl_encoding *outcode, bool base64, char *linefeed, size_t linefeed_len, zend_long indent)
{
unsigned char *in = (unsigned char*)ZSTR_VAL(input);
size_t in_len = ZSTR_LEN(input);
if (!in_len) {
return zend_empty_string;
}
if (indent < 0 || indent >= 74) {
indent = 0;
}
if (linefeed_len > 8) {
linefeed_len = 8;
}
/* Maintain legacy behavior as regards embedded NUL (zero) bytes in linefeed string */
for (size_t i = 0; i < linefeed_len; i++) {
if (linefeed[i] == '\0') {
linefeed_len = i;
break;
}
}
unsigned int state = 0;
/* wchar_buf should be big enough that when it is full, we definitely have enough
* wchars to fill an entire line of output */
uint32_t wchar_buf[80];
uint32_t *p, *e;
/* What part of wchar_buf is filled with still-unprocessed data which should not
* be overwritten? */
unsigned int offset = 0;
size_t line_start = 0;
/* If the entire input string is ASCII with no spaces (except possibly leading
* spaces), just pass it through unchanged */
bool checking_leading_spaces = true;
while (in_len) {
size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf, 80, &state);
p = wchar_buf;
e = wchar_buf + out_len;
while (p < e) {
uint32_t w = *p++;
if (checking_leading_spaces) {
if (w == ' ') {
continue;
} else {
checking_leading_spaces = false;
}
}
if (w < 0x21 || w > 0x7E || w == '=' || w == '?' || w == '_') {
/* We cannot simply pass input string through unchanged; start again */
in = (unsigned char*)ZSTR_VAL(input);
in_len = ZSTR_LEN(input);
goto no_passthrough;
}
}
}
return zend_string_copy(input); /* This just increments refcount */
no_passthrough: ;
mb_convert_buf buf;
mb_convert_buf_init(&buf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
/* Encode some prefix of the input string as plain ASCII if possible
* If we find it necessary to switch to Base64/QPrint encoding, we will
* do so all the way to the end of the string */
while (in_len) {
/* Decode part of the input string, refill wchar_buf */
ZEND_ASSERT(offset < 80);
size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, 80 - offset, &state);
ZEND_ASSERT(out_len <= 80 - offset);
p = wchar_buf;
e = wchar_buf + offset + out_len;
/* ASCII output is broken into space-delimited 'words'
* If we find a non-ASCII character in the middle of a word, we will
* transfer-encode the entire word */
uint32_t *word_start = p;
/* Don't consider adding line feed for spaces at the beginning of a word */
while (p < e && *p == ' ' && (p - word_start) <= 74) {
p++;
}
while (p < e) {
uint32_t w = *p++;
if (w < 0x20 || w > 0x7E || w == '?' || w == '=' || w == '_' || (w == ' ' && (p - word_start) > 74)) {
/* Non-ASCII character (or line too long); switch to Base64/QPrint encoding
* If we are already too far along on a line to include Base64/QPrint encoded data
* on the same line (without overrunning max line length), then add a line feed
* right now */
if (mb_convert_buf_len(&buf) - line_start + indent + strlen(outcode->mime_name) > 55) {
MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
buf.out = mb_convert_buf_add(buf.out, ' ');
indent = 0;
line_start = mb_convert_buf_len(&buf);
} else if (mb_convert_buf_len(&buf) > 0) {
MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 1);
buf.out = mb_convert_buf_add(buf.out, ' ');
}
p = word_start; /* Back up to where MIME encoding of input chars should start */
goto mime_encoding_needed;
} else if (w == ' ') {
/* When we see a space, check whether we should insert a line break */
if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 75) {
MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
buf.out = mb_convert_buf_add(buf.out, ' ');
indent = 0;
line_start = mb_convert_buf_len(&buf);
} else if (mb_convert_buf_len(&buf) > 0) {
MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
buf.out = mb_convert_buf_add(buf.out, ' ');
}
/* Output one (space-delimited) word as plain ASCII */
while (word_start < p-1) {
buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
}
word_start++;
while (p < e && *p == ' ') {
p++;
}
}
}
if (in_len) {
/* Copy chars which are part of an incomplete 'word' to the beginning
* of wchar_buf and reprocess them on the next iteration */
offset = e - word_start;
if (offset) {
memmove(wchar_buf, word_start, offset * sizeof(uint32_t));
}
} else {
/* We have reached the end of the input string while still in 'ASCII mode';
* process any trailing ASCII chars which were not followed by a space */
if (word_start < e && mb_convert_buf_len(&buf) > 0) {
/* The whole input string was not just one big ASCII 'word' with no spaces
* consider adding a line feed if necessary to prevent output lines from
* being too long */
if (mb_convert_buf_len(&buf) - line_start + (p - word_start) + indent > 74) {
MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + linefeed_len + 1);
buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
buf.out = mb_convert_buf_add(buf.out, ' ');
} else {
MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, (e - word_start) + 1);
buf.out = mb_convert_buf_add(buf.out, ' ');
}
}
while (word_start < e) {
buf.out = mb_convert_buf_add(buf.out, *word_start++ & 0xFF);
}
}
}
/* Ensure output string is marked as valid UTF-8 (ASCII strings are always 'valid UTF-8') */
return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
mime_encoding_needed: ;
/* We will generate the output line by line, first converting wchars to bytes
* in the requested output encoding, then transfer-encoding those bytes as
* Base64 or QPrint
* 'tmpbuf' will receive the bytes which need to be transfer-encoded before
* sending them to 'buf' */
mb_convert_buf tmpbuf;
mb_convert_buf_init(&tmpbuf, in_len, '?', MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR);
/* Do we need to refill wchar_buf to make sure we don't run out of wchars
* in the middle of a line? */
if (p == wchar_buf) {
goto start_new_line;
}
offset = e - p;
memmove(wchar_buf, p, offset * sizeof(uint32_t));
while(true) {
refill_wchar_buf: ;
ZEND_ASSERT(offset < 80);
size_t out_len = incode->to_wchar(&in, &in_len, wchar_buf + offset, 80 - offset, &state);
ZEND_ASSERT(out_len <= 80 - offset);
p = wchar_buf;
e = wchar_buf + offset + out_len;
start_new_line: ;
MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, strlen(outcode->mime_name) + 5);
buf.out = mb_convert_buf_add2(buf.out, '=', '?');
buf.out = mb_convert_buf_appends(buf.out, outcode->mime_name);
buf.out = mb_convert_buf_add3(buf.out, '?', base64 ? 'B' : 'Q', '?');
/* How many wchars should we try converting to Base64/QPrint-encoded bytes?
* We do something like a 'binary search' to find the greatest number which
* can be included on this line without exceeding max line length */
unsigned int n = 12;
size_t space_available = 73 - indent - (mb_convert_buf_len(&buf) - line_start);
while (true) {
ZEND_ASSERT(p < e);
/* Remember where we were in process of generating output, so we can back
* up if necessary */
size_t tmppos = mb_convert_buf_len(&tmpbuf);
unsigned int tmpstate = tmpbuf.state;
/* Try encoding 'n' wchars in output text encoding and sending output
* bytes to 'tmpbuf'. Hopefully this is not too many to fit on the
* current line. */
n = MIN(n, e - p);
outcode->from_wchar(p, n, &tmpbuf, false);
/* For some output text encodings, there may be a few ending bytes
* which need to be emitted to output before we break a line.
* Again, remember where we were so we can back up */
size_t tmppos2 = mb_convert_buf_len(&tmpbuf);
unsigned int tmpstate2 = tmpbuf.state;
outcode->from_wchar(NULL, 0, &tmpbuf, true);
if (transfer_encoded_size(&tmpbuf, base64) <= space_available || (n == 1 && tmppos == 0)) {
/* If we convert 'n' more wchars on the current line, it will not
* overflow the maximum line length */
p += n;
if (p == e) {
/* We are done; we shouldn't reach here if there is more remaining
* of the input string which needs to be processed */
ZEND_ASSERT(!in_len);
transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 2);
buf.out = mb_convert_buf_add2(buf.out, '?', '=');
mb_convert_buf_free(&tmpbuf);
return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
} else {
/* It's possible that more chars might fit on the current line,
* so back up to where we were before emitting any ending bytes */
mb_convert_buf_reset(&tmpbuf, tmppos2);
tmpbuf.state = tmpstate2;
}
} else {
/* Converting 'n' more wchars on this line would be too much.
* Back up to where we were before we tried that. */
mb_convert_buf_reset(&tmpbuf, tmppos);
tmpbuf.state = tmpstate;
if (n == 1) {
/* We have found the exact number of chars which will fit on the
* current line. Finish up and move to a new line. */
outcode->from_wchar(NULL, 0, &tmpbuf, true);
transfer_encode_mime_bytes(&tmpbuf, &buf, base64);
tmpbuf.state = 0;
MB_CONVERT_BUF_ENSURE(&buf, buf.out, buf.limit, 3 + linefeed_len);
buf.out = mb_convert_buf_add2(buf.out, '?', '=');
indent = 0; /* Indent argument must only affect the first line */
if (in_len) {
/* We still have more of input string remaining to decode */
buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
buf.out = mb_convert_buf_add(buf.out, ' ');
line_start = mb_convert_buf_len(&buf);
/* Copy remaining wchars to beginning of buffer so they will be
* processed on the next iteration of outer 'do' loop */
offset = e - p;
memmove(wchar_buf, p, offset * sizeof(uint32_t));
goto refill_wchar_buf;
} else if (p < e) {
/* Input string is finished, but we still have trailing wchars
* remaining to be processed in wchar_buf */
buf.out = mb_convert_buf_appendn(buf.out, linefeed, linefeed_len);
buf.out = mb_convert_buf_add(buf.out, ' ');
line_start = mb_convert_buf_len(&buf);
goto start_new_line;
} else {
/* We are done! */
mb_convert_buf_free(&tmpbuf);
return mb_convert_buf_result(&buf, &mbfl_encoding_utf8);
}
} else {
/* Try a smaller number of wchars */
n = MAX(n >> 1, 1);
}
}
}
}
}
PHP_FUNCTION(mb_encode_mimeheader)
{
const mbfl_encoding *charset = &mbfl_encoding_pass;
zend_string *str, *charset_name = NULL, *transenc_name = NULL;
char *linefeed = "\r\n";
size_t linefeed_len = 2;
zend_long indent = 0;
bool base64 = true;
ZEND_PARSE_PARAMETERS_START(1, 5)
Z_PARAM_STR(str)
Z_PARAM_OPTIONAL
Z_PARAM_STR(charset_name)
Z_PARAM_STR(transenc_name)
Z_PARAM_STRING(linefeed, linefeed_len)
Z_PARAM_LONG(indent)
ZEND_PARSE_PARAMETERS_END();
if (charset_name != NULL) {
charset = php_mb_get_encoding(charset_name, 2);
if (!charset) {
RETURN_THROWS();
} else if (charset->mime_name == NULL || charset->mime_name[0] == '\0') {
zend_argument_value_error(2, "\"%s\" cannot be used for MIME header encoding", ZSTR_VAL(charset_name));
RETURN_THROWS();
}
} else {
const mbfl_language *lang = mbfl_no2language(MBSTRG(language));
if (lang != NULL) {
charset = mbfl_no2encoding(lang->mail_charset);
const mbfl_encoding *transenc = mbfl_no2encoding(lang->mail_header_encoding);
char t = transenc->name[0];
if (t == 'Q' || t == 'q') {
base64 = false;
}
}
}
if (transenc_name != NULL && ZSTR_LEN(transenc_name) > 0) {
char t = ZSTR_VAL(transenc_name)[0];
if (t == 'Q' || t == 'q') {
base64 = false;
}
}
RETURN_STR(mb_mime_header_encode(str, MBSTRG(current_internal_encoding), charset, base64, linefeed, linefeed_len, indent));
}
static int8_t decode_base64(unsigned char c)
{
if (c >= 'A' && c <= 'Z') {

View file

@ -0,0 +1,160 @@
--TEST--
Test mb_encode_mimeheader() function : test cases found by fuzzer
--EXTENSIONS--
mbstring
--FILE--
<?php
mb_internal_encoding('UTF-8');
var_dump(mb_encode_mimeheader("", "UTF-8", "Q"));
// Regression test for QPrint-encoding of strings with embedded NUL (zero) bytes
var_dump(mb_encode_mimeheader("abc\x00abc", "UTF-8", "Q"));
// Regression test for input strings which end prematurely
var_dump(mb_encode_mimeheader("\xE2", "UTF-8", "B"));
// Handling of ? signs
var_dump(mb_encode_mimeheader("?", "ASCII", "B"));
var_dump(mb_encode_mimeheader("?", "ASCII", "Q"));
// Handling of = signs
var_dump(mb_encode_mimeheader("=", "ASCII", "B"));
var_dump(mb_encode_mimeheader("=", "ASCII", "Q"));
// Handling of underscores
var_dump(mb_encode_mimeheader("_", "ASCII", "B"));
var_dump(mb_encode_mimeheader("_", "ASCII", "Q"));
// Handling of 0x7F (DEL)
var_dump(mb_encode_mimeheader("\x7f", "ASCII", "B", ""));
// Handling of leading spaces
var_dump(mb_encode_mimeheader(" ", "ASCII", "B"));
var_dump(mb_encode_mimeheader(" ", "ASCII", "Q"));
var_dump(mb_encode_mimeheader(" ", "ASCII", "B"));
var_dump(mb_encode_mimeheader(" ", "ASCII", "Q"));
// Try multiple spaces after a word
var_dump(mb_encode_mimeheader("ab ab ", "ASCII", "B"));
var_dump(mb_encode_mimeheader("ab ab ", "ASCII", "Q"));
// Trailing spaces
var_dump(mb_encode_mimeheader("` ", "HZ", "B", ""));
var_dump(mb_encode_mimeheader("S ", "ASCII", "Q", "", 73));
// Regression test: extra spaces should not be added at beginning of ASCII string
// when entire input is one ASCII 'word' and high indent value makes us consider
// adding a line feed
var_dump(mb_encode_mimeheader("S4", "ASCII", "B", "\n", 73));
var_dump(mb_encode_mimeheader("S4", "ASCII", "Q", "\n", 73));
// Regression test: converting UTF-8 to UCS-4 and then QPrint-encoding makes string
// take a vastly larger number of bytes; make sure we don't overrun max line length
var_dump(mb_encode_mimeheader("24\x0a", "UCS-4", "Q", "", 29));
// Regression test: include space after ASCII word when switching to Base64 encoding
var_dump(mb_encode_mimeheader("o\x20\x00", "ASCII", "B"));
// Regression test for buffer overrun while performing Base64-encoding
var_dump(mb_encode_mimeheader("\x00\x11\x01\x00\x00\x00\x00\x00\x00\x00", "UCS-4", "B"));
// Regression test for incorrect calculation of when to stop generating output
var_dump(mb_encode_mimeheader("\x01\x00\xcb\xcb\xcb\xcb\xcb\xcb=\xcb\xcb\xcb=?\x01\x00a\x00\x00\xcb\xcb\xcb=?\xcb\xcb\xcb\xcb\xcb\xcb\xcb\xcb\xcb?4?4\xcb\xcb\xcb\xcb\xcb=?\x01\x00\x00\x00\x01\x00\x00\x06\xcb\xcb\xcb\xcb\xcb\xcb\xcb\xcb\xcb=?\xcb\xcb\xcb\xcb\xcb\xcb\xcb\xcb\xcb?4\xcb\xcb\xcb\xcb\xcb?4", "UCS-2", "B", ""));
// 'Line feed' string is truncated to no more than 8 bytes long
$linefeed = "=aaaaaa=?";
var_dump(mb_encode_mimeheader("?", "ASCII", "Q", "=aaaaaa=?", 52));
var_dump($linefeed); // Make sure 'line feed' string was not modified
// Regression test: must take ASCII characters already output at beginning of line
// into account when calculating how many QPrint-encoded characters we can output
// without overrunning max line length
var_dump(mb_encode_mimeheader(",\x20o\x00\x01\x00\x00(", "JIS", "Q", "", 40));
// Make sure we maintain legacy behavior when linefeed string contains NUL (zero) bytes
// (We treat the linefeed string as being truncated at that point)
// The reason is because in the original implementation, the linefeed string was a
// null-terminated C string, so including NUL bytes would have the side effect of
// causing only part of the linefeed string to be used
var_dump(mb_encode_mimeheader("\xff", "ASCII", "Q", "\x00", 54));
// Regression test: After we see a non-ASCII character and switch into Base64/QPrint encoding mode,
// we may need to emit a linefeed before we start the next MIME encoded word
// If so, properly record where the line start position is so we can correctly calculate
// how much output can fit on the line
var_dump(mb_encode_mimeheader("\xff~H~\xe0\xea\x00\x00\xff\xff\xff\xff\xff>\x00\x00\x00\x00", "HZ", "Q", "", 71));
// ASCII strings with no spaces should pass through unchanged
var_dump(mb_encode_mimeheader("yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyBIG5", "BIG-5", "B"));
// Regression test: After decoding part of a line as ASCII, before we switch into Base64/QPrint encoding mode,
// refill our buffer of wchars so we don't hit the end of the buffer in the middle of a line
var_dump(mb_encode_mimeheader("\x20\x20\x20\x202\x20\x20\x20sssssssssssssssssssssssssss\x20\x20\x20\x20W\x20\x20\x20\x20\x20\x20W\x20\x20\x20\x20\xb9S\x01\x00\xf0`\x00\x00\x20\x20\x20\x20mSCII\xee\x20\x20\x20\x20mSCII\xeeI\xee", "ArmSCII-8", "B", ""));
// Regression test: Input string with a huge number of spaces
var_dump(mb_encode_mimeheader("\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x00", "CP936", "Q", ""));
// Regression test: Long string, all ASCII, but with spaces at the beginning
var_dump(mb_encode_mimeheader("\x20\x201111111111111111111111111111111111111111111111111111111111111111111111111", "ASCII", "Q", ""));
// Only a single character in input, but when we convert it to outcode and then
// transfer-encode it, it takes too many bytes to fit on a single line
// Legacy implementation would always include at least one wchar in each encoded word;
// imitate the same behavior
var_dump(mb_encode_mimeheader("\xe7\xad\xb5", "HZ", "Q", "", 44));
// Regression test: Exploring corner cases of when legacy implementation would output plain ASCII
// with no transfer encoding, and when it would transfer-encode
var_dump(mb_encode_mimeheader("2\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20!3", "GB18030", "Q", ""));
var_dump(mb_encode_mimeheader("\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20!3\x20", "GB18030", "Q", ""));
// Change in behavior: The old implementation would output the following string as plain ASCII,
// but the new one transfer-encodes it
// In the general case, matching the old implementation's decision to transfer-encode or not
// perfectly would require allocating potentially unbounded scratch memory (up to the size of
// the input string), but we aim to only use a constant amount of temporarily allocated memory
var_dump(mb_encode_mimeheader("2\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20\x20!3", "GB18030", "Q", ""));
echo "Done";
?>
--EXPECT--
string(0) ""
string(21) "=?UTF-8?Q?abc=00abc?="
string(16) "=?UTF-8?B?Pw==?="
string(19) "=?US-ASCII?B?Pw==?="
string(18) "=?US-ASCII?Q?=3F?="
string(19) "=?US-ASCII?B?PQ==?="
string(18) "=?US-ASCII?Q?=3D?="
string(19) "=?US-ASCII?B?Xw==?="
string(18) "=?US-ASCII?Q?=5F?="
string(19) "=?US-ASCII?B?fw==?="
string(1) " "
string(1) " "
string(3) " "
string(3) " "
string(8) "ab ab "
string(8) "ab ab "
string(1) "`"
string(1) "S"
string(2) "S4"
string(2) "S4"
string(61) "=?UCS-4?Q?=00=00=00=32=00=00=00=34?= =?UCS-4?Q?=00=00=00=0A?="
string(21) "o =?US-ASCII?B?AA==?="
string(68) "=?UCS-4?B?AAAAAAAAABEAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA==?="
string(271) "=?UCS-2?B?AAEAAAA/AD8APwA/AD8APwA9AD8APwA/AD0APwABAAAAYQAAAAAAPwA/AD8=?= =?UCS-2?B?AD0APwA/AD8APwA/AD8APwA/AD8APwA/ADQAPwA0AD8APwA/AD8APwA9AD8=?= =?UCS-2?B?AAEAAAAAAAAAAQAAAAAABgA/AD8APwA/AD8APwA/AD8APwA9AD8APwA/AD8=?= =?UCS-2?B?AD8APwA/AD8APwA/AD8ANAA/AD8APwA/AD8APwA0?="
string(27) "=aaaaaa= =?US-ASCII?Q?=3F?="
string(9) "=aaaaaa=?"
string(55) ", =?ISO-2022-JP?Q?o=00=01=00=00?= =?ISO-2022-JP?Q?=28?="
string(19) " =?US-ASCII?Q?=3F?="
string(76) " =?HZ-GB-2312?Q?=3F=7E=7EH=7E=7E=3F=3F=00=00=3F=3F=3F=3F=3F=3E=00=00=00=00?="
string(75) "yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyBIG5"
string(108) " 2 sssssssssssssssssssssssssss W W =?ArmSCII-8?B?ICAgP1MBAD9gAAAgICAgbVNDSUk/ICAgIG1TQ0lJP0k/?="
string(294) "=?CP936?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?CP936?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?CP936?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?CP936?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=00?="
string(75) " 1111111111111111111111111111111111111111111111111111111111111111111111111"
string(33) "=?HZ-GB-2312?Q?=7E=7Bs=5B=7E=7D?="
string(77) "2 !3"
string(282) "=?GB18030?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?GB18030?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?GB18030?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?GB18030?Q?=20=20=20=20=20=20=20=20=20=20=20=20!=33=20?="
string(296) "2 =?GB18030?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?GB18030?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?GB18030?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20?= =?GB18030?Q?=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20=20!=33?="
Done