Improve and simplify UTF-8 validation in JSON

This commit is contained in:
Jakub Zelenka 2017-06-11 17:27:32 +01:00
parent 68a0639c8f
commit f6ac96b039

View file

@ -246,33 +246,13 @@ static int php_json_encode_array(smart_str *buf, zval *val, int options, php_jso
} }
/* }}} */ /* }}} */
static int php_json_valid_utf8(char utf8[], size_t len) /* {{{ */
{
size_t pos = 0, us;
int status;
while (pos < len) {
us = (unsigned char)utf8[pos];
if (us < 0x80) {
pos++;
} else {
us = php_next_utf8_char((const unsigned char *)utf8, len, &pos, &status);
if (status != SUCCESS) {
return 0;
}
}
}
return 1;
}
/* }}} */
static int php_json_escape_string( static int php_json_escape_string(
smart_str *buf, char *s, size_t len, smart_str *buf, char *s, size_t len,
int options, php_json_encoder *encoder) /* {{{ */ int options, php_json_encoder *encoder) /* {{{ */
{ {
int status; int status;
unsigned int us; unsigned int us;
size_t pos, checkpoint; size_t prev_pos, pos, checkpoint;
if (len == 0) { if (len == 0) {
smart_str_appendl(buf, "\"\"", 2); smart_str_appendl(buf, "\"\"", 2);
@ -295,18 +275,6 @@ static int php_json_escape_string(
} }
} }
if (options & PHP_JSON_UNESCAPED_UNICODE) {
/* validate UTF-8 string first */
if (!php_json_valid_utf8(s, len)) {
encoder->error_code = PHP_JSON_ERROR_UTF8;
if (options & PHP_JSON_PARTIAL_OUTPUT_ON_ERROR) {
smart_str_appendl(buf, "null", 4);
}
return FAILURE;
}
}
pos = 0; pos = 0;
checkpoint = buf->s ? ZSTR_LEN(buf->s) : 0; checkpoint = buf->s ? ZSTR_LEN(buf->s) : 0;
@ -315,27 +283,27 @@ static int php_json_escape_string(
smart_str_appendc(buf, '"'); smart_str_appendc(buf, '"');
do { do {
us = (unsigned char)s[pos]; prev_pos = pos;
if (us >= 0x80 && (!(options & PHP_JSON_UNESCAPED_UNICODE) || us == 0xE2)) { us = php_next_utf8_char((unsigned char *)s, len, &pos, &status);
/* UTF-8 character */ /* check whether UTF8 character is correct */
us = php_next_utf8_char((const unsigned char *)s, len, &pos, &status); if (status != SUCCESS) {
if (status != SUCCESS) { if (buf->s) {
if (buf->s) { ZSTR_LEN(buf->s) = checkpoint;
ZSTR_LEN(buf->s) = checkpoint;
}
encoder->error_code = PHP_JSON_ERROR_UTF8;
if (options & PHP_JSON_PARTIAL_OUTPUT_ON_ERROR) {
smart_str_appendl(buf, "null", 4);
}
return FAILURE;
} }
encoder->error_code = PHP_JSON_ERROR_UTF8;
if (options & PHP_JSON_PARTIAL_OUTPUT_ON_ERROR) {
smart_str_appendl(buf, "null", 4);
}
return FAILURE;
}
if (us >= 0x80 && (!(options & PHP_JSON_UNESCAPED_UNICODE) || (unsigned char)s[prev_pos] == 0xE2)) {
/* Escape U+2028/U+2029 line terminators, UNLESS both /* Escape U+2028/U+2029 line terminators, UNLESS both
JSON_UNESCAPED_UNICODE and JSON_UNESCAPED_UNICODE and
JSON_UNESCAPED_LINE_TERMINATORS were provided */ JSON_UNESCAPED_LINE_TERMINATORS were provided */
if ((options & PHP_JSON_UNESCAPED_UNICODE) if ((options & PHP_JSON_UNESCAPED_UNICODE)
&& ((options & PHP_JSON_UNESCAPED_LINE_TERMINATORS) && ((options & PHP_JSON_UNESCAPED_LINE_TERMINATORS)
|| us < 0x2028 || us > 0x2029)) { || us < 0x2028 || us > 0x2029)) {
smart_str_appendl(buf, &s[pos - 3], 3); smart_str_appendl(buf, &s[prev_pos], 3);
continue; continue;
} }
/* From http://en.wikipedia.org/wiki/UTF16 */ /* From http://en.wikipedia.org/wiki/UTF16 */
@ -357,8 +325,6 @@ static int php_json_escape_string(
smart_str_appendc(buf, digits[(us & 0xf0) >> 4]); smart_str_appendc(buf, digits[(us & 0xf0) >> 4]);
smart_str_appendc(buf, digits[(us & 0xf)]); smart_str_appendc(buf, digits[(us & 0xf)]);
} else { } else {
pos++;
switch (us) { switch (us) {
case '"': case '"':
if (options & PHP_JSON_HEX_QUOT) { if (options & PHP_JSON_HEX_QUOT) {
@ -434,7 +400,7 @@ static int php_json_escape_string(
default: default:
if (us >= ' ') { if (us >= ' ') {
smart_str_appendc(buf, (unsigned char) us); smart_str_appendl(buf, s + prev_pos, pos - prev_pos);
} else { } else {
smart_str_appendl(buf, "\\u00", sizeof("\\u00")-1); smart_str_appendl(buf, "\\u00", sizeof("\\u00")-1);
smart_str_appendc(buf, digits[(us & 0xf0) >> 4]); smart_str_appendc(buf, digits[(us & 0xf0) >> 4]);