fix WS, CS and other S

This commit is contained in:
Antony Dovgal 2007-07-12 09:55:41 +00:00
parent 53c34ffb40
commit a819499088
3 changed files with 519 additions and 583 deletions

View file

@ -38,6 +38,8 @@ SOFTWARE.
significant reductions in the size of the state transition table.
*/
/* {{{ constants */
/* error */
#define S_ERR -1
@ -134,7 +136,9 @@ SOFTWARE.
/* everything else */
#define S_ETC 30
/* }}} */
/* {{{ tables */
/*
This table maps the 128 ASCII characters into the 32 character classes.
The remaining Unicode characters should be mapped to S_ETC.
@ -161,7 +165,6 @@ static const int ascii_class[128] = {
S_ETC, S_ETC, S_ETC, S_LBE, S_ETC, S_RBE, S_ETC, S_ETC
};
/*
The state transition table takes the current state and the current symbol,
and returns either a new state or an action. A new state is a number between
@ -201,8 +204,9 @@ static const int state_transition_table[30][31] = {
/*29*/ {29,29,-1,-1,-1,-1,-1,-1, 3,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1}
};
#define JSON_PARSER_MAX_DEPTH 128
/* }}} */
#define JSON_PARSER_MAX_DEPTH 128
/*
A stack maintains the states of nested structures.
@ -215,7 +219,6 @@ typedef struct json_parser
int the_top;
} json_parser;
/*
These modes can be pushed on the PDA stack.
*/
@ -227,8 +230,7 @@ typedef struct json_parser
/*
Push a mode onto the stack. Return false if there is overflow.
*/
static int
push(json_parser *json, zval *z, int mode)
static int push(json_parser *json, zval *z, int mode) /* {{{ */
{
json->the_top += 1;
if (json->the_top >= JSON_PARSER_MAX_DEPTH) {
@ -238,14 +240,13 @@ push(json_parser *json, zval *z, int mode)
json->the_stack[json->the_top] = mode;
return true;
}
/* }}} */
/*
Pop the stack, assuring that the current mode matches the expectation.
Return false if there is underflow or if the modes mismatch.
*/
static int
pop(json_parser *json, zval *z, int mode)
static int pop(json_parser *json, zval *z, int mode) /* {{{ */
{
if (json->the_top < 0 || json->the_stack[json->the_top] != mode) {
return false;
@ -255,73 +256,53 @@ pop(json_parser *json, zval *z, int mode)
return true;
}
/* }}} */
static int dehexchar(char c)
{
if (c >= '0' && c <= '9')
static int dehexchar(char c) /* {{{ */
{
if (c >= '0' && c <= '9') {
return c - '0';
}
else if (c >= 'A' && c <= 'F')
{
} else if (c >= 'A' && c <= 'F') {
return c - ('A' - 10);
}
else if (c >= 'a' && c <= 'f')
{
} else if (c >= 'a' && c <= 'f') {
return c - ('a' - 10);
}
else
{
} else {
return -1;
}
}
/* }}} */
static void json_create_zval(zval **z, smart_str *buf, int type TSRMLS_DC)
static void json_create_zval(zval **z, smart_str *buf, int type TSRMLS_DC) /* {{{ */
{
ALLOC_INIT_ZVAL(*z);
if (type == IS_LONG)
{
if (type == IS_LONG) {
double d = zend_strtod(buf->c, NULL);
if (d > LONG_MAX || d < -LONG_MAX) {
ZVAL_DOUBLE(*z, d);
} else {
ZVAL_LONG(*z, (long)d);
}
}
else if (type == IS_DOUBLE)
{
} else if (type == IS_DOUBLE) {
ZVAL_DOUBLE(*z, zend_strtod(buf->c, NULL));
}
else if (type == IS_STRING)
{
} else if (type == IS_STRING) {
ZVAL_UTF8_STRINGL(*z, buf->c, buf->len, ZSTR_DUPLICATE);
}
else if (type == IS_BOOL)
{
} else if (type == IS_BOOL) {
ZVAL_BOOL(*z, (*(buf->c) == 't'));
}
else /* type == IS_NULL) || type unknown */
{
} else { /* type == IS_NULL) || type unknown */
ZVAL_NULL(*z);
}
}
/* }}} */
static void utf16_to_utf8(smart_str *buf, unsigned short utf16)
{
if (utf16 < 0x80)
static void utf16_to_utf8(smart_str *buf, unsigned short utf16) /* {{{ */
{
if (utf16 < 0x80) {
smart_str_appendc(buf, (unsigned char) utf16);
}
else if (utf16 < 0x800)
{
} else if (utf16 < 0x800) {
smart_str_appendc(buf, 0xc0 | (utf16 >> 6));
smart_str_appendc(buf, 0x80 | (utf16 & 0x3f));
}
else if ((utf16 & 0xfc00) == 0xdc00
} else if ((utf16 & 0xfc00) == 0xdc00
&& buf->len >= 3
&& ((unsigned char) buf->c[buf->len - 3]) == 0xed
&& ((unsigned char) buf->c[buf->len - 2] & 0xf0) == 0xa0
@ -339,42 +320,35 @@ static void utf16_to_utf8(smart_str *buf, unsigned short utf16)
smart_str_appendc(buf, 0x80 | ((utf32 >> 12) & 0x3f));
smart_str_appendc(buf, 0x80 | ((utf32 >> 6) & 0x3f));
smart_str_appendc(buf, 0x80 | (utf32 & 0x3f));
}
else
{
} else {
smart_str_appendc(buf, 0xe0 | (utf16 >> 12));
smart_str_appendc(buf, 0x80 | ((utf16 >> 6) & 0x3f));
smart_str_appendc(buf, 0x80 | (utf16 & 0x3f));
}
}
/* }}} */
static void attach_zval(json_parser *json, int up, int cur, smart_str *key, int assoc TSRMLS_DC)
static void attach_zval(json_parser *json, int up, int cur, smart_str *key, int assoc TSRMLS_DC) /* {{{ */
{
zval *root = json->the_zstack[up];
zval *child = json->the_zstack[cur];
int up_mode = json->the_stack[up];
if (up_mode == MODE_ARRAY)
{
if (up_mode == MODE_ARRAY) {
add_next_index_zval(root, child);
}
else if (up_mode == MODE_OBJECT)
{
if (!assoc)
{
} else if (up_mode == MODE_OBJECT) {
if (!assoc) {
add_utf8_property_zval_ex(root, (key->len ? key->c : "_empty_"), (key->len ? (key->len + 1) : sizeof("_empty_")), child TSRMLS_CC);
#if PHP_MAJOR_VERSION >= 5
ZVAL_DELREF(child);
#endif
}
else
{
} else {
add_utf8_assoc_zval_ex(root, (key->len ? key->c : ""), (key->len ? (key->len + 1) : sizeof("")), child);
}
key->len = 0;
}
}
/* }}} */
#define FREE_BUFFERS() do { smart_str_free(&buf); smart_str_free(&key); } while (0);
#define SWAP_BUFFERS(from, to) do { \
@ -390,7 +364,6 @@ static void attach_zval(json_parser *json, int up, int cur, smart_str *key, int
#define JSON_RESET_TYPE() do { type = -1; } while(0);
#define JSON(x) the_json.x
/*
The JSON_parser takes a UTF-16 encoded string and determines if it is a
syntactically correct JSON text. Along the way, it creates a PHP variable.
@ -398,8 +371,7 @@ static void attach_zval(json_parser *json, int up, int cur, smart_str *key, int
It is implemented as a Pushdown Automaton; that means it is a finite state
machine with a stack.
*/
int
JSON_parser(zval *z, unsigned short p[], int length, int assoc TSRMLS_DC)
int JSON_parser(zval *z, unsigned short p[], int length, int assoc TSRMLS_DC) /* {{{ */
{
int b; /* the next character */
int c; /* the next character class */
@ -437,9 +409,7 @@ JSON_parser(zval *z, unsigned short p[], int length, int assoc TSRMLS_DC)
Perform one of the predefined actions.
*/
switch (s) {
/*
empty }
*/
/* empty "}" {{{ */
case -9:
if (!pop(&the_json, z, MODE_KEY)) {
FREE_BUFFERS();
@ -447,9 +417,9 @@ JSON_parser(zval *z, unsigned short p[], int length, int assoc TSRMLS_DC)
}
the_state = 9;
break;
/*
{
*/
/* }}} */
/* "{" {{{ */
case -8:
if (!push(&the_json, z, MODE_KEY)) {
FREE_BUFFERS();
@ -457,42 +427,33 @@ JSON_parser(zval *z, unsigned short p[], int length, int assoc TSRMLS_DC)
}
the_state = 1;
if (JSON(the_top) > 0)
{
if (JSON(the_top) > 0) {
zval *obj;
if (JSON(the_top) == 1)
{
if (JSON(the_top) == 1) {
obj = z;
}
else
{
} else {
ALLOC_INIT_ZVAL(obj);
}
if (!assoc)
{
if (!assoc) {
object_init(obj);
}
else
{
} else {
array_init(obj);
}
JSON(the_zstack)[JSON(the_top)] = obj;
if (JSON(the_top) > 1)
{
if (JSON(the_top) > 1) {
attach_zval(&the_json, JSON(the_top-1), JSON(the_top), &key, assoc TSRMLS_CC);
}
JSON_RESET_TYPE();
}
break;
/*
}
*/
/* }}} */
/* "}" {{{ */
case -7:
if (type != -1 &&
(JSON(the_stack)[JSON(the_top)] == MODE_OBJECT ||
@ -503,15 +464,12 @@ JSON_parser(zval *z, unsigned short p[], int length, int assoc TSRMLS_DC)
json_create_zval(&mval, &buf, type TSRMLS_CC);
if (!assoc)
{
if (!assoc) {
add_utf8_property_zval_ex(JSON(the_zstack)[JSON(the_top)], (key.len ? key.c : "_empty_"), (key.len ? (key.len + 1) : sizeof("_empty_")), mval TSRMLS_CC);
#if PHP_MAJOR_VERSION >= 5
ZVAL_DELREF(mval);
#endif
}
else
{
} else {
add_utf8_assoc_zval_ex(JSON(the_zstack)[JSON(the_top)], (key.len ? key.c : ""), (key.len ? (key.len + 1) : sizeof("")), mval);
}
key.len = 0;
@ -526,9 +484,9 @@ JSON_parser(zval *z, unsigned short p[], int length, int assoc TSRMLS_DC)
}
the_state = 9;
break;
/*
[
*/
/* }}} */
/* "[" {{{ */
case -6:
if (!push(&the_json, z, MODE_ARRAY)) {
FREE_BUFFERS();
@ -536,24 +494,19 @@ JSON_parser(zval *z, unsigned short p[], int length, int assoc TSRMLS_DC)
}
the_state = 2;
if (JSON(the_top) > 0)
{
if (JSON(the_top) > 0) {
zval *arr;
if (JSON(the_top) == 1)
{
if (JSON(the_top) == 1) {
arr = z;
}
else
{
} else {
ALLOC_INIT_ZVAL(arr);
}
array_init(arr);
JSON(the_zstack)[JSON(the_top)] = arr;
if (JSON(the_top) > 1)
{
if (JSON(the_top) > 1) {
attach_zval(&the_json, JSON(the_top-1), JSON(the_top), &key, assoc TSRMLS_CC);
}
@ -561,9 +514,9 @@ JSON_parser(zval *z, unsigned short p[], int length, int assoc TSRMLS_DC)
}
break;
/*
]
*/
/* }}} */
/* "]" {{{ */
case -5:
{
if (type != -1 &&
@ -586,9 +539,9 @@ JSON_parser(zval *z, unsigned short p[], int length, int assoc TSRMLS_DC)
the_state = 9;
}
break;
/*
"
*/
/* }}} */
/* "\"" {{{ */
case -4:
switch (JSON(the_stack)[JSON(the_top)]) {
case MODE_KEY:
@ -614,9 +567,9 @@ JSON_parser(zval *z, unsigned short p[], int length, int assoc TSRMLS_DC)
return false;
}
break;
/*
,
*/
/* }}} */
/* "'" {{{ */
case -3:
{
zval *mval;
@ -632,17 +585,13 @@ JSON_parser(zval *z, unsigned short p[], int length, int assoc TSRMLS_DC)
switch (JSON(the_stack)[JSON(the_top)]) {
case MODE_OBJECT:
if (pop(&the_json, z, MODE_OBJECT) && push(&the_json, z, MODE_KEY)) {
if (type != -1)
{
if (!assoc)
{
if (type != -1) {
if (!assoc) {
add_utf8_property_zval_ex(JSON(the_zstack)[JSON(the_top)], (key.len ? key.c : "_empty_"), (key.len ? (key.len + 1) : sizeof("_empty_")), mval TSRMLS_CC);
#if PHP_MAJOR_VERSION >= 5
ZVAL_DELREF(mval);
#endif
}
else
{
} else {
add_utf8_assoc_zval_ex(JSON(the_zstack)[JSON(the_top)], (key.len ? key.c : ""), (key.len ? (key.len + 1) : sizeof("")), mval);
}
key.len = 0;
@ -651,8 +600,7 @@ JSON_parser(zval *z, unsigned short p[], int length, int assoc TSRMLS_DC)
}
break;
case MODE_ARRAY:
if (type != -1)
{
if (type != -1) {
add_next_index_zval(JSON(the_zstack)[JSON(the_top)], mval);
}
the_state = 28;
@ -665,6 +613,9 @@ JSON_parser(zval *z, unsigned short p[], int length, int assoc TSRMLS_DC)
JSON_RESET_TYPE();
}
break;
/* }}} */
/* ":" {{{ */
/*
:
*/
@ -673,31 +624,26 @@ JSON_parser(zval *z, unsigned short p[], int length, int assoc TSRMLS_DC)
the_state = 28;
break;
}
/*
syntax error
*/
/* }}} */
/* syntax error {{{ */
case -1:
{
FREE_BUFFERS();
return false;
}
/* }}} */
}
} else {
/*
Change the state and iterate.
*/
if (type == IS_STRING)
{
if (s == 3 && the_state != 8)
{
if (the_state != 4)
{
if (type == IS_STRING) {
if (s == 3 && the_state != 8) {
if (the_state != 4) {
utf16_to_utf8(&buf, b);
}
else
{
switch (b)
{
} else {
switch (b) {
case 'b':
smart_str_appendc(&buf, '\b');
break;
@ -718,54 +664,32 @@ JSON_parser(zval *z, unsigned short p[], int length, int assoc TSRMLS_DC)
break;
}
}
}
else if (s == 6)
{
} else if (s == 6) {
utf16 = dehexchar(b) << 12;
}
else if (s == 7)
{
} else if (s == 7) {
utf16 += dehexchar(b) << 8;
}
else if (s == 8)
{
} else if (s == 8) {
utf16 += dehexchar(b) << 4;
}
else if (s == 3 && the_state == 8)
{
} else if (s == 3 && the_state == 8) {
utf16 += dehexchar(b);
utf16_to_utf8(&buf, utf16);
}
}
else if (type < IS_LONG && (c == S_DIG || c == S_ZER))
{
} else if (type < IS_LONG && (c == S_DIG || c == S_ZER)) {
type = IS_LONG;
smart_str_appendc(&buf, b);
}
else if (type == IS_LONG && s == 24)
{
} else if (type == IS_LONG && s == 24) {
type = IS_DOUBLE;
smart_str_appendc(&buf, b);
}
else if (type < IS_DOUBLE && c == S_DOT)
{
} else if (type < IS_DOUBLE && c == S_DOT) {
type = IS_DOUBLE;
smart_str_appendc(&buf, b);
}
else if (type < IS_STRING && c == S_QUO)
{
} else if (type < IS_STRING && c == S_QUO) {
type = IS_STRING;
}
else if (type < IS_BOOL && ((the_state == 12 && s == 9) || (the_state == 16 && s == 9)))
{
} else if (type < IS_BOOL && ((the_state == 12 && s == 9) || (the_state == 16 && s == 9))) {
type = IS_BOOL;
}
else if (type < IS_NULL && the_state == 19 && s == 9)
{
} else if (type < IS_NULL && the_state == 19 && s == 9) {
type = IS_NULL;
}
else if (type != IS_STRING && c > S_WSP)
{
} else if (type != IS_STRING && c > S_WSP) {
utf16_to_utf8(&buf, b);
}
@ -774,10 +698,9 @@ JSON_parser(zval *z, unsigned short p[], int length, int assoc TSRMLS_DC)
}
FREE_BUFFERS();
return the_state == 9 && pop(&the_json, z, MODE_DONE);
}
/* }}} */
/*
* Local variables:

View file

@ -55,8 +55,7 @@ SOFTWARE.
/*
Get the next byte. It returns UTF8_END if there are no more bytes.
*/
static int
get(json_utf8_decode *utf8)
static int get(json_utf8_decode *utf8) /* {{{ */
{
int c;
if (utf8->the_index >= utf8->the_length) {
@ -66,25 +65,23 @@ get(json_utf8_decode *utf8)
utf8->the_index += 1;
return c;
}
/* }}} */
/*
Get the 6-bit payload of the next continuation byte.
Return UTF8_ERROR if it is not a contination byte.
*/
static int
cont(json_utf8_decode *utf8)
static int cont(json_utf8_decode *utf8) /* {{{ */
{
int c = get(utf8);
return ((c & 0xC0) == 0x80) ? (c & 0x3F) : UTF8_ERROR;
}
/* }}} */
/*
Initialize the UTF-8 decoder. The decoder is not reentrant,
*/
void
utf8_decode_init(json_utf8_decode *utf8, char p[], int length)
void utf8_decode_init(json_utf8_decode *utf8, char p[], int length) /* {{{ */
{
utf8->the_index = 0;
utf8->the_input = p;
@ -92,28 +89,26 @@ utf8_decode_init(json_utf8_decode *utf8, char p[], int length)
utf8->the_char = 0;
utf8->the_byte = 0;
}
/* }}} */
/*
Get the current byte offset. This is generally used in error reporting.
*/
int
utf8_decode_at_byte(json_utf8_decode *utf8)
int utf8_decode_at_byte(json_utf8_decode *utf8) /* {{{ */
{
return utf8->the_byte;
}
/* }}} */
/*
Get the current character offset. This is generally used in error reporting.
The character offset matches the byte offset if the text is strictly ASCII.
*/
int
utf8_decode_at_character(json_utf8_decode *utf8)
int utf8_decode_at_character(json_utf8_decode *utf8) /* {{{ */
{
return utf8->the_char > 0 ? utf8->the_char - 1 : 0;
}
/* }}} */
/*
Extract the next character.
@ -121,8 +116,7 @@ utf8_decode_at_character(json_utf8_decode *utf8)
or UTF8_END (the end)
or UTF8_ERROR (error)
*/
int
utf8_decode_next(json_utf8_decode *utf8)
int utf8_decode_next(json_utf8_decode *utf8) /* {{{ */
{
int c; /* the first byte of the character */
int r; /* the result */
@ -177,3 +171,13 @@ utf8_decode_next(json_utf8_decode *utf8)
}
return UTF8_ERROR;
}
/* }}} */
/*
* Local variables:
* tab-width: 4
* c-basic-offset: 4
* End:
* vim600: noet sw=4 ts=4
* vim<600: noet sw=4 ts=4
*/

View file

@ -29,8 +29,7 @@ SOFTWARE.
#include "utf8_to_utf16.h"
#include "utf8_decode.h"
int
utf8_to_utf16(unsigned short w[], char p[], int length)
int utf8_to_utf16(unsigned short w[], char p[], int length) /* {{{ */
{
int c;
int the_index = 0;
@ -54,3 +53,13 @@ utf8_to_utf16(unsigned short w[], char p[], int length)
}
}
}
/* }}} */
/*
* Local variables:
* tab-width: 4
* c-basic-offset: 4
* End:
* vim600: noet sw=4 ts=4
* vim<600: noet sw=4 ts=4
*/