diff --git a/benchmark/encoder.rb b/benchmark/encoder.rb index a50fbb06db..bd0105ae38 100644 --- a/benchmark/encoder.rb +++ b/benchmark/encoder.rb @@ -59,6 +59,9 @@ end benchmark_encoding "small nested array", [[1,2,3,4,5]]*10 benchmark_encoding "small hash", { "username" => "jhawthorn", "id" => 123, "event" => "wrote json serializer" } +# On this one we're a bit slower (~25%). +benchmark_encoding "mostly utf8", ([("€" * 3333)] * 2000), except: %i(json_state) + # On these three benchmarks we perform well. Either on par or very closely faster/slower benchmark_encoding "mixed utf8", ([("a" * 5000) + "€" + ("a" * 5000)] * 2000), except: %i(json_state) benchmark_encoding "twitter.json", JSON.load_file("#{__dir__}/data/twitter.json"), except: %i(json_state) diff --git a/ext/json/generator/generator.c b/ext/json/generator/generator.c index ed50f43aec..f784e3b4b2 100644 --- a/ext/json/generator/generator.c +++ b/ext/json/generator/generator.c @@ -25,7 +25,7 @@ static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend; * Everything else (should be UTF-8) is just passed through and * appended to the result. */ -static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const bool escape_table[256], bool out_script_safe) +static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256], bool out_script_safe) { const char *hexdig = "0123456789abcdef"; char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; @@ -33,57 +33,61 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const bool esca const char *ptr = RSTRING_PTR(str); unsigned long len = RSTRING_LEN(str); - unsigned long beg = 0, pos; + unsigned long beg = 0, pos = 0; - for (pos = 0; pos < len;) { - unsigned char ch = ptr[pos]; - /* JSON encoding */ - if (escape_table[ch]) { #define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos; - switch (ch) { - case '"': FLUSH_POS(1); fbuffer_append(out_buffer, "\\\"", 2); break; - case '\\': FLUSH_POS(1); fbuffer_append(out_buffer, "\\\\", 2); break; - case '/': FLUSH_POS(1); fbuffer_append(out_buffer, "\\/", 2); break; - case '\b': FLUSH_POS(1); fbuffer_append(out_buffer, "\\b", 2); break; - case '\f': FLUSH_POS(1); fbuffer_append(out_buffer, "\\f", 2); break; - case '\n': FLUSH_POS(1); fbuffer_append(out_buffer, "\\n", 2); break; - case '\r': FLUSH_POS(1); fbuffer_append(out_buffer, "\\r", 2); break; - case '\t': FLUSH_POS(1); fbuffer_append(out_buffer, "\\t", 2); break; - default: { - if ((ch & 0x80) == 0x00) { /* leading 1 bit is 0b0 */ - FLUSH_POS(1); - scratch[2] = hexdig[ch >> 12]; - scratch[3] = hexdig[(ch >> 8) & 0xf]; - scratch[4] = hexdig[(ch >> 4) & 0xf]; - scratch[5] = hexdig[ch & 0xf]; - fbuffer_append(out_buffer, scratch, 6); - } else if ((ch & 0xE0) == 0xC0) { /* leading 3 bits are 0b110 */ - pos += 2; - } else if ((ch & 0xF0) == 0xE0) { /* leading 4 bits are 0b1110 */ - unsigned char b2 = ptr[pos + 1]; - unsigned char b3 = ptr[pos + 2]; - if (out_script_safe && (b2 == 0x80)) { - if (b3 == 0xA8) { - FLUSH_POS(3); - fprintf(stderr, "escape: \\u2028 pos = %ld\n", pos); - fbuffer_append(out_buffer, "\\u2028", 6); - } else if (b3 == 0xA9) { - FLUSH_POS(3); - fprintf(stderr, "escape: \\u2029 pos = %ld\n", pos); - fbuffer_append(out_buffer, "\\u2029", 6); - } else { - pos += 3; - } - } else { - pos += 3; + + while (pos < len) { + unsigned char ch = ptr[pos]; + unsigned char ch_len = escape_table[ch]; + /* JSON encoding */ + + if (RB_UNLIKELY(ch_len)) { + switch (ch_len) { + case 0: + pos++; + break; + case 1: { + FLUSH_POS(1); + switch (ch) { + case '"': fbuffer_append(out_buffer, "\\\"", 2); break; + case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; + case '/': fbuffer_append(out_buffer, "\\/", 2); break; + case '\b': fbuffer_append(out_buffer, "\\b", 2); break; + case '\f': fbuffer_append(out_buffer, "\\f", 2); break; + case '\n': fbuffer_append(out_buffer, "\\n", 2); break; + case '\r': fbuffer_append(out_buffer, "\\r", 2); break; + case '\t': fbuffer_append(out_buffer, "\\t", 2); break; + default: { + scratch[2] = hexdig[ch >> 12]; + scratch[3] = hexdig[(ch >> 8) & 0xf]; + scratch[4] = hexdig[(ch >> 4) & 0xf]; + scratch[5] = hexdig[ch & 0xf]; + fbuffer_append(out_buffer, scratch, 6); + break; } - } else if ((ch & 0xF8) == 0xF0) { /* leading 5 bits are 0b11110 */ - pos += 4; - } else { - // This should be unreachable - rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8"); } + break; } + case 3: { + unsigned char b2 = ptr[pos + 1]; + if (out_script_safe && b2 == 0x80) { + unsigned char b3 = ptr[pos + 2]; + if (b3 == 0xA8) { + FLUSH_POS(3); + fbuffer_append(out_buffer, "\\u2028", 6); + break; + } else if (b3 == 0xA9) { + FLUSH_POS(3); + fbuffer_append(out_buffer, "\\u2029", 6); + break; + } + } + // fallthrough + } + default: + pos += ch_len; + break; } } else { pos++; @@ -98,29 +102,57 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const bool esca RB_GC_GUARD(str); } -static const bool escape_table[256] = { - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* '"' */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, /* '\\' */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +static const char escape_table[256] = { + // ASCII Control Characters + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + // ASCII Characters + 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, // '"' + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\' + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + // Continuation byte + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + // First byte of a 2-byte code point + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + // First byte of a 4-byte code point + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + //First byte of a 4+byte code point + 4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1, }; -static const bool script_safe_escape_table[256] = { - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* '"' and '/' */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, /* '\\' */ - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +static const char script_safe_escape_table[256] = { + // ASCII Control Characters + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + // ASCII Characters + 0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, // '"' and '/' + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\' + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + // Continuation byte + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + // First byte of a 2-byte code point + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + // First byte of a 4-byte code point + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, + //First byte of a 4+byte code point + 4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1, }; -static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const bool escape_table[256]) +static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256]) { const char *hexdig = "0123456789abcdef"; char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };