[ruby/json] convert_UTF8_to_JSON: repurpose the escape tables into size tables

Since we're looking up the table anyway, we might as well store the
UTF-8 char length in it. For single byte characters that don't need
escaping we store `0`.

This helps on strings with lots of multi-byte characters:

Before:

```
== Encoding mostly utf8 (20004001 bytes)
ruby 3.3.4 (2024-07-09 revision be1089c8ec) +YJIT [arm64-darwin23]
Warming up --------------------------------------
                json     6.000 i/100ms
                  oj    10.000 i/100ms
           rapidjson     2.000 i/100ms
Calculating -------------------------------------
                json     67.978 (± 1.5%) i/s   (14.71 ms/i) -    342.000 in   5.033062s
                  oj    100.876 (± 2.0%) i/s    (9.91 ms/i) -    510.000 in   5.058080s
           rapidjson     26.389 (± 7.6%) i/s   (37.89 ms/i) -    132.000 in   5.027681s

Comparison:
                json:       68.0 i/s
                  oj:      100.9 i/s - 1.48x  faster
           rapidjson:       26.4 i/s - 2.58x  slower
```

After:

```
== Encoding mostly utf8 (20004001 bytes)
ruby 3.3.4 (2024-07-09 revision be1089c8ec) +YJIT [arm64-darwin23]
Warming up --------------------------------------
                json     7.000 i/100ms
                  oj    10.000 i/100ms
           rapidjson     2.000 i/100ms
Calculating -------------------------------------
                json     75.187 (± 2.7%) i/s   (13.30 ms/i) -    378.000 in   5.030111s
                  oj     95.196 (± 2.1%) i/s   (10.50 ms/i) -    480.000 in   5.043565s
           rapidjson     25.969 (± 3.9%) i/s   (38.51 ms/i) -    130.000 in   5.011471s

Comparison:
                json:       75.2 i/s
                  oj:       95.2 i/s - 1.27x  faster
           rapidjson:       26.0 i/s - 2.90x  slower
```

51e2631d1f
This commit is contained in:
Jean Boussier 2024-10-18 20:47:28 +02:00 committed by Hiroshi SHIBATA
parent 9f300d0541
commit 97713ac952
2 changed files with 101 additions and 66 deletions

View file

@ -59,6 +59,9 @@ end
benchmark_encoding "small nested array", [[1,2,3,4,5]]*10
benchmark_encoding "small hash", { "username" => "jhawthorn", "id" => 123, "event" => "wrote json serializer" }
# On this one we're a bit slower (~25%).
benchmark_encoding "mostly utf8", ([("" * 3333)] * 2000), except: %i(json_state)
# On these three benchmarks we perform well. Either on par or very closely faster/slower
benchmark_encoding "mixed utf8", ([("a" * 5000) + "" + ("a" * 5000)] * 2000), except: %i(json_state)
benchmark_encoding "twitter.json", JSON.load_file("#{__dir__}/data/twitter.json"), except: %i(json_state)

View file

@ -25,7 +25,7 @@ static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend;
* Everything else (should be UTF-8) is just passed through and
* appended to the result.
*/
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const bool escape_table[256], bool out_script_safe)
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256], bool out_script_safe)
{
const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
@ -33,57 +33,61 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const bool esca
const char *ptr = RSTRING_PTR(str);
unsigned long len = RSTRING_LEN(str);
unsigned long beg = 0, pos;
unsigned long beg = 0, pos = 0;
for (pos = 0; pos < len;) {
unsigned char ch = ptr[pos];
/* JSON encoding */
if (escape_table[ch]) {
#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;
switch (ch) {
case '"': FLUSH_POS(1); fbuffer_append(out_buffer, "\\\"", 2); break;
case '\\': FLUSH_POS(1); fbuffer_append(out_buffer, "\\\\", 2); break;
case '/': FLUSH_POS(1); fbuffer_append(out_buffer, "\\/", 2); break;
case '\b': FLUSH_POS(1); fbuffer_append(out_buffer, "\\b", 2); break;
case '\f': FLUSH_POS(1); fbuffer_append(out_buffer, "\\f", 2); break;
case '\n': FLUSH_POS(1); fbuffer_append(out_buffer, "\\n", 2); break;
case '\r': FLUSH_POS(1); fbuffer_append(out_buffer, "\\r", 2); break;
case '\t': FLUSH_POS(1); fbuffer_append(out_buffer, "\\t", 2); break;
default: {
if ((ch & 0x80) == 0x00) { /* leading 1 bit is 0b0 */
FLUSH_POS(1);
scratch[2] = hexdig[ch >> 12];
scratch[3] = hexdig[(ch >> 8) & 0xf];
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
fbuffer_append(out_buffer, scratch, 6);
} else if ((ch & 0xE0) == 0xC0) { /* leading 3 bits are 0b110 */
pos += 2;
} else if ((ch & 0xF0) == 0xE0) { /* leading 4 bits are 0b1110 */
unsigned char b2 = ptr[pos + 1];
unsigned char b3 = ptr[pos + 2];
if (out_script_safe && (b2 == 0x80)) {
if (b3 == 0xA8) {
FLUSH_POS(3);
fprintf(stderr, "escape: \\u2028 pos = %ld\n", pos);
fbuffer_append(out_buffer, "\\u2028", 6);
} else if (b3 == 0xA9) {
FLUSH_POS(3);
fprintf(stderr, "escape: \\u2029 pos = %ld\n", pos);
fbuffer_append(out_buffer, "\\u2029", 6);
} else {
pos += 3;
}
} else {
pos += 3;
while (pos < len) {
unsigned char ch = ptr[pos];
unsigned char ch_len = escape_table[ch];
/* JSON encoding */
if (RB_UNLIKELY(ch_len)) {
switch (ch_len) {
case 0:
pos++;
break;
case 1: {
FLUSH_POS(1);
switch (ch) {
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
default: {
scratch[2] = hexdig[ch >> 12];
scratch[3] = hexdig[(ch >> 8) & 0xf];
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
fbuffer_append(out_buffer, scratch, 6);
break;
}
} else if ((ch & 0xF8) == 0xF0) { /* leading 5 bits are 0b11110 */
pos += 4;
} else {
// This should be unreachable
rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8");
}
break;
}
case 3: {
unsigned char b2 = ptr[pos + 1];
if (out_script_safe && b2 == 0x80) {
unsigned char b3 = ptr[pos + 2];
if (b3 == 0xA8) {
FLUSH_POS(3);
fbuffer_append(out_buffer, "\\u2028", 6);
break;
} else if (b3 == 0xA9) {
FLUSH_POS(3);
fbuffer_append(out_buffer, "\\u2029", 6);
break;
}
}
// fallthrough
}
default:
pos += ch_len;
break;
}
} else {
pos++;
@ -98,29 +102,57 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const bool esca
RB_GC_GUARD(str);
}
static const bool escape_table[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* '"' */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, /* '\\' */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
static const char escape_table[256] = {
// ASCII Control Characters
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
// ASCII Characters
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, // '"'
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
// Continuation byte
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
// First byte of a 2-byte code point
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
// First byte of a 4-byte code point
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
//First byte of a 4+byte code point
4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
};
static const bool script_safe_escape_table[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* '"' and '/' */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, /* '\\' */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
static const char script_safe_escape_table[256] = {
// ASCII Control Characters
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
// ASCII Characters
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, // '"' and '/'
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
// Continuation byte
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
// First byte of a 2-byte code point
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
// First byte of a 4-byte code point
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
//First byte of a 4+byte code point
4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
};
static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const bool escape_table[256])
static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256])
{
const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };