[ruby/json] convert_UTF8_to_JSON: repurpose the escape tables into size tables

Since we're looking up the table anyway, we might as well store the UTF-8 char length in it. For single byte characters that don't need escaping we store `0`. This helps on strings with lots of multi-byte characters: Before: ``` == Encoding mostly utf8 (20004001 bytes) ruby 3.3.4 (2024-07-09 revision be1089c8ec) +YJIT [arm64-darwin23] Warming up -------------------------------------- json 6.000 i/100ms oj 10.000 i/100ms rapidjson 2.000 i/100ms Calculating ------------------------------------- json 67.978 (± 1.5%) i/s (14.71 ms/i) - 342.000 in 5.033062s oj 100.876 (± 2.0%) i/s (9.91 ms/i) - 510.000 in 5.058080s rapidjson 26.389 (± 7.6%) i/s (37.89 ms/i) - 132.000 in 5.027681s Comparison: json: 68.0 i/s oj: 100.9 i/s - 1.48x faster rapidjson: 26.4 i/s - 2.58x slower ``` After: ``` == Encoding mostly utf8 (20004001 bytes) ruby 3.3.4 (2024-07-09 revision be1089c8ec) +YJIT [arm64-darwin23] Warming up -------------------------------------- json 7.000 i/100ms oj 10.000 i/100ms rapidjson 2.000 i/100ms Calculating ------------------------------------- json 75.187 (± 2.7%) i/s (13.30 ms/i) - 378.000 in 5.030111s oj 95.196 (± 2.1%) i/s (10.50 ms/i) - 480.000 in 5.043565s rapidjson 25.969 (± 3.9%) i/s (38.51 ms/i) - 130.000 in 5.011471s Comparison: json: 75.2 i/s oj: 95.2 i/s - 1.27x faster rapidjson: 26.0 i/s - 2.90x slower ``` 51e2631d1f
2025-08-15 13:39:04 +02:00 · 2024-10-18 20:47:28 +02:00 · 2024-10-18 20:47:28 +02:00 · 97713ac952
commit 97713ac952
parent 9f300d0541
2 changed files with 101 additions and 66 deletions
--- a/benchmark/encoder.rb
+++ b/benchmark/encoder.rb
@ -59,6 +59,9 @@ end
 benchmark_encoding "small nested array", [[1,2,3,4,5]]*10
 benchmark_encoding "small hash", { "username" => "jhawthorn", "id" => 123, "event" => "wrote json serializer" }

+# On this one we're a bit slower (~25%).
+benchmark_encoding "mostly utf8", ([("€" * 3333)] * 2000), except: %i(json_state)
+
 # On these three benchmarks we perform well. Either on par or very closely faster/slower
 benchmark_encoding "mixed utf8", ([("a" * 5000) + "€" + ("a" * 5000)] * 2000), except: %i(json_state)
 benchmark_encoding "twitter.json", JSON.load_file("#{__dir__}/data/twitter.json"), except: %i(json_state)
--- a/ext/json/generator/generator.c
+++ b/ext/json/generator/generator.c
@ -25,7 +25,7 @@ static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend;
 * Everything else (should be UTF-8) is just passed through and
 * appended to the result.
 */
-static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const bool escape_table[256], bool out_script_safe)
+static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256], bool out_script_safe)
 {
    const char *hexdig = "0123456789abcdef";
    char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
@ -33,57 +33,61 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const bool esca
    const char *ptr = RSTRING_PTR(str);
    unsigned long len = RSTRING_LEN(str);

-    unsigned long beg = 0, pos;
+    unsigned long beg = 0, pos = 0;

-    for (pos = 0; pos < len;) {
-        unsigned char ch = ptr[pos];
-        /* JSON encoding */
-        if (escape_table[ch]) {
 #define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;
-            switch (ch) {
-                case '"':  FLUSH_POS(1); fbuffer_append(out_buffer, "\\\"", 2); break;
-                case '\\': FLUSH_POS(1); fbuffer_append(out_buffer, "\\\\", 2); break;
-                case '/':  FLUSH_POS(1); fbuffer_append(out_buffer, "\\/", 2); break;
-                case '\b': FLUSH_POS(1); fbuffer_append(out_buffer, "\\b", 2); break;
-                case '\f': FLUSH_POS(1); fbuffer_append(out_buffer, "\\f", 2); break;
-                case '\n': FLUSH_POS(1); fbuffer_append(out_buffer, "\\n", 2); break;
-                case '\r': FLUSH_POS(1); fbuffer_append(out_buffer, "\\r", 2); break;
-                case '\t': FLUSH_POS(1); fbuffer_append(out_buffer, "\\t", 2); break;
-                default: {
-                    if ((ch & 0x80) == 0x00) { /* leading 1 bit is   0b0     */
-                        FLUSH_POS(1);
-                        scratch[2] = hexdig[ch >> 12];
-                        scratch[3] = hexdig[(ch >> 8) & 0xf];
-                        scratch[4] = hexdig[(ch >> 4) & 0xf];
-                        scratch[5] = hexdig[ch & 0xf];
-                        fbuffer_append(out_buffer, scratch, 6);
-                    } else if ((ch & 0xE0) == 0xC0) { /* leading 3 bits are 0b110   */
-                        pos += 2;
-                    } else if ((ch & 0xF0) == 0xE0) { /* leading 4 bits are 0b1110  */
-                        unsigned char b2 = ptr[pos + 1];
-                        unsigned char b3 = ptr[pos + 2];
-                        if (out_script_safe && (b2 == 0x80)) {
-                            if (b3 == 0xA8) {
-                                FLUSH_POS(3);
-                                fprintf(stderr, "escape: \\u2028 pos = %ld\n", pos);
-                                fbuffer_append(out_buffer, "\\u2028", 6);
-                            } else if (b3 == 0xA9) {
-                                FLUSH_POS(3);
-                                fprintf(stderr, "escape: \\u2029 pos = %ld\n", pos);
-                                fbuffer_append(out_buffer, "\\u2029", 6);
-                            } else {
-                                pos += 3;
-                            }
-                        } else {
-                            pos += 3;
+
+    while (pos < len) {
+        unsigned char ch = ptr[pos];
+        unsigned char ch_len = escape_table[ch];
+        /* JSON encoding */
+
+        if (RB_UNLIKELY(ch_len)) {
+            switch (ch_len) {
+                case 0:
+                    pos++;
+                    break;
+                case 1: {
+                    FLUSH_POS(1);
+                    switch (ch) {
+                        case '"':  fbuffer_append(out_buffer, "\\\"", 2); break;
+                        case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
+                        case '/':  fbuffer_append(out_buffer, "\\/", 2); break;
+                        case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
+                        case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
+                        case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
+                        case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
+                        case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
+                        default: {
+                            scratch[2] = hexdig[ch >> 12];
+                            scratch[3] = hexdig[(ch >> 8) & 0xf];
+                            scratch[4] = hexdig[(ch >> 4) & 0xf];
+                            scratch[5] = hexdig[ch & 0xf];
+                            fbuffer_append(out_buffer, scratch, 6);
+                            break;
                        }
-                    } else if ((ch & 0xF8) == 0xF0) { /* leading 5 bits are 0b11110 */
-                        pos += 4;
-                    } else {
-                        // This should be unreachable
-                        rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8");
                    }
+                    break;
                }
+                case 3: {
+                    unsigned char b2 = ptr[pos + 1];
+                    if (out_script_safe && b2 == 0x80) {
+                        unsigned char b3 = ptr[pos + 2];
+                        if (b3 == 0xA8) {
+                            FLUSH_POS(3);
+                            fbuffer_append(out_buffer, "\\u2028", 6);
+                            break;
+                        } else if (b3 == 0xA9) {
+                            FLUSH_POS(3);
+                            fbuffer_append(out_buffer, "\\u2029", 6);
+                            break;
+                        }
+                    }
+                    // fallthrough
+                }
+                default:
+                    pos += ch_len;
+                    break;
            }
        } else {
            pos++;
@ -98,29 +102,57 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const bool esca
    RB_GC_GUARD(str);
 }

-static const bool escape_table[256] = {
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* '"'  */
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, /* '\\' */
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+static const char escape_table[256] = {
+    // ASCII Control Characters
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    // ASCII Characters
+    0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, // '"'
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    // Continuation byte
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    // First byte of a 2-byte code point
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    // First byte of a 4-byte code point
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+    //First byte of a 4+byte code point
+    4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
 };

-static const bool script_safe_escape_table[256] = {
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* '"' and '/' */
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, /* '\\' */
-    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
-    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
+static const char script_safe_escape_table[256] = {
+    // ASCII Control Characters
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    // ASCII Characters
+    0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, // '"' and '/'
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, // '\\'
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    // Continuation byte
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    // First byte of a 2-byte code point
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
+    // First byte of a 4-byte code point
+    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
+    //First byte of a 4+byte code point
+    4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1,
 };

-static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const bool escape_table[256])
+static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const char escape_table[256])
 {
    const char *hexdig = "0123456789abcdef";
    char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };