[ruby/prism] Use MatchWriteNode on split InterpolatedREN

ee54244800
2025-08-27 15:06:10 +02:00 · 2023-10-23 14:31:30 -04:00 · 2023-10-23 14:31:30 -04:00 · a8af5d3808
commit a8af5d3808
parent 9c5b084c0a
4 changed files with 176 additions and 72 deletions
--- a/prism/prism.c
+++ b/prism/prism.c
@ -14570,6 +14570,50 @@ parse_call_operator_write(pm_parser_t *parser, pm_call_node_t *call_node, const
    }
 }
 // Potentially change a =~ with a regular expression with named captures into a
 // match write node.
 static pm_node_t *
 parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *content, pm_call_node_t *call) {
    pm_string_list_t named_captures;
    pm_string_list_init(&named_captures);
    pm_node_t *result;
    if (pm_regexp_named_capture_group_names(pm_string_source(content), pm_string_length(content), &named_captures, parser->encoding_changed, &parser->encoding) && (named_captures.length > 0)) {
        pm_match_write_node_t *match = pm_match_write_node_create(parser, call);
        for (size_t index = 0; index < named_captures.length; index++) {
            pm_string_t *name = &named_captures.strings[index];
            pm_constant_id_t local;
            if (content->type == PM_STRING_SHARED) {
                // If the unescaped string is a slice of the source,
                // then we can copy the names directly. The pointers
                // will line up.
                local = pm_parser_local_add_location(parser, name->source, name->source + name->length);
            } else {
                // Otherwise, the name is a slice of the malloc-ed
                // owned string, in which case we need to copy it
                // out into a new string.
                size_t length = pm_string_length(name);
                void *memory = malloc(length);
                memcpy(memory, pm_string_source(name), length);
                local = pm_parser_local_add_owned(parser, (const uint8_t *) memory, length);
            }
            pm_constant_id_list_append(&match->locals, local);
        }
        result = (pm_node_t *) match;
    } else {
        result = (pm_node_t *) call;
    }
    pm_string_list_free(&named_captures);
    return result;
 }
 static inline pm_node_t *
 parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t previous_binding_power, pm_binding_power_t binding_power) {
    pm_token_t token = parser->current;
@ -14995,42 +15039,51 @@ parse_expression_infix(pm_parser_t *parser, pm_node_t *node, pm_binding_power_t
            // If the receiver of this =~ is a regular expression node, then we
            // need to introduce local variables for it based on its named
            // capture groups.
-            if (PM_NODE_TYPE_P(node, PM_REGULAR_EXPRESSION_NODE)) {
+            if (PM_NODE_TYPE_P(node, PM_INTERPOLATED_REGULAR_EXPRESSION_NODE)) {
-                pm_string_list_t named_captures;
+                // It's possible to have an interpolated regular expression node
-                pm_string_list_init(&named_captures);
+                // that only contains strings. This is because it can be split
                // up by a heredoc. In this case we need to concat the unescaped
                // strings together and then parse them as a regular expression.
                pm_node_list_t *parts = &((pm_interpolated_regular_expression_node_t *) node)->parts;
-                const pm_string_t *unescaped = &((pm_regular_expression_node_t *) node)->unescaped;
+                bool interpolated = false;
-                if (pm_regexp_named_capture_group_names(pm_string_source(unescaped), pm_string_length(unescaped), &named_captures, parser->encoding_changed, &parser->encoding) && (named_captures.length > 0)) {
+                size_t total_length = 0;
                    pm_match_write_node_t *match = pm_match_write_node_create(parser, call);
-                    for (size_t index = 0; index < named_captures.length; index++) {
+                for (size_t index = 0; index < parts->size; index++) {
-                        pm_string_t *name = &named_captures.strings[index];
+                    pm_node_t *part = parts->nodes[index];
                        pm_constant_id_t local;
-                        if (unescaped->type == PM_STRING_SHARED) {
+                    if (PM_NODE_TYPE_P(part, PM_STRING_NODE)) {
-                            // If the unescaped string is a slice of the source,
+                        total_length += pm_string_length(&((pm_string_node_t *) part)->unescaped);
-                            // then we can copy the names directly. The pointers
+                    } else {
-                            // will line up.
+                        interpolated = true;
-                            local = pm_parser_local_add_location(parser, name->source, name->source + name->length);
+                        break;
                        } else {
                            // Otherwise, the name is a slice of the malloc-ed
                            // owned string, in which case we need to copy it
                            // out into a new string.
                            size_t length = pm_string_length(name);
                            void *memory = malloc(length);
                            memcpy(memory, pm_string_source(name), length);
                            local = pm_parser_local_add_owned(parser, (const uint8_t *) memory, length);
                        }
                        pm_constant_id_list_append(&match->locals, local);
                    }
                    result = (pm_node_t *) match;
                }
-                pm_string_list_free(&named_captures);
+                if (!interpolated) {
                    void *memory = malloc(total_length);
                    if (!memory) abort();
                    uint8_t *cursor = memory;
                    for (size_t index = 0; index < parts->size; index++) {
                        pm_string_t *unescaped = &((pm_string_node_t *) parts->nodes[index])->unescaped;
                        size_t length = pm_string_length(unescaped);
                        memcpy(cursor, pm_string_source(unescaped), length);
                        cursor += length;
                    }
                    pm_string_t owned;
                    pm_string_owned_init(&owned, (uint8_t *) memory, total_length);
                    result = parse_regular_expression_named_captures(parser, &owned, call);
                    pm_string_free(&owned);
                }
            } else if (PM_NODE_TYPE_P(node, PM_REGULAR_EXPRESSION_NODE)) {
                // If we have a regular expression node, then we can just parse
                // the named captures directly off the unescaped string.
                const pm_string_t *content = &((pm_regular_expression_node_t *) node)->unescaped;
                result = parse_regular_expression_named_captures(parser, content, call);
            }
            return result;
--- a/prism/regexp.c
+++ b/prism/regexp.c
@ -188,6 +188,8 @@ pm_regexp_parse_range_quantifier(pm_regexp_parser_t *parser) {
 //            ;
 static bool
 pm_regexp_parse_quantifier(pm_regexp_parser_t *parser) {
    if (pm_regexp_char_is_eof(parser)) return true;
    switch (*parser->cursor) {
        case '*':
        case '+':
--- a/test/prism/fixtures/spanning_heredoc.txt
+++ b/test/prism/fixtures/spanning_heredoc.txt
@ -49,3 +49,7 @@ pp <<-A, %I[p\
 o
 A
 p]
 <<A; /\
 A
 (?<a>)/ =~ ''
--- a/test/prism/snapshots/spanning_heredoc.txt
+++ b/test/prism/snapshots/spanning_heredoc.txt
@ -1,8 +1,8 @@
-@ ProgramNode (location: (4,0)-(51,2))
+@ ProgramNode (location: (4,0)-(55,13))
-├── locals: []
+├── locals: [:a]
 └── statements:
-    @ StatementsNode (location: (4,0)-(51,2))
+    @ StatementsNode (location: (4,0)-(55,13))
-    └── body: (length: 8)
+    └── body: (length: 10)
        ├── @ CallNode (location: (4,0)-(7,7))
        │   ├── receiver: ∅
        │   ├── call_operator_loc: ∅
@ -270,41 +270,86 @@
        │   ├── block: ∅
        │   ├── flags: ∅
        │   └── name: :pp
-        └── @ CallNode (location: (48,0)-(51,2))
+        ├── @ CallNode (location: (48,0)-(51,2))
-            ├── receiver: ∅
+        │   ├── receiver: ∅
-            ├── call_operator_loc: ∅
+        │   ├── call_operator_loc: ∅
-            ├── message_loc: (48,0)-(48,2) = "pp"
+        │   ├── message_loc: (48,0)-(48,2) = "pp"
-            ├── opening_loc: ∅
+        │   ├── opening_loc: ∅
-            ├── arguments:
+        │   ├── arguments:
-            │   @ ArgumentsNode (location: (48,3)-(51,2))
+        │   │   @ ArgumentsNode (location: (48,3)-(51,2))
-            │   ├── arguments: (length: 2)
+        │   │   ├── arguments: (length: 2)
-            │   │   ├── @ StringNode (location: (48,3)-(48,7))
+        │   │   │   ├── @ StringNode (location: (48,3)-(48,7))
-            │   │   │   ├── flags: ∅
+        │   │   │   │   ├── flags: ∅
-            │   │   │   ├── opening_loc: (48,3)-(48,7) = "<<-A"
+        │   │   │   │   ├── opening_loc: (48,3)-(48,7) = "<<-A"
-            │   │   │   ├── content_loc: (49,0)-(50,0) = "o\n"
+        │   │   │   │   ├── content_loc: (49,0)-(50,0) = "o\n"
-            │   │   │   ├── closing_loc: (50,0)-(51,0) = "A\n"
+        │   │   │   │   ├── closing_loc: (50,0)-(51,0) = "A\n"
-            │   │   │   └── unescaped: "o\n"
+        │   │   │   │   └── unescaped: "o\n"
-            │   │   └── @ ArrayNode (location: (48,9)-(51,2))
+        │   │   │   └── @ ArrayNode (location: (48,9)-(51,2))
-            │   │       ├── elements: (length: 1)
+        │   │   │       ├── elements: (length: 1)
-            │   │       │   └── @ InterpolatedSymbolNode (location: (48,12)-(48,14))
+        │   │   │       │   └── @ InterpolatedSymbolNode (location: (48,12)-(48,14))
-            │   │       │       ├── opening_loc: ∅
+        │   │   │       │       ├── opening_loc: ∅
-            │   │       │       ├── parts: (length: 2)
+        │   │   │       │       ├── parts: (length: 2)
-            │   │       │       │   ├── @ SymbolNode (location: (48,12)-(48,14))
+        │   │   │       │       │   ├── @ SymbolNode (location: (48,12)-(48,14))
-            │   │       │       │   │   ├── opening_loc: ∅
+        │   │   │       │       │   │   ├── opening_loc: ∅
-            │   │       │       │   │   ├── value_loc: (48,12)-(48,14) = "p\\"
+        │   │   │       │       │   │   ├── value_loc: (48,12)-(48,14) = "p\\"
-            │   │       │       │   │   ├── closing_loc: ∅
+        │   │   │       │       │   │   ├── closing_loc: ∅
-            │   │       │       │   │   └── unescaped: "p\n"
+        │   │   │       │       │   │   └── unescaped: "p\n"
-            │   │       │       │   └── @ StringNode (location: (48,12)-(48,14))
+        │   │   │       │       │   └── @ StringNode (location: (48,12)-(48,14))
-            │   │       │       │       ├── flags: ∅
+        │   │   │       │       │       ├── flags: ∅
-            │   │       │       │       ├── opening_loc: ∅
+        │   │   │       │       │       ├── opening_loc: ∅
-            │   │       │       │       ├── content_loc: (48,12)-(48,14) = "p\\"
+        │   │   │       │       │       ├── content_loc: (48,12)-(48,14) = "p\\"
-            │   │       │       │       ├── closing_loc: ∅
+        │   │   │       │       │       ├── closing_loc: ∅
-            │   │       │       │       └── unescaped: "p"
+        │   │   │       │       │       └── unescaped: "p"
-            │   │       │       └── closing_loc: ∅
+        │   │   │       │       └── closing_loc: ∅
-            │   │       ├── opening_loc: (48,9)-(48,12) = "%I["
+        │   │   │       ├── opening_loc: (48,9)-(48,12) = "%I["
-            │   │       └── closing_loc: (51,1)-(51,2) = "]"
+        │   │   │       └── closing_loc: (51,1)-(51,2) = "]"
-            │   └── flags: ∅
+        │   │   └── flags: ∅
-            ├── closing_loc: ∅
+        │   ├── closing_loc: ∅
-            ├── block: ∅
+        │   ├── block: ∅
-            ├── flags: ∅
+        │   ├── flags: ∅
-            └── name: :pp
+        │   └── name: :pp
        ├── @ StringNode (location: (53,0)-(53,3))
        │   ├── flags: ∅
        │   ├── opening_loc: (53,0)-(53,3) = "<<A"
        │   ├── content_loc: (54,0)-(54,0) = ""
        │   ├── closing_loc: (54,0)-(55,0) = "A\n"
        │   └── unescaped: ""
        └── @ MatchWriteNode (location: (53,5)-(55,13))
            ├── call:
            │   @ CallNode (location: (53,5)-(55,13))
            │   ├── receiver:
            │   │   @ InterpolatedRegularExpressionNode (location: (53,5)-(55,7))
            │   │   ├── opening_loc: (53,5)-(53,6) = "/"
            │   │   ├── parts: (length: 2)
            │   │   │   ├── @ StringNode (location: (53,6)-(53,7))
            │   │   │   │   ├── flags: ∅
            │   │   │   │   ├── opening_loc: ∅
            │   │   │   │   ├── content_loc: (53,6)-(53,7) = "\\"
            │   │   │   │   ├── closing_loc: ∅
            │   │   │   │   └── unescaped: ""
            │   │   │   └── @ StringNode (location: (55,0)-(55,6))
            │   │   │       ├── flags: ∅
            │   │   │       ├── opening_loc: ∅
            │   │   │       ├── content_loc: (55,0)-(55,6) = "(?<a>)"
            │   │   │       ├── closing_loc: ∅
            │   │   │       └── unescaped: "(?<a>)"
            │   │   ├── closing_loc: (55,6)-(55,7) = "/"
            │   │   └── flags: ∅
            │   ├── call_operator_loc: ∅
            │   ├── message_loc: (55,8)-(55,10) = "=~"
            │   ├── opening_loc: ∅
            │   ├── arguments:
            │   │   @ ArgumentsNode (location: (55,11)-(55,13))
            │   │   ├── arguments: (length: 1)
            │   │   │   └── @ StringNode (location: (55,11)-(55,13))
            │   │   │       ├── flags: ∅
            │   │   │       ├── opening_loc: (55,11)-(55,12) = "'"
            │   │   │       ├── content_loc: (55,12)-(55,12) = ""
            │   │   │       ├── closing_loc: (55,12)-(55,13) = "'"
            │   │   │       └── unescaped: ""
            │   │   └── flags: ∅
            │   ├── closing_loc: ∅
            │   ├── block: ∅
            │   ├── flags: ∅
            │   └── name: :=~
            └── locals: [:a]