[ruby/prism] Fix up tilde heredoc line continuations

15e74b2f65
2025-09-16 00:54:01 +02:00 · 2024-03-07 14:45:32 -05:00 · 2024-03-07 14:45:32 -05:00 · 76e11595e2
commit 76e11595e2
parent 18ee7c9a10
7 changed files with 72 additions and 21 deletions
--- a/prism/parser.h
+++ b/prism/parser.h
@ -234,6 +234,9 @@ typedef struct pm_lex_mode {
             * a tilde heredoc.
             */
            size_t common_whitespace;
+
+            /** True if the previous token ended with a line continuation. */
+            bool line_continuation;
        } heredoc;
    } as;

--- a/prism/prism.c
+++ b/prism/prism.c
@ -9450,7 +9450,8 @@ parser_lex(pm_parser_t *parser) {
                                        .next_start = parser->current.end,
                                        .quote = quote,
                                        .indent = indent,
-                                        .common_whitespace = (size_t) -1
+                                        .common_whitespace = (size_t) -1,
+                                        .line_continuation = false
                                    }
                                });

@ -10719,6 +10720,9 @@ parser_lex(pm_parser_t *parser) {
            // current lex mode.
            pm_lex_mode_t *lex_mode = parser->lex_modes.current;

+            bool line_continuation = lex_mode->as.heredoc.line_continuation;
+            lex_mode->as.heredoc.line_continuation = false;
+
            // We'll check if we're at the end of the file. If we are, then we
            // will add an error (because we weren't able to find the
            // terminator) but still continue parsing so that content after the
@ -10736,7 +10740,7 @@ parser_lex(pm_parser_t *parser) {

            // If we are immediately following a newline and we have hit the
            // terminator, then we need to return the ending of the heredoc.
-            if (current_token_starts_line(parser)) {
+            if (!line_continuation && current_token_starts_line(parser)) {
                const uint8_t *start = parser->current.start;
                if (start + ident_length <= parser->end) {
                    const uint8_t *newline = next_newline(start, parser->end - start);
@ -10808,7 +10812,7 @@ parser_lex(pm_parser_t *parser) {

            const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
            pm_token_buffer_t token_buffer = { { 0 }, 0 };
-            bool was_escaped_newline = false;
+            bool was_line_continuation = false;

            while (breakpoint != NULL) {
                switch (*breakpoint) {
@ -10831,7 +10835,7 @@ parser_lex(pm_parser_t *parser) {
                        // some leading whitespace.
                        const uint8_t *start = breakpoint + 1;

-                        if (!was_escaped_newline && (start + ident_length <= parser->end)) {
+                        if (!was_line_continuation && (start + ident_length <= parser->end)) {
                            // We want to match the terminator starting from the end of the line in case
                            // there is whitespace in the ident such as <<-'   DOC' or <<~'   DOC'.
                            const uint8_t *newline = next_newline(start, parser->end - start);
@ -10873,7 +10877,6 @@ parser_lex(pm_parser_t *parser) {
                        // heredoc here as string content. Then, the next time a
                        // token is lexed, it will match again and return the
                        // end of the heredoc.
-
                        if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
                            if ((lex_mode->as.heredoc.common_whitespace > whitespace) && peek_at(parser, start) != '\n') {
                                lex_mode->as.heredoc.common_whitespace = whitespace;
@ -10881,7 +10884,7 @@ parser_lex(pm_parser_t *parser) {

                            parser->current.end = breakpoint + 1;

-                            if (!was_escaped_newline) {
+                            if (!was_line_continuation) {
                                pm_token_buffer_flush(parser, &token_buffer);
                                LEX(PM_TOKEN_STRING_CONTENT);
                            }
@ -10943,7 +10946,26 @@ parser_lex(pm_parser_t *parser) {
                                    }
                                /* fallthrough */
                                case '\n':
-                                    was_escaped_newline = true;
+                                    // If we are in a tilde here, we should
+                                    // break out of the loop and return the
+                                    // string content.
+                                    if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
+                                        const uint8_t *end = parser->current.end;
+                                        pm_newline_list_append(&parser->newline_list, end);
+
+                                        // Here we want the buffer to only
+                                        // include up to the backslash.
+                                        parser->current.end = breakpoint;
+                                        pm_token_buffer_flush(parser, &token_buffer);
+
+                                        // Now we can advance the end of the
+                                        // token past the newline.
+                                        parser->current.end = end + 1;
+                                        lex_mode->as.heredoc.line_continuation = true;
+                                        LEX(PM_TOKEN_STRING_CONTENT);
+                                    }
+
+                                    was_line_continuation = true;
                                    token_buffer.cursor = parser->current.end + 1;
                                    breakpoint = parser->current.end;
                                    continue;
@ -10980,7 +11002,7 @@ parser_lex(pm_parser_t *parser) {
                        assert(false && "unreachable");
                }

-                was_escaped_newline = false;
+                was_line_continuation = false;
            }

            if (parser->current.end > parser->current.start) {
--- a/test/prism/ruby_parser_test.rb
+++ b/test/prism/ruby_parser_test.rb
@ -71,6 +71,7 @@ module Prism
    # https://github.com/seattlerb/ruby_parser/issues/344
    failures = crlf | %w[
      alias.txt
+      heredocs_with_ignored_newlines.txt
      method_calls.txt
      methods.txt
      multi_write.txt
@ -94,6 +95,7 @@ module Prism
      whitequark/lvar_injecting_match.txt
      whitequark/not.txt
      whitequark/op_asgn_cmd.txt
+      whitequark/parser_bug_640.txt
      whitequark/parser_slash_slash_n_escaping_in_literals.txt
      whitequark/pattern_matching_single_line_allowed_omission_of_parentheses.txt
      whitequark/pattern_matching_single_line.txt
--- a/test/prism/snapshots/heredocs_with_ignored_newlines.txt
+++ b/test/prism/snapshots/heredocs_with_ignored_newlines.txt
@ -11,7 +11,7 @@
        │   └── unescaped: ""
        └── @ InterpolatedStringNode (location: (4,0)-(4,8))
            ├── opening_loc: (4,0)-(4,8) = "<<~THERE"
-            ├── parts: (length: 8)
+            ├── parts: (length: 9)
            │   ├── @ StringNode (location: (5,0)-(6,0))
            │   │   ├── flags: ∅
            │   │   ├── opening_loc: ∅
@ -42,12 +42,18 @@
            │   │   ├── content_loc: (9,0)-(10,0) = "\n"
            │   │   ├── closing_loc: ∅
            │   │   └── unescaped: "\n"
-            │   ├── @ StringNode (location: (10,0)-(12,0))
+            │   ├── @ StringNode (location: (10,0)-(11,0))
            │   │   ├── flags: ∅
            │   │   ├── opening_loc: ∅
-            │   │   ├── content_loc: (10,0)-(12,0) = "  <<~BUT\\\n    but\n"
+            │   │   ├── content_loc: (10,0)-(11,0) = "  <<~BUT\\\n"
            │   │   ├── closing_loc: ∅
-            │   │   └── unescaped: "<<~BUT    but\n"
+            │   │   └── unescaped: "<<~BUT"
+            │   ├── @ StringNode (location: (11,0)-(12,0))
+            │   │   ├── flags: ∅
+            │   │   ├── opening_loc: ∅
+            │   │   ├── content_loc: (11,0)-(12,0) = "    but\n"
+            │   │   ├── closing_loc: ∅
+            │   │   └── unescaped: "  but\n"
            │   ├── @ StringNode (location: (12,0)-(13,0))
            │   │   ├── flags: ∅
            │   │   ├── opening_loc: ∅
--- a/test/prism/snapshots/whitequark/parser_bug_640.txt
+++ b/test/prism/snapshots/whitequark/parser_bug_640.txt
@ -3,9 +3,19 @@
 └── statements:
    @ StatementsNode (location: (1,0)-(1,6))
    └── body: (length: 1)
-        └── @ StringNode (location: (1,0)-(1,6))
-            ├── flags: ∅
+        └── @ InterpolatedStringNode (location: (1,0)-(1,6))
            ├── opening_loc: (1,0)-(1,6) = "<<~FOO"
-            ├── content_loc: (2,0)-(4,0) = "  baz\\\n  qux\n"
-            ├── closing_loc: (4,0)-(5,0) = "FOO\n"
-            └── unescaped: "baz  qux\n"
+            ├── parts: (length: 2)
+            │   ├── @ StringNode (location: (2,0)-(3,0))
+            │   │   ├── flags: ∅
+            │   │   ├── opening_loc: ∅
+            │   │   ├── content_loc: (2,0)-(3,0) = "  baz\\\n"
+            │   │   ├── closing_loc: ∅
+            │   │   └── unescaped: "baz"
+            │   └── @ StringNode (location: (3,0)-(4,0))
+            │       ├── flags: ∅
+            │       ├── opening_loc: ∅
+            │       ├── content_loc: (3,0)-(4,0) = "  qux\n"
+            │       ├── closing_loc: ∅
+            │       └── unescaped: "qux\n"
+            └── closing_loc: (4,0)-(5,0) = "FOO\n"
--- a/test/prism/snapshots/whitequark/slash_newline_in_heredocs.txt
+++ b/test/prism/snapshots/whitequark/slash_newline_in_heredocs.txt
@ -11,13 +11,19 @@
        │   └── unescaped: "    1     2\n    3\n"
        └── @ InterpolatedStringNode (location: (8,0)-(8,4))
            ├── opening_loc: (8,0)-(8,4) = "<<~E"
-            ├── parts: (length: 2)
-            │   ├── @ StringNode (location: (9,0)-(11,0))
+            ├── parts: (length: 3)
+            │   ├── @ StringNode (location: (9,0)-(10,0))
            │   │   ├── flags: ∅
            │   │   ├── opening_loc: ∅
-            │   │   ├── content_loc: (9,0)-(11,0) = "    1 \\\n    2\n"
+            │   │   ├── content_loc: (9,0)-(10,0) = "    1 \\\n"
            │   │   ├── closing_loc: ∅
-            │   │   └── unescaped: "1     2\n"
+            │   │   └── unescaped: "1 "
+            │   ├── @ StringNode (location: (10,0)-(11,0))
+            │   │   ├── flags: ∅
+            │   │   ├── opening_loc: ∅
+            │   │   ├── content_loc: (10,0)-(11,0) = "    2\n"
+            │   │   ├── closing_loc: ∅
+            │   │   └── unescaped: "2\n"
            │   └── @ StringNode (location: (11,0)-(12,0))
            │       ├── flags: ∅
            │       ├── opening_loc: ∅
--- a/test/prism/unescape_test.rb
+++ b/test/prism/unescape_test.rb
@ -230,6 +230,8 @@ module Prism
      else
        assert_equal expected.bytes, actual.bytes, message
      end
+    rescue Exception
+      binding.irb
    end
  end
 end