[ruby/prism] Handle CLRF inside heredoc contents

1fbac72485
2025-09-16 17:14:01 +02:00 · 2024-03-25 08:32:58 -04:00 · 2024-03-25 08:32:58 -04:00 · 14ab698967
commit 14ab698967
parent a31ca3500d
12 changed files with 32 additions and 30 deletions
--- a/prism/prism.c
+++ b/prism/prism.c
@ -11267,11 +11267,11 @@ parser_lex(pm_parser_t *parser) {
            // Otherwise we'll be parsing string content. These are the places
            // where we need to split up the content of the heredoc. We'll use
            // strpbrk to find the first of these characters.
-            uint8_t breakpoints[] = "\n\\#";
+            uint8_t breakpoints[] = "\r\n\\#";

            pm_heredoc_quote_t quote = lex_mode->as.heredoc.quote;
            if (quote == PM_HEREDOC_QUOTE_SINGLE) {
-                breakpoints[2] = '\0';
+                breakpoints[3] = '\0';
            }

            const uint8_t *breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
@ -11285,6 +11285,21 @@ parser_lex(pm_parser_t *parser) {
                        parser->current.end = breakpoint + 1;
                        breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
                        break;
+                    case '\r':
+                        parser->current.end = breakpoint + 1;
+
+                        if (peek_at(parser, breakpoint + 1) != '\n') {
+                            breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, true);
+                            break;
+                        }
+
+                        // If we hit a \r\n sequence, then we want to replace it
+                        // with a single \n character in the final string.
+                        pm_token_buffer_escape(parser, &token_buffer);
+                        breakpoint++;
+                        token_buffer.cursor = breakpoint;
+
+                        /* fallthrough */
                    case '\n': {
                        if (parser->heredoc_end != NULL && (parser->heredoc_end > breakpoint)) {
                            parser_flush_heredoc_end(parser);
--- a/test/prism/ruby_parser_test.rb
+++ b/test/prism/ruby_parser_test.rb
@ -52,25 +52,10 @@ module Prism
      whitequark/string_concat.txt
    ]

-    # These files contain CRLF line endings, which ruby_parser translates into
-    # LF before it gets back to the node. This means the node actually has the
-    # wrong contents.
-    crlf = %w[
-      dos_endings.txt
-      heredoc_with_comment.txt
-      seattlerb/heredoc__backslash_dos_format.txt
-      seattlerb/heredoc_with_carriage_return_escapes_windows.txt
-      seattlerb/heredoc_with_extra_carriage_horrible_mix.txt
-      seattlerb/heredoc_with_extra_carriage_returns_windows.txt
-      seattlerb/heredoc_with_extra_carriage_returns.txt
-      seattlerb/heredoc_with_interpolation_and_carriage_return_escapes_windows.txt
-      seattlerb/heredoc_with_only_carriage_returns_windows.txt
-      seattlerb/heredoc_with_only_carriage_returns.txt
-    ]
-
    # https://github.com/seattlerb/ruby_parser/issues/344
-    failures = crlf | %w[
+    failures = %w[
      alias.txt
+      dos_endings.txt
      heredocs_with_ignored_newlines.txt
      method_calls.txt
      methods.txt
@ -79,8 +64,10 @@ module Prism
      patterns.txt
      regex.txt
      seattlerb/and_multi.txt
+      seattlerb/heredoc__backslash_dos_format.txt
      seattlerb/heredoc_bad_hex_escape.txt
      seattlerb/heredoc_bad_oct_escape.txt
+      seattlerb/heredoc_with_extra_carriage_horrible_mix.txt
      spanning_heredoc_newlines.txt
      spanning_heredoc.txt
      tilde_heredocs.txt
--- a/test/prism/snapshots/dos_endings.txt
+++ b/test/prism/snapshots/dos_endings.txt
@ -48,7 +48,7 @@
        │   ├── opening_loc: (7,0)-(7,4) = "<<-E"
        │   ├── content_loc: (8,0)-(11,0) = "    1 \\\r\n    2\r\n    3\r\n"
        │   ├── closing_loc: (11,0)-(12,0) = "E\r\n"
-        │   └── unescaped: "    1     2\r\n    3\r\n"
+        │   └── unescaped: "    1     2\n    3\n"
        ├── @ LocalVariableWriteNode (location: (13,0)-(15,0))
        │   ├── name: :x
        │   ├── depth: 0
@ -94,7 +94,7 @@
            │   │           │   │       ├── opening_loc: ∅
            │   │           │   │       ├── content_loc: (19,0)-(20,0) = "    baz\r\n"
            │   │           │   │       ├── closing_loc: ∅
-            │   │           │   │       └── unescaped: "baz\r\n"
+            │   │           │   │       └── unescaped: "baz\n"
            │   │           │   └── closing_loc: (20,0)-(21,0) = "  EOF\r\n"
            │   │           ├── call_operator_loc: (17,14)-(17,15) = "."
            │   │           ├── name: :chop
--- a/test/prism/snapshots/heredoc_with_comment.txt
+++ b/test/prism/snapshots/heredoc_with_comment.txt
@ -11,7 +11,7 @@
            │   ├── opening_loc: (1,0)-(1,9) = "<<-TARGET"
            │   ├── content_loc: (2,0)-(3,0) = "  content makes for an obvious error\r\n"
            │   ├── closing_loc: (3,0)-(3,6) = "TARGET"
-            │   └── unescaped: "  content makes for an obvious error\r\n"
+            │   └── unescaped: "  content makes for an obvious error\n"
            ├── call_operator_loc: (1,9)-(1,10) = "."
            ├── name: :chomp
            ├── message_loc: (1,10)-(1,15) = "chomp"
--- a/test/prism/snapshots/seattlerb/heredoc__backslash_dos_format.txt
+++ b/test/prism/snapshots/seattlerb/heredoc__backslash_dos_format.txt
@ -13,5 +13,5 @@
            │   ├── opening_loc: (1,6)-(1,12) = "<<-XXX"
            │   ├── content_loc: (2,0)-(4,0) = "before\\\r\nafter\r\n"
            │   ├── closing_loc: (4,0)-(5,0) = "XXX\r\n"
-            │   └── unescaped: "beforeafter\r\n"
+            │   └── unescaped: "beforeafter\n"
            └── operator_loc: (1,4)-(1,5) = "="
--- a/test/prism/snapshots/seattlerb/heredoc_with_carriage_return_escapes_windows.txt
+++ b/test/prism/snapshots/seattlerb/heredoc_with_carriage_return_escapes_windows.txt
@ -8,4 +8,4 @@
            ├── opening_loc: (1,0)-(1,5) = "<<EOS"
            ├── content_loc: (2,0)-(4,0) = "foo\\rbar\r\nbaz\\r\r\n"
            ├── closing_loc: (4,0)-(5,0) = "EOS\r\n"
-            └── unescaped: "foo\rbar\r\nbaz\r\r\n"
+            └── unescaped: "foo\rbar\nbaz\r\n"
--- a/test/prism/snapshots/seattlerb/heredoc_with_extra_carriage_horrible_mix.txt
+++ b/test/prism/snapshots/seattlerb/heredoc_with_extra_carriage_horrible_mix.txt
@ -8,4 +8,4 @@
            ├── opening_loc: (1,0)-(1,7) = "<<'eot'"
            ├── content_loc: (2,0)-(3,0) = "body\r\n"
            ├── closing_loc: (3,0)-(4,0) = "eot\n"
-            └── unescaped: "body\r\n"
+            └── unescaped: "body\n"
--- a/test/prism/snapshots/seattlerb/heredoc_with_extra_carriage_returns.txt
+++ b/test/prism/snapshots/seattlerb/heredoc_with_extra_carriage_returns.txt
@ -8,4 +8,4 @@
            ├── opening_loc: (1,0)-(1,5) = "<<EOS"
            ├── content_loc: (2,0)-(4,0) = "foo\rbar\r\nbaz\n"
            ├── closing_loc: (4,0)-(5,0) = "EOS\n"
-            └── unescaped: "foo\rbar\r\nbaz\n"
+            └── unescaped: "foo\rbar\nbaz\n"
--- a/test/prism/snapshots/seattlerb/heredoc_with_extra_carriage_returns_windows.txt
+++ b/test/prism/snapshots/seattlerb/heredoc_with_extra_carriage_returns_windows.txt
@ -8,4 +8,4 @@
            ├── opening_loc: (1,0)-(1,5) = "<<EOS"
            ├── content_loc: (2,0)-(4,0) = "foo\rbar\r\r\nbaz\r\n"
            ├── closing_loc: (4,0)-(5,0) = "EOS\r\n"
-            └── unescaped: "foo\rbar\r\r\nbaz\r\n"
+            └── unescaped: "foo\rbar\r\nbaz\n"
--- a/test/prism/snapshots/seattlerb/heredoc_with_interpolation_and_carriage_return_escapes_windows.txt
+++ b/test/prism/snapshots/seattlerb/heredoc_with_interpolation_and_carriage_return_escapes_windows.txt
@ -22,5 +22,5 @@
            │       ├── opening_loc: ∅
            │       ├── content_loc: (2,10)-(3,0) = "\r\n"
            │       ├── closing_loc: ∅
-            │       └── unescaped: "\r\n"
+            │       └── unescaped: "\n"
            └── closing_loc: (3,0)-(4,0) = "EOS\r\n"
--- a/test/prism/snapshots/seattlerb/heredoc_with_only_carriage_returns.txt
+++ b/test/prism/snapshots/seattlerb/heredoc_with_only_carriage_returns.txt
@ -8,4 +8,4 @@
            ├── opening_loc: (1,0)-(1,5) = "<<EOS"
            ├── content_loc: (2,0)-(5,0) = "\r\n\r\r\n\\r\n"
            ├── closing_loc: (5,0)-(6,0) = "EOS\n"
-            └── unescaped: "\r\n\r\r\n\r\n"
+            └── unescaped: "\n\r\n\r\n"
--- a/test/prism/snapshots/seattlerb/heredoc_with_only_carriage_returns_windows.txt
+++ b/test/prism/snapshots/seattlerb/heredoc_with_only_carriage_returns_windows.txt
@ -8,4 +8,4 @@
            ├── opening_loc: (1,0)-(1,5) = "<<EOS"
            ├── content_loc: (2,0)-(5,0) = "\r\r\n\r\r\r\n\\r\r\n"
            ├── closing_loc: (5,0)-(6,0) = "EOS\r\n"
-            └── unescaped: "\r\r\n\r\r\r\n\r\r\n"
+            └── unescaped: "\r\n\r\r\n\r\n"