[ruby/prism] Decode %r like % strings

%r regular expressions need to be decoded like strings. This commit fixes %r decoding so it works like strings. 85bfd9c0cd
2025-08-15 13:39:04 +02:00 · 2024-12-11 15:54:56 -08:00 · 2024-12-11 15:54:56 -08:00 · 9181e8bc87
commit 9181e8bc87
parent 0a1fa99482
2 changed files with 62 additions and 18 deletions
--- a/prism/prism.c
+++ b/prism/prism.c
@ -12110,9 +12110,28 @@ parser_lex(pm_parser_t *parser) {
            pm_regexp_token_buffer_t token_buffer = { 0 };

            while (breakpoint != NULL) {
+                uint8_t term = lex_mode->as.regexp.terminator;
+                bool is_terminator = (*breakpoint == term);
+
+                // If the terminator is newline, we need to consider \r\n _also_ a newline
+                // For example: `%\nfoo\r\n`
+                // The string should be "foo", not "foo\r"
+                if (*breakpoint == '\r' && peek_at(parser, breakpoint + 1) == '\n') {
+                    if (term == '\n') {
+                        is_terminator = true;
+                    }
+
+                    // If the terminator is a CR, but we see a CRLF, we need to
+                    // treat the CRLF as a newline, meaning this is _not_ the
+                    // terminator
+                    if (term == '\r') {
+                        is_terminator = false;
+                    }
+                }
+
                // If we hit the terminator, we need to determine what kind of
                // token to return.
-                if (*breakpoint == lex_mode->as.regexp.terminator) {
+                if (is_terminator) {
                    if (lex_mode->as.regexp.nesting > 0) {
                        parser->current.end = breakpoint + 1;
                        breakpoint = pm_strpbrk(parser, parser->current.end, breakpoints, parser->end - parser->current.end, false);
@ -12342,20 +12361,21 @@ parser_lex(pm_parser_t *parser) {
                    continue;
                }

-                bool is_terminator = (*breakpoint == lex_mode->as.string.terminator);
+                uint8_t term = lex_mode->as.string.terminator;
+                bool is_terminator = (*breakpoint == term);

                // If the terminator is newline, we need to consider \r\n _also_ a newline
-                // For example: `%\nfoo\r\n`
-                // The string should be "foo", not "foo\r"
+                // For example: `%r\nfoo\r\n`
+                // The string should be /foo/, not /foo\r/
                if (*breakpoint == '\r' && peek_at(parser, breakpoint + 1) == '\n') {
-                    if (lex_mode->as.string.terminator == '\n') {
+                    if (term == '\n') {
                        is_terminator = true;
                    }

                    // If the terminator is a CR, but we see a CRLF, we need to
                    // treat the CRLF as a newline, meaning this is _not_ the
                    // terminator
-                    if (lex_mode->as.string.terminator == '\r') {
+                    if (term == '\r') {
                        is_terminator = false;
                    }
                }
--- a/test/prism/percent_delimiter_string_test.rb
+++ b/test/prism/percent_delimiter_string_test.rb
@ -3,56 +3,80 @@
 require_relative "test_helper"

 module Prism
-  class PercentDelimiterStringTest < TestCase
+  module PercentDelimiterTests
    def test_newline_terminator_with_lf_crlf
-      str = "%\n123456\r\n"
+      str = l "\n123456\r\n"
      assert_parse "123456", str
    end

    def test_newline_terminator_with_lf_crlf_with_extra_cr
-      str = "%\n123456\r\r\n"
+      str = l "\n123456\r\r\n"
      assert_parse "123456\r", str
    end

    def test_newline_terminator_with_crlf_pair
-      str = "%\r\n123456\r\n"
+      str = l "\r\n123456\r\n"
      assert_parse "123456", str
    end

    def test_newline_terminator_with_crlf_crlf_with_extra_cr
-      str = "%\r\n123456\r\r\n"
+      str = l "\r\n123456\r\r\n"
      assert_parse "123456\r", str
    end

    def test_newline_terminator_with_cr_cr
-      str = "%\r123456\r;\n"
+      str = l "\r123456\r;\n"
      assert_parse "123456", str
    end

    def test_newline_terminator_with_crlf_lf
-      str = "%\r\n123456\n;\n"
+      str = l "\r\n123456\n;\n"
      assert_parse "123456", str
    end

    def test_cr_crlf
-      str = "%\r1\r\n \r"
+      str = l "\r1\r\n \r"
      assert_parse "1\n ", str
    end

    def test_lf_crlf
-      str = "%\n1\r\n \n"
+      str = l "\n1\r\n \n"
      assert_parse "1", str
    end

    def test_lf_lf
-      str = "%\n1\n \n"
+      str = l "\n1\n \n"
      assert_parse "1", str
    end

    def assert_parse(expected, str)
+      assert_equal expected, find_node(str).unescaped
+    end
+  end
+
+  class PercentDelimiterStringTest < TestCase
+    include PercentDelimiterTests
+
+    def find_node(str)
      tree = Prism.parse str
-      node = tree.value.breadth_first_search { |x| Prism::StringNode === x }
-      assert_equal expected, node.unescaped
+      tree.value.breadth_first_search { |x| Prism::StringNode === x }
+    end
+
+    def l(str)
+      "%" + str
+    end
+  end
+
+  class PercentDelimiterRegexpTest < TestCase
+    include PercentDelimiterTests
+
+    def l(str)
+      "%r" + str
+    end
+
+    def find_node(str)
+      tree = Prism.parse str
+      tree.value.breadth_first_search { |x| Prism::RegularExpressionNode === x }
    end
  end
 end