[ruby/prism] Fix binary encoding for the parser translator

Skipping detecting the encoding is almost always right, just for binary it should actually happen. A symbol containing escapes that are invalid in utf-8 would fail to parse since symbols must be valid in the script encoding. Additionally, the parser gem would raise an exception somewhere during string handling fa0154d9e4
2025-08-15 13:39:04 +02:00 · 2025-01-11 22:25:09 +01:00 · 2025-01-11 22:25:09 +01:00 · 723f31cf6b
commit 723f31cf6b
parent 8e56d9e415
8 changed files with 119 additions and 3 deletions
--- a/lib/prism/translation/parser.rb
+++ b/lib/prism/translation/parser.rb
@ -51,7 +51,7 @@ module Prism
        source = source_buffer.source
        offset_cache = build_offset_cache(source)
-        result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache)
+        result = unwrap(Prism.parse(source, **prism_options), offset_cache)
        build_ast(result.value, offset_cache)
      ensure
@ -64,7 +64,7 @@ module Prism
        source = source_buffer.source
        offset_cache = build_offset_cache(source)
-        result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache)
+        result = unwrap(Prism.parse(source, **prism_options), offset_cache)
        [
          build_ast(result.value, offset_cache),
@ -83,7 +83,7 @@ module Prism
        offset_cache = build_offset_cache(source)
        result =
          begin
-            unwrap(Prism.parse_lex(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache)
+            unwrap(Prism.parse_lex(source, **prism_options), offset_cache)
          rescue ::Parser::SyntaxError
            raise if !recover
          end
@ -285,6 +285,20 @@ module Prism
        )
      end
      # Options for how prism should parse/lex the source.
      def prism_options
        options = {
          filepath: @source_buffer.name,
          version: convert_for_prism(version),
          partial_script: true,
        }
        # The parser gem always encodes to UTF-8, unless it is binary.
        # https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/source/buffer.rb#L80-L107
        options[:encoding] = false if @source_buffer.source.encoding != Encoding::BINARY
        options
      end
      # Converts the version format handled by Parser to the format handled by Prism.
      def convert_for_prism(version)
        case version
--- a/test/prism/fixtures/encoding_binary.txt
+++ b/test/prism/fixtures/encoding_binary.txt
@ -0,0 +1,9 @@
 # encoding: binary
 "\xcd"
 :"\xcd"
 /#{"\xcd"}/
 %W[\xC0]
--- a/test/prism/fixtures/encoding_euc_jp.txt
+++ b/test/prism/fixtures/encoding_euc_jp.txt
@ -0,0 +1,6 @@
 # encoding: euc-jp
 # \x8E indicates a double-byte character, \x01 is not a valid second byte in euc-jp
 "\x8E\x01"
 %W["\x8E\x01"]
--- a/test/prism/ruby/parser_test.rb
+++ b/test/prism/ruby/parser_test.rb
@ -17,6 +17,18 @@ end
 # First, opt in to every AST feature.
 Parser::Builders::Default.modernize
 # The parser gem rejects some strings that would most likely lead to errors
 # in consumers due to encoding problems. RuboCop however monkey-patches this
 # method out in order to accept such code.
 # https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/builders/default.rb#L2289-L2295
 Parser::Builders::Default.prepend(
  Module.new {
    def string_value(token)
      value(token)
    end
  }
 )
 # Modify the source map == check so that it doesn't check against the node
 # itself so we don't get into a recursive loop.
 Parser::Source::Map.prepend(
--- a/test/prism/ruby/ruby_parser_test.rb
+++ b/test/prism/ruby/ruby_parser_test.rb
@ -26,6 +26,7 @@ Sexp.prepend(
 module Prism
  class RubyParserTest < TestCase
    todos = [
      "encoding_euc_jp.txt",
      "newline_terminated.txt",
      "regex_char_width.txt",
      "seattlerb/bug169.txt",
--- a/test/prism/snapshots/encoding_binary.txt
+++ b/test/prism/snapshots/encoding_binary.txt
@ -0,0 +1,49 @@
@ ProgramNode (location: (3,0)-(9,8))
 ├── flags: ∅
 ├── locals: []
 └── statements:
    @ StatementsNode (location: (3,0)-(9,8))
    ├── flags: ∅
    └── body: (length: 4)
        ├── @ StringNode (location: (3,0)-(3,6))
        │   ├── flags: newline
        │   ├── opening_loc: (3,0)-(3,1) = "\""
        │   ├── content_loc: (3,1)-(3,5) = "\\xcd"
        │   ├── closing_loc: (3,5)-(3,6) = "\""
        │   └── unescaped: "\xCD"
        ├── @ SymbolNode (location: (5,0)-(5,7))
        │   ├── flags: newline, static_literal
        │   ├── opening_loc: (5,0)-(5,2) = ":\""
        │   ├── value_loc: (5,2)-(5,6) = "\\xcd"
        │   ├── closing_loc: (5,6)-(5,7) = "\""
        │   └── unescaped: "\xCD"
        ├── @ InterpolatedRegularExpressionNode (location: (7,0)-(7,11))
        │   ├── flags: newline, static_literal
        │   ├── opening_loc: (7,0)-(7,1) = "/"
        │   ├── parts: (length: 1)
        │   │   └── @ EmbeddedStatementsNode (location: (7,1)-(7,10))
        │   │       ├── flags: ∅
        │   │       ├── opening_loc: (7,1)-(7,3) = "\#{"
        │   │       ├── statements:
        │   │       │   @ StatementsNode (location: (7,3)-(7,9))
        │   │       │   ├── flags: ∅
        │   │       │   └── body: (length: 1)
        │   │       │       └── @ StringNode (location: (7,3)-(7,9))
        │   │       │           ├── flags: static_literal, frozen
        │   │       │           ├── opening_loc: (7,3)-(7,4) = "\""
        │   │       │           ├── content_loc: (7,4)-(7,8) = "\\xcd"
        │   │       │           ├── closing_loc: (7,8)-(7,9) = "\""
        │   │       │           └── unescaped: "\xCD"
        │   │       └── closing_loc: (7,9)-(7,10) = "}"
        │   └── closing_loc: (7,10)-(7,11) = "/"
        └── @ ArrayNode (location: (9,0)-(9,8))
            ├── flags: newline
            ├── elements: (length: 1)
            │   └── @ StringNode (location: (9,3)-(9,7))
            │       ├── flags: ∅
            │       ├── opening_loc: ∅
            │       ├── content_loc: (9,3)-(9,7) = "\\xC0"
            │       ├── closing_loc: ∅
            │       └── unescaped: "\xC0"
            ├── opening_loc: (9,0)-(9,3) = "%W["
            └── closing_loc: (9,7)-(9,8) = "]"
--- a/test/prism/snapshots/encoding_euc_jp.txt
+++ b/test/prism/snapshots/encoding_euc_jp.txt
@ -0,0 +1,24 @@
@ ProgramNode (location: (4,0)-(6,14))
 ├── flags: ∅
 ├── locals: []
 └── statements:
    @ StatementsNode (location: (4,0)-(6,14))
    ├── flags: ∅
    └── body: (length: 2)
        ├── @ StringNode (location: (4,0)-(4,10))
        │   ├── flags: newline
        │   ├── opening_loc: (4,0)-(4,1) = "\""
        │   ├── content_loc: (4,1)-(4,9) = "\\x8E\\x01"
        │   ├── closing_loc: (4,9)-(4,10) = "\""
        │   └── unescaped: "\x8E\x01"
        └── @ ArrayNode (location: (6,0)-(6,14))
            ├── flags: newline
            ├── elements: (length: 1)
            │   └── @ StringNode (location: (6,3)-(6,13))
            │       ├── flags: ∅
            │       ├── opening_loc: ∅
            │       ├── content_loc: (6,3)-(6,13) = "\"\\x8E\\x01\""
            │       ├── closing_loc: ∅
            │       └── unescaped: "\"\x8E\x01\""
            ├── opening_loc: (6,0)-(6,3) = "%W["
            └── closing_loc: (6,13)-(6,14) = "]"
--- a/test/prism/snippets_test.rb
+++ b/test/prism/snippets_test.rb
@ -5,6 +5,7 @@ require_relative "test_helper"
 module Prism
  class SnippetsTest < TestCase
    except = [
      "encoding_binary.txt",
      "newline_terminated.txt",
      "seattlerb/begin_rescue_else_ensure_no_bodies.txt",
      "seattlerb/case_in.txt",