[ruby/prism] Fix binary encoding for the parser translator

Skipping detecting the encoding is almost always right, just for binary it should actually happen.

A symbol containing escapes that are invalid
in utf-8 would fail to parse since symbols must be valid in the script encoding.
Additionally, the parser gem would raise an exception somewhere during string handling

fa0154d9e4
This commit is contained in:
Earlopain 2025-01-11 22:25:09 +01:00 committed by git
parent 8e56d9e415
commit 723f31cf6b
8 changed files with 119 additions and 3 deletions

View file

@ -51,7 +51,7 @@ module Prism
source = source_buffer.source source = source_buffer.source
offset_cache = build_offset_cache(source) offset_cache = build_offset_cache(source)
result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache) result = unwrap(Prism.parse(source, **prism_options), offset_cache)
build_ast(result.value, offset_cache) build_ast(result.value, offset_cache)
ensure ensure
@ -64,7 +64,7 @@ module Prism
source = source_buffer.source source = source_buffer.source
offset_cache = build_offset_cache(source) offset_cache = build_offset_cache(source)
result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache) result = unwrap(Prism.parse(source, **prism_options), offset_cache)
[ [
build_ast(result.value, offset_cache), build_ast(result.value, offset_cache),
@ -83,7 +83,7 @@ module Prism
offset_cache = build_offset_cache(source) offset_cache = build_offset_cache(source)
result = result =
begin begin
unwrap(Prism.parse_lex(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache) unwrap(Prism.parse_lex(source, **prism_options), offset_cache)
rescue ::Parser::SyntaxError rescue ::Parser::SyntaxError
raise if !recover raise if !recover
end end
@ -285,6 +285,20 @@ module Prism
) )
end end
# Options for how prism should parse/lex the source.
def prism_options
options = {
filepath: @source_buffer.name,
version: convert_for_prism(version),
partial_script: true,
}
# The parser gem always encodes to UTF-8, unless it is binary.
# https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/source/buffer.rb#L80-L107
options[:encoding] = false if @source_buffer.source.encoding != Encoding::BINARY
options
end
# Converts the version format handled by Parser to the format handled by Prism. # Converts the version format handled by Parser to the format handled by Prism.
def convert_for_prism(version) def convert_for_prism(version)
case version case version

View file

@ -0,0 +1,9 @@
# encoding: binary
"\xcd"
:"\xcd"
/#{"\xcd"}/
%W[\xC0]

View file

@ -0,0 +1,6 @@
# encoding: euc-jp
# \x8E indicates a double-byte character, \x01 is not a valid second byte in euc-jp
"\x8E\x01"
%W["\x8E\x01"]

View file

@ -17,6 +17,18 @@ end
# First, opt in to every AST feature. # First, opt in to every AST feature.
Parser::Builders::Default.modernize Parser::Builders::Default.modernize
# The parser gem rejects some strings that would most likely lead to errors
# in consumers due to encoding problems. RuboCop however monkey-patches this
# method out in order to accept such code.
# https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/builders/default.rb#L2289-L2295
Parser::Builders::Default.prepend(
Module.new {
def string_value(token)
value(token)
end
}
)
# Modify the source map == check so that it doesn't check against the node # Modify the source map == check so that it doesn't check against the node
# itself so we don't get into a recursive loop. # itself so we don't get into a recursive loop.
Parser::Source::Map.prepend( Parser::Source::Map.prepend(

View file

@ -26,6 +26,7 @@ Sexp.prepend(
module Prism module Prism
class RubyParserTest < TestCase class RubyParserTest < TestCase
todos = [ todos = [
"encoding_euc_jp.txt",
"newline_terminated.txt", "newline_terminated.txt",
"regex_char_width.txt", "regex_char_width.txt",
"seattlerb/bug169.txt", "seattlerb/bug169.txt",

View file

@ -0,0 +1,49 @@
@ ProgramNode (location: (3,0)-(9,8))
├── flags: ∅
├── locals: []
└── statements:
@ StatementsNode (location: (3,0)-(9,8))
├── flags: ∅
└── body: (length: 4)
├── @ StringNode (location: (3,0)-(3,6))
│ ├── flags: newline
│ ├── opening_loc: (3,0)-(3,1) = "\""
│ ├── content_loc: (3,1)-(3,5) = "\\xcd"
│ ├── closing_loc: (3,5)-(3,6) = "\""
│ └── unescaped: "\xCD"
├── @ SymbolNode (location: (5,0)-(5,7))
│ ├── flags: newline, static_literal
│ ├── opening_loc: (5,0)-(5,2) = ":\""
│ ├── value_loc: (5,2)-(5,6) = "\\xcd"
│ ├── closing_loc: (5,6)-(5,7) = "\""
│ └── unescaped: "\xCD"
├── @ InterpolatedRegularExpressionNode (location: (7,0)-(7,11))
│ ├── flags: newline, static_literal
│ ├── opening_loc: (7,0)-(7,1) = "/"
│ ├── parts: (length: 1)
│ │ └── @ EmbeddedStatementsNode (location: (7,1)-(7,10))
│ │ ├── flags: ∅
│ │ ├── opening_loc: (7,1)-(7,3) = "\#{"
│ │ ├── statements:
│ │ │ @ StatementsNode (location: (7,3)-(7,9))
│ │ │ ├── flags: ∅
│ │ │ └── body: (length: 1)
│ │ │ └── @ StringNode (location: (7,3)-(7,9))
│ │ │ ├── flags: static_literal, frozen
│ │ │ ├── opening_loc: (7,3)-(7,4) = "\""
│ │ │ ├── content_loc: (7,4)-(7,8) = "\\xcd"
│ │ │ ├── closing_loc: (7,8)-(7,9) = "\""
│ │ │ └── unescaped: "\xCD"
│ │ └── closing_loc: (7,9)-(7,10) = "}"
│ └── closing_loc: (7,10)-(7,11) = "/"
└── @ ArrayNode (location: (9,0)-(9,8))
├── flags: newline
├── elements: (length: 1)
│ └── @ StringNode (location: (9,3)-(9,7))
│ ├── flags: ∅
│ ├── opening_loc: ∅
│ ├── content_loc: (9,3)-(9,7) = "\\xC0"
│ ├── closing_loc: ∅
│ └── unescaped: "\xC0"
├── opening_loc: (9,0)-(9,3) = "%W["
└── closing_loc: (9,7)-(9,8) = "]"

View file

@ -0,0 +1,24 @@
@ ProgramNode (location: (4,0)-(6,14))
├── flags: ∅
├── locals: []
└── statements:
@ StatementsNode (location: (4,0)-(6,14))
├── flags: ∅
└── body: (length: 2)
├── @ StringNode (location: (4,0)-(4,10))
│ ├── flags: newline
│ ├── opening_loc: (4,0)-(4,1) = "\""
│ ├── content_loc: (4,1)-(4,9) = "\\x8E\\x01"
│ ├── closing_loc: (4,9)-(4,10) = "\""
│ └── unescaped: "\x8E\x01"
└── @ ArrayNode (location: (6,0)-(6,14))
├── flags: newline
├── elements: (length: 1)
│ └── @ StringNode (location: (6,3)-(6,13))
│ ├── flags: ∅
│ ├── opening_loc: ∅
│ ├── content_loc: (6,3)-(6,13) = "\"\\x8E\\x01\""
│ ├── closing_loc: ∅
│ └── unescaped: "\"\x8E\x01\""
├── opening_loc: (6,0)-(6,3) = "%W["
└── closing_loc: (6,13)-(6,14) = "]"

View file

@ -5,6 +5,7 @@ require_relative "test_helper"
module Prism module Prism
class SnippetsTest < TestCase class SnippetsTest < TestCase
except = [ except = [
"encoding_binary.txt",
"newline_terminated.txt", "newline_terminated.txt",
"seattlerb/begin_rescue_else_ensure_no_bodies.txt", "seattlerb/begin_rescue_else_ensure_no_bodies.txt",
"seattlerb/case_in.txt", "seattlerb/case_in.txt",