[ruby/prism] Fix binary encoding for the parser translator

Skipping detecting the encoding is almost always right, just for binary it should actually happen.

A symbol containing escapes that are invalid
in utf-8 would fail to parse since symbols must be valid in the script encoding.
Additionally, the parser gem would raise an exception somewhere during string handling

fa0154d9e4
This commit is contained in:
Earlopain 2025-01-11 22:25:09 +01:00 committed by git
parent 8e56d9e415
commit 723f31cf6b
8 changed files with 119 additions and 3 deletions

View file

@ -51,7 +51,7 @@ module Prism
source = source_buffer.source
offset_cache = build_offset_cache(source)
result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache)
result = unwrap(Prism.parse(source, **prism_options), offset_cache)
build_ast(result.value, offset_cache)
ensure
@ -64,7 +64,7 @@ module Prism
source = source_buffer.source
offset_cache = build_offset_cache(source)
result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache)
result = unwrap(Prism.parse(source, **prism_options), offset_cache)
[
build_ast(result.value, offset_cache),
@ -83,7 +83,7 @@ module Prism
offset_cache = build_offset_cache(source)
result =
begin
unwrap(Prism.parse_lex(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache)
unwrap(Prism.parse_lex(source, **prism_options), offset_cache)
rescue ::Parser::SyntaxError
raise if !recover
end
@ -285,6 +285,20 @@ module Prism
)
end
# Options for how prism should parse/lex the source.
def prism_options
options = {
filepath: @source_buffer.name,
version: convert_for_prism(version),
partial_script: true,
}
# The parser gem always encodes to UTF-8, unless it is binary.
# https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/source/buffer.rb#L80-L107
options[:encoding] = false if @source_buffer.source.encoding != Encoding::BINARY
options
end
# Converts the version format handled by Parser to the format handled by Prism.
def convert_for_prism(version)
case version

View file

@ -0,0 +1,9 @@
# encoding: binary
"\xcd"
:"\xcd"
/#{"\xcd"}/
%W[\xC0]

View file

@ -0,0 +1,6 @@
# encoding: euc-jp
# \x8E indicates a double-byte character, \x01 is not a valid second byte in euc-jp
"\x8E\x01"
%W["\x8E\x01"]

View file

@ -17,6 +17,18 @@ end
# First, opt in to every AST feature.
Parser::Builders::Default.modernize
# The parser gem rejects some strings that would most likely lead to errors
# in consumers due to encoding problems. RuboCop however monkey-patches this
# method out in order to accept such code.
# https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/builders/default.rb#L2289-L2295
Parser::Builders::Default.prepend(
Module.new {
def string_value(token)
value(token)
end
}
)
# Modify the source map == check so that it doesn't check against the node
# itself so we don't get into a recursive loop.
Parser::Source::Map.prepend(

View file

@ -26,6 +26,7 @@ Sexp.prepend(
module Prism
class RubyParserTest < TestCase
todos = [
"encoding_euc_jp.txt",
"newline_terminated.txt",
"regex_char_width.txt",
"seattlerb/bug169.txt",

View file

@ -0,0 +1,49 @@
@ ProgramNode (location: (3,0)-(9,8))
├── flags: ∅
├── locals: []
└── statements:
@ StatementsNode (location: (3,0)-(9,8))
├── flags: ∅
└── body: (length: 4)
├── @ StringNode (location: (3,0)-(3,6))
│ ├── flags: newline
│ ├── opening_loc: (3,0)-(3,1) = "\""
│ ├── content_loc: (3,1)-(3,5) = "\\xcd"
│ ├── closing_loc: (3,5)-(3,6) = "\""
│ └── unescaped: "\xCD"
├── @ SymbolNode (location: (5,0)-(5,7))
│ ├── flags: newline, static_literal
│ ├── opening_loc: (5,0)-(5,2) = ":\""
│ ├── value_loc: (5,2)-(5,6) = "\\xcd"
│ ├── closing_loc: (5,6)-(5,7) = "\""
│ └── unescaped: "\xCD"
├── @ InterpolatedRegularExpressionNode (location: (7,0)-(7,11))
│ ├── flags: newline, static_literal
│ ├── opening_loc: (7,0)-(7,1) = "/"
│ ├── parts: (length: 1)
│ │ └── @ EmbeddedStatementsNode (location: (7,1)-(7,10))
│ │ ├── flags: ∅
│ │ ├── opening_loc: (7,1)-(7,3) = "\#{"
│ │ ├── statements:
│ │ │ @ StatementsNode (location: (7,3)-(7,9))
│ │ │ ├── flags: ∅
│ │ │ └── body: (length: 1)
│ │ │ └── @ StringNode (location: (7,3)-(7,9))
│ │ │ ├── flags: static_literal, frozen
│ │ │ ├── opening_loc: (7,3)-(7,4) = "\""
│ │ │ ├── content_loc: (7,4)-(7,8) = "\\xcd"
│ │ │ ├── closing_loc: (7,8)-(7,9) = "\""
│ │ │ └── unescaped: "\xCD"
│ │ └── closing_loc: (7,9)-(7,10) = "}"
│ └── closing_loc: (7,10)-(7,11) = "/"
└── @ ArrayNode (location: (9,0)-(9,8))
├── flags: newline
├── elements: (length: 1)
│ └── @ StringNode (location: (9,3)-(9,7))
│ ├── flags: ∅
│ ├── opening_loc: ∅
│ ├── content_loc: (9,3)-(9,7) = "\\xC0"
│ ├── closing_loc: ∅
│ └── unescaped: "\xC0"
├── opening_loc: (9,0)-(9,3) = "%W["
└── closing_loc: (9,7)-(9,8) = "]"

View file

@ -0,0 +1,24 @@
@ ProgramNode (location: (4,0)-(6,14))
├── flags: ∅
├── locals: []
└── statements:
@ StatementsNode (location: (4,0)-(6,14))
├── flags: ∅
└── body: (length: 2)
├── @ StringNode (location: (4,0)-(4,10))
│ ├── flags: newline
│ ├── opening_loc: (4,0)-(4,1) = "\""
│ ├── content_loc: (4,1)-(4,9) = "\\x8E\\x01"
│ ├── closing_loc: (4,9)-(4,10) = "\""
│ └── unescaped: "\x8E\x01"
└── @ ArrayNode (location: (6,0)-(6,14))
├── flags: newline
├── elements: (length: 1)
│ └── @ StringNode (location: (6,3)-(6,13))
│ ├── flags: ∅
│ ├── opening_loc: ∅
│ ├── content_loc: (6,3)-(6,13) = "\"\\x8E\\x01\""
│ ├── closing_loc: ∅
│ └── unescaped: "\"\x8E\x01\""
├── opening_loc: (6,0)-(6,3) = "%W["
└── closing_loc: (6,13)-(6,14) = "]"

View file

@ -5,6 +5,7 @@ require_relative "test_helper"
module Prism
class SnippetsTest < TestCase
except = [
"encoding_binary.txt",
"newline_terminated.txt",
"seattlerb/begin_rescue_else_ensure_no_bodies.txt",
"seattlerb/case_in.txt",