mirror of
https://github.com/ruby/ruby.git
synced 2025-08-15 13:39:04 +02:00
[ruby/prism] Fix binary encoding for the parser translator
Skipping detecting the encoding is almost always right, just for binary it should actually happen.
A symbol containing escapes that are invalid
in utf-8 would fail to parse since symbols must be valid in the script encoding.
Additionally, the parser gem would raise an exception somewhere during string handling
fa0154d9e4
This commit is contained in:
parent
8e56d9e415
commit
723f31cf6b
8 changed files with 119 additions and 3 deletions
|
@ -51,7 +51,7 @@ module Prism
|
|||
source = source_buffer.source
|
||||
|
||||
offset_cache = build_offset_cache(source)
|
||||
result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache)
|
||||
result = unwrap(Prism.parse(source, **prism_options), offset_cache)
|
||||
|
||||
build_ast(result.value, offset_cache)
|
||||
ensure
|
||||
|
@ -64,7 +64,7 @@ module Prism
|
|||
source = source_buffer.source
|
||||
|
||||
offset_cache = build_offset_cache(source)
|
||||
result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache)
|
||||
result = unwrap(Prism.parse(source, **prism_options), offset_cache)
|
||||
|
||||
[
|
||||
build_ast(result.value, offset_cache),
|
||||
|
@ -83,7 +83,7 @@ module Prism
|
|||
offset_cache = build_offset_cache(source)
|
||||
result =
|
||||
begin
|
||||
unwrap(Prism.parse_lex(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache)
|
||||
unwrap(Prism.parse_lex(source, **prism_options), offset_cache)
|
||||
rescue ::Parser::SyntaxError
|
||||
raise if !recover
|
||||
end
|
||||
|
@ -285,6 +285,20 @@ module Prism
|
|||
)
|
||||
end
|
||||
|
||||
# Options for how prism should parse/lex the source.
|
||||
def prism_options
|
||||
options = {
|
||||
filepath: @source_buffer.name,
|
||||
version: convert_for_prism(version),
|
||||
partial_script: true,
|
||||
}
|
||||
# The parser gem always encodes to UTF-8, unless it is binary.
|
||||
# https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/source/buffer.rb#L80-L107
|
||||
options[:encoding] = false if @source_buffer.source.encoding != Encoding::BINARY
|
||||
|
||||
options
|
||||
end
|
||||
|
||||
# Converts the version format handled by Parser to the format handled by Prism.
|
||||
def convert_for_prism(version)
|
||||
case version
|
||||
|
|
9
test/prism/fixtures/encoding_binary.txt
Normal file
9
test/prism/fixtures/encoding_binary.txt
Normal file
|
@ -0,0 +1,9 @@
|
|||
# encoding: binary
|
||||
|
||||
"\xcd"
|
||||
|
||||
:"\xcd"
|
||||
|
||||
/#{"\xcd"}/
|
||||
|
||||
%W[\xC0]
|
6
test/prism/fixtures/encoding_euc_jp.txt
Normal file
6
test/prism/fixtures/encoding_euc_jp.txt
Normal file
|
@ -0,0 +1,6 @@
|
|||
# encoding: euc-jp
|
||||
|
||||
# \x8E indicates a double-byte character, \x01 is not a valid second byte in euc-jp
|
||||
"\x8E\x01"
|
||||
|
||||
%W["\x8E\x01"]
|
|
@ -17,6 +17,18 @@ end
|
|||
# First, opt in to every AST feature.
|
||||
Parser::Builders::Default.modernize
|
||||
|
||||
# The parser gem rejects some strings that would most likely lead to errors
|
||||
# in consumers due to encoding problems. RuboCop however monkey-patches this
|
||||
# method out in order to accept such code.
|
||||
# https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/builders/default.rb#L2289-L2295
|
||||
Parser::Builders::Default.prepend(
|
||||
Module.new {
|
||||
def string_value(token)
|
||||
value(token)
|
||||
end
|
||||
}
|
||||
)
|
||||
|
||||
# Modify the source map == check so that it doesn't check against the node
|
||||
# itself so we don't get into a recursive loop.
|
||||
Parser::Source::Map.prepend(
|
||||
|
|
|
@ -26,6 +26,7 @@ Sexp.prepend(
|
|||
module Prism
|
||||
class RubyParserTest < TestCase
|
||||
todos = [
|
||||
"encoding_euc_jp.txt",
|
||||
"newline_terminated.txt",
|
||||
"regex_char_width.txt",
|
||||
"seattlerb/bug169.txt",
|
||||
|
|
49
test/prism/snapshots/encoding_binary.txt
Normal file
49
test/prism/snapshots/encoding_binary.txt
Normal file
|
@ -0,0 +1,49 @@
|
|||
@ ProgramNode (location: (3,0)-(9,8))
|
||||
├── flags: ∅
|
||||
├── locals: []
|
||||
└── statements:
|
||||
@ StatementsNode (location: (3,0)-(9,8))
|
||||
├── flags: ∅
|
||||
└── body: (length: 4)
|
||||
├── @ StringNode (location: (3,0)-(3,6))
|
||||
│ ├── flags: newline
|
||||
│ ├── opening_loc: (3,0)-(3,1) = "\""
|
||||
│ ├── content_loc: (3,1)-(3,5) = "\\xcd"
|
||||
│ ├── closing_loc: (3,5)-(3,6) = "\""
|
||||
│ └── unescaped: "\xCD"
|
||||
├── @ SymbolNode (location: (5,0)-(5,7))
|
||||
│ ├── flags: newline, static_literal
|
||||
│ ├── opening_loc: (5,0)-(5,2) = ":\""
|
||||
│ ├── value_loc: (5,2)-(5,6) = "\\xcd"
|
||||
│ ├── closing_loc: (5,6)-(5,7) = "\""
|
||||
│ └── unescaped: "\xCD"
|
||||
├── @ InterpolatedRegularExpressionNode (location: (7,0)-(7,11))
|
||||
│ ├── flags: newline, static_literal
|
||||
│ ├── opening_loc: (7,0)-(7,1) = "/"
|
||||
│ ├── parts: (length: 1)
|
||||
│ │ └── @ EmbeddedStatementsNode (location: (7,1)-(7,10))
|
||||
│ │ ├── flags: ∅
|
||||
│ │ ├── opening_loc: (7,1)-(7,3) = "\#{"
|
||||
│ │ ├── statements:
|
||||
│ │ │ @ StatementsNode (location: (7,3)-(7,9))
|
||||
│ │ │ ├── flags: ∅
|
||||
│ │ │ └── body: (length: 1)
|
||||
│ │ │ └── @ StringNode (location: (7,3)-(7,9))
|
||||
│ │ │ ├── flags: static_literal, frozen
|
||||
│ │ │ ├── opening_loc: (7,3)-(7,4) = "\""
|
||||
│ │ │ ├── content_loc: (7,4)-(7,8) = "\\xcd"
|
||||
│ │ │ ├── closing_loc: (7,8)-(7,9) = "\""
|
||||
│ │ │ └── unescaped: "\xCD"
|
||||
│ │ └── closing_loc: (7,9)-(7,10) = "}"
|
||||
│ └── closing_loc: (7,10)-(7,11) = "/"
|
||||
└── @ ArrayNode (location: (9,0)-(9,8))
|
||||
├── flags: newline
|
||||
├── elements: (length: 1)
|
||||
│ └── @ StringNode (location: (9,3)-(9,7))
|
||||
│ ├── flags: ∅
|
||||
│ ├── opening_loc: ∅
|
||||
│ ├── content_loc: (9,3)-(9,7) = "\\xC0"
|
||||
│ ├── closing_loc: ∅
|
||||
│ └── unescaped: "\xC0"
|
||||
├── opening_loc: (9,0)-(9,3) = "%W["
|
||||
└── closing_loc: (9,7)-(9,8) = "]"
|
24
test/prism/snapshots/encoding_euc_jp.txt
Normal file
24
test/prism/snapshots/encoding_euc_jp.txt
Normal file
|
@ -0,0 +1,24 @@
|
|||
@ ProgramNode (location: (4,0)-(6,14))
|
||||
├── flags: ∅
|
||||
├── locals: []
|
||||
└── statements:
|
||||
@ StatementsNode (location: (4,0)-(6,14))
|
||||
├── flags: ∅
|
||||
└── body: (length: 2)
|
||||
├── @ StringNode (location: (4,0)-(4,10))
|
||||
│ ├── flags: newline
|
||||
│ ├── opening_loc: (4,0)-(4,1) = "\""
|
||||
│ ├── content_loc: (4,1)-(4,9) = "\\x8E\\x01"
|
||||
│ ├── closing_loc: (4,9)-(4,10) = "\""
|
||||
│ └── unescaped: "\x8E\x01"
|
||||
└── @ ArrayNode (location: (6,0)-(6,14))
|
||||
├── flags: newline
|
||||
├── elements: (length: 1)
|
||||
│ └── @ StringNode (location: (6,3)-(6,13))
|
||||
│ ├── flags: ∅
|
||||
│ ├── opening_loc: ∅
|
||||
│ ├── content_loc: (6,3)-(6,13) = "\"\\x8E\\x01\""
|
||||
│ ├── closing_loc: ∅
|
||||
│ └── unescaped: "\"\x8E\x01\""
|
||||
├── opening_loc: (6,0)-(6,3) = "%W["
|
||||
└── closing_loc: (6,13)-(6,14) = "]"
|
|
@ -5,6 +5,7 @@ require_relative "test_helper"
|
|||
module Prism
|
||||
class SnippetsTest < TestCase
|
||||
except = [
|
||||
"encoding_binary.txt",
|
||||
"newline_terminated.txt",
|
||||
"seattlerb/begin_rescue_else_ensure_no_bodies.txt",
|
||||
"seattlerb/case_in.txt",
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue