[ruby/prism] Be more defensive in the parser translator lexer

Generally I have been good about safely accessing the tokens but failed
to properly guard against no tokens in places
where it could theoretically happen through invalid syntax.

I added a test case for one occurance, other changes are theoretical only.

4a3866af19
This commit is contained in:
Earlopain 2025-08-14 17:05:54 +02:00 committed by git
parent a677220aba
commit a04555c8ab
2 changed files with 32 additions and 15 deletions

View file

@ -277,20 +277,20 @@ module Prism
when :tCOMMENT when :tCOMMENT
if token.type == :EMBDOC_BEGIN if token.type == :EMBDOC_BEGIN
while !((next_token = lexed[index][0]) && next_token.type == :EMBDOC_END) && (index < length - 1) while !((next_token = lexed[index]&.first) && next_token.type == :EMBDOC_END) && (index < length - 1)
value += next_token.value value += next_token.value
index += 1 index += 1
end end
value += next_token.value value += next_token.value
location = range(token.location.start_offset, lexed[index][0].location.end_offset) location = range(token.location.start_offset, next_token.location.end_offset)
index += 1 index += 1
else else
is_at_eol = value.chomp!.nil? is_at_eol = value.chomp!.nil?
location = range(token.location.start_offset, token.location.end_offset + (is_at_eol ? 0 : -1)) location = range(token.location.start_offset, token.location.end_offset + (is_at_eol ? 0 : -1))
prev_token = lexed[index - 2][0] if index - 2 >= 0 prev_token, _ = lexed[index - 2] if index - 2 >= 0
next_token = lexed[index][0] next_token, _ = lexed[index]
is_inline_comment = prev_token&.location&.start_line == token.location.start_line is_inline_comment = prev_token&.location&.start_line == token.location.start_line
if is_inline_comment && !is_at_eol && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type) if is_inline_comment && !is_at_eol && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
@ -309,7 +309,7 @@ module Prism
end end
end end
when :tNL when :tNL
next_token = next_token = lexed[index][0] next_token, _ = lexed[index]
# Newlines after comments are emitted out of order. # Newlines after comments are emitted out of order.
if next_token&.type == :COMMENT if next_token&.type == :COMMENT
comment_newline_location = location comment_newline_location = location
@ -346,8 +346,8 @@ module Prism
location = range(token.location.start_offset, token.location.start_offset + percent_array_leading_whitespace(value)) location = range(token.location.start_offset, token.location.start_offset + percent_array_leading_whitespace(value))
value = nil value = nil
when :tSTRING_BEG when :tSTRING_BEG
next_token = lexed[index][0] next_token, _ = lexed[index]
next_next_token = lexed[index + 1][0] next_next_token, _ = lexed[index + 1]
basic_quotes = value == '"' || value == "'" basic_quotes = value == '"' || value == "'"
if basic_quotes && next_token&.type == :STRING_END if basic_quotes && next_token&.type == :STRING_END
@ -415,7 +415,8 @@ module Prism
while token.type == :STRING_CONTENT while token.type == :STRING_CONTENT
current_length += token.value.bytesize current_length += token.value.bytesize
# Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line. # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line prev_token, _ = lexed[index - 2] if index - 2 >= 0
is_first_token_on_line = prev_token && token.location.start_line != prev_token.location.start_line
# The parser gem only removes indentation when the heredoc is not nested # The parser gem only removes indentation when the heredoc is not nested
not_nested = heredoc_stack.size == 1 not_nested = heredoc_stack.size == 1
if is_percent_array if is_percent_array
@ -434,7 +435,7 @@ module Prism
tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]] tokens << [:tSTRING_CONTENT, [current_string, range(start_offset, start_offset + current_length)]]
break break
end end
token = lexed[index][0] token, _ = lexed[index]
index += 1 index += 1
end end
else else
@ -489,7 +490,7 @@ module Prism
end end
if percent_array?(quote_stack.pop) if percent_array?(quote_stack.pop)
prev_token = lexed[index - 2][0] if index - 2 >= 0 prev_token, _ = lexed[index - 2] if index - 2 >= 0
empty = %i[PERCENT_LOWER_I PERCENT_LOWER_W PERCENT_UPPER_I PERCENT_UPPER_W].include?(prev_token&.type) empty = %i[PERCENT_LOWER_I PERCENT_LOWER_W PERCENT_UPPER_I PERCENT_UPPER_W].include?(prev_token&.type)
ends_with_whitespace = prev_token&.type == :WORDS_SEP ends_with_whitespace = prev_token&.type == :WORDS_SEP
# parser always emits a space token after content in a percent array, even if no actual whitespace is present. # parser always emits a space token after content in a percent array, even if no actual whitespace is present.
@ -498,7 +499,7 @@ module Prism
end end
end end
when :tSYMBEG when :tSYMBEG
if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END if (next_token = lexed[index]&.first) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
next_location = token.location.join(next_token.location) next_location = token.location.join(next_token.location)
type = :tSYMBOL type = :tSYMBOL
value = next_token.value value = next_token.value
@ -513,13 +514,13 @@ module Prism
type = :tIDENTIFIER type = :tIDENTIFIER
end end
when :tXSTRING_BEG when :tXSTRING_BEG
if (next_token = lexed[index][0]) && !%i[STRING_CONTENT STRING_END EMBEXPR_BEGIN].include?(next_token.type) if (next_token = lexed[index]&.first) && !%i[STRING_CONTENT STRING_END EMBEXPR_BEGIN].include?(next_token.type)
# self.`() # self.`()
type = :tBACK_REF2 type = :tBACK_REF2
end end
quote_stack.push(value) quote_stack.push(value)
when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG
if (next_token = lexed[index][0]) && next_token.type == :WORDS_SEP if (next_token = lexed[index]&.first) && next_token.type == :WORDS_SEP
index += 1 index += 1
end end
@ -595,9 +596,9 @@ module Prism
previous_line = -1 previous_line = -1
result = Float::MAX result = Float::MAX
while (lexed[next_token_index] && next_token = lexed[next_token_index][0]) while (next_token = lexed[next_token_index]&.first)
next_token_index += 1 next_token_index += 1
next_next_token = lexed[next_token_index] && lexed[next_token_index][0] next_next_token, _ = lexed[next_token_index]
first_token_on_line = next_token.location.start_column == 0 first_token_on_line = next_token.location.start_column == 0
# String content inside nested heredocs and interpolation is ignored # String content inside nested heredocs and interpolation is ignored

View file

@ -163,6 +163,22 @@ module Prism
end end
end end
def test_invalid_syntax
code = <<~RUBY
foo do
case bar
when
end
end
RUBY
buffer = Parser::Source::Buffer.new("(string)")
buffer.source = code
parser = Prism::Translation::Parser33.new
parser.diagnostics.all_errors_are_fatal = true
assert_raise(Parser::SyntaxError) { parser.tokenize(buffer) }
end
def test_it_block_parameter_syntax def test_it_block_parameter_syntax
it_fixture_path = Pathname(__dir__).join("../../../test/prism/fixtures/it.txt") it_fixture_path = Pathname(__dir__).join("../../../test/prism/fixtures/it.txt")