mirror of
https://github.com/ruby/ruby.git
synced 2025-08-23 13:04:13 +02:00
101 lines
3.4 KiB
Ruby
101 lines
3.4 KiB
Ruby
# frozen_string_literal: true
|
|
|
|
return if RUBY_ENGINE != "ruby"
|
|
|
|
require_relative "../test_helper"
|
|
|
|
module Prism
|
|
class EncodingsTest < TestCase
|
|
class ConstantContext < BasicObject
|
|
def self.const_missing(const)
|
|
const
|
|
end
|
|
end
|
|
|
|
class IdentifierContext < BasicObject
|
|
def method_missing(name, *)
|
|
name
|
|
end
|
|
end
|
|
|
|
# These test that we're correctly parsing codepoints for each alias of each
|
|
# encoding that prism supports.
|
|
each_encoding do |encoding, range|
|
|
(encoding.names - %w[external internal filesystem locale]).each do |name|
|
|
define_method(:"test_encoding_#{name}") do
|
|
assert_encoding(encoding, name, range)
|
|
end
|
|
end
|
|
end
|
|
|
|
private
|
|
|
|
def assert_encoding_constant(name, character)
|
|
source = "# encoding: #{name}\n#{character}"
|
|
expected = ConstantContext.new.instance_eval(source)
|
|
|
|
result = Prism.parse(source)
|
|
assert result.success?
|
|
|
|
actual = result.value.statements.body.last
|
|
assert_kind_of ConstantReadNode, actual
|
|
assert_equal expected, actual.name
|
|
end
|
|
|
|
def assert_encoding_identifier(name, character)
|
|
source = "# encoding: #{name}\n#{character}"
|
|
expected = IdentifierContext.new.instance_eval(source)
|
|
|
|
result = Prism.parse(source)
|
|
assert result.success?
|
|
|
|
actual = result.value.statements.body.last
|
|
assert_kind_of CallNode, actual
|
|
assert_equal expected, actual.name
|
|
end
|
|
|
|
# Check that we can properly parse every codepoint in the given encoding.
|
|
def assert_encoding(encoding, name, range)
|
|
# I'm not entirely sure, but I believe these codepoints are incorrect in
|
|
# their parsing in CRuby. They all report as matching `[[:lower:]]` but
|
|
# then they are parsed as constants. This is because CRuby determines if
|
|
# an identifier is a constant or not by case folding it down to lowercase
|
|
# and checking if there is a difference. And even though they report
|
|
# themselves as lowercase, their case fold is different. I have reported
|
|
# this bug upstream.
|
|
case encoding
|
|
when Encoding::UTF_8, Encoding::UTF_8_MAC, Encoding::UTF8_DoCoMo, Encoding::UTF8_KDDI, Encoding::UTF8_SoftBank, Encoding::CESU_8
|
|
range = range.to_a - [
|
|
0x01c5, 0x01c8, 0x01cb, 0x01f2, 0x1f88, 0x1f89, 0x1f8a, 0x1f8b,
|
|
0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f, 0x1f98, 0x1f99, 0x1f9a, 0x1f9b,
|
|
0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f, 0x1fa8, 0x1fa9, 0x1faa, 0x1fab,
|
|
0x1fac, 0x1fad, 0x1fae, 0x1faf, 0x1fbc, 0x1fcc, 0x1ffc,
|
|
]
|
|
when Encoding::Windows_1253
|
|
range = range.to_a - [0xb5]
|
|
end
|
|
|
|
range.each do |codepoint|
|
|
character = codepoint.chr(encoding)
|
|
|
|
if character.match?(/[[:alpha:]]/)
|
|
if character.match?(/[[:upper:]]/)
|
|
assert_encoding_constant(name, character)
|
|
else
|
|
assert_encoding_identifier(name, character)
|
|
end
|
|
elsif character.match?(/[[:alnum:]]/)
|
|
assert_encoding_identifier(name, "_#{character}")
|
|
else
|
|
next if ["/", "{"].include?(character)
|
|
|
|
source = "# encoding: #{name}\n/(?##{character})/\n"
|
|
assert Prism.parse_success?(source), "Expected #{source.inspect} to parse successfully."
|
|
end
|
|
rescue RangeError
|
|
source = "# encoding: #{name}\n\\x#{codepoint.to_s(16)}"
|
|
assert Prism.parse_failure?(source)
|
|
end
|
|
end
|
|
end
|
|
end
|