mirror of
https://github.com/ruby/ruby.git
synced 2025-08-27 06:56:13 +02:00
parent
7b5bb978fb
commit
ddaa073058
4 changed files with 114 additions and 81 deletions
|
@ -167,6 +167,8 @@ extern pm_encoding_t pm_encoding_cp855;
|
||||||
extern pm_encoding_t pm_encoding_cp949;
|
extern pm_encoding_t pm_encoding_cp949;
|
||||||
extern pm_encoding_t pm_encoding_cp950;
|
extern pm_encoding_t pm_encoding_cp950;
|
||||||
extern pm_encoding_t pm_encoding_euc_jp;
|
extern pm_encoding_t pm_encoding_euc_jp;
|
||||||
|
extern pm_encoding_t pm_encoding_euc_jp_ms;
|
||||||
|
extern pm_encoding_t pm_encoding_euc_jis_2004;
|
||||||
extern pm_encoding_t pm_encoding_gb1988;
|
extern pm_encoding_t pm_encoding_gb1988;
|
||||||
extern pm_encoding_t pm_encoding_gbk;
|
extern pm_encoding_t pm_encoding_gbk;
|
||||||
extern pm_encoding_t pm_encoding_ibm437;
|
extern pm_encoding_t pm_encoding_ibm437;
|
||||||
|
|
|
@ -11,8 +11,8 @@ pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) {
|
||||||
if (
|
if (
|
||||||
(n > 1) &&
|
(n > 1) &&
|
||||||
(
|
(
|
||||||
((b[0] == 0x8E) && (b[1] >= 0xA1 && b[1] <= 0xFE)) ||
|
((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) &&
|
||||||
((b[0] >= 0xA1 && b[0] <= 0xFE) && (b[1] >= 0xA1 && b[1] <= 0xFE))
|
(b[1] >= 0xA1 && b[1] <= 0xFE)
|
||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
return 2;
|
return 2;
|
||||||
|
@ -60,7 +60,27 @@ pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) {
|
||||||
|
|
||||||
/** EUC-JP encoding */
|
/** EUC-JP encoding */
|
||||||
pm_encoding_t pm_encoding_euc_jp = {
|
pm_encoding_t pm_encoding_euc_jp = {
|
||||||
.name = "euc-jp",
|
.name = "EUC-JP",
|
||||||
|
.char_width = pm_encoding_euc_jp_char_width,
|
||||||
|
.alnum_char = pm_encoding_euc_jp_alnum_char,
|
||||||
|
.alpha_char = pm_encoding_euc_jp_alpha_char,
|
||||||
|
.isupper_char = pm_encoding_euc_jp_isupper_char,
|
||||||
|
.multibyte = true
|
||||||
|
};
|
||||||
|
|
||||||
|
/** eucJP-ms encoding */
|
||||||
|
pm_encoding_t pm_encoding_euc_jp_ms = {
|
||||||
|
.name = "eucJP-ms",
|
||||||
|
.char_width = pm_encoding_euc_jp_char_width,
|
||||||
|
.alnum_char = pm_encoding_euc_jp_alnum_char,
|
||||||
|
.alpha_char = pm_encoding_euc_jp_alpha_char,
|
||||||
|
.isupper_char = pm_encoding_euc_jp_isupper_char,
|
||||||
|
.multibyte = true
|
||||||
|
};
|
||||||
|
|
||||||
|
/** EUC-JIS-2004 encoding */
|
||||||
|
pm_encoding_t pm_encoding_euc_jis_2004 = {
|
||||||
|
.name = "EUC-JIS-2004",
|
||||||
.char_width = pm_encoding_euc_jp_char_width,
|
.char_width = pm_encoding_euc_jp_char_width,
|
||||||
.alnum_char = pm_encoding_euc_jp_alnum_char,
|
.alnum_char = pm_encoding_euc_jp_alnum_char,
|
||||||
.alpha_char = pm_encoding_euc_jp_alpha_char,
|
.alpha_char = pm_encoding_euc_jp_alpha_char,
|
||||||
|
|
|
@ -6248,6 +6248,8 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star
|
||||||
break;
|
break;
|
||||||
case 'E': case 'e':
|
case 'E': case 'e':
|
||||||
ENCODING2("EUC-JP", "eucJP", pm_encoding_euc_jp);
|
ENCODING2("EUC-JP", "eucJP", pm_encoding_euc_jp);
|
||||||
|
ENCODING2("eucJP-ms", "euc-jp-ms", pm_encoding_euc_jp_ms);
|
||||||
|
ENCODING2("EUC-JIS-2004", "EUC-JISX0213", pm_encoding_euc_jis_2004);
|
||||||
ENCODING1("external", pm_encoding_utf_8);
|
ENCODING1("external", pm_encoding_utf_8);
|
||||||
break;
|
break;
|
||||||
case 'F': case 'f':
|
case 'F': case 'f':
|
||||||
|
|
|
@ -6,92 +6,101 @@ require_relative "test_helper"
|
||||||
|
|
||||||
module Prism
|
module Prism
|
||||||
class EncodingTest < TestCase
|
class EncodingTest < TestCase
|
||||||
|
codepoints_1byte = 0x00...0x100
|
||||||
|
codepoints_2bytes = 0x00...0x10000
|
||||||
|
|
||||||
encodings = {
|
encodings = {
|
||||||
Encoding::ASCII => 0x00...0x100,
|
Encoding::ASCII => codepoints_1byte,
|
||||||
Encoding::ASCII_8BIT => 0x00...0x100,
|
Encoding::ASCII_8BIT => codepoints_1byte,
|
||||||
Encoding::CP850 => 0x00...0x100,
|
Encoding::CP850 => codepoints_1byte,
|
||||||
Encoding::CP852 => 0x00...0x100,
|
Encoding::CP852 => codepoints_1byte,
|
||||||
Encoding::CP855 => 0x00...0x100,
|
Encoding::CP855 => codepoints_1byte,
|
||||||
Encoding::GB1988 => 0x00...0x100,
|
Encoding::GB1988 => codepoints_1byte,
|
||||||
Encoding::IBM437 => 0x00...0x100,
|
Encoding::IBM437 => codepoints_1byte,
|
||||||
Encoding::IBM720 => 0x00...0x100,
|
Encoding::IBM720 => codepoints_1byte,
|
||||||
Encoding::IBM737 => 0x00...0x100,
|
Encoding::IBM737 => codepoints_1byte,
|
||||||
Encoding::IBM775 => 0x00...0x100,
|
Encoding::IBM775 => codepoints_1byte,
|
||||||
Encoding::IBM852 => 0x00...0x100,
|
Encoding::IBM852 => codepoints_1byte,
|
||||||
Encoding::IBM855 => 0x00...0x100,
|
Encoding::IBM855 => codepoints_1byte,
|
||||||
Encoding::IBM857 => 0x00...0x100,
|
Encoding::IBM857 => codepoints_1byte,
|
||||||
Encoding::IBM860 => 0x00...0x100,
|
Encoding::IBM860 => codepoints_1byte,
|
||||||
Encoding::IBM861 => 0x00...0x100,
|
Encoding::IBM861 => codepoints_1byte,
|
||||||
Encoding::IBM862 => 0x00...0x100,
|
Encoding::IBM862 => codepoints_1byte,
|
||||||
Encoding::IBM863 => 0x00...0x100,
|
Encoding::IBM863 => codepoints_1byte,
|
||||||
Encoding::IBM864 => 0x00...0x100,
|
Encoding::IBM864 => codepoints_1byte,
|
||||||
Encoding::IBM865 => 0x00...0x100,
|
Encoding::IBM865 => codepoints_1byte,
|
||||||
Encoding::IBM866 => 0x00...0x100,
|
Encoding::IBM866 => codepoints_1byte,
|
||||||
Encoding::IBM869 => 0x00...0x100,
|
Encoding::IBM869 => codepoints_1byte,
|
||||||
Encoding::ISO_8859_1 => 0x00...0x100,
|
Encoding::ISO_8859_1 => codepoints_1byte,
|
||||||
Encoding::ISO_8859_2 => 0x00...0x100,
|
Encoding::ISO_8859_2 => codepoints_1byte,
|
||||||
Encoding::ISO_8859_3 => 0x00...0x100,
|
Encoding::ISO_8859_3 => codepoints_1byte,
|
||||||
Encoding::ISO_8859_4 => 0x00...0x100,
|
Encoding::ISO_8859_4 => codepoints_1byte,
|
||||||
Encoding::ISO_8859_5 => 0x00...0x100,
|
Encoding::ISO_8859_5 => codepoints_1byte,
|
||||||
Encoding::ISO_8859_6 => 0x00...0x100,
|
Encoding::ISO_8859_6 => codepoints_1byte,
|
||||||
Encoding::ISO_8859_7 => 0x00...0x100,
|
Encoding::ISO_8859_7 => codepoints_1byte,
|
||||||
Encoding::ISO_8859_8 => 0x00...0x100,
|
Encoding::ISO_8859_8 => codepoints_1byte,
|
||||||
Encoding::ISO_8859_9 => 0x00...0x100,
|
Encoding::ISO_8859_9 => codepoints_1byte,
|
||||||
Encoding::ISO_8859_10 => 0x00...0x100,
|
Encoding::ISO_8859_10 => codepoints_1byte,
|
||||||
Encoding::ISO_8859_11 => 0x00...0x100,
|
Encoding::ISO_8859_11 => codepoints_1byte,
|
||||||
Encoding::ISO_8859_13 => 0x00...0x100,
|
Encoding::ISO_8859_13 => codepoints_1byte,
|
||||||
Encoding::ISO_8859_14 => 0x00...0x100,
|
Encoding::ISO_8859_14 => codepoints_1byte,
|
||||||
Encoding::ISO_8859_15 => 0x00...0x100,
|
Encoding::ISO_8859_15 => codepoints_1byte,
|
||||||
Encoding::ISO_8859_16 => 0x00...0x100,
|
Encoding::ISO_8859_16 => codepoints_1byte,
|
||||||
Encoding::KOI8_R => 0x00...0x100,
|
Encoding::KOI8_R => codepoints_1byte,
|
||||||
Encoding::KOI8_U => 0x00...0x100,
|
Encoding::KOI8_U => codepoints_1byte,
|
||||||
Encoding::MACCENTEURO => 0x00...0x100,
|
Encoding::MACCENTEURO => codepoints_1byte,
|
||||||
Encoding::MACCROATIAN => 0x00...0x100,
|
Encoding::MACCROATIAN => codepoints_1byte,
|
||||||
Encoding::MACCYRILLIC => 0x00...0x100,
|
Encoding::MACCYRILLIC => codepoints_1byte,
|
||||||
Encoding::MACGREEK => 0x00...0x100,
|
Encoding::MACGREEK => codepoints_1byte,
|
||||||
Encoding::MACICELAND => 0x00...0x100,
|
Encoding::MACICELAND => codepoints_1byte,
|
||||||
Encoding::MACROMAN => 0x00...0x100,
|
Encoding::MACROMAN => codepoints_1byte,
|
||||||
Encoding::MACROMANIA => 0x00...0x100,
|
Encoding::MACROMANIA => codepoints_1byte,
|
||||||
Encoding::MACTHAI => 0x00...0x100,
|
Encoding::MACTHAI => codepoints_1byte,
|
||||||
Encoding::MACTURKISH => 0x00...0x100,
|
Encoding::MACTURKISH => codepoints_1byte,
|
||||||
Encoding::MACUKRAINE => 0x00...0x100,
|
Encoding::MACUKRAINE => codepoints_1byte,
|
||||||
Encoding::TIS_620 => 0x00...0x100,
|
Encoding::TIS_620 => codepoints_1byte,
|
||||||
Encoding::Windows_1250 => 0x00...0x100,
|
Encoding::Windows_1250 => codepoints_1byte,
|
||||||
Encoding::Windows_1251 => 0x00...0x100,
|
Encoding::Windows_1251 => codepoints_1byte,
|
||||||
Encoding::Windows_1252 => 0x00...0x100,
|
Encoding::Windows_1252 => codepoints_1byte,
|
||||||
Encoding::Windows_1253 => 0x00...0x100,
|
Encoding::Windows_1253 => codepoints_1byte,
|
||||||
Encoding::Windows_1254 => 0x00...0x100,
|
Encoding::Windows_1254 => codepoints_1byte,
|
||||||
Encoding::Windows_1255 => 0x00...0x100,
|
Encoding::Windows_1255 => codepoints_1byte,
|
||||||
Encoding::Windows_1256 => 0x00...0x100,
|
Encoding::Windows_1256 => codepoints_1byte,
|
||||||
Encoding::Windows_1257 => 0x00...0x100,
|
Encoding::Windows_1257 => codepoints_1byte,
|
||||||
Encoding::Windows_1258 => 0x00...0x100,
|
Encoding::Windows_1258 => codepoints_1byte,
|
||||||
Encoding::Windows_874 => 0x00...0x100,
|
Encoding::Windows_874 => codepoints_1byte,
|
||||||
Encoding::Big5 => 0x00...0x10000,
|
Encoding::Big5 => codepoints_2bytes,
|
||||||
Encoding::Big5_HKSCS => 0x00...0x10000,
|
Encoding::Big5_HKSCS => codepoints_2bytes,
|
||||||
Encoding::Big5_UAO => 0x00...0x10000,
|
Encoding::Big5_UAO => codepoints_2bytes,
|
||||||
Encoding::CP949 => 0x00...0x10000,
|
Encoding::CP949 => codepoints_2bytes,
|
||||||
Encoding::CP950 => 0x00...0x10000,
|
Encoding::CP950 => codepoints_2bytes,
|
||||||
Encoding::GBK => 0x00...0x10000,
|
Encoding::GBK => codepoints_2bytes,
|
||||||
Encoding::MACJAPANESE => 0x00...0x10000,
|
Encoding::MACJAPANESE => codepoints_2bytes,
|
||||||
Encoding::Shift_JIS => 0x00...0x10000,
|
Encoding::Shift_JIS => codepoints_2bytes,
|
||||||
Encoding::SJIS_DoCoMo => 0x00...0x10000,
|
Encoding::SJIS_DoCoMo => codepoints_2bytes,
|
||||||
Encoding::SJIS_KDDI => 0x00...0x10000,
|
Encoding::SJIS_KDDI => codepoints_2bytes,
|
||||||
Encoding::SJIS_SoftBank => 0x00...0x10000,
|
Encoding::SJIS_SoftBank => codepoints_2bytes,
|
||||||
Encoding::Windows_31J => 0x00...0x10000,
|
Encoding::Windows_31J => codepoints_2bytes,
|
||||||
}
|
}
|
||||||
|
|
||||||
# By default we don't test every codepoint in these encodings because they
|
# By default we don't test every codepoint in these encodings because they
|
||||||
# are 3 and 4 byte representations so it can drastically slow down the test
|
# are 3 and 4 byte representations so it can drastically slow down the test
|
||||||
# suite.
|
# suite.
|
||||||
if ENV["PRISM_TEST_ALL_ENCODINGS"]
|
if ENV["PRISM_TEST_ALL_ENCODINGS"]
|
||||||
|
codepoints_eucjp = [*(0x00...0x10000), *(0x00...0x10000).map { |bytes| bytes | 0x8F0000 }]
|
||||||
|
codepoints_unicode = 0x00...0x110000
|
||||||
|
|
||||||
|
encodings.clear
|
||||||
encodings.merge!(
|
encodings.merge!(
|
||||||
Encoding::CP51932 => [*(0x00...0x10000), *(0x00...0x10000).map { |bytes| bytes | 0x8F0000 }],
|
Encoding::CP51932 => codepoints_eucjp,
|
||||||
Encoding::EUC_JP => [*(0x00...0x10000), *(0x00...0x10000).map { |bytes| bytes | 0x8F0000 }],
|
Encoding::EUC_JP => codepoints_eucjp,
|
||||||
Encoding::UTF_8 => 0x00...0x110000,
|
Encoding::EUCJP_MS => codepoints_eucjp,
|
||||||
Encoding::UTF8_MAC => 0x00...0x110000,
|
Encoding::EUC_JIS_2004 => codepoints_eucjp,
|
||||||
Encoding::UTF8_DoCoMo => 0x00...0x110000,
|
Encoding::UTF_8 => codepoints_unicode,
|
||||||
Encoding::UTF8_KDDI => 0x00...0x110000,
|
Encoding::UTF8_MAC => codepoints_unicode,
|
||||||
Encoding::UTF8_SoftBank => 0x00...0x110000
|
Encoding::UTF8_DoCoMo => codepoints_unicode,
|
||||||
|
Encoding::UTF8_KDDI => codepoints_unicode,
|
||||||
|
Encoding::UTF8_SoftBank => codepoints_unicode
|
||||||
)
|
)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue