diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h index 7dfc8cd982..e81ecad25b 100644 --- a/prism/enc/pm_encoding.h +++ b/prism/enc/pm_encoding.h @@ -167,6 +167,8 @@ extern pm_encoding_t pm_encoding_cp855; extern pm_encoding_t pm_encoding_cp949; extern pm_encoding_t pm_encoding_cp950; extern pm_encoding_t pm_encoding_euc_jp; +extern pm_encoding_t pm_encoding_euc_jp_ms; +extern pm_encoding_t pm_encoding_euc_jis_2004; extern pm_encoding_t pm_encoding_gb1988; extern pm_encoding_t pm_encoding_gbk; extern pm_encoding_t pm_encoding_ibm437; diff --git a/prism/enc/pm_euc_jp.c b/prism/enc/pm_euc_jp.c index 6468712607..9bee6a1292 100644 --- a/prism/enc/pm_euc_jp.c +++ b/prism/enc/pm_euc_jp.c @@ -11,8 +11,8 @@ pm_encoding_euc_jp_char_width(const uint8_t *b, ptrdiff_t n) { if ( (n > 1) && ( - ((b[0] == 0x8E) && (b[1] >= 0xA1 && b[1] <= 0xFE)) || - ((b[0] >= 0xA1 && b[0] <= 0xFE) && (b[1] >= 0xA1 && b[1] <= 0xFE)) + ((b[0] == 0x8E) || (b[0] >= 0xA1 && b[0] <= 0xFE)) && + (b[1] >= 0xA1 && b[1] <= 0xFE) ) ) { return 2; @@ -60,7 +60,27 @@ pm_encoding_euc_jp_isupper_char(const uint8_t *b, ptrdiff_t n) { /** EUC-JP encoding */ pm_encoding_t pm_encoding_euc_jp = { - .name = "euc-jp", + .name = "EUC-JP", + .char_width = pm_encoding_euc_jp_char_width, + .alnum_char = pm_encoding_euc_jp_alnum_char, + .alpha_char = pm_encoding_euc_jp_alpha_char, + .isupper_char = pm_encoding_euc_jp_isupper_char, + .multibyte = true +}; + +/** eucJP-ms encoding */ +pm_encoding_t pm_encoding_euc_jp_ms = { + .name = "eucJP-ms", + .char_width = pm_encoding_euc_jp_char_width, + .alnum_char = pm_encoding_euc_jp_alnum_char, + .alpha_char = pm_encoding_euc_jp_alpha_char, + .isupper_char = pm_encoding_euc_jp_isupper_char, + .multibyte = true +}; + +/** EUC-JIS-2004 encoding */ +pm_encoding_t pm_encoding_euc_jis_2004 = { + .name = "EUC-JIS-2004", .char_width = pm_encoding_euc_jp_char_width, .alnum_char = pm_encoding_euc_jp_alnum_char, .alpha_char = pm_encoding_euc_jp_alpha_char, diff --git a/prism/prism.c b/prism/prism.c index 0cabae6232..4679ebb4ed 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -6248,6 +6248,8 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star break; case 'E': case 'e': ENCODING2("EUC-JP", "eucJP", pm_encoding_euc_jp); + ENCODING2("eucJP-ms", "euc-jp-ms", pm_encoding_euc_jp_ms); + ENCODING2("EUC-JIS-2004", "EUC-JISX0213", pm_encoding_euc_jis_2004); ENCODING1("external", pm_encoding_utf_8); break; case 'F': case 'f': diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb index db51d653f9..b206ab20e5 100644 --- a/test/prism/encoding_test.rb +++ b/test/prism/encoding_test.rb @@ -6,92 +6,101 @@ require_relative "test_helper" module Prism class EncodingTest < TestCase + codepoints_1byte = 0x00...0x100 + codepoints_2bytes = 0x00...0x10000 + encodings = { - Encoding::ASCII => 0x00...0x100, - Encoding::ASCII_8BIT => 0x00...0x100, - Encoding::CP850 => 0x00...0x100, - Encoding::CP852 => 0x00...0x100, - Encoding::CP855 => 0x00...0x100, - Encoding::GB1988 => 0x00...0x100, - Encoding::IBM437 => 0x00...0x100, - Encoding::IBM720 => 0x00...0x100, - Encoding::IBM737 => 0x00...0x100, - Encoding::IBM775 => 0x00...0x100, - Encoding::IBM852 => 0x00...0x100, - Encoding::IBM855 => 0x00...0x100, - Encoding::IBM857 => 0x00...0x100, - Encoding::IBM860 => 0x00...0x100, - Encoding::IBM861 => 0x00...0x100, - Encoding::IBM862 => 0x00...0x100, - Encoding::IBM863 => 0x00...0x100, - Encoding::IBM864 => 0x00...0x100, - Encoding::IBM865 => 0x00...0x100, - Encoding::IBM866 => 0x00...0x100, - Encoding::IBM869 => 0x00...0x100, - Encoding::ISO_8859_1 => 0x00...0x100, - Encoding::ISO_8859_2 => 0x00...0x100, - Encoding::ISO_8859_3 => 0x00...0x100, - Encoding::ISO_8859_4 => 0x00...0x100, - Encoding::ISO_8859_5 => 0x00...0x100, - Encoding::ISO_8859_6 => 0x00...0x100, - Encoding::ISO_8859_7 => 0x00...0x100, - Encoding::ISO_8859_8 => 0x00...0x100, - Encoding::ISO_8859_9 => 0x00...0x100, - Encoding::ISO_8859_10 => 0x00...0x100, - Encoding::ISO_8859_11 => 0x00...0x100, - Encoding::ISO_8859_13 => 0x00...0x100, - Encoding::ISO_8859_14 => 0x00...0x100, - Encoding::ISO_8859_15 => 0x00...0x100, - Encoding::ISO_8859_16 => 0x00...0x100, - Encoding::KOI8_R => 0x00...0x100, - Encoding::KOI8_U => 0x00...0x100, - Encoding::MACCENTEURO => 0x00...0x100, - Encoding::MACCROATIAN => 0x00...0x100, - Encoding::MACCYRILLIC => 0x00...0x100, - Encoding::MACGREEK => 0x00...0x100, - Encoding::MACICELAND => 0x00...0x100, - Encoding::MACROMAN => 0x00...0x100, - Encoding::MACROMANIA => 0x00...0x100, - Encoding::MACTHAI => 0x00...0x100, - Encoding::MACTURKISH => 0x00...0x100, - Encoding::MACUKRAINE => 0x00...0x100, - Encoding::TIS_620 => 0x00...0x100, - Encoding::Windows_1250 => 0x00...0x100, - Encoding::Windows_1251 => 0x00...0x100, - Encoding::Windows_1252 => 0x00...0x100, - Encoding::Windows_1253 => 0x00...0x100, - Encoding::Windows_1254 => 0x00...0x100, - Encoding::Windows_1255 => 0x00...0x100, - Encoding::Windows_1256 => 0x00...0x100, - Encoding::Windows_1257 => 0x00...0x100, - Encoding::Windows_1258 => 0x00...0x100, - Encoding::Windows_874 => 0x00...0x100, - Encoding::Big5 => 0x00...0x10000, - Encoding::Big5_HKSCS => 0x00...0x10000, - Encoding::Big5_UAO => 0x00...0x10000, - Encoding::CP949 => 0x00...0x10000, - Encoding::CP950 => 0x00...0x10000, - Encoding::GBK => 0x00...0x10000, - Encoding::MACJAPANESE => 0x00...0x10000, - Encoding::Shift_JIS => 0x00...0x10000, - Encoding::SJIS_DoCoMo => 0x00...0x10000, - Encoding::SJIS_KDDI => 0x00...0x10000, - Encoding::SJIS_SoftBank => 0x00...0x10000, - Encoding::Windows_31J => 0x00...0x10000, + Encoding::ASCII => codepoints_1byte, + Encoding::ASCII_8BIT => codepoints_1byte, + Encoding::CP850 => codepoints_1byte, + Encoding::CP852 => codepoints_1byte, + Encoding::CP855 => codepoints_1byte, + Encoding::GB1988 => codepoints_1byte, + Encoding::IBM437 => codepoints_1byte, + Encoding::IBM720 => codepoints_1byte, + Encoding::IBM737 => codepoints_1byte, + Encoding::IBM775 => codepoints_1byte, + Encoding::IBM852 => codepoints_1byte, + Encoding::IBM855 => codepoints_1byte, + Encoding::IBM857 => codepoints_1byte, + Encoding::IBM860 => codepoints_1byte, + Encoding::IBM861 => codepoints_1byte, + Encoding::IBM862 => codepoints_1byte, + Encoding::IBM863 => codepoints_1byte, + Encoding::IBM864 => codepoints_1byte, + Encoding::IBM865 => codepoints_1byte, + Encoding::IBM866 => codepoints_1byte, + Encoding::IBM869 => codepoints_1byte, + Encoding::ISO_8859_1 => codepoints_1byte, + Encoding::ISO_8859_2 => codepoints_1byte, + Encoding::ISO_8859_3 => codepoints_1byte, + Encoding::ISO_8859_4 => codepoints_1byte, + Encoding::ISO_8859_5 => codepoints_1byte, + Encoding::ISO_8859_6 => codepoints_1byte, + Encoding::ISO_8859_7 => codepoints_1byte, + Encoding::ISO_8859_8 => codepoints_1byte, + Encoding::ISO_8859_9 => codepoints_1byte, + Encoding::ISO_8859_10 => codepoints_1byte, + Encoding::ISO_8859_11 => codepoints_1byte, + Encoding::ISO_8859_13 => codepoints_1byte, + Encoding::ISO_8859_14 => codepoints_1byte, + Encoding::ISO_8859_15 => codepoints_1byte, + Encoding::ISO_8859_16 => codepoints_1byte, + Encoding::KOI8_R => codepoints_1byte, + Encoding::KOI8_U => codepoints_1byte, + Encoding::MACCENTEURO => codepoints_1byte, + Encoding::MACCROATIAN => codepoints_1byte, + Encoding::MACCYRILLIC => codepoints_1byte, + Encoding::MACGREEK => codepoints_1byte, + Encoding::MACICELAND => codepoints_1byte, + Encoding::MACROMAN => codepoints_1byte, + Encoding::MACROMANIA => codepoints_1byte, + Encoding::MACTHAI => codepoints_1byte, + Encoding::MACTURKISH => codepoints_1byte, + Encoding::MACUKRAINE => codepoints_1byte, + Encoding::TIS_620 => codepoints_1byte, + Encoding::Windows_1250 => codepoints_1byte, + Encoding::Windows_1251 => codepoints_1byte, + Encoding::Windows_1252 => codepoints_1byte, + Encoding::Windows_1253 => codepoints_1byte, + Encoding::Windows_1254 => codepoints_1byte, + Encoding::Windows_1255 => codepoints_1byte, + Encoding::Windows_1256 => codepoints_1byte, + Encoding::Windows_1257 => codepoints_1byte, + Encoding::Windows_1258 => codepoints_1byte, + Encoding::Windows_874 => codepoints_1byte, + Encoding::Big5 => codepoints_2bytes, + Encoding::Big5_HKSCS => codepoints_2bytes, + Encoding::Big5_UAO => codepoints_2bytes, + Encoding::CP949 => codepoints_2bytes, + Encoding::CP950 => codepoints_2bytes, + Encoding::GBK => codepoints_2bytes, + Encoding::MACJAPANESE => codepoints_2bytes, + Encoding::Shift_JIS => codepoints_2bytes, + Encoding::SJIS_DoCoMo => codepoints_2bytes, + Encoding::SJIS_KDDI => codepoints_2bytes, + Encoding::SJIS_SoftBank => codepoints_2bytes, + Encoding::Windows_31J => codepoints_2bytes, } # By default we don't test every codepoint in these encodings because they # are 3 and 4 byte representations so it can drastically slow down the test # suite. if ENV["PRISM_TEST_ALL_ENCODINGS"] + codepoints_eucjp = [*(0x00...0x10000), *(0x00...0x10000).map { |bytes| bytes | 0x8F0000 }] + codepoints_unicode = 0x00...0x110000 + + encodings.clear encodings.merge!( - Encoding::CP51932 => [*(0x00...0x10000), *(0x00...0x10000).map { |bytes| bytes | 0x8F0000 }], - Encoding::EUC_JP => [*(0x00...0x10000), *(0x00...0x10000).map { |bytes| bytes | 0x8F0000 }], - Encoding::UTF_8 => 0x00...0x110000, - Encoding::UTF8_MAC => 0x00...0x110000, - Encoding::UTF8_DoCoMo => 0x00...0x110000, - Encoding::UTF8_KDDI => 0x00...0x110000, - Encoding::UTF8_SoftBank => 0x00...0x110000 + Encoding::CP51932 => codepoints_eucjp, + Encoding::EUC_JP => codepoints_eucjp, + Encoding::EUCJP_MS => codepoints_eucjp, + Encoding::EUC_JIS_2004 => codepoints_eucjp, + Encoding::UTF_8 => codepoints_unicode, + Encoding::UTF8_MAC => codepoints_unicode, + Encoding::UTF8_DoCoMo => codepoints_unicode, + Encoding::UTF8_KDDI => codepoints_unicode, + Encoding::UTF8_SoftBank => codepoints_unicode ) end