diff --git a/prism/enc/pm_encoding.h b/prism/enc/pm_encoding.h index 7d73576665..97ebc68a4d 100644 --- a/prism/enc/pm_encoding.h +++ b/prism/enc/pm_encoding.h @@ -180,8 +180,9 @@ extern pm_encoding_t pm_encoding_koi8_r; extern pm_encoding_t pm_encoding_shift_jis; extern pm_encoding_t pm_encoding_utf_8; extern pm_encoding_t pm_encoding_utf8_mac; -extern pm_encoding_t pm_encoding_windows_31j; +extern pm_encoding_t pm_encoding_windows_1250; extern pm_encoding_t pm_encoding_windows_1251; extern pm_encoding_t pm_encoding_windows_1252; +extern pm_encoding_t pm_encoding_windows_31j; #endif diff --git a/prism/enc/pm_tables.c b/prism/enc/pm_tables.c index 7b840acfaa..3ed5523c7f 100644 --- a/prism/enc/pm_tables.c +++ b/prism/enc/pm_tables.c @@ -408,6 +408,30 @@ static uint8_t pm_encoding_koi8_r_table[256] = { 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Fx }; +/** + * Each element of the following table contains a bitfield that indicates a + * piece of information about the corresponding windows-1250 character. + */ +static uint8_t pm_encoding_windows_1250_table[256] = { +// 0 1 2 3 4 5 6 7 8 9 A B C D E F + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x + 0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x + 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 7, 7, 7, 7, // 8x + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 3, 3, 3, // 9x + 0, 0, 0, 7, 0, 7, 0, 0, 0, 0, 7, 0, 0, 0, 0, 7, // Ax + 0, 0, 0, 3, 0, 3, 0, 0, 0, 3, 3, 0, 7, 0, 3, 3, // Bx + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx + 7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex + 3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx +}; + /** * Each element of the following table contains a bitfield that indicates a * piece of information about the corresponding windows-1251 character. @@ -537,6 +561,7 @@ PRISM_ENCODING_TABLE(iso_8859_14) PRISM_ENCODING_TABLE(iso_8859_15) PRISM_ENCODING_TABLE(iso_8859_16) PRISM_ENCODING_TABLE(koi8_r) +PRISM_ENCODING_TABLE(windows_1250) PRISM_ENCODING_TABLE(windows_1251) PRISM_ENCODING_TABLE(windows_1252) @@ -722,6 +747,16 @@ pm_encoding_t pm_encoding_koi8_r = { .multibyte = false }; +/** Windows-1250 */ +pm_encoding_t pm_encoding_windows_1250 = { + .name = "windows-1250", + .char_width = pm_encoding_single_char_width, + .alnum_char = pm_encoding_windows_1250_alnum_char, + .alpha_char = pm_encoding_windows_1250_alpha_char, + .isupper_char = pm_encoding_windows_1250_isupper_char, + .multibyte = false +}; + /** Windows-1251 */ pm_encoding_t pm_encoding_windows_1251 = { .name = "windows-1251", diff --git a/prism/prism.c b/prism/prism.c index 572dc1f146..33d50acc3d 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -6022,10 +6022,18 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star } // Next, we're going to check for UTF-8. This is the most common encoding. - // Extensions like utf-8 can contain extra encoding details like, - // utf-8-dos, utf-8-linux, utf-8-mac. We treat these all as utf-8 should - // treat any encoding starting utf-8 as utf-8. + // utf-8 can contain extra information at the end about the platform it is + // encoded on, such as utf-8-mac or utf-8-unix. We'll ignore those suffixes. if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "utf-8", 5) == 0)) { + // We need to explicitly handle utf-8-hfs, as that one needs to switch + // over to being utf8-mac. + if (width == 9 && (pm_strncasecmp(start + 5, (const uint8_t *) "-hfs", 4) == 0)) { + parser->encoding = pm_encoding_utf8_mac; + parser->encoding_changed = true; + if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser); + return true; + } + // We don't need to do anything here because the default encoding is // already UTF-8. We'll just return. return true; @@ -6036,48 +6044,58 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star #define ENCODING(value, prebuilt) \ if (width == sizeof(value) - 1 && start + width <= end && pm_strncasecmp(start, (const uint8_t *) value, width) == 0) { \ parser->encoding = prebuilt; \ - parser->encoding_changed |= true; \ + parser->encoding_changed = true; \ if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser); \ return true; \ } + // Built convenience macros to compare aliases for the same encoding. +#define ENCODING2(value1, value2, prebuilt) ENCODING(value1, prebuilt) ENCODING(value2, prebuilt) +#define ENCODING3(value1, value2, value3, prebuilt) ENCODING2(value1, value2, prebuilt) ENCODING(value3, prebuilt) +#define ENCODING4(value1, value2, value3, value4, prebuilt) ENCODING3(value1, value2, value3, prebuilt) ENCODING(value4, prebuilt) +#define ENCODING5(value1, value2, value3, value4, value5, prebuilt) ENCODING4(value1, value2, value3, value4, prebuilt) ENCODING(value5, prebuilt) + // Check most common first. (This is pretty arbitrary.) - ENCODING("ascii", pm_encoding_ascii); - ENCODING("ascii-8bit", pm_encoding_ascii_8bit); - ENCODING("us-ascii", pm_encoding_ascii); - ENCODING("binary", pm_encoding_ascii_8bit); - ENCODING("shift_jis", pm_encoding_shift_jis); - ENCODING("euc-jp", pm_encoding_euc_jp); + ENCODING("ASCII", pm_encoding_ascii); + ENCODING("ASCII-8BIT", pm_encoding_ascii_8bit); + ENCODING("US-ASCII", pm_encoding_ascii); + ENCODING("BINARY", pm_encoding_ascii_8bit); + ENCODING("Shift_JIS", pm_encoding_shift_jis); + ENCODING("EUC-JP", pm_encoding_euc_jp); // Then check all the others. - ENCODING("big5", pm_encoding_big5); + ENCODING2("ANSI_X3.4-1968", "646", pm_encoding_ascii); ENCODING("cp51932", pm_encoding_cp51932); - ENCODING("gbk", pm_encoding_gbk); - ENCODING("iso-8859-1", pm_encoding_iso_8859_1); - ENCODING("iso-8859-2", pm_encoding_iso_8859_2); - ENCODING("iso-8859-3", pm_encoding_iso_8859_3); - ENCODING("iso-8859-4", pm_encoding_iso_8859_4); - ENCODING("iso-8859-5", pm_encoding_iso_8859_5); - ENCODING("iso-8859-6", pm_encoding_iso_8859_6); - ENCODING("iso-8859-7", pm_encoding_iso_8859_7); - ENCODING("iso-8859-8", pm_encoding_iso_8859_8); - ENCODING("iso-8859-9", pm_encoding_iso_8859_9); - ENCODING("iso-8859-10", pm_encoding_iso_8859_10); - ENCODING("iso-8859-11", pm_encoding_iso_8859_11); - ENCODING("iso-8859-13", pm_encoding_iso_8859_13); - ENCODING("iso-8859-14", pm_encoding_iso_8859_14); - ENCODING("iso-8859-15", pm_encoding_iso_8859_15); - ENCODING("iso-8859-16", pm_encoding_iso_8859_16); - ENCODING("koi8-r", pm_encoding_koi8_r); - ENCODING("windows-31j", pm_encoding_windows_31j); - ENCODING("windows-1251", pm_encoding_windows_1251); - ENCODING("windows-1252", pm_encoding_windows_1252); - ENCODING("cp1251", pm_encoding_windows_1251); - ENCODING("cp1252", pm_encoding_windows_1252); - ENCODING("cp932", pm_encoding_windows_31j); - ENCODING("sjis", pm_encoding_windows_31j); - ENCODING("utf8-mac", pm_encoding_utf8_mac); + ENCODING("eucJP", pm_encoding_euc_jp); + ENCODING("Big5", pm_encoding_big5); + ENCODING2("GBK", "CP936", pm_encoding_gbk); + ENCODING2("ISO-8859-1", "ISO8859-1", pm_encoding_iso_8859_1); + ENCODING2("ISO-8859-2", "ISO8859-2", pm_encoding_iso_8859_2); + ENCODING2("ISO-8859-3", "ISO8859-3", pm_encoding_iso_8859_3); + ENCODING2("ISO-8859-4", "ISO8859-4", pm_encoding_iso_8859_4); + ENCODING2("ISO-8859-5", "ISO8859-5", pm_encoding_iso_8859_5); + ENCODING2("ISO-8859-6", "ISO8859-6", pm_encoding_iso_8859_6); + ENCODING2("ISO-8859-7", "ISO8859-7", pm_encoding_iso_8859_7); + ENCODING2("ISO-8859-8", "ISO8859-8", pm_encoding_iso_8859_8); + ENCODING2("ISO-8859-9", "ISO8859-9", pm_encoding_iso_8859_9); + ENCODING2("ISO-8859-10", "ISO8859-10", pm_encoding_iso_8859_10); + ENCODING2("ISO-8859-11", "ISO8859-11", pm_encoding_iso_8859_11); + ENCODING2("ISO-8859-13", "ISO8859-13", pm_encoding_iso_8859_13); + ENCODING2("ISO-8859-14", "ISO8859-14", pm_encoding_iso_8859_14); + ENCODING2("ISO-8859-15", "ISO8859-15", pm_encoding_iso_8859_15); + ENCODING2("ISO-8859-16", "ISO8859-16", pm_encoding_iso_8859_16); + ENCODING2("KOI8-R", "CP878", pm_encoding_koi8_r); + ENCODING4("CP65001", "locale", "external", "filesystem", pm_encoding_utf_8); + ENCODING3("UTF8-MAC", "UTF-8-MAC", "UTF-8-HFS", pm_encoding_utf8_mac); + ENCODING2("Windows-1250", "CP1250", pm_encoding_windows_1250); + ENCODING2("Windows-1251", "CP1251", pm_encoding_windows_1251); + ENCODING2("Windows-1252", "CP1252", pm_encoding_windows_1252); + ENCODING5("Windows-31J", "CP932", "csWindows31J", "SJIS", "PCK", pm_encoding_windows_31j); +#undef ENCODING2 +#undef ENCODING3 +#undef ENCODING4 +#undef ENCODING5 #undef ENCODING return false; diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb index f3a24fa9dc..9e18989ad3 100644 --- a/test/prism/encoding_test.rb +++ b/test/prism/encoding_test.rb @@ -4,45 +4,49 @@ require_relative "test_helper" module Prism class EncodingTest < TestCase - %w[ - ascii - ascii-8bit - big5 - binary - euc-jp - gbk - iso-8859-1 - iso-8859-2 - iso-8859-3 - iso-8859-4 - iso-8859-5 - iso-8859-6 - iso-8859-7 - iso-8859-8 - iso-8859-9 - iso-8859-10 - iso-8859-11 - iso-8859-13 - iso-8859-14 - iso-8859-15 - iso-8859-16 - koi8-r - shift_jis - sjis - us-ascii - utf-8 - utf8-mac - windows-31j - windows-1251 - windows-1252 - CP1251 - CP1252 - CP51932 - ].each do |encoding| - define_method "test_encoding_#{encoding}" do - result = Prism.parse("# encoding: #{encoding}\n'string'") - actual = result.value.statements.body.first.unescaped.encoding - assert_equal Encoding.find(encoding), actual + [ + "US-ASCII", + "ASCII-8BIT", + "Big5", + "CP51932", + "EUC-JP", + "GBK", + "ISO-8859-1", + "ISO-8859-2", + "ISO-8859-3", + "ISO-8859-4", + "ISO-8859-5", + "ISO-8859-6", + "ISO-8859-7", + "ISO-8859-8", + "ISO-8859-9", + "ISO-8859-10", + "ISO-8859-11", + "ISO-8859-13", + "ISO-8859-14", + "ISO-8859-15", + "ISO-8859-16", + "KOI8-R", + "Shift_JIS", + "Windows-31J", + "UTF-8", + "UTF8-MAC", + "Windows-1250", + "Windows-1251", + "Windows-1252", + ].each do |canonical_name| + encoding = Encoding.find(canonical_name) + + encoding.names.each do |name| + # Even though UTF-8-MAC is an alias for UTF8-MAC, CRuby treats it as + # UTF-8. So we'll skip this test. + next if name == "UTF-8-MAC" + + define_method "test_encoding_#{name}" do + result = Prism.parse("# encoding: #{name}\n'string'") + actual = result.value.statements.body.first.unescaped.encoding + assert_equal encoding, actual + end end end