[ruby/prism] Add windows-1250 encoding

a362535ca4
2025-08-27 23:16:42 +02:00 · 2023-11-11 22:52:23 -05:00 · 2023-11-11 22:52:23 -05:00 · ca789e7232
commit ca789e7232
parent aebc6e8b8d
4 changed files with 134 additions and 76 deletions
--- a/prism/enc/pm_encoding.h
+++ b/prism/enc/pm_encoding.h
@ -180,8 +180,9 @@ extern pm_encoding_t pm_encoding_koi8_r;
 extern pm_encoding_t pm_encoding_shift_jis;
 extern pm_encoding_t pm_encoding_utf_8;
 extern pm_encoding_t pm_encoding_utf8_mac;
-extern pm_encoding_t pm_encoding_windows_31j;
+extern pm_encoding_t pm_encoding_windows_1250;
 extern pm_encoding_t pm_encoding_windows_1251;
 extern pm_encoding_t pm_encoding_windows_1252;
 extern pm_encoding_t pm_encoding_windows_31j;
 #endif
--- a/prism/enc/pm_tables.c
+++ b/prism/enc/pm_tables.c
@ -408,6 +408,30 @@ static uint8_t pm_encoding_koi8_r_table[256] = {
    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Fx
 };
 /**
 * Each element of the following table contains a bitfield that indicates a
 * piece of information about the corresponding windows-1250 character.
 */
 static uint8_t pm_encoding_windows_1250_table[256] = {
 //  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 1x
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 2x
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, // 3x
    0, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // 4x
    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, 0, // 5x
    0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 6x
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, // 7x
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 7, 7, 7, 7, // 8x
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 3, 3, 3, 3, // 9x
    0, 0, 0, 7, 0, 7, 0, 0, 0, 0, 7, 0, 0, 0, 0, 7, // Ax
    0, 0, 0, 3, 0, 3, 0, 0, 0, 3, 3, 0, 7, 0, 3, 3, // Bx
    7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, // Cx
    7, 7, 7, 7, 7, 7, 7, 0, 7, 7, 7, 7, 7, 7, 7, 3, // Dx
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // Ex
    3, 3, 3, 3, 3, 3, 3, 0, 3, 3, 3, 3, 3, 3, 3, 0, // Fx
 };
 /**
 * Each element of the following table contains a bitfield that indicates a
 * piece of information about the corresponding windows-1251 character.
@ -537,6 +561,7 @@ PRISM_ENCODING_TABLE(iso_8859_14)
 PRISM_ENCODING_TABLE(iso_8859_15)
 PRISM_ENCODING_TABLE(iso_8859_16)
 PRISM_ENCODING_TABLE(koi8_r)
 PRISM_ENCODING_TABLE(windows_1250)
 PRISM_ENCODING_TABLE(windows_1251)
 PRISM_ENCODING_TABLE(windows_1252)
@ -722,6 +747,16 @@ pm_encoding_t pm_encoding_koi8_r = {
    .multibyte = false
 };
 /** Windows-1250 */
 pm_encoding_t pm_encoding_windows_1250 = {
    .name = "windows-1250",
    .char_width = pm_encoding_single_char_width,
    .alnum_char = pm_encoding_windows_1250_alnum_char,
    .alpha_char = pm_encoding_windows_1250_alpha_char,
    .isupper_char = pm_encoding_windows_1250_isupper_char,
    .multibyte = false
 };
 /** Windows-1251 */
 pm_encoding_t pm_encoding_windows_1251 = {
    .name = "windows-1251",
--- a/prism/prism.c
+++ b/prism/prism.c
@ -6022,10 +6022,18 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star
    }
    // Next, we're going to check for UTF-8. This is the most common encoding.
-    // Extensions like utf-8 can contain extra encoding details like,
+    // utf-8 can contain extra information at the end about the platform it is
-    // utf-8-dos, utf-8-linux, utf-8-mac. We treat these all as utf-8 should
+    // encoded on, such as utf-8-mac or utf-8-unix. We'll ignore those suffixes.
    // treat any encoding starting utf-8 as utf-8.
    if ((start + 5 <= end) && (pm_strncasecmp(start, (const uint8_t *) "utf-8", 5) == 0)) {
        // We need to explicitly handle utf-8-hfs, as that one needs to switch
        // over to being utf8-mac.
        if (width == 9 && (pm_strncasecmp(start + 5, (const uint8_t *) "-hfs", 4) == 0)) {
            parser->encoding = pm_encoding_utf8_mac;
            parser->encoding_changed = true;
            if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser);
            return true;
        }
        // We don't need to do anything here because the default encoding is
        // already UTF-8. We'll just return.
        return true;
@ -6036,48 +6044,58 @@ parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *star
 #define ENCODING(value, prebuilt) \
    if (width == sizeof(value) - 1 && start + width <= end && pm_strncasecmp(start, (const uint8_t *) value, width) == 0) { \
        parser->encoding = prebuilt; \
-        parser->encoding_changed |= true; \
+        parser->encoding_changed = true; \
        if (parser->encoding_changed_callback != NULL) parser->encoding_changed_callback(parser); \
        return true; \
    }
    // Built convenience macros to compare aliases for the same encoding.
 #define ENCODING2(value1, value2, prebuilt) ENCODING(value1, prebuilt) ENCODING(value2, prebuilt)
 #define ENCODING3(value1, value2, value3, prebuilt) ENCODING2(value1, value2, prebuilt) ENCODING(value3, prebuilt)
 #define ENCODING4(value1, value2, value3, value4, prebuilt) ENCODING3(value1, value2, value3, prebuilt) ENCODING(value4, prebuilt)
 #define ENCODING5(value1, value2, value3, value4, value5, prebuilt) ENCODING4(value1, value2, value3, value4, prebuilt) ENCODING(value5, prebuilt)
    // Check most common first. (This is pretty arbitrary.)
-    ENCODING("ascii", pm_encoding_ascii);
+    ENCODING("ASCII", pm_encoding_ascii);
-    ENCODING("ascii-8bit", pm_encoding_ascii_8bit);
+    ENCODING("ASCII-8BIT", pm_encoding_ascii_8bit);
-    ENCODING("us-ascii", pm_encoding_ascii);
+    ENCODING("US-ASCII", pm_encoding_ascii);
-    ENCODING("binary", pm_encoding_ascii_8bit);
+    ENCODING("BINARY", pm_encoding_ascii_8bit);
-    ENCODING("shift_jis", pm_encoding_shift_jis);
+    ENCODING("Shift_JIS", pm_encoding_shift_jis);
-    ENCODING("euc-jp", pm_encoding_euc_jp);
+    ENCODING("EUC-JP", pm_encoding_euc_jp);
    // Then check all the others.
-    ENCODING("big5", pm_encoding_big5);
+    ENCODING2("ANSI_X3.4-1968", "646", pm_encoding_ascii);
    ENCODING("cp51932", pm_encoding_cp51932);
-    ENCODING("gbk", pm_encoding_gbk);
+    ENCODING("eucJP", pm_encoding_euc_jp);
-    ENCODING("iso-8859-1", pm_encoding_iso_8859_1);
+    ENCODING("Big5", pm_encoding_big5);
-    ENCODING("iso-8859-2", pm_encoding_iso_8859_2);
+    ENCODING2("GBK", "CP936", pm_encoding_gbk);
-    ENCODING("iso-8859-3", pm_encoding_iso_8859_3);
+    ENCODING2("ISO-8859-1", "ISO8859-1", pm_encoding_iso_8859_1);
-    ENCODING("iso-8859-4", pm_encoding_iso_8859_4);
+    ENCODING2("ISO-8859-2", "ISO8859-2", pm_encoding_iso_8859_2);
-    ENCODING("iso-8859-5", pm_encoding_iso_8859_5);
+    ENCODING2("ISO-8859-3", "ISO8859-3", pm_encoding_iso_8859_3);
-    ENCODING("iso-8859-6", pm_encoding_iso_8859_6);
+    ENCODING2("ISO-8859-4", "ISO8859-4", pm_encoding_iso_8859_4);
-    ENCODING("iso-8859-7", pm_encoding_iso_8859_7);
+    ENCODING2("ISO-8859-5", "ISO8859-5", pm_encoding_iso_8859_5);
-    ENCODING("iso-8859-8", pm_encoding_iso_8859_8);
+    ENCODING2("ISO-8859-6", "ISO8859-6", pm_encoding_iso_8859_6);
-    ENCODING("iso-8859-9", pm_encoding_iso_8859_9);
+    ENCODING2("ISO-8859-7", "ISO8859-7", pm_encoding_iso_8859_7);
-    ENCODING("iso-8859-10", pm_encoding_iso_8859_10);
+    ENCODING2("ISO-8859-8", "ISO8859-8", pm_encoding_iso_8859_8);
-    ENCODING("iso-8859-11", pm_encoding_iso_8859_11);
+    ENCODING2("ISO-8859-9", "ISO8859-9", pm_encoding_iso_8859_9);
-    ENCODING("iso-8859-13", pm_encoding_iso_8859_13);
+    ENCODING2("ISO-8859-10", "ISO8859-10", pm_encoding_iso_8859_10);
-    ENCODING("iso-8859-14", pm_encoding_iso_8859_14);
+    ENCODING2("ISO-8859-11", "ISO8859-11", pm_encoding_iso_8859_11);
-    ENCODING("iso-8859-15", pm_encoding_iso_8859_15);
+    ENCODING2("ISO-8859-13", "ISO8859-13", pm_encoding_iso_8859_13);
-    ENCODING("iso-8859-16", pm_encoding_iso_8859_16);
+    ENCODING2("ISO-8859-14", "ISO8859-14", pm_encoding_iso_8859_14);
-    ENCODING("koi8-r", pm_encoding_koi8_r);
+    ENCODING2("ISO-8859-15", "ISO8859-15", pm_encoding_iso_8859_15);
-    ENCODING("windows-31j", pm_encoding_windows_31j);
+    ENCODING2("ISO-8859-16", "ISO8859-16", pm_encoding_iso_8859_16);
-    ENCODING("windows-1251", pm_encoding_windows_1251);
+    ENCODING2("KOI8-R", "CP878", pm_encoding_koi8_r);
-    ENCODING("windows-1252", pm_encoding_windows_1252);
+    ENCODING4("CP65001", "locale", "external", "filesystem", pm_encoding_utf_8);
-    ENCODING("cp1251", pm_encoding_windows_1251);
+    ENCODING3("UTF8-MAC", "UTF-8-MAC", "UTF-8-HFS", pm_encoding_utf8_mac);
-    ENCODING("cp1252", pm_encoding_windows_1252);
+    ENCODING2("Windows-1250", "CP1250", pm_encoding_windows_1250);
-    ENCODING("cp932", pm_encoding_windows_31j);
+    ENCODING2("Windows-1251", "CP1251", pm_encoding_windows_1251);
-    ENCODING("sjis", pm_encoding_windows_31j);
+    ENCODING2("Windows-1252", "CP1252", pm_encoding_windows_1252);
-    ENCODING("utf8-mac", pm_encoding_utf8_mac);
+    ENCODING5("Windows-31J", "CP932", "csWindows31J", "SJIS", "PCK", pm_encoding_windows_31j);
 #undef ENCODING2
 #undef ENCODING3
 #undef ENCODING4
 #undef ENCODING5
 #undef ENCODING
    return false;
--- a/test/prism/encoding_test.rb
+++ b/test/prism/encoding_test.rb
@ -4,45 +4,49 @@ require_relative "test_helper"
 module Prism
  class EncodingTest < TestCase
-    %w[
+    [
-      ascii
+      "US-ASCII",
-      ascii-8bit
+      "ASCII-8BIT",
-      big5
+      "Big5",
-      binary
+      "CP51932",
-      euc-jp
+      "EUC-JP",
-      gbk
+      "GBK",
-      iso-8859-1
+      "ISO-8859-1",
-      iso-8859-2
+      "ISO-8859-2",
-      iso-8859-3
+      "ISO-8859-3",
-      iso-8859-4
+      "ISO-8859-4",
-      iso-8859-5
+      "ISO-8859-5",
-      iso-8859-6
+      "ISO-8859-6",
-      iso-8859-7
+      "ISO-8859-7",
-      iso-8859-8
+      "ISO-8859-8",
-      iso-8859-9
+      "ISO-8859-9",
-      iso-8859-10
+      "ISO-8859-10",
-      iso-8859-11
+      "ISO-8859-11",
-      iso-8859-13
+      "ISO-8859-13",
-      iso-8859-14
+      "ISO-8859-14",
-      iso-8859-15
+      "ISO-8859-15",
-      iso-8859-16
+      "ISO-8859-16",
-      koi8-r
+      "KOI8-R",
-      shift_jis
+      "Shift_JIS",
-      sjis
+      "Windows-31J",
-      us-ascii
+      "UTF-8",
-      utf-8
+      "UTF8-MAC",
-      utf8-mac
+      "Windows-1250",
-      windows-31j
+      "Windows-1251",
-      windows-1251
+      "Windows-1252",
-      windows-1252
+    ].each do |canonical_name|
-      CP1251
+      encoding = Encoding.find(canonical_name)
-      CP1252
+
-      CP51932
+      encoding.names.each do |name|
-    ].each do |encoding|
+        # Even though UTF-8-MAC is an alias for UTF8-MAC, CRuby treats it as
-      define_method "test_encoding_#{encoding}" do
+        # UTF-8. So we'll skip this test.
-        result = Prism.parse("# encoding: #{encoding}\n'string'")
+        next if name == "UTF-8-MAC"
-        actual = result.value.statements.body.first.unescaped.encoding
+
-        assert_equal Encoding.find(encoding), actual
+        define_method "test_encoding_#{name}" do
          result = Prism.parse("# encoding: #{name}\n'string'")
          actual = result.value.statements.body.first.unescaped.encoding
          assert_equal encoding, actual
        end
      end
    end