[ruby/prism] More mixed encoding errors

2a43b4f55c
2025-08-15 13:39:04 +02:00 · 2024-05-16 12:29:14 -04:00 · 2024-05-16 12:29:14 -04:00 · 286d3032a7
commit 286d3032a7
parent 07241dfac8
2 changed files with 40 additions and 6 deletions
--- a/prism/prism.c
+++ b/prism/prism.c
@ -15382,6 +15382,10 @@ parse_string_part(pm_parser_t *parser) {
        //     "aaa #{bbb} #@ccc ddd"
        //          ^^^^^^
        case PM_TOKEN_EMBEXPR_BEGIN: {
+            // Ruby disallows seeing encoding around interpolation in strings,
+            // even though it is known at parse time.
+            parser->explicit_encoding = NULL;
+
            pm_lex_state_t state = parser->lex_state;
            int brace_nesting = parser->brace_nesting;

@ -15414,6 +15418,10 @@ parse_string_part(pm_parser_t *parser) {
        //     "aaa #{bbb} #@ccc ddd"
        //                 ^^^^^
        case PM_TOKEN_EMBVAR: {
+            // Ruby disallows seeing encoding around interpolation in strings,
+            // even though it is known at parse time.
+            parser->explicit_encoding = NULL;
+
            lex_state_set(parser, PM_LEX_STATE_BEG);
            parser_lex(parser);

--- a/prism/util/pm_strpbrk.c
+++ b/prism/util/pm_strpbrk.c
@ -8,6 +8,27 @@ pm_strpbrk_invalid_multibyte_character(pm_parser_t *parser, const uint8_t *start
    pm_diagnostic_list_append_format(&parser->error_list, start, end, PM_ERR_INVALID_MULTIBYTE_CHARACTER, *start);
 }

+/**
+ * Set the explicit encoding for the parser to the current encoding.
+ */
+static inline void
+pm_strpbrk_explicit_encoding_set(pm_parser_t *parser, const uint8_t *source, size_t width) {
+    if (parser->explicit_encoding != NULL) {
+        if (parser->explicit_encoding == parser->encoding) {
+            // Okay, we already locked to this encoding.
+        } else if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) {
+            // Not okay, we already found a Unicode escape sequence and this
+            // conflicts.
+            pm_diagnostic_list_append_format(&parser->error_list, source, source + width, PM_ERR_MIXED_ENCODING, parser->encoding->name);
+        } else {
+            // Should not be anything else.
+            assert(false && "unreachable");
+        }
+    }
+
+    parser->explicit_encoding = parser->encoding;
+}
+
 /**
 * This is the default path.
 */
@ -52,7 +73,7 @@ pm_strpbrk_utf8(pm_parser_t *parser, const uint8_t *source, const uint8_t *chars
 * This is the path when the encoding is ASCII-8BIT.
 */
 static inline const uint8_t *
-pm_strpbrk_ascii_8bit(const uint8_t *source, const uint8_t *charset, size_t maximum) {
+pm_strpbrk_ascii_8bit(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
    size_t index = 0;

    while (index < maximum) {
@ -60,6 +81,7 @@ pm_strpbrk_ascii_8bit(const uint8_t *source, const uint8_t *charset, size_t maxi
            return source + index;
        }

+        if (validate && source[index] >= 0x80) pm_strpbrk_explicit_encoding_set(parser, source, 1);
        index++;
    }

@ -72,6 +94,7 @@ pm_strpbrk_ascii_8bit(const uint8_t *source, const uint8_t *charset, size_t maxi
 static inline const uint8_t *
 pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
    size_t index = 0;
+    const pm_encoding_t *encoding = parser->encoding;

    while (index < maximum) {
        if (strchr((const char *) charset, source[index]) != NULL) {
@ -81,7 +104,8 @@ pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t
        if (source[index] < 0x80) {
            index++;
        } else {
-            size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
+            size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
+            if (validate) pm_strpbrk_explicit_encoding_set(parser, source, width);

            if (width > 0) {
                index += width;
@ -96,7 +120,7 @@ pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t

                do {
                    index++;
-                } while (index < maximum && parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
+                } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);

                pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
            }
@ -113,6 +137,7 @@ pm_strpbrk_multi_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t
 static inline const uint8_t *
 pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, size_t maximum, bool validate) {
    size_t index = 0;
+    const pm_encoding_t *encoding = parser->encoding;

    while (index < maximum) {
        if (strchr((const char *) charset, source[index]) != NULL) {
@ -122,7 +147,8 @@ pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t
        if (source[index] < 0x80 || !validate) {
            index++;
        } else {
-            size_t width = parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
+            size_t width = encoding->char_width(source + index, (ptrdiff_t) (maximum - index));
+            pm_strpbrk_explicit_encoding_set(parser, source, width);

            if (width > 0) {
                index += width;
@ -135,7 +161,7 @@ pm_strpbrk_single_byte(pm_parser_t *parser, const uint8_t *source, const uint8_t

                do {
                    index++;
-                } while (index < maximum && parser->encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);
+                } while (index < maximum && encoding->char_width(source + index, (ptrdiff_t) (maximum - index)) == 0);

                pm_strpbrk_invalid_multibyte_character(parser, source + start, source + index);
            }
@ -171,7 +197,7 @@ pm_strpbrk(pm_parser_t *parser, const uint8_t *source, const uint8_t *charset, p
    } else if (!parser->encoding_changed) {
        return pm_strpbrk_utf8(parser, source, charset, (size_t) length, validate);
    } else if (parser->encoding == PM_ENCODING_ASCII_8BIT_ENTRY) {
-        return pm_strpbrk_ascii_8bit(source, charset, (size_t) length);
+        return pm_strpbrk_ascii_8bit(parser, source, charset, (size_t) length, validate);
    } else if (parser->encoding->multibyte) {
        return pm_strpbrk_multi_byte(parser, source, charset, (size_t) length, validate);
    } else {