diff --git a/parse.y b/parse.y index bd3251e4e1..5642a55f67 100644 --- a/parse.y +++ b/parse.y @@ -7260,6 +7260,8 @@ tokadd_codepoint(struct parser_params *p, rb_encoding **encp, return TRUE; } +static int tokadd_mbchar(struct parser_params *p, int c); + /* return value is for ?\u3042 */ static void tokadd_utf8(struct parser_params *p, rb_encoding **encp, @@ -7277,44 +7279,71 @@ tokadd_utf8(struct parser_params *p, rb_encoding **encp, if (regexp_literal) { tokadd(p, '\\'); tokadd(p, 'u'); } if (peek(p, open_brace)) { /* handle \u{...} form */ - const char *second = NULL; - int c, last = nextc(p); - if (p->lex.pcur >= p->lex.pend) goto unterminated; - while (ISSPACE(c = *p->lex.pcur) && ++p->lex.pcur < p->lex.pend); - while (c != close_brace) { - if (c == term) goto unterminated; - if (second == multiple_codepoints) - second = p->lex.pcur; - if (regexp_literal) tokadd(p, last); - if (!tokadd_codepoint(p, encp, regexp_literal, TRUE)) { - break; - } - while (ISSPACE(c = *p->lex.pcur)) { - if (++p->lex.pcur >= p->lex.pend) goto unterminated; - last = c; - } - if (term == -1 && !second) - second = multiple_codepoints; - } + if (regexp_literal && p->lex.strterm->u.literal.u1.func == str_regexp) { + /* + * Skip parsing validation code and copy bytes as-is until term or + * closing brace, in order to correctly handle extended regexps where + * invalid unicode escapes are allowed in comments. The regexp parser + * does its own validation and will catch any issues. + */ + int c = *p->lex.pcur; + tokadd(p, c); + for (c = *++p->lex.pcur; p->lex.pcur < p->lex.pend; c = *++p->lex.pcur) { + if (c == close_brace) { + tokadd(p, c); + ++p->lex.pcur; + break; + } + else if (c == term) { + break; + } + if (c == '\\' && p->lex.pcur + 1 < p->lex.pend) { + tokadd(p, c); + c = *++p->lex.pcur; + } + tokadd_mbchar(p, c); + } + } + else { + const char *second = NULL; + int c, last = nextc(p); + if (p->lex.pcur >= p->lex.pend) goto unterminated; + while (ISSPACE(c = *p->lex.pcur) && ++p->lex.pcur < p->lex.pend); + while (c != close_brace) { + if (c == term) goto unterminated; + if (second == multiple_codepoints) + second = p->lex.pcur; + if (regexp_literal) tokadd(p, last); + if (!tokadd_codepoint(p, encp, regexp_literal, TRUE)) { + break; + } + while (ISSPACE(c = *p->lex.pcur)) { + if (++p->lex.pcur >= p->lex.pend) goto unterminated; + last = c; + } + if (term == -1 && !second) + second = multiple_codepoints; + } - if (c != close_brace) { - unterminated: - token_flush(p); - yyerror0("unterminated Unicode escape"); - return; - } - if (second && second != multiple_codepoints) { - const char *pcur = p->lex.pcur; - p->lex.pcur = second; - dispatch_scan_event(p, tSTRING_CONTENT); - token_flush(p); - p->lex.pcur = pcur; - yyerror0(multiple_codepoints); - token_flush(p); - } + if (c != close_brace) { + unterminated: + token_flush(p); + yyerror0("unterminated Unicode escape"); + return; + } + if (second && second != multiple_codepoints) { + const char *pcur = p->lex.pcur; + p->lex.pcur = second; + dispatch_scan_event(p, tSTRING_CONTENT); + token_flush(p); + p->lex.pcur = pcur; + yyerror0(multiple_codepoints); + token_flush(p); + } - if (regexp_literal) tokadd(p, close_brace); - nextc(p); + if (regexp_literal) tokadd(p, close_brace); + nextc(p); + } } else { /* handle \uxxxx form */ if (!tokadd_codepoint(p, encp, regexp_literal, FALSE)) { diff --git a/test/ruby/test_parse.rb b/test/ruby/test_parse.rb index 4488ea620e..a22f11aeae 100644 --- a/test/ruby/test_parse.rb +++ b/test/ruby/test_parse.rb @@ -1041,6 +1041,22 @@ x = __ENCODING__ assert_syntax_error(" 0b\n", /\^/) end + def test_unclosed_unicode_escape_at_eol_bug_19750 + assert_separately([], "#{<<-"begin;"}\n#{<<~'end;'}") + begin; + assert_syntax_error("/\\u", /too short escape sequence/) + assert_syntax_error("/\\u{", /unterminated regexp meets end of file/) + assert_syntax_error("/\\u{\\n", /invalid Unicode list/) + assert_syntax_error("/a#\\u{\\n/", /invalid Unicode list/) + re = eval("/a#\\u{\n$/x") + assert_match(re, 'a') + assert_not_match(re, 'a#') + re = eval("/a#\\u\n$/x") + assert_match(re, 'a') + assert_not_match(re, 'a#') + end; + end + def test_error_def_in_argument assert_separately([], "#{<<-"begin;"}\n#{<<~"end;"}") begin; diff --git a/version.h b/version.h index 403b4b020f..5c14532dd5 100644 --- a/version.h +++ b/version.h @@ -11,7 +11,7 @@ # define RUBY_VERSION_MINOR RUBY_API_VERSION_MINOR #define RUBY_VERSION_TEENY 2 #define RUBY_RELEASE_DATE RUBY_RELEASE_YEAR_STR"-"RUBY_RELEASE_MONTH_STR"-"RUBY_RELEASE_DAY_STR -#define RUBY_PATCHLEVEL 92 +#define RUBY_PATCHLEVEL 93 #include "ruby/version.h" #include "ruby/internal/abi.h"